diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2013-10-28 08:15:55 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2013-10-28 08:15:55 -0400 |
commit | 5bb3398dd2df2c26261b2156c98cf4c95b3f91fe (patch) | |
tree | 526d914e0e1cc62249b4a0d2fea31558cc17fcd7 /arch/arm | |
parent | e0230e1327fb862c9b6cde24ae62d55f9db62c9b (diff) | |
parent | 9b5fdb9781f74fb15827e465bfb5aa63211953c8 (diff) |
Merge tag 'kvm-arm-for-3.13-2' of git://git.linaro.org/people/cdall/linux-kvm-arm into kvm-queue
Updates for KVM/ARM, take 2 including:
- Transparent Huge Pages and hugetlbfs support for KVM/ARM
- Yield CPU when guest executes WFE to speed up CPU overcommit
Diffstat (limited to 'arch/arm')
-rw-r--r-- | arch/arm/include/asm/kvm_arm.h | 5 | ||||
-rw-r--r-- | arch/arm/include/asm/kvm_mmu.h | 17 | ||||
-rw-r--r-- | arch/arm/include/asm/pgtable-3level.h | 2 | ||||
-rw-r--r-- | arch/arm/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/arm/kvm/handle_exit.c | 20 | ||||
-rw-r--r-- | arch/arm/kvm/mmu.c | 223 |
6 files changed, 219 insertions, 49 deletions
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index d556f03bca17..1d3153c7eb41 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h | |||
@@ -57,6 +57,7 @@ | |||
57 | * TSC: Trap SMC | 57 | * TSC: Trap SMC |
58 | * TSW: Trap cache operations by set/way | 58 | * TSW: Trap cache operations by set/way |
59 | * TWI: Trap WFI | 59 | * TWI: Trap WFI |
60 | * TWE: Trap WFE | ||
60 | * TIDCP: Trap L2CTLR/L2ECTLR | 61 | * TIDCP: Trap L2CTLR/L2ECTLR |
61 | * BSU_IS: Upgrade barriers to the inner shareable domain | 62 | * BSU_IS: Upgrade barriers to the inner shareable domain |
62 | * FB: Force broadcast of all maintainance operations | 63 | * FB: Force broadcast of all maintainance operations |
@@ -67,7 +68,7 @@ | |||
67 | */ | 68 | */ |
68 | #define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \ | 69 | #define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \ |
69 | HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \ | 70 | HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \ |
70 | HCR_SWIO | HCR_TIDCP) | 71 | HCR_TWE | HCR_SWIO | HCR_TIDCP) |
71 | #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF) | 72 | #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF) |
72 | 73 | ||
73 | /* System Control Register (SCTLR) bits */ | 74 | /* System Control Register (SCTLR) bits */ |
@@ -208,6 +209,8 @@ | |||
208 | #define HSR_EC_DABT (0x24) | 209 | #define HSR_EC_DABT (0x24) |
209 | #define HSR_EC_DABT_HYP (0x25) | 210 | #define HSR_EC_DABT_HYP (0x25) |
210 | 211 | ||
212 | #define HSR_WFI_IS_WFE (1U << 0) | ||
213 | |||
211 | #define HSR_HVC_IMM_MASK ((1UL << 16) - 1) | 214 | #define HSR_HVC_IMM_MASK ((1UL << 16) - 1) |
212 | 215 | ||
213 | #define HSR_DABT_S1PTW (1U << 7) | 216 | #define HSR_DABT_S1PTW (1U << 7) |
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 9b28c41f4ba9..77de4a41cc50 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h | |||
@@ -62,6 +62,12 @@ phys_addr_t kvm_get_idmap_vector(void); | |||
62 | int kvm_mmu_init(void); | 62 | int kvm_mmu_init(void); |
63 | void kvm_clear_hyp_idmap(void); | 63 | void kvm_clear_hyp_idmap(void); |
64 | 64 | ||
65 | static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd) | ||
66 | { | ||
67 | *pmd = new_pmd; | ||
68 | flush_pmd_entry(pmd); | ||
69 | } | ||
70 | |||
65 | static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) | 71 | static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) |
66 | { | 72 | { |
67 | *pte = new_pte; | 73 | *pte = new_pte; |
@@ -103,9 +109,15 @@ static inline void kvm_set_s2pte_writable(pte_t *pte) | |||
103 | pte_val(*pte) |= L_PTE_S2_RDWR; | 109 | pte_val(*pte) |= L_PTE_S2_RDWR; |
104 | } | 110 | } |
105 | 111 | ||
112 | static inline void kvm_set_s2pmd_writable(pmd_t *pmd) | ||
113 | { | ||
114 | pmd_val(*pmd) |= L_PMD_S2_RDWR; | ||
115 | } | ||
116 | |||
106 | struct kvm; | 117 | struct kvm; |
107 | 118 | ||
108 | static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) | 119 | static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, |
120 | unsigned long size) | ||
109 | { | 121 | { |
110 | /* | 122 | /* |
111 | * If we are going to insert an instruction page and the icache is | 123 | * If we are going to insert an instruction page and the icache is |
@@ -120,8 +132,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) | |||
120 | * need any kind of flushing (DDI 0406C.b - Page B3-1392). | 132 | * need any kind of flushing (DDI 0406C.b - Page B3-1392). |
121 | */ | 133 | */ |
122 | if (icache_is_pipt()) { | 134 | if (icache_is_pipt()) { |
123 | unsigned long hva = gfn_to_hva(kvm, gfn); | 135 | __cpuc_coherent_user_range(hva, hva + size); |
124 | __cpuc_coherent_user_range(hva, hva + PAGE_SIZE); | ||
125 | } else if (!icache_is_vivt_asid_tagged()) { | 136 | } else if (!icache_is_vivt_asid_tagged()) { |
126 | /* any kind of VIPT cache */ | 137 | /* any kind of VIPT cache */ |
127 | __flush_icache_all(); | 138 | __flush_icache_all(); |
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 5689c18c85f5..a331d2527342 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h | |||
@@ -126,6 +126,8 @@ | |||
126 | #define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */ | 126 | #define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */ |
127 | #define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ | 127 | #define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ |
128 | 128 | ||
129 | #define L_PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ | ||
130 | |||
129 | /* | 131 | /* |
130 | * Hyp-mode PL2 PTE definitions for LPAE. | 132 | * Hyp-mode PL2 PTE definitions for LPAE. |
131 | */ | 133 | */ |
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index ebf5015508b5..466bd299b1a8 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig | |||
@@ -20,6 +20,7 @@ config KVM | |||
20 | bool "Kernel-based Virtual Machine (KVM) support" | 20 | bool "Kernel-based Virtual Machine (KVM) support" |
21 | select PREEMPT_NOTIFIERS | 21 | select PREEMPT_NOTIFIERS |
22 | select ANON_INODES | 22 | select ANON_INODES |
23 | select HAVE_KVM_CPU_RELAX_INTERCEPT | ||
23 | select KVM_MMIO | 24 | select KVM_MMIO |
24 | select KVM_ARM_HOST | 25 | select KVM_ARM_HOST |
25 | depends on ARM_VIRT_EXT && ARM_LPAE | 26 | depends on ARM_VIRT_EXT && ARM_LPAE |
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index df4c82d47ad7..a92079011a83 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c | |||
@@ -73,23 +73,29 @@ static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) | |||
73 | } | 73 | } |
74 | 74 | ||
75 | /** | 75 | /** |
76 | * kvm_handle_wfi - handle a wait-for-interrupts instruction executed by a guest | 76 | * kvm_handle_wfx - handle a WFI or WFE instructions trapped in guests |
77 | * @vcpu: the vcpu pointer | 77 | * @vcpu: the vcpu pointer |
78 | * @run: the kvm_run structure pointer | 78 | * @run: the kvm_run structure pointer |
79 | * | 79 | * |
80 | * Simply sets the wait_for_interrupts flag on the vcpu structure, which will | 80 | * WFE: Yield the CPU and come back to this vcpu when the scheduler |
81 | * halt execution of world-switches and schedule other host processes until | 81 | * decides to. |
82 | * there is an incoming IRQ or FIQ to the VM. | 82 | * WFI: Simply call kvm_vcpu_block(), which will halt execution of |
83 | * world-switches and schedule other host processes until there is an | ||
84 | * incoming IRQ or FIQ to the VM. | ||
83 | */ | 85 | */ |
84 | static int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run) | 86 | static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) |
85 | { | 87 | { |
86 | trace_kvm_wfi(*vcpu_pc(vcpu)); | 88 | trace_kvm_wfi(*vcpu_pc(vcpu)); |
87 | kvm_vcpu_block(vcpu); | 89 | if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) |
90 | kvm_vcpu_on_spin(vcpu); | ||
91 | else | ||
92 | kvm_vcpu_block(vcpu); | ||
93 | |||
88 | return 1; | 94 | return 1; |
89 | } | 95 | } |
90 | 96 | ||
91 | static exit_handle_fn arm_exit_handlers[] = { | 97 | static exit_handle_fn arm_exit_handlers[] = { |
92 | [HSR_EC_WFI] = kvm_handle_wfi, | 98 | [HSR_EC_WFI] = kvm_handle_wfx, |
93 | [HSR_EC_CP15_32] = kvm_handle_cp15_32, | 99 | [HSR_EC_CP15_32] = kvm_handle_cp15_32, |
94 | [HSR_EC_CP15_64] = kvm_handle_cp15_64, | 100 | [HSR_EC_CP15_64] = kvm_handle_cp15_64, |
95 | [HSR_EC_CP14_MR] = kvm_handle_cp14_access, | 101 | [HSR_EC_CP14_MR] = kvm_handle_cp14_access, |
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index b0de86b56c13..371958370de4 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/mman.h> | 19 | #include <linux/mman.h> |
20 | #include <linux/kvm_host.h> | 20 | #include <linux/kvm_host.h> |
21 | #include <linux/io.h> | 21 | #include <linux/io.h> |
22 | #include <linux/hugetlb.h> | ||
22 | #include <trace/events/kvm.h> | 23 | #include <trace/events/kvm.h> |
23 | #include <asm/pgalloc.h> | 24 | #include <asm/pgalloc.h> |
24 | #include <asm/cacheflush.h> | 25 | #include <asm/cacheflush.h> |
@@ -41,6 +42,8 @@ static unsigned long hyp_idmap_start; | |||
41 | static unsigned long hyp_idmap_end; | 42 | static unsigned long hyp_idmap_end; |
42 | static phys_addr_t hyp_idmap_vector; | 43 | static phys_addr_t hyp_idmap_vector; |
43 | 44 | ||
45 | #define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) | ||
46 | |||
44 | static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) | 47 | static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) |
45 | { | 48 | { |
46 | /* | 49 | /* |
@@ -93,19 +96,29 @@ static bool page_empty(void *ptr) | |||
93 | 96 | ||
94 | static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) | 97 | static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) |
95 | { | 98 | { |
96 | pmd_t *pmd_table = pmd_offset(pud, 0); | 99 | if (pud_huge(*pud)) { |
97 | pud_clear(pud); | 100 | pud_clear(pud); |
98 | kvm_tlb_flush_vmid_ipa(kvm, addr); | 101 | kvm_tlb_flush_vmid_ipa(kvm, addr); |
99 | pmd_free(NULL, pmd_table); | 102 | } else { |
103 | pmd_t *pmd_table = pmd_offset(pud, 0); | ||
104 | pud_clear(pud); | ||
105 | kvm_tlb_flush_vmid_ipa(kvm, addr); | ||
106 | pmd_free(NULL, pmd_table); | ||
107 | } | ||
100 | put_page(virt_to_page(pud)); | 108 | put_page(virt_to_page(pud)); |
101 | } | 109 | } |
102 | 110 | ||
103 | static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) | 111 | static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) |
104 | { | 112 | { |
105 | pte_t *pte_table = pte_offset_kernel(pmd, 0); | 113 | if (kvm_pmd_huge(*pmd)) { |
106 | pmd_clear(pmd); | 114 | pmd_clear(pmd); |
107 | kvm_tlb_flush_vmid_ipa(kvm, addr); | 115 | kvm_tlb_flush_vmid_ipa(kvm, addr); |
108 | pte_free_kernel(NULL, pte_table); | 116 | } else { |
117 | pte_t *pte_table = pte_offset_kernel(pmd, 0); | ||
118 | pmd_clear(pmd); | ||
119 | kvm_tlb_flush_vmid_ipa(kvm, addr); | ||
120 | pte_free_kernel(NULL, pte_table); | ||
121 | } | ||
109 | put_page(virt_to_page(pmd)); | 122 | put_page(virt_to_page(pmd)); |
110 | } | 123 | } |
111 | 124 | ||
@@ -136,18 +149,32 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, | |||
136 | continue; | 149 | continue; |
137 | } | 150 | } |
138 | 151 | ||
152 | if (pud_huge(*pud)) { | ||
153 | /* | ||
154 | * If we are dealing with a huge pud, just clear it and | ||
155 | * move on. | ||
156 | */ | ||
157 | clear_pud_entry(kvm, pud, addr); | ||
158 | addr = pud_addr_end(addr, end); | ||
159 | continue; | ||
160 | } | ||
161 | |||
139 | pmd = pmd_offset(pud, addr); | 162 | pmd = pmd_offset(pud, addr); |
140 | if (pmd_none(*pmd)) { | 163 | if (pmd_none(*pmd)) { |
141 | addr = pmd_addr_end(addr, end); | 164 | addr = pmd_addr_end(addr, end); |
142 | continue; | 165 | continue; |
143 | } | 166 | } |
144 | 167 | ||
145 | pte = pte_offset_kernel(pmd, addr); | 168 | if (!kvm_pmd_huge(*pmd)) { |
146 | clear_pte_entry(kvm, pte, addr); | 169 | pte = pte_offset_kernel(pmd, addr); |
147 | next = addr + PAGE_SIZE; | 170 | clear_pte_entry(kvm, pte, addr); |
171 | next = addr + PAGE_SIZE; | ||
172 | } | ||
148 | 173 | ||
149 | /* If we emptied the pte, walk back up the ladder */ | 174 | /* |
150 | if (page_empty(pte)) { | 175 | * If the pmd entry is to be cleared, walk back up the ladder |
176 | */ | ||
177 | if (kvm_pmd_huge(*pmd) || page_empty(pte)) { | ||
151 | clear_pmd_entry(kvm, pmd, addr); | 178 | clear_pmd_entry(kvm, pmd, addr); |
152 | next = pmd_addr_end(addr, end); | 179 | next = pmd_addr_end(addr, end); |
153 | if (page_empty(pmd) && !page_empty(pud)) { | 180 | if (page_empty(pmd) && !page_empty(pud)) { |
@@ -420,29 +447,71 @@ void kvm_free_stage2_pgd(struct kvm *kvm) | |||
420 | kvm->arch.pgd = NULL; | 447 | kvm->arch.pgd = NULL; |
421 | } | 448 | } |
422 | 449 | ||
423 | 450 | static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, | |
424 | static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, | 451 | phys_addr_t addr) |
425 | phys_addr_t addr, const pte_t *new_pte, bool iomap) | ||
426 | { | 452 | { |
427 | pgd_t *pgd; | 453 | pgd_t *pgd; |
428 | pud_t *pud; | 454 | pud_t *pud; |
429 | pmd_t *pmd; | 455 | pmd_t *pmd; |
430 | pte_t *pte, old_pte; | ||
431 | 456 | ||
432 | /* Create 2nd stage page table mapping - Level 1 */ | ||
433 | pgd = kvm->arch.pgd + pgd_index(addr); | 457 | pgd = kvm->arch.pgd + pgd_index(addr); |
434 | pud = pud_offset(pgd, addr); | 458 | pud = pud_offset(pgd, addr); |
435 | if (pud_none(*pud)) { | 459 | if (pud_none(*pud)) { |
436 | if (!cache) | 460 | if (!cache) |
437 | return 0; /* ignore calls from kvm_set_spte_hva */ | 461 | return NULL; |
438 | pmd = mmu_memory_cache_alloc(cache); | 462 | pmd = mmu_memory_cache_alloc(cache); |
439 | pud_populate(NULL, pud, pmd); | 463 | pud_populate(NULL, pud, pmd); |
440 | get_page(virt_to_page(pud)); | 464 | get_page(virt_to_page(pud)); |
441 | } | 465 | } |
442 | 466 | ||
443 | pmd = pmd_offset(pud, addr); | 467 | return pmd_offset(pud, addr); |
468 | } | ||
469 | |||
470 | static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache | ||
471 | *cache, phys_addr_t addr, const pmd_t *new_pmd) | ||
472 | { | ||
473 | pmd_t *pmd, old_pmd; | ||
474 | |||
475 | pmd = stage2_get_pmd(kvm, cache, addr); | ||
476 | VM_BUG_ON(!pmd); | ||
477 | |||
478 | /* | ||
479 | * Mapping in huge pages should only happen through a fault. If a | ||
480 | * page is merged into a transparent huge page, the individual | ||
481 | * subpages of that huge page should be unmapped through MMU | ||
482 | * notifiers before we get here. | ||
483 | * | ||
484 | * Merging of CompoundPages is not supported; they should become | ||
485 | * splitting first, unmapped, merged, and mapped back in on-demand. | ||
486 | */ | ||
487 | VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); | ||
488 | |||
489 | old_pmd = *pmd; | ||
490 | kvm_set_pmd(pmd, *new_pmd); | ||
491 | if (pmd_present(old_pmd)) | ||
492 | kvm_tlb_flush_vmid_ipa(kvm, addr); | ||
493 | else | ||
494 | get_page(virt_to_page(pmd)); | ||
495 | return 0; | ||
496 | } | ||
497 | |||
498 | static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, | ||
499 | phys_addr_t addr, const pte_t *new_pte, bool iomap) | ||
500 | { | ||
501 | pmd_t *pmd; | ||
502 | pte_t *pte, old_pte; | ||
444 | 503 | ||
445 | /* Create 2nd stage page table mapping - Level 2 */ | 504 | /* Create stage-2 page table mapping - Level 1 */ |
505 | pmd = stage2_get_pmd(kvm, cache, addr); | ||
506 | if (!pmd) { | ||
507 | /* | ||
508 | * Ignore calls from kvm_set_spte_hva for unallocated | ||
509 | * address ranges. | ||
510 | */ | ||
511 | return 0; | ||
512 | } | ||
513 | |||
514 | /* Create stage-2 page mappings - Level 2 */ | ||
446 | if (pmd_none(*pmd)) { | 515 | if (pmd_none(*pmd)) { |
447 | if (!cache) | 516 | if (!cache) |
448 | return 0; /* ignore calls from kvm_set_spte_hva */ | 517 | return 0; /* ignore calls from kvm_set_spte_hva */ |
@@ -507,16 +576,60 @@ out: | |||
507 | return ret; | 576 | return ret; |
508 | } | 577 | } |
509 | 578 | ||
579 | static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) | ||
580 | { | ||
581 | pfn_t pfn = *pfnp; | ||
582 | gfn_t gfn = *ipap >> PAGE_SHIFT; | ||
583 | |||
584 | if (PageTransCompound(pfn_to_page(pfn))) { | ||
585 | unsigned long mask; | ||
586 | /* | ||
587 | * The address we faulted on is backed by a transparent huge | ||
588 | * page. However, because we map the compound huge page and | ||
589 | * not the individual tail page, we need to transfer the | ||
590 | * refcount to the head page. We have to be careful that the | ||
591 | * THP doesn't start to split while we are adjusting the | ||
592 | * refcounts. | ||
593 | * | ||
594 | * We are sure this doesn't happen, because mmu_notifier_retry | ||
595 | * was successful and we are holding the mmu_lock, so if this | ||
596 | * THP is trying to split, it will be blocked in the mmu | ||
597 | * notifier before touching any of the pages, specifically | ||
598 | * before being able to call __split_huge_page_refcount(). | ||
599 | * | ||
600 | * We can therefore safely transfer the refcount from PG_tail | ||
601 | * to PG_head and switch the pfn from a tail page to the head | ||
602 | * page accordingly. | ||
603 | */ | ||
604 | mask = PTRS_PER_PMD - 1; | ||
605 | VM_BUG_ON((gfn & mask) != (pfn & mask)); | ||
606 | if (pfn & mask) { | ||
607 | *ipap &= PMD_MASK; | ||
608 | kvm_release_pfn_clean(pfn); | ||
609 | pfn &= ~mask; | ||
610 | kvm_get_pfn(pfn); | ||
611 | *pfnp = pfn; | ||
612 | } | ||
613 | |||
614 | return true; | ||
615 | } | ||
616 | |||
617 | return false; | ||
618 | } | ||
619 | |||
510 | static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, | 620 | static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, |
511 | gfn_t gfn, struct kvm_memory_slot *memslot, | 621 | struct kvm_memory_slot *memslot, |
512 | unsigned long fault_status) | 622 | unsigned long fault_status) |
513 | { | 623 | { |
514 | pte_t new_pte; | ||
515 | pfn_t pfn; | ||
516 | int ret; | 624 | int ret; |
517 | bool write_fault, writable; | 625 | bool write_fault, writable, hugetlb = false, force_pte = false; |
518 | unsigned long mmu_seq; | 626 | unsigned long mmu_seq; |
627 | gfn_t gfn = fault_ipa >> PAGE_SHIFT; | ||
628 | unsigned long hva = gfn_to_hva(vcpu->kvm, gfn); | ||
629 | struct kvm *kvm = vcpu->kvm; | ||
519 | struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; | 630 | struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; |
631 | struct vm_area_struct *vma; | ||
632 | pfn_t pfn; | ||
520 | 633 | ||
521 | write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); | 634 | write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); |
522 | if (fault_status == FSC_PERM && !write_fault) { | 635 | if (fault_status == FSC_PERM && !write_fault) { |
@@ -524,6 +637,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, | |||
524 | return -EFAULT; | 637 | return -EFAULT; |
525 | } | 638 | } |
526 | 639 | ||
640 | /* Let's check if we will get back a huge page backed by hugetlbfs */ | ||
641 | down_read(¤t->mm->mmap_sem); | ||
642 | vma = find_vma_intersection(current->mm, hva, hva + 1); | ||
643 | if (is_vm_hugetlb_page(vma)) { | ||
644 | hugetlb = true; | ||
645 | gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; | ||
646 | } else { | ||
647 | /* | ||
648 | * Pages belonging to VMAs not aligned to the PMD mapping | ||
649 | * granularity cannot be mapped using block descriptors even | ||
650 | * if the pages belong to a THP for the process, because the | ||
651 | * stage-2 block descriptor will cover more than a single THP | ||
652 | * and we loose atomicity for unmapping, updates, and splits | ||
653 | * of the THP or other pages in the stage-2 block range. | ||
654 | */ | ||
655 | if (vma->vm_start & ~PMD_MASK) | ||
656 | force_pte = true; | ||
657 | } | ||
658 | up_read(¤t->mm->mmap_sem); | ||
659 | |||
527 | /* We need minimum second+third level pages */ | 660 | /* We need minimum second+third level pages */ |
528 | ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS); | 661 | ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS); |
529 | if (ret) | 662 | if (ret) |
@@ -541,26 +674,40 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, | |||
541 | */ | 674 | */ |
542 | smp_rmb(); | 675 | smp_rmb(); |
543 | 676 | ||
544 | pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable); | 677 | pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); |
545 | if (is_error_pfn(pfn)) | 678 | if (is_error_pfn(pfn)) |
546 | return -EFAULT; | 679 | return -EFAULT; |
547 | 680 | ||
548 | new_pte = pfn_pte(pfn, PAGE_S2); | 681 | spin_lock(&kvm->mmu_lock); |
549 | coherent_icache_guest_page(vcpu->kvm, gfn); | 682 | if (mmu_notifier_retry(kvm, mmu_seq)) |
550 | |||
551 | spin_lock(&vcpu->kvm->mmu_lock); | ||
552 | if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) | ||
553 | goto out_unlock; | 683 | goto out_unlock; |
554 | if (writable) { | 684 | if (!hugetlb && !force_pte) |
555 | kvm_set_s2pte_writable(&new_pte); | 685 | hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); |
556 | kvm_set_pfn_dirty(pfn); | 686 | |
687 | if (hugetlb) { | ||
688 | pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); | ||
689 | new_pmd = pmd_mkhuge(new_pmd); | ||
690 | if (writable) { | ||
691 | kvm_set_s2pmd_writable(&new_pmd); | ||
692 | kvm_set_pfn_dirty(pfn); | ||
693 | } | ||
694 | coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE); | ||
695 | ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); | ||
696 | } else { | ||
697 | pte_t new_pte = pfn_pte(pfn, PAGE_S2); | ||
698 | if (writable) { | ||
699 | kvm_set_s2pte_writable(&new_pte); | ||
700 | kvm_set_pfn_dirty(pfn); | ||
701 | } | ||
702 | coherent_icache_guest_page(kvm, hva, PAGE_SIZE); | ||
703 | ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); | ||
557 | } | 704 | } |
558 | stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false); | 705 | |
559 | 706 | ||
560 | out_unlock: | 707 | out_unlock: |
561 | spin_unlock(&vcpu->kvm->mmu_lock); | 708 | spin_unlock(&kvm->mmu_lock); |
562 | kvm_release_pfn_clean(pfn); | 709 | kvm_release_pfn_clean(pfn); |
563 | return 0; | 710 | return ret; |
564 | } | 711 | } |
565 | 712 | ||
566 | /** | 713 | /** |
@@ -629,7 +776,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) | |||
629 | 776 | ||
630 | memslot = gfn_to_memslot(vcpu->kvm, gfn); | 777 | memslot = gfn_to_memslot(vcpu->kvm, gfn); |
631 | 778 | ||
632 | ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, fault_status); | 779 | ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status); |
633 | if (ret == 0) | 780 | if (ret == 0) |
634 | ret = 1; | 781 | ret = 1; |
635 | out_unlock: | 782 | out_unlock: |