Merge tag 'kvm-arm-for-3.13-2' of git://git.linaro.org/people/cdall/linux-kvm-arm into kvm-queue

Updates for KVM/ARM, take 2 including: - Transparent Huge Pages and hugetlbfs support for KVM/ARM - Yield CPU when guest executes WFE to speed up CPU overcommit
author: Paolo Bonzini <pbonzini@redhat.com> 2013-10-28 08:15:55 -0400
committer: Paolo Bonzini <pbonzini@redhat.com> 2013-10-28 08:15:55 -0400
commit: 5bb3398dd2df2c26261b2156c98cf4c95b3f91fe (patch)
tree: 526d914e0e1cc62249b4a0d2fea31558cc17fcd7 /arch/arm
parent: e0230e1327fb862c9b6cde24ae62d55f9db62c9b (diff)
parent: 9b5fdb9781f74fb15827e465bfb5aa63211953c8 (diff)
6 files changed, 219 insertions, 49 deletions
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index d556f03bca17..1d3153c7eb41 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -57,6 +57,7 @@
 * TSC:         Trap SMC
 * TSW:         Trap cache operations by set/way
 * TWI:         Trap WFI
+ * TWE:         Trap WFE
 * TIDCP:       Trap L2CTLR/L2ECTLR
 * BSU_IS:      Upgrade barriers to the inner shareable domain
 * FB:          Force broadcast of all maintainance operations
@@ -67,7 +68,7 @@
 */
 #define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \
                        HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \
-                        HCR_SWIO | HCR_TIDCP)
+                        HCR_TWE | HCR_SWIO | HCR_TIDCP)
 #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
 /* System Control Register (SCTLR) bits */
@@ -208,6 +209,8 @@
 #define HSR_EC_DABT     (0x24)
 #define HSR_EC_DABT_HYP (0x25)
+#define HSR_WFI_IS_WFE          (1U << 0)
 #define HSR_HVC_IMM_MASK        ((1UL << 16) - 1)
 #define HSR_DABT_S1PTW          (1U << 7)
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 9b28c41f4ba9..77de4a41cc50 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -62,6 +62,12 @@ phys_addr_t kvm_get_idmap_vector(void);
 int kvm_mmu_init(void);
 void kvm_clear_hyp_idmap(void);
+static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd)
+{
+        *pmd = new_pmd;
+        flush_pmd_entry(pmd);
+}
 static inline void kvm_set_pte(pte_t *pte, pte_t new_pte)
 {
        *pte = new_pte;
@@ -103,9 +109,15 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
        pte_val(*pte) |= L_PTE_S2_RDWR;
 }
+static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
+{
+        pmd_val(*pmd) |= L_PMD_S2_RDWR;
+}
 struct kvm;
-static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
+static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
+                                              unsigned long size)
 {
        /*
         * If we are going to insert an instruction page and the icache is
@@ -120,8 +132,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
         * need any kind of flushing (DDI 0406C.b - Page B3-1392).
         */
        if (icache_is_pipt()) {
-                unsigned long hva = gfn_to_hva(kvm, gfn);
+                __cpuc_coherent_user_range(hva, hva + size);
-                __cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
        } else if (!icache_is_vivt_asid_tagged()) {
                /* any kind of VIPT cache */
                __flush_icache_all();
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 5689c18c85f5..a331d2527342 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -126,6 +126,8 @@
 #define L_PTE_S2_RDONLY          (_AT(pteval_t, 1) << 6)   /* HAP[1]   */
 #define L_PTE_S2_RDWR            (_AT(pteval_t, 3) << 6)   /* HAP[2:1] */
+#define L_PMD_S2_RDWR            (_AT(pmdval_t, 3) << 6)   /* HAP[2:1] */
 /*
 * Hyp-mode PL2 PTE definitions for LPAE.
 */
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index ebf5015508b5..466bd299b1a8 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -20,6 +20,7 @@ config KVM
        bool "Kernel-based Virtual Machine (KVM) support"
        select PREEMPT_NOTIFIERS
        select ANON_INODES
+        select HAVE_KVM_CPU_RELAX_INTERCEPT
        select KVM_MMIO
        select KVM_ARM_HOST
        depends on ARM_VIRT_EXT && ARM_LPAE
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index df4c82d47ad7..a92079011a83 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -73,23 +73,29 @@ static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
 }
 /**
- * kvm_handle_wfi - handle a wait-for-interrupts instruction executed by a guest
+ * kvm_handle_wfx - handle a WFI or WFE instructions trapped in guests
 * @vcpu:       the vcpu pointer
 * @run:        the kvm_run structure pointer
 *
- * Simply sets the wait_for_interrupts flag on the vcpu structure, which will
+ * WFE: Yield the CPU and come back to this vcpu when the scheduler
- * halt execution of world-switches and schedule other host processes until
+ * decides to.
- * there is an incoming IRQ or FIQ to the VM.
+ * WFI: Simply call kvm_vcpu_block(), which will halt execution of
+ * world-switches and schedule other host processes until there is an
+ * incoming IRQ or FIQ to the VM.
 */
-static int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run)
+static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
        trace_kvm_wfi(*vcpu_pc(vcpu));
-        kvm_vcpu_block(vcpu);
+        if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE)
+                kvm_vcpu_on_spin(vcpu);
+        else
+                kvm_vcpu_block(vcpu);
        return 1;
 }
 static exit_handle_fn arm_exit_handlers[] = {
-        [HSR_EC_WFI]            = kvm_handle_wfi,
+        [HSR_EC_WFI]            = kvm_handle_wfx,
        [HSR_EC_CP15_32]        = kvm_handle_cp15_32,
        [HSR_EC_CP15_64]        = kvm_handle_cp15_64,
        [HSR_EC_CP14_MR]        = kvm_handle_cp14_access,
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index b0de86b56c13..371958370de4 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -19,6 +19,7 @@
 #include <linux/mman.h>
 #include <linux/kvm_host.h>
 #include <linux/io.h>
+#include <linux/hugetlb.h>
 #include <trace/events/kvm.h>
 #include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
@@ -41,6 +42,8 @@ static unsigned long hyp_idmap_start;
 static unsigned long hyp_idmap_end;
 static phys_addr_t hyp_idmap_vector;
+#define kvm_pmd_huge(_x)        (pmd_huge(_x) || pmd_trans_huge(_x))
 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
        /*
@@ -93,19 +96,29 @@ static bool page_empty(void *ptr)
 static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
 {
-        pmd_t *pmd_table = pmd_offset(pud, 0);
+        if (pud_huge(*pud)) {
-        pud_clear(pud);
+                pud_clear(pud);
-        kvm_tlb_flush_vmid_ipa(kvm, addr);
+                kvm_tlb_flush_vmid_ipa(kvm, addr);
-        pmd_free(NULL, pmd_table);
+        } else {
+                pmd_t *pmd_table = pmd_offset(pud, 0);
+                pud_clear(pud);
+                kvm_tlb_flush_vmid_ipa(kvm, addr);
+                pmd_free(NULL, pmd_table);
+        }
        put_page(virt_to_page(pud));
 }
 static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
 {
-        pte_t *pte_table = pte_offset_kernel(pmd, 0);
+        if (kvm_pmd_huge(*pmd)) {
-        pmd_clear(pmd);
+                pmd_clear(pmd);
-        kvm_tlb_flush_vmid_ipa(kvm, addr);
+                kvm_tlb_flush_vmid_ipa(kvm, addr);
-        pte_free_kernel(NULL, pte_table);
+        } else {
+                pte_t *pte_table = pte_offset_kernel(pmd, 0);
+                pmd_clear(pmd);
+                kvm_tlb_flush_vmid_ipa(kvm, addr);
+                pte_free_kernel(NULL, pte_table);
+        }
        put_page(virt_to_page(pmd));
 }
@@ -136,18 +149,32 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
                        continue;
                }
+                if (pud_huge(*pud)) {
+                        /*
+                         * If we are dealing with a huge pud, just clear it and
+                         * move on.
+                         */
+                        clear_pud_entry(kvm, pud, addr);
+                        addr = pud_addr_end(addr, end);
+                        continue;
+                }
                pmd = pmd_offset(pud, addr);
                if (pmd_none(*pmd)) {
                        addr = pmd_addr_end(addr, end);
                        continue;
                }
-                pte = pte_offset_kernel(pmd, addr);
+                if (!kvm_pmd_huge(*pmd)) {
-                clear_pte_entry(kvm, pte, addr);
+                        pte = pte_offset_kernel(pmd, addr);
-                next = addr + PAGE_SIZE;
+                        clear_pte_entry(kvm, pte, addr);
+                        next = addr + PAGE_SIZE;
+                }
-                /* If we emptied the pte, walk back up the ladder */
+                /*
-                if (page_empty(pte)) {
+                 * If the pmd entry is to be cleared, walk back up the ladder
+                 */
+                if (kvm_pmd_huge(*pmd) || page_empty(pte)) {
                        clear_pmd_entry(kvm, pmd, addr);
                        next = pmd_addr_end(addr, end);
                        if (page_empty(pmd) && !page_empty(pud)) {
@@ -420,29 +447,71 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
        kvm->arch.pgd = NULL;
 }
+static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+                             phys_addr_t addr)
-                          phys_addr_t addr, const pte_t *new_pte, bool iomap)
 {
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd;
-        pte_t *pte, old_pte;
-        /* Create 2nd stage page table mapping - Level 1 */
        pgd = kvm->arch.pgd + pgd_index(addr);
        pud = pud_offset(pgd, addr);
        if (pud_none(*pud)) {
                if (!cache)
-                        return 0; /* ignore calls from kvm_set_spte_hva */
+                        return NULL;
                pmd = mmu_memory_cache_alloc(cache);
                pud_populate(NULL, pud, pmd);
                get_page(virt_to_page(pud));
        }
-        pmd = pmd_offset(pud, addr);
+        return pmd_offset(pud, addr);
+}
+static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
+                               *cache, phys_addr_t addr, const pmd_t *new_pmd)
+{
+        pmd_t *pmd, old_pmd;
+        pmd = stage2_get_pmd(kvm, cache, addr);
+        VM_BUG_ON(!pmd);
+        /*
+         * Mapping in huge pages should only happen through a fault.  If a
+         * page is merged into a transparent huge page, the individual
+         * subpages of that huge page should be unmapped through MMU
+         * notifiers before we get here.
+         *
+         * Merging of CompoundPages is not supported; they should become
+         * splitting first, unmapped, merged, and mapped back in on-demand.
+         */
+        VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
+        old_pmd = *pmd;
+        kvm_set_pmd(pmd, *new_pmd);
+        if (pmd_present(old_pmd))
+                kvm_tlb_flush_vmid_ipa(kvm, addr);
+        else
+                get_page(virt_to_page(pmd));
+        return 0;
+}
+static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+                          phys_addr_t addr, const pte_t *new_pte, bool iomap)
+{
+        pmd_t *pmd;
+        pte_t *pte, old_pte;
-        /* Create 2nd stage page table mapping - Level 2 */
+        /* Create stage-2 page table mapping - Level 1 */
+        pmd = stage2_get_pmd(kvm, cache, addr);
+        if (!pmd) {
+                /*
+                 * Ignore calls from kvm_set_spte_hva for unallocated
+                 * address ranges.
+                 */
+                return 0;
+        }
+        /* Create stage-2 page mappings - Level 2 */
        if (pmd_none(*pmd)) {
                if (!cache)
                        return 0; /* ignore calls from kvm_set_spte_hva */
@@ -507,16 +576,60 @@ out:
        return ret;
 }
+static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
+{
+        pfn_t pfn = *pfnp;
+        gfn_t gfn = *ipap >> PAGE_SHIFT;
+        if (PageTransCompound(pfn_to_page(pfn))) {
+                unsigned long mask;
+                /*
+                 * The address we faulted on is backed by a transparent huge
+                 * page.  However, because we map the compound huge page and
+                 * not the individual tail page, we need to transfer the
+                 * refcount to the head page.  We have to be careful that the
+                 * THP doesn't start to split while we are adjusting the
+                 * refcounts.
+                 *
+                 * We are sure this doesn't happen, because mmu_notifier_retry
+                 * was successful and we are holding the mmu_lock, so if this
+                 * THP is trying to split, it will be blocked in the mmu
+                 * notifier before touching any of the pages, specifically
+                 * before being able to call __split_huge_page_refcount().
+                 *
+                 * We can therefore safely transfer the refcount from PG_tail
+                 * to PG_head and switch the pfn from a tail page to the head
+                 * page accordingly.
+                 */
+                mask = PTRS_PER_PMD - 1;
+                VM_BUG_ON((gfn & mask) != (pfn & mask));
+                if (pfn & mask) {
+                        *ipap &= PMD_MASK;
+                        kvm_release_pfn_clean(pfn);
+                        pfn &= ~mask;
+                        kvm_get_pfn(pfn);
+                        *pfnp = pfn;
+                }
+                return true;
+        }
+        return false;
+}
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
-                          gfn_t gfn, struct kvm_memory_slot *memslot,
+                          struct kvm_memory_slot *memslot,
                          unsigned long fault_status)
 {
-        pte_t new_pte;
-        pfn_t pfn;
        int ret;
-        bool write_fault, writable;
+        bool write_fault, writable, hugetlb = false, force_pte = false;
        unsigned long mmu_seq;
+        gfn_t gfn = fault_ipa >> PAGE_SHIFT;
+        unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
+        struct kvm *kvm = vcpu->kvm;
        struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+        struct vm_area_struct *vma;
+        pfn_t pfn;
        write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
        if (fault_status == FSC_PERM && !write_fault) {
@@ -524,6 +637,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                return -EFAULT;
        }
+        /* Let's check if we will get back a huge page backed by hugetlbfs */
+        down_read(&current->mm->mmap_sem);
+        vma = find_vma_intersection(current->mm, hva, hva + 1);
+        if (is_vm_hugetlb_page(vma)) {
+                hugetlb = true;
+                gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
+        } else {
+                /*
+                 * Pages belonging to VMAs not aligned to the PMD mapping
+                 * granularity cannot be mapped using block descriptors even
+                 * if the pages belong to a THP for the process, because the
+                 * stage-2 block descriptor will cover more than a single THP
+                 * and we loose atomicity for unmapping, updates, and splits
+                 * of the THP or other pages in the stage-2 block range.
+                 */
+                if (vma->vm_start & ~PMD_MASK)
+                        force_pte = true;
+        }
+        up_read(&current->mm->mmap_sem);
        /* We need minimum second+third level pages */
        ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS);
        if (ret)
@@ -541,26 +674,40 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         */
        smp_rmb();
-        pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable);
+        pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
        if (is_error_pfn(pfn))
                return -EFAULT;
-        new_pte = pfn_pte(pfn, PAGE_S2);
+        spin_lock(&kvm->mmu_lock);
-        coherent_icache_guest_page(vcpu->kvm, gfn);
+        if (mmu_notifier_retry(kvm, mmu_seq))
-        spin_lock(&vcpu->kvm->mmu_lock);
-        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
-        if (writable) {
+        if (!hugetlb && !force_pte)
-                kvm_set_s2pte_writable(&new_pte);
+                hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
-                kvm_set_pfn_dirty(pfn);
+        if (hugetlb) {
+                pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
+                new_pmd = pmd_mkhuge(new_pmd);
+                if (writable) {
+                        kvm_set_s2pmd_writable(&new_pmd);
+                        kvm_set_pfn_dirty(pfn);
+                }
+                coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE);
+                ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
+        } else {
+                pte_t new_pte = pfn_pte(pfn, PAGE_S2);
+                if (writable) {
+                        kvm_set_s2pte_writable(&new_pte);
+                        kvm_set_pfn_dirty(pfn);
+                }
+                coherent_icache_guest_page(kvm, hva, PAGE_SIZE);
+                ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
        }
-        stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false);
 out_unlock:
-        spin_unlock(&vcpu->kvm->mmu_lock);
+        spin_unlock(&kvm->mmu_lock);
        kvm_release_pfn_clean(pfn);
-        return 0;
+        return ret;
 }
 /**
@@ -629,7 +776,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
        memslot = gfn_to_memslot(vcpu->kvm, gfn);
-        ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, fault_status);
+        ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status);
        if (ret == 0)
                ret = 1;
 out_unlock:
author	Paolo Bonzini <pbonzini@redhat.com>	2013-10-28 08:15:55 -0400
committer	Paolo Bonzini <pbonzini@redhat.com>	2013-10-28 08:15:55 -0400
commit	5bb3398dd2df2c26261b2156c98cf4c95b3f91fe (patch)
tree	526d914e0e1cc62249b4a0d2fea31558cc17fcd7 /arch/arm
parent	e0230e1327fb862c9b6cde24ae62d55f9db62c9b (diff)
parent	9b5fdb9781f74fb15827e465bfb5aa63211953c8 (diff)

diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index d556f03bca17..1d3153c7eb41 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h
@@ -57,6 +57,7 @@
57	* TSC: Trap SMC	57	* TSC: Trap SMC
58	* TSW: Trap cache operations by set/way	58	* TSW: Trap cache operations by set/way
59	* TWI: Trap WFI	59	* TWI: Trap WFI
		60	* TWE: Trap WFE
60	* TIDCP: Trap L2CTLR/L2ECTLR	61	* TIDCP: Trap L2CTLR/L2ECTLR
61	* BSU_IS: Upgrade barriers to the inner shareable domain	62	* BSU_IS: Upgrade barriers to the inner shareable domain
62	* FB: Force broadcast of all maintainance operations	63	* FB: Force broadcast of all maintainance operations
@@ -67,7 +68,7 @@
67	*/	68	*/
68	#define HCR_GUEST_MASK (HCR_TSC \| HCR_TSW \| HCR_TWI \| HCR_VM \| HCR_BSU_IS \| \	69	#define HCR_GUEST_MASK (HCR_TSC \| HCR_TSW \| HCR_TWI \| HCR_VM \| HCR_BSU_IS \| \
69	HCR_FB \| HCR_TAC \| HCR_AMO \| HCR_IMO \| HCR_FMO \| \	70	HCR_FB \| HCR_TAC \| HCR_AMO \| HCR_IMO \| HCR_FMO \| \
70	HCR_SWIO \| HCR_TIDCP)	71	HCR_TWE \| HCR_SWIO \| HCR_TIDCP)
71	#define HCR_VIRT_EXCP_MASK (HCR_VA \| HCR_VI \| HCR_VF)	72	#define HCR_VIRT_EXCP_MASK (HCR_VA \| HCR_VI \| HCR_VF)
72		73
73	/* System Control Register (SCTLR) bits */	74	/* System Control Register (SCTLR) bits */
@@ -208,6 +209,8 @@
208	#define HSR_EC_DABT (0x24)	209	#define HSR_EC_DABT (0x24)
209	#define HSR_EC_DABT_HYP (0x25)	210	#define HSR_EC_DABT_HYP (0x25)
210		211
		212	#define HSR_WFI_IS_WFE (1U << 0)
		213
211	#define HSR_HVC_IMM_MASK ((1UL << 16) - 1)	214	#define HSR_HVC_IMM_MASK ((1UL << 16) - 1)
212		215
213	#define HSR_DABT_S1PTW (1U << 7)	216	#define HSR_DABT_S1PTW (1U << 7)


diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 9b28c41f4ba9..77de4a41cc50 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h
@@ -62,6 +62,12 @@ phys_addr_t kvm_get_idmap_vector(void);
62	int kvm_mmu_init(void);	62	int kvm_mmu_init(void);
63	void kvm_clear_hyp_idmap(void);	63	void kvm_clear_hyp_idmap(void);
64		64
		65	static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd)
		66	{
		67	*pmd = new_pmd;
		68	flush_pmd_entry(pmd);
		69	}
		70
65	static inline void kvm_set_pte(pte_t *pte, pte_t new_pte)	71	static inline void kvm_set_pte(pte_t *pte, pte_t new_pte)
66	{	72	{
67	*pte = new_pte;	73	*pte = new_pte;
@@ -103,9 +109,15 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
103	pte_val(*pte) \|= L_PTE_S2_RDWR;	109	pte_val(*pte) \|= L_PTE_S2_RDWR;
104	}	110	}
105		111
		112	static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
		113	{
		114	pmd_val(*pmd) \|= L_PMD_S2_RDWR;
		115	}
		116
106	struct kvm;	117	struct kvm;
107		118
108	static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)	119	static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
		120	unsigned long size)
109	{	121	{
110	/*	122	/*
111	* If we are going to insert an instruction page and the icache is	123	* If we are going to insert an instruction page and the icache is
@@ -120,8 +132,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
120	* need any kind of flushing (DDI 0406C.b - Page B3-1392).	132	* need any kind of flushing (DDI 0406C.b - Page B3-1392).
121	*/	133	*/
122	if (icache_is_pipt()) {	134	if (icache_is_pipt()) {
123	unsigned long hva = gfn_to_hva(kvm, gfn);	135	__cpuc_coherent_user_range(hva, hva + size);
124	__cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
125	} else if (!icache_is_vivt_asid_tagged()) {	136	} else if (!icache_is_vivt_asid_tagged()) {
126	/* any kind of VIPT cache */	137	/* any kind of VIPT cache */
127	__flush_icache_all();	138	__flush_icache_all();


diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 5689c18c85f5..a331d2527342 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h
@@ -126,6 +126,8 @@
126	#define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */	126	#define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */
127	#define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */	127	#define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
128		128
		129	#define L_PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
		130
129	/*	131	/*
130	* Hyp-mode PL2 PTE definitions for LPAE.	132	* Hyp-mode PL2 PTE definitions for LPAE.
131	*/	133	*/


diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index ebf5015508b5..466bd299b1a8 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig
@@ -20,6 +20,7 @@ config KVM
20	bool "Kernel-based Virtual Machine (KVM) support"	20	bool "Kernel-based Virtual Machine (KVM) support"
21	select PREEMPT_NOTIFIERS	21	select PREEMPT_NOTIFIERS
22	select ANON_INODES	22	select ANON_INODES
		23	select HAVE_KVM_CPU_RELAX_INTERCEPT
23	select KVM_MMIO	24	select KVM_MMIO
24	select KVM_ARM_HOST	25	select KVM_ARM_HOST
25	depends on ARM_VIRT_EXT && ARM_LPAE	26	depends on ARM_VIRT_EXT && ARM_LPAE


diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index df4c82d47ad7..a92079011a83 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c
@@ -73,23 +73,29 @@ static int handle_dabt_hyp(struct kvm_vcpu vcpu, struct kvm_run run)
73	}	73	}
74		74
75	/**	75	/**
76	* kvm_handle_wfi - handle a wait-for-interrupts instruction executed by a guest	76	* kvm_handle_wfx - handle a WFI or WFE instructions trapped in guests
77	* @vcpu: the vcpu pointer	77	* @vcpu: the vcpu pointer
78	* @run: the kvm_run structure pointer	78	* @run: the kvm_run structure pointer
79	*	79	*
80	* Simply sets the wait_for_interrupts flag on the vcpu structure, which will	80	* WFE: Yield the CPU and come back to this vcpu when the scheduler
81	* halt execution of world-switches and schedule other host processes until	81	* decides to.
82	* there is an incoming IRQ or FIQ to the VM.	82	* WFI: Simply call kvm_vcpu_block(), which will halt execution of
		83	* world-switches and schedule other host processes until there is an
		84	* incoming IRQ or FIQ to the VM.
83	*/	85	*/
84	static int kvm_handle_wfi(struct kvm_vcpu vcpu, struct kvm_run run)	86	static int kvm_handle_wfx(struct kvm_vcpu vcpu, struct kvm_run run)
85	{	87	{
86	trace_kvm_wfi(*vcpu_pc(vcpu));	88	trace_kvm_wfi(*vcpu_pc(vcpu));
87	kvm_vcpu_block(vcpu);	89	if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE)
		90	kvm_vcpu_on_spin(vcpu);
		91	else
		92	kvm_vcpu_block(vcpu);
		93
88	return 1;	94	return 1;
89	}	95	}
90		96
91	static exit_handle_fn arm_exit_handlers[] = {	97	static exit_handle_fn arm_exit_handlers[] = {
92	[HSR_EC_WFI] = kvm_handle_wfi,	98	[HSR_EC_WFI] = kvm_handle_wfx,
93	[HSR_EC_CP15_32] = kvm_handle_cp15_32,	99	[HSR_EC_CP15_32] = kvm_handle_cp15_32,
94	[HSR_EC_CP15_64] = kvm_handle_cp15_64,	100	[HSR_EC_CP15_64] = kvm_handle_cp15_64,
95	[HSR_EC_CP14_MR] = kvm_handle_cp14_access,	101	[HSR_EC_CP14_MR] = kvm_handle_cp14_access,


diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index b0de86b56c13..371958370de4 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c
@@ -19,6 +19,7 @@
19	#include <linux/mman.h>	19	#include <linux/mman.h>
20	#include <linux/kvm_host.h>	20	#include <linux/kvm_host.h>
21	#include <linux/io.h>	21	#include <linux/io.h>
		22	#include <linux/hugetlb.h>
22	#include <trace/events/kvm.h>	23	#include <trace/events/kvm.h>
23	#include <asm/pgalloc.h>	24	#include <asm/pgalloc.h>
24	#include <asm/cacheflush.h>	25	#include <asm/cacheflush.h>
@@ -41,6 +42,8 @@ static unsigned long hyp_idmap_start;
41	static unsigned long hyp_idmap_end;	42	static unsigned long hyp_idmap_end;
42	static phys_addr_t hyp_idmap_vector;	43	static phys_addr_t hyp_idmap_vector;
43		44
		45	#define kvm_pmd_huge(_x) (pmd_huge(_x) \|\| pmd_trans_huge(_x))
		46
44	static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)	47	static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
45	{	48	{
46	/*	49	/*
@@ -93,19 +96,29 @@ static bool page_empty(void *ptr)
93		96
94	static void clear_pud_entry(struct kvm kvm, pud_t pud, phys_addr_t addr)	97	static void clear_pud_entry(struct kvm kvm, pud_t pud, phys_addr_t addr)
95	{	98	{
96	pmd_t *pmd_table = pmd_offset(pud, 0);	99	if (pud_huge(*pud)) {
97	pud_clear(pud);	100	pud_clear(pud);
98	kvm_tlb_flush_vmid_ipa(kvm, addr);	101	kvm_tlb_flush_vmid_ipa(kvm, addr);
99	pmd_free(NULL, pmd_table);	102	} else {
		103	pmd_t *pmd_table = pmd_offset(pud, 0);
		104	pud_clear(pud);
		105	kvm_tlb_flush_vmid_ipa(kvm, addr);
		106	pmd_free(NULL, pmd_table);
		107	}
100	put_page(virt_to_page(pud));	108	put_page(virt_to_page(pud));
101	}	109	}
102		110
103	static void clear_pmd_entry(struct kvm kvm, pmd_t pmd, phys_addr_t addr)	111	static void clear_pmd_entry(struct kvm kvm, pmd_t pmd, phys_addr_t addr)
104	{	112	{
105	pte_t *pte_table = pte_offset_kernel(pmd, 0);	113	if (kvm_pmd_huge(*pmd)) {
106	pmd_clear(pmd);	114	pmd_clear(pmd);
107	kvm_tlb_flush_vmid_ipa(kvm, addr);	115	kvm_tlb_flush_vmid_ipa(kvm, addr);
108	pte_free_kernel(NULL, pte_table);	116	} else {
		117	pte_t *pte_table = pte_offset_kernel(pmd, 0);
		118	pmd_clear(pmd);
		119	kvm_tlb_flush_vmid_ipa(kvm, addr);
		120	pte_free_kernel(NULL, pte_table);
		121	}
109	put_page(virt_to_page(pmd));	122	put_page(virt_to_page(pmd));
110	}	123	}
111		124
@@ -136,18 +149,32 @@ static void unmap_range(struct kvm kvm, pgd_t pgdp,
136	continue;	149	continue;
137	}	150	}
138		151
		152	if (pud_huge(*pud)) {
		153	/*
		154	* If we are dealing with a huge pud, just clear it and
		155	* move on.
		156	*/
		157	clear_pud_entry(kvm, pud, addr);
		158	addr = pud_addr_end(addr, end);
		159	continue;
		160	}
		161
139	pmd = pmd_offset(pud, addr);	162	pmd = pmd_offset(pud, addr);
140	if (pmd_none(*pmd)) {	163	if (pmd_none(*pmd)) {
141	addr = pmd_addr_end(addr, end);	164	addr = pmd_addr_end(addr, end);
142	continue;	165	continue;
143	}	166	}
144		167
145	pte = pte_offset_kernel(pmd, addr);	168	if (!kvm_pmd_huge(*pmd)) {
146	clear_pte_entry(kvm, pte, addr);	169	pte = pte_offset_kernel(pmd, addr);
147	next = addr + PAGE_SIZE;	170	clear_pte_entry(kvm, pte, addr);
		171	next = addr + PAGE_SIZE;
		172	}
148		173
149	/* If we emptied the pte, walk back up the ladder */	174	/*
150	if (page_empty(pte)) {	175	* If the pmd entry is to be cleared, walk back up the ladder
		176	*/
		177	if (kvm_pmd_huge(*pmd) \|\| page_empty(pte)) {
151	clear_pmd_entry(kvm, pmd, addr);	178	clear_pmd_entry(kvm, pmd, addr);
152	next = pmd_addr_end(addr, end);	179	next = pmd_addr_end(addr, end);
153	if (page_empty(pmd) && !page_empty(pud)) {	180	if (page_empty(pmd) && !page_empty(pud)) {
@@ -420,29 +447,71 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
420	kvm->arch.pgd = NULL;	447	kvm->arch.pgd = NULL;
421	}	448	}
422		449
423		450	static pmd_t stage2_get_pmd(struct kvm kvm, struct kvm_mmu_memory_cache *cache,
424	static int stage2_set_pte(struct kvm kvm, struct kvm_mmu_memory_cache cache,	451	phys_addr_t addr)
425	phys_addr_t addr, const pte_t *new_pte, bool iomap)
426	{	452	{
427	pgd_t *pgd;	453	pgd_t *pgd;
428	pud_t *pud;	454	pud_t *pud;
429	pmd_t *pmd;	455	pmd_t *pmd;
430	pte_t *pte, old_pte;
431		456
432	/* Create 2nd stage page table mapping - Level 1 */
433	pgd = kvm->arch.pgd + pgd_index(addr);	457	pgd = kvm->arch.pgd + pgd_index(addr);
434	pud = pud_offset(pgd, addr);	458	pud = pud_offset(pgd, addr);
435	if (pud_none(*pud)) {	459	if (pud_none(*pud)) {
436	if (!cache)	460	if (!cache)
437	return 0; /* ignore calls from kvm_set_spte_hva */	461	return NULL;
438	pmd = mmu_memory_cache_alloc(cache);	462	pmd = mmu_memory_cache_alloc(cache);
439	pud_populate(NULL, pud, pmd);	463	pud_populate(NULL, pud, pmd);
440	get_page(virt_to_page(pud));	464	get_page(virt_to_page(pud));
441	}	465	}
442		466
443	pmd = pmd_offset(pud, addr);	467	return pmd_offset(pud, addr);
		468	}
		469
		470	static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
		471	cache, phys_addr_t addr, const pmd_t new_pmd)
		472	{
		473	pmd_t *pmd, old_pmd;
		474
		475	pmd = stage2_get_pmd(kvm, cache, addr);
		476	VM_BUG_ON(!pmd);
		477
		478	/*
		479	* Mapping in huge pages should only happen through a fault. If a
		480	* page is merged into a transparent huge page, the individual
		481	* subpages of that huge page should be unmapped through MMU
		482	* notifiers before we get here.
		483	*
		484	* Merging of CompoundPages is not supported; they should become
		485	* splitting first, unmapped, merged, and mapped back in on-demand.
		486	*/
		487	VM_BUG_ON(pmd_present(pmd) && pmd_pfn(pmd) != pmd_pfn(*new_pmd));
		488
		489	old_pmd = *pmd;
		490	kvm_set_pmd(pmd, *new_pmd);
		491	if (pmd_present(old_pmd))
		492	kvm_tlb_flush_vmid_ipa(kvm, addr);
		493	else
		494	get_page(virt_to_page(pmd));
		495	return 0;
		496	}
		497
		498	static int stage2_set_pte(struct kvm kvm, struct kvm_mmu_memory_cache cache,
		499	phys_addr_t addr, const pte_t *new_pte, bool iomap)
		500	{
		501	pmd_t *pmd;
		502	pte_t *pte, old_pte;
444		503
445	/* Create 2nd stage page table mapping - Level 2 */	504	/* Create stage-2 page table mapping - Level 1 */
		505	pmd = stage2_get_pmd(kvm, cache, addr);
		506	if (!pmd) {
		507	/*
		508	* Ignore calls from kvm_set_spte_hva for unallocated
		509	* address ranges.
		510	*/
		511	return 0;
		512	}
		513
		514	/* Create stage-2 page mappings - Level 2 */
446	if (pmd_none(*pmd)) {	515	if (pmd_none(*pmd)) {
447	if (!cache)	516	if (!cache)
448	return 0; /* ignore calls from kvm_set_spte_hva */	517	return 0; /* ignore calls from kvm_set_spte_hva */
@@ -507,16 +576,60 @@ out:
507	return ret;	576	return ret;
508	}	577	}
509		578
		579	static bool transparent_hugepage_adjust(pfn_t pfnp, phys_addr_t ipap)
		580	{
		581	pfn_t pfn = *pfnp;
		582	gfn_t gfn = *ipap >> PAGE_SHIFT;
		583
		584	if (PageTransCompound(pfn_to_page(pfn))) {
		585	unsigned long mask;
		586	/*
		587	* The address we faulted on is backed by a transparent huge
		588	* page. However, because we map the compound huge page and
		589	* not the individual tail page, we need to transfer the
		590	* refcount to the head page. We have to be careful that the
		591	* THP doesn't start to split while we are adjusting the
		592	* refcounts.
		593	*
		594	* We are sure this doesn't happen, because mmu_notifier_retry
		595	* was successful and we are holding the mmu_lock, so if this
		596	* THP is trying to split, it will be blocked in the mmu
		597	* notifier before touching any of the pages, specifically
		598	* before being able to call __split_huge_page_refcount().
		599	*
		600	* We can therefore safely transfer the refcount from PG_tail
		601	* to PG_head and switch the pfn from a tail page to the head
		602	* page accordingly.
		603	*/
		604	mask = PTRS_PER_PMD - 1;
		605	VM_BUG_ON((gfn & mask) != (pfn & mask));
		606	if (pfn & mask) {
		607	*ipap &= PMD_MASK;
		608	kvm_release_pfn_clean(pfn);
		609	pfn &= ~mask;
		610	kvm_get_pfn(pfn);
		611	*pfnp = pfn;
		612	}
		613
		614	return true;
		615	}
		616
		617	return false;
		618	}
		619
510	static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,	620	static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
511	gfn_t gfn, struct kvm_memory_slot *memslot,	621	struct kvm_memory_slot *memslot,
512	unsigned long fault_status)	622	unsigned long fault_status)
513	{	623	{
514	pte_t new_pte;
515	pfn_t pfn;
516	int ret;	624	int ret;
517	bool write_fault, writable;	625	bool write_fault, writable, hugetlb = false, force_pte = false;
518	unsigned long mmu_seq;	626	unsigned long mmu_seq;
		627	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
		628	unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
		629	struct kvm *kvm = vcpu->kvm;
519	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;	630	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
		631	struct vm_area_struct *vma;
		632	pfn_t pfn;
520		633
521	write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));	634	write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
522	if (fault_status == FSC_PERM && !write_fault) {	635	if (fault_status == FSC_PERM && !write_fault) {
@@ -524,6 +637,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
524	return -EFAULT;	637	return -EFAULT;
525	}	638	}
526		639
		640	/* Let's check if we will get back a huge page backed by hugetlbfs */
		641	down_read(&current->mm->mmap_sem);
		642	vma = find_vma_intersection(current->mm, hva, hva + 1);
		643	if (is_vm_hugetlb_page(vma)) {
		644	hugetlb = true;
		645	gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
		646	} else {
		647	/*
		648	* Pages belonging to VMAs not aligned to the PMD mapping
		649	* granularity cannot be mapped using block descriptors even
		650	* if the pages belong to a THP for the process, because the
		651	* stage-2 block descriptor will cover more than a single THP
		652	* and we loose atomicity for unmapping, updates, and splits
		653	* of the THP or other pages in the stage-2 block range.
		654	*/
		655	if (vma->vm_start & ~PMD_MASK)
		656	force_pte = true;
		657	}
		658	up_read(&current->mm->mmap_sem);
		659
527	/* We need minimum second+third level pages */	660	/* We need minimum second+third level pages */
528	ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS);	661	ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS);
529	if (ret)	662	if (ret)
@@ -541,26 +674,40 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
541	*/	674	*/
542	smp_rmb();	675	smp_rmb();
543		676
544	pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable);	677	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
545	if (is_error_pfn(pfn))	678	if (is_error_pfn(pfn))
546	return -EFAULT;	679	return -EFAULT;
547		680
548	new_pte = pfn_pte(pfn, PAGE_S2);	681	spin_lock(&kvm->mmu_lock);
549	coherent_icache_guest_page(vcpu->kvm, gfn);	682	if (mmu_notifier_retry(kvm, mmu_seq))
550
551	spin_lock(&vcpu->kvm->mmu_lock);
552	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
553	goto out_unlock;	683	goto out_unlock;
554	if (writable) {	684	if (!hugetlb && !force_pte)
555	kvm_set_s2pte_writable(&new_pte);	685	hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
556	kvm_set_pfn_dirty(pfn);	686
		687	if (hugetlb) {
		688	pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
		689	new_pmd = pmd_mkhuge(new_pmd);
		690	if (writable) {
		691	kvm_set_s2pmd_writable(&new_pmd);
		692	kvm_set_pfn_dirty(pfn);
		693	}
		694	coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE);
		695	ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
		696	} else {
		697	pte_t new_pte = pfn_pte(pfn, PAGE_S2);
		698	if (writable) {
		699	kvm_set_s2pte_writable(&new_pte);
		700	kvm_set_pfn_dirty(pfn);
		701	}
		702	coherent_icache_guest_page(kvm, hva, PAGE_SIZE);
		703	ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
557	}	704	}
558	stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false);	705
559		706
560	out_unlock:	707	out_unlock:
561	spin_unlock(&vcpu->kvm->mmu_lock);	708	spin_unlock(&kvm->mmu_lock);
562	kvm_release_pfn_clean(pfn);	709	kvm_release_pfn_clean(pfn);
563	return 0;	710	return ret;
564	}	711	}
565		712
566	/**	713	/**
@@ -629,7 +776,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu vcpu, struct kvm_run run)
629		776
630	memslot = gfn_to_memslot(vcpu->kvm, gfn);	777	memslot = gfn_to_memslot(vcpu->kvm, gfn);
631		778
632	ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, fault_status);	779	ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status);
633	if (ret == 0)	780	if (ret == 0)
634	ret = 1;	781	ret = 1;
635	out_unlock:	782	out_unlock: