KVM: ARM: Transparent huge page (THP) support

Support transparent huge pages in KVM/ARM and KVM/ARM64. The transparent_hugepage_adjust is not very pretty, but this is also how it's solved on x86 and seems to be simply an artifact on how THPs behave. This should eventually be shared across architectures if possible, but that can always be changed down the road. Acked-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
author: Christoffer Dall <christoffer.dall@linaro.org> 2013-10-02 18:32:01 -0400
committer: Christoffer Dall <christoffer.dall@linaro.org> 2013-10-17 20:06:30 -0400
commit: 9b5fdb9781f74fb15827e465bfb5aa63211953c8 (patch)
tree: 0b2cec09a004720b520d39ff5db34ebb79e860ee
parent: ad361f093c1e31d0b43946210a32ab4ff5c49850 (diff)
1 files changed, 56 insertions, 2 deletions
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 745d8b1630cc..371958370de4 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -42,7 +42,7 @@ static unsigned long hyp_idmap_start;
 static unsigned long hyp_idmap_end;
 static phys_addr_t hyp_idmap_vector;
-#define kvm_pmd_huge(_x)        (pmd_huge(_x))
+#define kvm_pmd_huge(_x)        (pmd_huge(_x) || pmd_trans_huge(_x))
 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
@@ -576,12 +576,53 @@ out:
        return ret;
 }
+static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
+{
+        pfn_t pfn = *pfnp;
+        gfn_t gfn = *ipap >> PAGE_SHIFT;
+        if (PageTransCompound(pfn_to_page(pfn))) {
+                unsigned long mask;
+                /*
+                 * The address we faulted on is backed by a transparent huge
+                 * page.  However, because we map the compound huge page and
+                 * not the individual tail page, we need to transfer the
+                 * refcount to the head page.  We have to be careful that the
+                 * THP doesn't start to split while we are adjusting the
+                 * refcounts.
+                 *
+                 * We are sure this doesn't happen, because mmu_notifier_retry
+                 * was successful and we are holding the mmu_lock, so if this
+                 * THP is trying to split, it will be blocked in the mmu
+                 * notifier before touching any of the pages, specifically
+                 * before being able to call __split_huge_page_refcount().
+                 *
+                 * We can therefore safely transfer the refcount from PG_tail
+                 * to PG_head and switch the pfn from a tail page to the head
+                 * page accordingly.
+                 */
+                mask = PTRS_PER_PMD - 1;
+                VM_BUG_ON((gfn & mask) != (pfn & mask));
+                if (pfn & mask) {
+                        *ipap &= PMD_MASK;
+                        kvm_release_pfn_clean(pfn);
+                        pfn &= ~mask;
+                        kvm_get_pfn(pfn);
+                        *pfnp = pfn;
+                }
+                return true;
+        }
+        return false;
+}
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                          struct kvm_memory_slot *memslot,
                          unsigned long fault_status)
 {
        int ret;
-        bool write_fault, writable, hugetlb = false;
+        bool write_fault, writable, hugetlb = false, force_pte = false;
        unsigned long mmu_seq;
        gfn_t gfn = fault_ipa >> PAGE_SHIFT;
        unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
@@ -602,6 +643,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        if (is_vm_hugetlb_page(vma)) {
                hugetlb = true;
                gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
+        } else {
+                /*
+                 * Pages belonging to VMAs not aligned to the PMD mapping
+                 * granularity cannot be mapped using block descriptors even
+                 * if the pages belong to a THP for the process, because the
+                 * stage-2 block descriptor will cover more than a single THP
+                 * and we loose atomicity for unmapping, updates, and splits
+                 * of the THP or other pages in the stage-2 block range.
+                 */
+                if (vma->vm_start & ~PMD_MASK)
+                        force_pte = true;
        }
        up_read(&current->mm->mmap_sem);
@@ -629,6 +681,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        spin_lock(&kvm->mmu_lock);
        if (mmu_notifier_retry(kvm, mmu_seq))
                goto out_unlock;
+        if (!hugetlb && !force_pte)
+                hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
        if (hugetlb) {
                pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
author	Christoffer Dall <christoffer.dall@linaro.org>	2013-10-02 18:32:01 -0400
committer	Christoffer Dall <christoffer.dall@linaro.org>	2013-10-17 20:06:30 -0400
commit	9b5fdb9781f74fb15827e465bfb5aa63211953c8 (patch)
tree	0b2cec09a004720b520d39ff5db34ebb79e860ee
parent	ad361f093c1e31d0b43946210a32ab4ff5c49850 (diff)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 745d8b1630cc..371958370de4 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c
@@ -42,7 +42,7 @@ static unsigned long hyp_idmap_start;
42	static unsigned long hyp_idmap_end;	42	static unsigned long hyp_idmap_end;
43	static phys_addr_t hyp_idmap_vector;	43	static phys_addr_t hyp_idmap_vector;
44		44
45	#define kvm_pmd_huge(_x) (pmd_huge(_x))	45	#define kvm_pmd_huge(_x) (pmd_huge(_x) \|\| pmd_trans_huge(_x))
46		46
47	static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)	47	static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
48	{	48	{
@@ -576,12 +576,53 @@ out:
576	return ret;	576	return ret;
577	}	577	}
578		578
		579	static bool transparent_hugepage_adjust(pfn_t pfnp, phys_addr_t ipap)
		580	{
		581	pfn_t pfn = *pfnp;
		582	gfn_t gfn = *ipap >> PAGE_SHIFT;
		583
		584	if (PageTransCompound(pfn_to_page(pfn))) {
		585	unsigned long mask;
		586	/*
		587	* The address we faulted on is backed by a transparent huge
		588	* page. However, because we map the compound huge page and
		589	* not the individual tail page, we need to transfer the
		590	* refcount to the head page. We have to be careful that the
		591	* THP doesn't start to split while we are adjusting the
		592	* refcounts.
		593	*
		594	* We are sure this doesn't happen, because mmu_notifier_retry
		595	* was successful and we are holding the mmu_lock, so if this
		596	* THP is trying to split, it will be blocked in the mmu
		597	* notifier before touching any of the pages, specifically
		598	* before being able to call __split_huge_page_refcount().
		599	*
		600	* We can therefore safely transfer the refcount from PG_tail
		601	* to PG_head and switch the pfn from a tail page to the head
		602	* page accordingly.
		603	*/
		604	mask = PTRS_PER_PMD - 1;
		605	VM_BUG_ON((gfn & mask) != (pfn & mask));
		606	if (pfn & mask) {
		607	*ipap &= PMD_MASK;
		608	kvm_release_pfn_clean(pfn);
		609	pfn &= ~mask;
		610	kvm_get_pfn(pfn);
		611	*pfnp = pfn;
		612	}
		613
		614	return true;
		615	}
		616
		617	return false;
		618	}
		619
579	static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,	620	static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
580	struct kvm_memory_slot *memslot,	621	struct kvm_memory_slot *memslot,
581	unsigned long fault_status)	622	unsigned long fault_status)
582	{	623	{
583	int ret;	624	int ret;
584	bool write_fault, writable, hugetlb = false;	625	bool write_fault, writable, hugetlb = false, force_pte = false;
585	unsigned long mmu_seq;	626	unsigned long mmu_seq;
586	gfn_t gfn = fault_ipa >> PAGE_SHIFT;	627	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
587	unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);	628	unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
@@ -602,6 +643,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
602	if (is_vm_hugetlb_page(vma)) {	643	if (is_vm_hugetlb_page(vma)) {
603	hugetlb = true;	644	hugetlb = true;
604	gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;	645	gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
		646	} else {
		647	/*
		648	* Pages belonging to VMAs not aligned to the PMD mapping
		649	* granularity cannot be mapped using block descriptors even
		650	* if the pages belong to a THP for the process, because the
		651	* stage-2 block descriptor will cover more than a single THP
		652	* and we loose atomicity for unmapping, updates, and splits
		653	* of the THP or other pages in the stage-2 block range.
		654	*/
		655	if (vma->vm_start & ~PMD_MASK)
		656	force_pte = true;
605	}	657	}
606	up_read(&current->mm->mmap_sem);	658	up_read(&current->mm->mmap_sem);
607		659
@@ -629,6 +681,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
629	spin_lock(&kvm->mmu_lock);	681	spin_lock(&kvm->mmu_lock);
630	if (mmu_notifier_retry(kvm, mmu_seq))	682	if (mmu_notifier_retry(kvm, mmu_seq))
631	goto out_unlock;	683	goto out_unlock;
		684	if (!hugetlb && !force_pte)
		685	hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
632		686
633	if (hugetlb) {	687	if (hugetlb) {
634	pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);	688	pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);