aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoffer Dall <christoffer.dall@linaro.org>2013-10-17 23:41:44 -0400
committerChristoffer Dall <christoffer.dall@linaro.org>2013-10-17 23:41:44 -0400
commite4b3c9c21bd5674e96988f7507fd924e00087cd0 (patch)
tree0b2cec09a004720b520d39ff5db34ebb79e860ee
parent2f8d01a1475cfede058c6a92b5d3dad576da2827 (diff)
parent9b5fdb9781f74fb15827e465bfb5aa63211953c8 (diff)
Merge branch 'kvm-arm-next-3.13-2' into kvm-arm-next
-rw-r--r--arch/arm/include/asm/kvm_mmu.h17
-rw-r--r--arch/arm/include/asm/pgtable-3level.h2
-rw-r--r--arch/arm/kvm/mmu.c223
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h12
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h2
5 files changed, 212 insertions, 44 deletions
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 9b28c41f4ba9..77de4a41cc50 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -62,6 +62,12 @@ phys_addr_t kvm_get_idmap_vector(void);
62int kvm_mmu_init(void); 62int kvm_mmu_init(void);
63void kvm_clear_hyp_idmap(void); 63void kvm_clear_hyp_idmap(void);
64 64
65static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd)
66{
67 *pmd = new_pmd;
68 flush_pmd_entry(pmd);
69}
70
65static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) 71static inline void kvm_set_pte(pte_t *pte, pte_t new_pte)
66{ 72{
67 *pte = new_pte; 73 *pte = new_pte;
@@ -103,9 +109,15 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
103 pte_val(*pte) |= L_PTE_S2_RDWR; 109 pte_val(*pte) |= L_PTE_S2_RDWR;
104} 110}
105 111
112static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
113{
114 pmd_val(*pmd) |= L_PMD_S2_RDWR;
115}
116
106struct kvm; 117struct kvm;
107 118
108static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) 119static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
120 unsigned long size)
109{ 121{
110 /* 122 /*
111 * If we are going to insert an instruction page and the icache is 123 * If we are going to insert an instruction page and the icache is
@@ -120,8 +132,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
120 * need any kind of flushing (DDI 0406C.b - Page B3-1392). 132 * need any kind of flushing (DDI 0406C.b - Page B3-1392).
121 */ 133 */
122 if (icache_is_pipt()) { 134 if (icache_is_pipt()) {
123 unsigned long hva = gfn_to_hva(kvm, gfn); 135 __cpuc_coherent_user_range(hva, hva + size);
124 __cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
125 } else if (!icache_is_vivt_asid_tagged()) { 136 } else if (!icache_is_vivt_asid_tagged()) {
126 /* any kind of VIPT cache */ 137 /* any kind of VIPT cache */
127 __flush_icache_all(); 138 __flush_icache_all();
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 5689c18c85f5..a331d2527342 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -126,6 +126,8 @@
126#define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */ 126#define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */
127#define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ 127#define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
128 128
129#define L_PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
130
129/* 131/*
130 * Hyp-mode PL2 PTE definitions for LPAE. 132 * Hyp-mode PL2 PTE definitions for LPAE.
131 */ 133 */
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index b0de86b56c13..371958370de4 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -19,6 +19,7 @@
19#include <linux/mman.h> 19#include <linux/mman.h>
20#include <linux/kvm_host.h> 20#include <linux/kvm_host.h>
21#include <linux/io.h> 21#include <linux/io.h>
22#include <linux/hugetlb.h>
22#include <trace/events/kvm.h> 23#include <trace/events/kvm.h>
23#include <asm/pgalloc.h> 24#include <asm/pgalloc.h>
24#include <asm/cacheflush.h> 25#include <asm/cacheflush.h>
@@ -41,6 +42,8 @@ static unsigned long hyp_idmap_start;
41static unsigned long hyp_idmap_end; 42static unsigned long hyp_idmap_end;
42static phys_addr_t hyp_idmap_vector; 43static phys_addr_t hyp_idmap_vector;
43 44
45#define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x))
46
44static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 47static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
45{ 48{
46 /* 49 /*
@@ -93,19 +96,29 @@ static bool page_empty(void *ptr)
93 96
94static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) 97static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
95{ 98{
96 pmd_t *pmd_table = pmd_offset(pud, 0); 99 if (pud_huge(*pud)) {
97 pud_clear(pud); 100 pud_clear(pud);
98 kvm_tlb_flush_vmid_ipa(kvm, addr); 101 kvm_tlb_flush_vmid_ipa(kvm, addr);
99 pmd_free(NULL, pmd_table); 102 } else {
103 pmd_t *pmd_table = pmd_offset(pud, 0);
104 pud_clear(pud);
105 kvm_tlb_flush_vmid_ipa(kvm, addr);
106 pmd_free(NULL, pmd_table);
107 }
100 put_page(virt_to_page(pud)); 108 put_page(virt_to_page(pud));
101} 109}
102 110
103static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) 111static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
104{ 112{
105 pte_t *pte_table = pte_offset_kernel(pmd, 0); 113 if (kvm_pmd_huge(*pmd)) {
106 pmd_clear(pmd); 114 pmd_clear(pmd);
107 kvm_tlb_flush_vmid_ipa(kvm, addr); 115 kvm_tlb_flush_vmid_ipa(kvm, addr);
108 pte_free_kernel(NULL, pte_table); 116 } else {
117 pte_t *pte_table = pte_offset_kernel(pmd, 0);
118 pmd_clear(pmd);
119 kvm_tlb_flush_vmid_ipa(kvm, addr);
120 pte_free_kernel(NULL, pte_table);
121 }
109 put_page(virt_to_page(pmd)); 122 put_page(virt_to_page(pmd));
110} 123}
111 124
@@ -136,18 +149,32 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
136 continue; 149 continue;
137 } 150 }
138 151
152 if (pud_huge(*pud)) {
153 /*
154 * If we are dealing with a huge pud, just clear it and
155 * move on.
156 */
157 clear_pud_entry(kvm, pud, addr);
158 addr = pud_addr_end(addr, end);
159 continue;
160 }
161
139 pmd = pmd_offset(pud, addr); 162 pmd = pmd_offset(pud, addr);
140 if (pmd_none(*pmd)) { 163 if (pmd_none(*pmd)) {
141 addr = pmd_addr_end(addr, end); 164 addr = pmd_addr_end(addr, end);
142 continue; 165 continue;
143 } 166 }
144 167
145 pte = pte_offset_kernel(pmd, addr); 168 if (!kvm_pmd_huge(*pmd)) {
146 clear_pte_entry(kvm, pte, addr); 169 pte = pte_offset_kernel(pmd, addr);
147 next = addr + PAGE_SIZE; 170 clear_pte_entry(kvm, pte, addr);
171 next = addr + PAGE_SIZE;
172 }
148 173
149 /* If we emptied the pte, walk back up the ladder */ 174 /*
150 if (page_empty(pte)) { 175 * If the pmd entry is to be cleared, walk back up the ladder
176 */
177 if (kvm_pmd_huge(*pmd) || page_empty(pte)) {
151 clear_pmd_entry(kvm, pmd, addr); 178 clear_pmd_entry(kvm, pmd, addr);
152 next = pmd_addr_end(addr, end); 179 next = pmd_addr_end(addr, end);
153 if (page_empty(pmd) && !page_empty(pud)) { 180 if (page_empty(pmd) && !page_empty(pud)) {
@@ -420,29 +447,71 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
420 kvm->arch.pgd = NULL; 447 kvm->arch.pgd = NULL;
421} 448}
422 449
423 450static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
424static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 451 phys_addr_t addr)
425 phys_addr_t addr, const pte_t *new_pte, bool iomap)
426{ 452{
427 pgd_t *pgd; 453 pgd_t *pgd;
428 pud_t *pud; 454 pud_t *pud;
429 pmd_t *pmd; 455 pmd_t *pmd;
430 pte_t *pte, old_pte;
431 456
432 /* Create 2nd stage page table mapping - Level 1 */
433 pgd = kvm->arch.pgd + pgd_index(addr); 457 pgd = kvm->arch.pgd + pgd_index(addr);
434 pud = pud_offset(pgd, addr); 458 pud = pud_offset(pgd, addr);
435 if (pud_none(*pud)) { 459 if (pud_none(*pud)) {
436 if (!cache) 460 if (!cache)
437 return 0; /* ignore calls from kvm_set_spte_hva */ 461 return NULL;
438 pmd = mmu_memory_cache_alloc(cache); 462 pmd = mmu_memory_cache_alloc(cache);
439 pud_populate(NULL, pud, pmd); 463 pud_populate(NULL, pud, pmd);
440 get_page(virt_to_page(pud)); 464 get_page(virt_to_page(pud));
441 } 465 }
442 466
443 pmd = pmd_offset(pud, addr); 467 return pmd_offset(pud, addr);
468}
469
470static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
471 *cache, phys_addr_t addr, const pmd_t *new_pmd)
472{
473 pmd_t *pmd, old_pmd;
474
475 pmd = stage2_get_pmd(kvm, cache, addr);
476 VM_BUG_ON(!pmd);
477
478 /*
479 * Mapping in huge pages should only happen through a fault. If a
480 * page is merged into a transparent huge page, the individual
481 * subpages of that huge page should be unmapped through MMU
482 * notifiers before we get here.
483 *
484 * Merging of CompoundPages is not supported; they should become
485 * splitting first, unmapped, merged, and mapped back in on-demand.
486 */
487 VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
488
489 old_pmd = *pmd;
490 kvm_set_pmd(pmd, *new_pmd);
491 if (pmd_present(old_pmd))
492 kvm_tlb_flush_vmid_ipa(kvm, addr);
493 else
494 get_page(virt_to_page(pmd));
495 return 0;
496}
497
498static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
499 phys_addr_t addr, const pte_t *new_pte, bool iomap)
500{
501 pmd_t *pmd;
502 pte_t *pte, old_pte;
444 503
445 /* Create 2nd stage page table mapping - Level 2 */ 504 /* Create stage-2 page table mapping - Level 1 */
505 pmd = stage2_get_pmd(kvm, cache, addr);
506 if (!pmd) {
507 /*
508 * Ignore calls from kvm_set_spte_hva for unallocated
509 * address ranges.
510 */
511 return 0;
512 }
513
514 /* Create stage-2 page mappings - Level 2 */
446 if (pmd_none(*pmd)) { 515 if (pmd_none(*pmd)) {
447 if (!cache) 516 if (!cache)
448 return 0; /* ignore calls from kvm_set_spte_hva */ 517 return 0; /* ignore calls from kvm_set_spte_hva */
@@ -507,16 +576,60 @@ out:
507 return ret; 576 return ret;
508} 577}
509 578
579static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
580{
581 pfn_t pfn = *pfnp;
582 gfn_t gfn = *ipap >> PAGE_SHIFT;
583
584 if (PageTransCompound(pfn_to_page(pfn))) {
585 unsigned long mask;
586 /*
587 * The address we faulted on is backed by a transparent huge
588 * page. However, because we map the compound huge page and
589 * not the individual tail page, we need to transfer the
590 * refcount to the head page. We have to be careful that the
591 * THP doesn't start to split while we are adjusting the
592 * refcounts.
593 *
594 * We are sure this doesn't happen, because mmu_notifier_retry
595 * was successful and we are holding the mmu_lock, so if this
596 * THP is trying to split, it will be blocked in the mmu
597 * notifier before touching any of the pages, specifically
598 * before being able to call __split_huge_page_refcount().
599 *
600 * We can therefore safely transfer the refcount from PG_tail
601 * to PG_head and switch the pfn from a tail page to the head
602 * page accordingly.
603 */
604 mask = PTRS_PER_PMD - 1;
605 VM_BUG_ON((gfn & mask) != (pfn & mask));
606 if (pfn & mask) {
607 *ipap &= PMD_MASK;
608 kvm_release_pfn_clean(pfn);
609 pfn &= ~mask;
610 kvm_get_pfn(pfn);
611 *pfnp = pfn;
612 }
613
614 return true;
615 }
616
617 return false;
618}
619
510static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 620static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
511 gfn_t gfn, struct kvm_memory_slot *memslot, 621 struct kvm_memory_slot *memslot,
512 unsigned long fault_status) 622 unsigned long fault_status)
513{ 623{
514 pte_t new_pte;
515 pfn_t pfn;
516 int ret; 624 int ret;
517 bool write_fault, writable; 625 bool write_fault, writable, hugetlb = false, force_pte = false;
518 unsigned long mmu_seq; 626 unsigned long mmu_seq;
627 gfn_t gfn = fault_ipa >> PAGE_SHIFT;
628 unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
629 struct kvm *kvm = vcpu->kvm;
519 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 630 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
631 struct vm_area_struct *vma;
632 pfn_t pfn;
520 633
521 write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); 634 write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
522 if (fault_status == FSC_PERM && !write_fault) { 635 if (fault_status == FSC_PERM && !write_fault) {
@@ -524,6 +637,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
524 return -EFAULT; 637 return -EFAULT;
525 } 638 }
526 639
640 /* Let's check if we will get back a huge page backed by hugetlbfs */
641 down_read(&current->mm->mmap_sem);
642 vma = find_vma_intersection(current->mm, hva, hva + 1);
643 if (is_vm_hugetlb_page(vma)) {
644 hugetlb = true;
645 gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
646 } else {
647 /*
648 * Pages belonging to VMAs not aligned to the PMD mapping
649 * granularity cannot be mapped using block descriptors even
650 * if the pages belong to a THP for the process, because the
651 * stage-2 block descriptor will cover more than a single THP
652 * and we loose atomicity for unmapping, updates, and splits
653 * of the THP or other pages in the stage-2 block range.
654 */
655 if (vma->vm_start & ~PMD_MASK)
656 force_pte = true;
657 }
658 up_read(&current->mm->mmap_sem);
659
527 /* We need minimum second+third level pages */ 660 /* We need minimum second+third level pages */
528 ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS); 661 ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS);
529 if (ret) 662 if (ret)
@@ -541,26 +674,40 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
541 */ 674 */
542 smp_rmb(); 675 smp_rmb();
543 676
544 pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable); 677 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
545 if (is_error_pfn(pfn)) 678 if (is_error_pfn(pfn))
546 return -EFAULT; 679 return -EFAULT;
547 680
548 new_pte = pfn_pte(pfn, PAGE_S2); 681 spin_lock(&kvm->mmu_lock);
549 coherent_icache_guest_page(vcpu->kvm, gfn); 682 if (mmu_notifier_retry(kvm, mmu_seq))
550
551 spin_lock(&vcpu->kvm->mmu_lock);
552 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
553 goto out_unlock; 683 goto out_unlock;
554 if (writable) { 684 if (!hugetlb && !force_pte)
555 kvm_set_s2pte_writable(&new_pte); 685 hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
556 kvm_set_pfn_dirty(pfn); 686
687 if (hugetlb) {
688 pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
689 new_pmd = pmd_mkhuge(new_pmd);
690 if (writable) {
691 kvm_set_s2pmd_writable(&new_pmd);
692 kvm_set_pfn_dirty(pfn);
693 }
694 coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE);
695 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
696 } else {
697 pte_t new_pte = pfn_pte(pfn, PAGE_S2);
698 if (writable) {
699 kvm_set_s2pte_writable(&new_pte);
700 kvm_set_pfn_dirty(pfn);
701 }
702 coherent_icache_guest_page(kvm, hva, PAGE_SIZE);
703 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
557 } 704 }
558 stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false); 705
559 706
560out_unlock: 707out_unlock:
561 spin_unlock(&vcpu->kvm->mmu_lock); 708 spin_unlock(&kvm->mmu_lock);
562 kvm_release_pfn_clean(pfn); 709 kvm_release_pfn_clean(pfn);
563 return 0; 710 return ret;
564} 711}
565 712
566/** 713/**
@@ -629,7 +776,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
629 776
630 memslot = gfn_to_memslot(vcpu->kvm, gfn); 777 memslot = gfn_to_memslot(vcpu->kvm, gfn);
631 778
632 ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, fault_status); 779 ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status);
633 if (ret == 0) 780 if (ret == 0)
634 ret = 1; 781 ret = 1;
635out_unlock: 782out_unlock:
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index efe609c6a3c9..680f74e67497 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -91,6 +91,7 @@ int kvm_mmu_init(void);
91void kvm_clear_hyp_idmap(void); 91void kvm_clear_hyp_idmap(void);
92 92
93#define kvm_set_pte(ptep, pte) set_pte(ptep, pte) 93#define kvm_set_pte(ptep, pte) set_pte(ptep, pte)
94#define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd)
94 95
95static inline bool kvm_is_write_fault(unsigned long esr) 96static inline bool kvm_is_write_fault(unsigned long esr)
96{ 97{
@@ -116,13 +117,18 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
116 pte_val(*pte) |= PTE_S2_RDWR; 117 pte_val(*pte) |= PTE_S2_RDWR;
117} 118}
118 119
120static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
121{
122 pmd_val(*pmd) |= PMD_S2_RDWR;
123}
124
119struct kvm; 125struct kvm;
120 126
121static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) 127static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
128 unsigned long size)
122{ 129{
123 if (!icache_is_aliasing()) { /* PIPT */ 130 if (!icache_is_aliasing()) { /* PIPT */
124 unsigned long hva = gfn_to_hva(kvm, gfn); 131 flush_icache_range(hva, hva + size);
125 flush_icache_range(hva, hva + PAGE_SIZE);
126 } else if (!icache_is_aivivt()) { /* non ASID-tagged VIVT */ 132 } else if (!icache_is_aivivt()) { /* non ASID-tagged VIVT */
127 /* any kind of VIPT cache */ 133 /* any kind of VIPT cache */
128 __flush_icache_all(); 134 __flush_icache_all();
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index d57e66845c86..755f86143320 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -85,6 +85,8 @@
85#define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */ 85#define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */
86#define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ 86#define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
87 87
88#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
89
88/* 90/*
89 * Memory Attribute override for Stage-2 (MemAttr[3:0]) 91 * Memory Attribute override for Stage-2 (MemAttr[3:0])
90 */ 92 */