aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm
diff options
context:
space:
mode:
authorChristoffer Dall <christoffer.dall@linaro.org>2012-11-01 12:14:45 -0400
committerChristoffer Dall <christoffer.dall@linaro.org>2013-10-17 20:06:20 -0400
commitad361f093c1e31d0b43946210a32ab4ff5c49850 (patch)
tree2bfef51798c8e6a916c9a961dd293fd919b0258a /arch/arm
parent86ed81aa2e1ce05a4e7f0819f0dfc34e8d8fb910 (diff)
KVM: ARM: Support hugetlbfs backed huge pages
Support huge pages in KVM/ARM and KVM/ARM64. The pud_huge checking on the unmap path may feel a bit silly as the pud_huge check is always defined to false, but the compiler should be smart about this. Note: This deals only with VMAs marked as huge which are allocated by users through hugetlbfs only. Transparent huge pages can only be detected by looking at the underlying pages (or the page tables themselves) and this patch so far simply maps these on a page-by-page level in the Stage-2 page tables. Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Acked-by: Catalin Marinas <catalin.marinas@arm.com> Acked-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Diffstat (limited to 'arch/arm')
-rw-r--r--arch/arm/include/asm/kvm_mmu.h17
-rw-r--r--arch/arm/include/asm/pgtable-3level.h2
-rw-r--r--arch/arm/kvm/mmu.c169
3 files changed, 147 insertions, 41 deletions
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 9b28c41f4ba9..77de4a41cc50 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -62,6 +62,12 @@ phys_addr_t kvm_get_idmap_vector(void);
62int kvm_mmu_init(void); 62int kvm_mmu_init(void);
63void kvm_clear_hyp_idmap(void); 63void kvm_clear_hyp_idmap(void);
64 64
65static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd)
66{
67 *pmd = new_pmd;
68 flush_pmd_entry(pmd);
69}
70
65static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) 71static inline void kvm_set_pte(pte_t *pte, pte_t new_pte)
66{ 72{
67 *pte = new_pte; 73 *pte = new_pte;
@@ -103,9 +109,15 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
103 pte_val(*pte) |= L_PTE_S2_RDWR; 109 pte_val(*pte) |= L_PTE_S2_RDWR;
104} 110}
105 111
112static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
113{
114 pmd_val(*pmd) |= L_PMD_S2_RDWR;
115}
116
106struct kvm; 117struct kvm;
107 118
108static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) 119static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
120 unsigned long size)
109{ 121{
110 /* 122 /*
111 * If we are going to insert an instruction page and the icache is 123 * If we are going to insert an instruction page and the icache is
@@ -120,8 +132,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
120 * need any kind of flushing (DDI 0406C.b - Page B3-1392). 132 * need any kind of flushing (DDI 0406C.b - Page B3-1392).
121 */ 133 */
122 if (icache_is_pipt()) { 134 if (icache_is_pipt()) {
123 unsigned long hva = gfn_to_hva(kvm, gfn); 135 __cpuc_coherent_user_range(hva, hva + size);
124 __cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
125 } else if (!icache_is_vivt_asid_tagged()) { 136 } else if (!icache_is_vivt_asid_tagged()) {
126 /* any kind of VIPT cache */ 137 /* any kind of VIPT cache */
127 __flush_icache_all(); 138 __flush_icache_all();
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 5689c18c85f5..a331d2527342 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -126,6 +126,8 @@
126#define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */ 126#define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */
127#define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ 127#define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
128 128
129#define L_PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
130
129/* 131/*
130 * Hyp-mode PL2 PTE definitions for LPAE. 132 * Hyp-mode PL2 PTE definitions for LPAE.
131 */ 133 */
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index b0de86b56c13..745d8b1630cc 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -19,6 +19,7 @@
19#include <linux/mman.h> 19#include <linux/mman.h>
20#include <linux/kvm_host.h> 20#include <linux/kvm_host.h>
21#include <linux/io.h> 21#include <linux/io.h>
22#include <linux/hugetlb.h>
22#include <trace/events/kvm.h> 23#include <trace/events/kvm.h>
23#include <asm/pgalloc.h> 24#include <asm/pgalloc.h>
24#include <asm/cacheflush.h> 25#include <asm/cacheflush.h>
@@ -41,6 +42,8 @@ static unsigned long hyp_idmap_start;
41static unsigned long hyp_idmap_end; 42static unsigned long hyp_idmap_end;
42static phys_addr_t hyp_idmap_vector; 43static phys_addr_t hyp_idmap_vector;
43 44
45#define kvm_pmd_huge(_x) (pmd_huge(_x))
46
44static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 47static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
45{ 48{
46 /* 49 /*
@@ -93,19 +96,29 @@ static bool page_empty(void *ptr)
93 96
94static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) 97static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
95{ 98{
96 pmd_t *pmd_table = pmd_offset(pud, 0); 99 if (pud_huge(*pud)) {
97 pud_clear(pud); 100 pud_clear(pud);
98 kvm_tlb_flush_vmid_ipa(kvm, addr); 101 kvm_tlb_flush_vmid_ipa(kvm, addr);
99 pmd_free(NULL, pmd_table); 102 } else {
103 pmd_t *pmd_table = pmd_offset(pud, 0);
104 pud_clear(pud);
105 kvm_tlb_flush_vmid_ipa(kvm, addr);
106 pmd_free(NULL, pmd_table);
107 }
100 put_page(virt_to_page(pud)); 108 put_page(virt_to_page(pud));
101} 109}
102 110
103static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) 111static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
104{ 112{
105 pte_t *pte_table = pte_offset_kernel(pmd, 0); 113 if (kvm_pmd_huge(*pmd)) {
106 pmd_clear(pmd); 114 pmd_clear(pmd);
107 kvm_tlb_flush_vmid_ipa(kvm, addr); 115 kvm_tlb_flush_vmid_ipa(kvm, addr);
108 pte_free_kernel(NULL, pte_table); 116 } else {
117 pte_t *pte_table = pte_offset_kernel(pmd, 0);
118 pmd_clear(pmd);
119 kvm_tlb_flush_vmid_ipa(kvm, addr);
120 pte_free_kernel(NULL, pte_table);
121 }
109 put_page(virt_to_page(pmd)); 122 put_page(virt_to_page(pmd));
110} 123}
111 124
@@ -136,18 +149,32 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
136 continue; 149 continue;
137 } 150 }
138 151
152 if (pud_huge(*pud)) {
153 /*
154 * If we are dealing with a huge pud, just clear it and
155 * move on.
156 */
157 clear_pud_entry(kvm, pud, addr);
158 addr = pud_addr_end(addr, end);
159 continue;
160 }
161
139 pmd = pmd_offset(pud, addr); 162 pmd = pmd_offset(pud, addr);
140 if (pmd_none(*pmd)) { 163 if (pmd_none(*pmd)) {
141 addr = pmd_addr_end(addr, end); 164 addr = pmd_addr_end(addr, end);
142 continue; 165 continue;
143 } 166 }
144 167
145 pte = pte_offset_kernel(pmd, addr); 168 if (!kvm_pmd_huge(*pmd)) {
146 clear_pte_entry(kvm, pte, addr); 169 pte = pte_offset_kernel(pmd, addr);
147 next = addr + PAGE_SIZE; 170 clear_pte_entry(kvm, pte, addr);
171 next = addr + PAGE_SIZE;
172 }
148 173
149 /* If we emptied the pte, walk back up the ladder */ 174 /*
150 if (page_empty(pte)) { 175 * If the pmd entry is to be cleared, walk back up the ladder
176 */
177 if (kvm_pmd_huge(*pmd) || page_empty(pte)) {
151 clear_pmd_entry(kvm, pmd, addr); 178 clear_pmd_entry(kvm, pmd, addr);
152 next = pmd_addr_end(addr, end); 179 next = pmd_addr_end(addr, end);
153 if (page_empty(pmd) && !page_empty(pud)) { 180 if (page_empty(pmd) && !page_empty(pud)) {
@@ -420,29 +447,71 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
420 kvm->arch.pgd = NULL; 447 kvm->arch.pgd = NULL;
421} 448}
422 449
423 450static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
424static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 451 phys_addr_t addr)
425 phys_addr_t addr, const pte_t *new_pte, bool iomap)
426{ 452{
427 pgd_t *pgd; 453 pgd_t *pgd;
428 pud_t *pud; 454 pud_t *pud;
429 pmd_t *pmd; 455 pmd_t *pmd;
430 pte_t *pte, old_pte;
431 456
432 /* Create 2nd stage page table mapping - Level 1 */
433 pgd = kvm->arch.pgd + pgd_index(addr); 457 pgd = kvm->arch.pgd + pgd_index(addr);
434 pud = pud_offset(pgd, addr); 458 pud = pud_offset(pgd, addr);
435 if (pud_none(*pud)) { 459 if (pud_none(*pud)) {
436 if (!cache) 460 if (!cache)
437 return 0; /* ignore calls from kvm_set_spte_hva */ 461 return NULL;
438 pmd = mmu_memory_cache_alloc(cache); 462 pmd = mmu_memory_cache_alloc(cache);
439 pud_populate(NULL, pud, pmd); 463 pud_populate(NULL, pud, pmd);
440 get_page(virt_to_page(pud)); 464 get_page(virt_to_page(pud));
441 } 465 }
442 466
443 pmd = pmd_offset(pud, addr); 467 return pmd_offset(pud, addr);
468}
469
470static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
471 *cache, phys_addr_t addr, const pmd_t *new_pmd)
472{
473 pmd_t *pmd, old_pmd;
474
475 pmd = stage2_get_pmd(kvm, cache, addr);
476 VM_BUG_ON(!pmd);
444 477
445 /* Create 2nd stage page table mapping - Level 2 */ 478 /*
479 * Mapping in huge pages should only happen through a fault. If a
480 * page is merged into a transparent huge page, the individual
481 * subpages of that huge page should be unmapped through MMU
482 * notifiers before we get here.
483 *
484 * Merging of CompoundPages is not supported; they should become
485 * splitting first, unmapped, merged, and mapped back in on-demand.
486 */
487 VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
488
489 old_pmd = *pmd;
490 kvm_set_pmd(pmd, *new_pmd);
491 if (pmd_present(old_pmd))
492 kvm_tlb_flush_vmid_ipa(kvm, addr);
493 else
494 get_page(virt_to_page(pmd));
495 return 0;
496}
497
498static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
499 phys_addr_t addr, const pte_t *new_pte, bool iomap)
500{
501 pmd_t *pmd;
502 pte_t *pte, old_pte;
503
504 /* Create stage-2 page table mapping - Level 1 */
505 pmd = stage2_get_pmd(kvm, cache, addr);
506 if (!pmd) {
507 /*
508 * Ignore calls from kvm_set_spte_hva for unallocated
509 * address ranges.
510 */
511 return 0;
512 }
513
514 /* Create stage-2 page mappings - Level 2 */
446 if (pmd_none(*pmd)) { 515 if (pmd_none(*pmd)) {
447 if (!cache) 516 if (!cache)
448 return 0; /* ignore calls from kvm_set_spte_hva */ 517 return 0; /* ignore calls from kvm_set_spte_hva */
@@ -508,15 +577,18 @@ out:
508} 577}
509 578
510static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 579static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
511 gfn_t gfn, struct kvm_memory_slot *memslot, 580 struct kvm_memory_slot *memslot,
512 unsigned long fault_status) 581 unsigned long fault_status)
513{ 582{
514 pte_t new_pte;
515 pfn_t pfn;
516 int ret; 583 int ret;
517 bool write_fault, writable; 584 bool write_fault, writable, hugetlb = false;
518 unsigned long mmu_seq; 585 unsigned long mmu_seq;
586 gfn_t gfn = fault_ipa >> PAGE_SHIFT;
587 unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
588 struct kvm *kvm = vcpu->kvm;
519 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 589 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
590 struct vm_area_struct *vma;
591 pfn_t pfn;
520 592
521 write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); 593 write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
522 if (fault_status == FSC_PERM && !write_fault) { 594 if (fault_status == FSC_PERM && !write_fault) {
@@ -524,6 +596,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
524 return -EFAULT; 596 return -EFAULT;
525 } 597 }
526 598
599 /* Let's check if we will get back a huge page backed by hugetlbfs */
600 down_read(&current->mm->mmap_sem);
601 vma = find_vma_intersection(current->mm, hva, hva + 1);
602 if (is_vm_hugetlb_page(vma)) {
603 hugetlb = true;
604 gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
605 }
606 up_read(&current->mm->mmap_sem);
607
527 /* We need minimum second+third level pages */ 608 /* We need minimum second+third level pages */
528 ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS); 609 ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS);
529 if (ret) 610 if (ret)
@@ -541,26 +622,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
541 */ 622 */
542 smp_rmb(); 623 smp_rmb();
543 624
544 pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable); 625 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
545 if (is_error_pfn(pfn)) 626 if (is_error_pfn(pfn))
546 return -EFAULT; 627 return -EFAULT;
547 628
548 new_pte = pfn_pte(pfn, PAGE_S2); 629 spin_lock(&kvm->mmu_lock);
549 coherent_icache_guest_page(vcpu->kvm, gfn); 630 if (mmu_notifier_retry(kvm, mmu_seq))
550
551 spin_lock(&vcpu->kvm->mmu_lock);
552 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
553 goto out_unlock; 631 goto out_unlock;
554 if (writable) { 632
555 kvm_set_s2pte_writable(&new_pte); 633 if (hugetlb) {
556 kvm_set_pfn_dirty(pfn); 634 pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
635 new_pmd = pmd_mkhuge(new_pmd);
636 if (writable) {
637 kvm_set_s2pmd_writable(&new_pmd);
638 kvm_set_pfn_dirty(pfn);
639 }
640 coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE);
641 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
642 } else {
643 pte_t new_pte = pfn_pte(pfn, PAGE_S2);
644 if (writable) {
645 kvm_set_s2pte_writable(&new_pte);
646 kvm_set_pfn_dirty(pfn);
647 }
648 coherent_icache_guest_page(kvm, hva, PAGE_SIZE);
649 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
557 } 650 }
558 stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false); 651
559 652
560out_unlock: 653out_unlock:
561 spin_unlock(&vcpu->kvm->mmu_lock); 654 spin_unlock(&kvm->mmu_lock);
562 kvm_release_pfn_clean(pfn); 655 kvm_release_pfn_clean(pfn);
563 return 0; 656 return ret;
564} 657}
565 658
566/** 659/**
@@ -629,7 +722,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
629 722
630 memslot = gfn_to_memslot(vcpu->kvm, gfn); 723 memslot = gfn_to_memslot(vcpu->kvm, gfn);
631 724
632 ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, fault_status); 725 ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status);
633 if (ret == 0) 726 if (ret == 0)
634 ret = 1; 727 ret = 1;
635out_unlock: 728out_unlock: