aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNicholas Piggin <npiggin@gmail.com>2018-09-11 06:48:34 -0400
committerPaul Mackerras <paulus@ozlabs.org>2018-09-11 18:50:50 -0400
commit71d29f43b6332badc5598c656616a62575e83342 (patch)
tree49a72048094fc32aaddd5bbdd6332a3a79b9b7e1
parent425333bf3a7743715c17e503049d0837d6c4a603 (diff)
KVM: PPC: Book3S HV: Don't use compound_order to determine host mapping size
THP paths can defer splitting compound pages until after the actual remap and TLB flushes to split a huge PMD/PUD. This causes radix partition scope page table mappings to get out of synch with the host qemu page table mappings. This results in random memory corruption in the guest when running with THP. The easiest way to reproduce is use KVM balloon to free up a lot of memory in the guest and then shrink the balloon to give the memory back, while some work is being done in the guest. Cc: David Gibson <david@gibson.dropbear.id.au> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> Cc: kvm-ppc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c91
1 files changed, 37 insertions, 54 deletions
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index fd6e8c13685f..933c574e1cf7 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -525,8 +525,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
525 unsigned long ea, unsigned long dsisr) 525 unsigned long ea, unsigned long dsisr)
526{ 526{
527 struct kvm *kvm = vcpu->kvm; 527 struct kvm *kvm = vcpu->kvm;
528 unsigned long mmu_seq, pte_size; 528 unsigned long mmu_seq;
529 unsigned long gpa, gfn, hva, pfn; 529 unsigned long gpa, gfn, hva;
530 struct kvm_memory_slot *memslot; 530 struct kvm_memory_slot *memslot;
531 struct page *page = NULL; 531 struct page *page = NULL;
532 long ret; 532 long ret;
@@ -623,9 +623,10 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
623 */ 623 */
624 hva = gfn_to_hva_memslot(memslot, gfn); 624 hva = gfn_to_hva_memslot(memslot, gfn);
625 if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) { 625 if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
626 pfn = page_to_pfn(page);
627 upgrade_write = true; 626 upgrade_write = true;
628 } else { 627 } else {
628 unsigned long pfn;
629
629 /* Call KVM generic code to do the slow-path check */ 630 /* Call KVM generic code to do the slow-path check */
630 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, 631 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
631 writing, upgrade_p); 632 writing, upgrade_p);
@@ -639,63 +640,45 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
639 } 640 }
640 } 641 }
641 642
642 /* See if we can insert a 1GB or 2MB large PTE here */
643 level = 0;
644 if (page && PageCompound(page)) {
645 pte_size = PAGE_SIZE << compound_order(compound_head(page));
646 if (pte_size >= PUD_SIZE &&
647 (gpa & (PUD_SIZE - PAGE_SIZE)) ==
648 (hva & (PUD_SIZE - PAGE_SIZE))) {
649 level = 2;
650 pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1);
651 } else if (pte_size >= PMD_SIZE &&
652 (gpa & (PMD_SIZE - PAGE_SIZE)) ==
653 (hva & (PMD_SIZE - PAGE_SIZE))) {
654 level = 1;
655 pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
656 }
657 }
658
659 /* 643 /*
660 * Compute the PTE value that we need to insert. 644 * Read the PTE from the process' radix tree and use that
645 * so we get the shift and attribute bits.
661 */ 646 */
662 if (page) { 647 local_irq_disable();
663 pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE | 648 ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
664 _PAGE_ACCESSED; 649 pte = *ptep;
665 if (writing || upgrade_write) 650 local_irq_enable();
666 pgflags |= _PAGE_WRITE | _PAGE_DIRTY; 651
667 pte = pfn_pte(pfn, __pgprot(pgflags)); 652 /* Get pte level from shift/size */
653 if (shift == PUD_SHIFT &&
654 (gpa & (PUD_SIZE - PAGE_SIZE)) ==
655 (hva & (PUD_SIZE - PAGE_SIZE))) {
656 level = 2;
657 } else if (shift == PMD_SHIFT &&
658 (gpa & (PMD_SIZE - PAGE_SIZE)) ==
659 (hva & (PMD_SIZE - PAGE_SIZE))) {
660 level = 1;
668 } else { 661 } else {
669 /* 662 level = 0;
670 * Read the PTE from the process' radix tree and use that 663 if (shift > PAGE_SHIFT) {
671 * so we get the attribute bits. 664 /*
672 */ 665 * If the pte maps more than one page, bring over
673 local_irq_disable(); 666 * bits from the virtual address to get the real
674 ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); 667 * address of the specific single page we want.
675 pte = *ptep; 668 */
676 local_irq_enable(); 669 unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
677 if (shift == PUD_SHIFT && 670 pte = __pte(pte_val(pte) | (hva & rpnmask));
678 (gpa & (PUD_SIZE - PAGE_SIZE)) ==
679 (hva & (PUD_SIZE - PAGE_SIZE))) {
680 level = 2;
681 } else if (shift == PMD_SHIFT &&
682 (gpa & (PMD_SIZE - PAGE_SIZE)) ==
683 (hva & (PMD_SIZE - PAGE_SIZE))) {
684 level = 1;
685 } else if (shift && shift != PAGE_SHIFT) {
686 /* Adjust PFN */
687 unsigned long mask = (1ul << shift) - PAGE_SIZE;
688 pte = __pte(pte_val(pte) | (hva & mask));
689 }
690 pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
691 if (writing || upgrade_write) {
692 if (pte_val(pte) & _PAGE_WRITE)
693 pte = __pte(pte_val(pte) | _PAGE_DIRTY);
694 } else {
695 pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
696 } 671 }
697 } 672 }
698 673
674 pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
675 if (writing || upgrade_write) {
676 if (pte_val(pte) & _PAGE_WRITE)
677 pte = __pte(pte_val(pte) | _PAGE_DIRTY);
678 } else {
679 pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
680 }
681
699 /* Allocate space in the tree and write the PTE */ 682 /* Allocate space in the tree and write the PTE */
700 ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); 683 ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
701 684