KVM: PPC: Book3S HV: Improve handling of local vs. global TLB invalidations

When we change or remove a HPT (hashed page table) entry, we can do either a global TLB invalidation (tlbie) that works across the whole machine, or a local invalidation (tlbiel) that only affects this core. Currently we do local invalidations if the VM has only one vcpu or if the guest requests it with the H_LOCAL flag, though the guest Linux kernel currently doesn't ever use H_LOCAL. Then, to cope with the possibility that vcpus moving around to different physical cores might expose stale TLB entries, there is some code in kvmppc_hv_entry to flush the whole TLB of entries for this VM if either this vcpu is now running on a different physical core from where it last ran, or if this physical core last ran a different vcpu. There are a number of problems on POWER7 with this as it stands: - The TLB invalidation is done per thread, whereas it only needs to be done per core, since the TLB is shared between the threads. - With the possibility of the host paging out guest pages, the use of H_LOCAL by an SMP guest is dangerous since the guest could possibly retain and use a stale TLB entry pointing to a page that had been removed from the guest. - The TLB invalidations that we do when a vcpu moves from one physical core to another are unnecessary in the case of an SMP guest that isn't using H_LOCAL. - The optimization of using local invalidations rather than global should apply to guests with one virtual core, not just one vcpu. (None of this applies on PPC970, since there we always have to invalidate the whole TLB when entering and leaving the guest, and we can't support paging out guest memory.) To fix these problems and simplify the code, we now maintain a simple cpumask of which cpus need to flush the TLB on entry to the guest. (This is indexed by cpu, though we only ever use the bits for thread 0 of each core.) Whenever we do a local TLB invalidation, we set the bits for every cpu except the bit for thread 0 of the core that we're currently running on. Whenever we enter a guest, we test and clear the bit for our core, and flush the TLB if it was set. On initial startup of the VM, and when resetting the HPT, we set all the bits in the need_tlb_flush cpumask, since any core could potentially have stale TLB entries from the previous VM to use the same LPID, or the previous contents of the HPT. Then, we maintain a count of the number of online virtual cores, and use that when deciding whether to use a local invalidation rather than the number of online vcpus. The code to make that decision is extracted out into a new function, global_invalidates(). For multi-core guests on POWER7 (i.e. when we are using mmu notifiers), we now never do local invalidations regardless of the H_LOCAL flag. Signed-off-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Alexander Graf <agraf@suse.de>
author: Paul Mackerras <paulus@samba.org> 2012-11-21 18:28:08 -0500
committer: Alexander Graf <agraf@suse.de> 2012-12-05 19:34:05 -0500
commit: 1b400ba0cd24a5994d792c7cfa0ee24cac266d3c (patch)
tree: b973122d899c1b6cc27d358fa3f2750777182879 /arch
parent: 6a7f972dfe8de97c00f3196d0b87fb68bd8cf35e (diff)
6 files changed, 73 insertions, 45 deletions
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 58c72646c445..62fbd38b15fa 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -246,11 +246,12 @@ struct kvm_arch {
        int using_mmu_notifiers;
        u32 hpt_order;
        atomic_t vcpus_running;
+        u32 online_vcores;
        unsigned long hpt_npte;
        unsigned long hpt_mask;
        atomic_t hpte_mod_interest;
        spinlock_t slot_phys_lock;
-        unsigned short last_vcpu[NR_CPUS];
+        cpumask_t need_tlb_flush;
        struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
        struct kvmppc_linear_info *hpt_li;
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
@@ -275,6 +276,7 @@ struct kvmppc_vcore {
        int nap_count;
        int napping_threads;
        u16 pcpu;
+        u16 last_cpu;
        u8 vcore_state;
        u8 in_guest;
        struct list_head runnable_threads;
@@ -523,7 +525,6 @@ struct kvm_vcpu_arch {
        u64 dec_jiffies;
        u64 dec_expires;
        unsigned long pending_exceptions;
-        u16 last_cpu;
        u8 ceded;
        u8 prodded;
        u32 last_inst;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 7523539cfe9f..4e23ba2f3ca7 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -441,8 +441,7 @@ int main(void)
        DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
        DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
        DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
-        DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
+        DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
-        DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
        DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
        DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
        DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
@@ -470,7 +469,6 @@ int main(void)
        DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
        DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
        DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
-        DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
        DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
        DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
        DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index ac6b5acb99b9..8cc18abd6dde 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -148,11 +148,8 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
                 * Reset all the reverse-mapping chains for all memslots
                 */
                kvmppc_rmap_reset(kvm);
-                /*
+                /* Ensure that each vcpu will flush its TLB on next entry. */
-                 * Set the whole last_vcpu array to an invalid vcpu number.
+                cpumask_setall(&kvm->arch.need_tlb_flush);
-                 * This ensures that each vcpu will flush its TLB on next entry.
-                 */
-                memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu));
                *htab_orderp = order;
                err = 0;
        } else {
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index a4f59dbcd800..ddbec60cb0d2 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -853,7 +853,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
                goto free_vcpu;
        vcpu->arch.shared = &vcpu->arch.shregs;
-        vcpu->arch.last_cpu = -1;
        vcpu->arch.mmcr[0] = MMCR0_FC;
        vcpu->arch.ctrl = CTRL_RUNLATCH;
        /* default to host PVR, since we can't spoof it */
@@ -880,6 +879,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
                        vcore->preempt_tb = TB_NIL;
                }
                kvm->arch.vcores[core] = vcore;
+                kvm->arch.online_vcores++;
        }
        mutex_unlock(&kvm->lock);
@@ -1802,6 +1802,13 @@ int kvmppc_core_init_vm(struct kvm *kvm)
                return -ENOMEM;
        kvm->arch.lpid = lpid;
+        /*
+         * Since we don't flush the TLB when tearing down a VM,
+         * and this lpid might have previously been used,
+         * make sure we flush on each core before running the new VM.
+         */
+        cpumask_setall(&kvm->arch.need_tlb_flush);
        INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
        kvm->arch.rma = NULL;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 7a57ea49172d..19c93bae1aea 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -35,6 +35,37 @@ static void *real_vmalloc_addr(void *x)
        return __va(addr);
 }
+/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
+static int global_invalidates(struct kvm *kvm, unsigned long flags)
+{
+        int global;
+        /*
+         * If there is only one vcore, and it's currently running,
+         * we can use tlbiel as long as we mark all other physical
+         * cores as potentially having stale TLB entries for this lpid.
+         * If we're not using MMU notifiers, we never take pages away
+         * from the guest, so we can use tlbiel if requested.
+         * Otherwise, don't use tlbiel.
+         */
+        if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore)
+                global = 0;
+        else if (kvm->arch.using_mmu_notifiers)
+                global = 1;
+        else
+                global = !(flags & H_LOCAL);
+        if (!global) {
+                /* any other core might now have stale TLB entries... */
+                smp_wmb();
+                cpumask_setall(&kvm->arch.need_tlb_flush);
+                cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
+                                  &kvm->arch.need_tlb_flush);
+        }
+        return global;
+}
 /*
 * Add this HPTE into the chain for the real page.
 * Must be called with the chain locked; it unlocks the chain.
@@ -390,7 +421,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
        if (v & HPTE_V_VALID) {
                hpte[0] &= ~HPTE_V_VALID;
                rb = compute_tlbie_rb(v, hpte[1], pte_index);
-                if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) {
+                if (global_invalidates(kvm, flags)) {
                        while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
                                cpu_relax();
                        asm volatile("ptesync" : : : "memory");
@@ -565,8 +596,6 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
                return H_NOT_FOUND;
        }
-        if (atomic_read(&kvm->online_vcpus) == 1)
-                flags |= H_LOCAL;
        v = hpte[0];
        bits = (flags << 55) & HPTE_R_PP0;
        bits |= (flags << 48) & HPTE_R_KEY_HI;
@@ -587,7 +616,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
        if (v & HPTE_V_VALID) {
                rb = compute_tlbie_rb(v, r, pte_index);
                hpte[0] = v & ~HPTE_V_VALID;
-                if (!(flags & H_LOCAL)) {
+                if (global_invalidates(kvm, flags)) {
                        while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
                                cpu_relax();
                        asm volatile("ptesync" : : : "memory");
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 690d1120402d..b48bd53dd771 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -313,7 +313,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
        mtspr   SPRN_SDR1,r6            /* switch to partition page table */
        mtspr   SPRN_LPID,r7
        isync
+        /* See if we need to flush the TLB */
+        lhz     r6,PACAPACAINDEX(r13)   /* test_bit(cpu, need_tlb_flush) */
+        clrldi  r7,r6,64-6              /* extract bit number (6 bits) */
+        srdi    r6,r6,6                 /* doubleword number */
+        sldi    r6,r6,3                 /* address offset */
+        add     r6,r6,r9
+        addi    r6,r6,KVM_NEED_FLUSH    /* dword in kvm->arch.need_tlb_flush */
        li      r0,1
+        sld     r0,r0,r7
+        ld      r7,0(r6)
+        and.    r7,r7,r0
+        beq     22f
+23:     ldarx   r7,0,r6                 /* if set, clear the bit */
+        andc    r7,r7,r0
+        stdcx.  r7,0,r6
+        bne     23b
+        li      r6,128                  /* and flush the TLB */
+        mtctr   r6
+        li      r7,0x800                /* IS field = 0b10 */
+        ptesync
+28:     tlbiel  r7
+        addi    r7,r7,0x1000
+        bdnz    28b
+        ptesync
+22:     li      r0,1
        stb     r0,VCORE_IN_GUEST(r5)   /* signal secondaries to continue */
        b       10f
@@ -336,36 +362,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
        mr      r9,r4
        blt     hdec_soon
-        /*
-         * Invalidate the TLB if we could possibly have stale TLB
-         * entries for this partition on this core due to the use
-         * of tlbiel.
-         * XXX maybe only need this on primary thread?
-         */
-        ld      r9,VCPU_KVM(r4)         /* pointer to struct kvm */
-        lwz     r5,VCPU_VCPUID(r4)
-        lhz     r6,PACAPACAINDEX(r13)
-        rldimi  r6,r5,0,62              /* XXX map as if threads 1:1 p:v */
-        lhz     r8,VCPU_LAST_CPU(r4)
-        sldi    r7,r6,1                 /* see if this is the same vcpu */
-        add     r7,r7,r9                /* as last ran on this pcpu */
-        lhz     r0,KVM_LAST_VCPU(r7)
-        cmpw    r6,r8                   /* on the same cpu core as last time? */
-        bne     3f
-        cmpw    r0,r5                   /* same vcpu as this core last ran? */
-        beq     1f
-3:      sth     r6,VCPU_LAST_CPU(r4)    /* if not, invalidate partition TLB */
-        sth     r5,KVM_LAST_VCPU(r7)
-        li      r6,128
-        mtctr   r6
-        li      r7,0x800                /* IS field = 0b10 */
-        ptesync
-2:      tlbiel  r7
-        addi    r7,r7,0x1000
-        bdnz    2b
-        ptesync
-1:
        /* Save purr/spurr */
        mfspr   r5,SPRN_PURR
        mfspr   r6,SPRN_SPURR
author	Paul Mackerras <paulus@samba.org>	2012-11-21 18:28:08 -0500
committer	Alexander Graf <agraf@suse.de>	2012-12-05 19:34:05 -0500
commit	1b400ba0cd24a5994d792c7cfa0ee24cac266d3c (patch)
tree	b973122d899c1b6cc27d358fa3f2750777182879 /arch
parent	6a7f972dfe8de97c00f3196d0b87fb68bd8cf35e (diff)