aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2012-11-21 18:28:08 -0500
committerAlexander Graf <agraf@suse.de>2012-12-05 19:34:05 -0500
commit1b400ba0cd24a5994d792c7cfa0ee24cac266d3c (patch)
treeb973122d899c1b6cc27d358fa3f2750777182879 /arch
parent6a7f972dfe8de97c00f3196d0b87fb68bd8cf35e (diff)
KVM: PPC: Book3S HV: Improve handling of local vs. global TLB invalidations
When we change or remove a HPT (hashed page table) entry, we can do either a global TLB invalidation (tlbie) that works across the whole machine, or a local invalidation (tlbiel) that only affects this core. Currently we do local invalidations if the VM has only one vcpu or if the guest requests it with the H_LOCAL flag, though the guest Linux kernel currently doesn't ever use H_LOCAL. Then, to cope with the possibility that vcpus moving around to different physical cores might expose stale TLB entries, there is some code in kvmppc_hv_entry to flush the whole TLB of entries for this VM if either this vcpu is now running on a different physical core from where it last ran, or if this physical core last ran a different vcpu. There are a number of problems on POWER7 with this as it stands: - The TLB invalidation is done per thread, whereas it only needs to be done per core, since the TLB is shared between the threads. - With the possibility of the host paging out guest pages, the use of H_LOCAL by an SMP guest is dangerous since the guest could possibly retain and use a stale TLB entry pointing to a page that had been removed from the guest. - The TLB invalidations that we do when a vcpu moves from one physical core to another are unnecessary in the case of an SMP guest that isn't using H_LOCAL. - The optimization of using local invalidations rather than global should apply to guests with one virtual core, not just one vcpu. (None of this applies on PPC970, since there we always have to invalidate the whole TLB when entering and leaving the guest, and we can't support paging out guest memory.) To fix these problems and simplify the code, we now maintain a simple cpumask of which cpus need to flush the TLB on entry to the guest. (This is indexed by cpu, though we only ever use the bits for thread 0 of each core.) Whenever we do a local TLB invalidation, we set the bits for every cpu except the bit for thread 0 of the core that we're currently running on. Whenever we enter a guest, we test and clear the bit for our core, and flush the TLB if it was set. On initial startup of the VM, and when resetting the HPT, we set all the bits in the need_tlb_flush cpumask, since any core could potentially have stale TLB entries from the previous VM to use the same LPID, or the previous contents of the HPT. Then, we maintain a count of the number of online virtual cores, and use that when deciding whether to use a local invalidation rather than the number of online vcpus. The code to make that decision is extracted out into a new function, global_invalidates(). For multi-core guests on POWER7 (i.e. when we are using mmu notifiers), we now never do local invalidations regardless of the H_LOCAL flag. Signed-off-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Alexander Graf <agraf@suse.de>
Diffstat (limited to 'arch')
-rw-r--r--arch/powerpc/include/asm/kvm_host.h5
-rw-r--r--arch/powerpc/kernel/asm-offsets.c4
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c7
-rw-r--r--arch/powerpc/kvm/book3s_hv.c9
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c37
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S56
6 files changed, 73 insertions, 45 deletions
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 58c72646c445..62fbd38b15fa 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -246,11 +246,12 @@ struct kvm_arch {
246 int using_mmu_notifiers; 246 int using_mmu_notifiers;
247 u32 hpt_order; 247 u32 hpt_order;
248 atomic_t vcpus_running; 248 atomic_t vcpus_running;
249 u32 online_vcores;
249 unsigned long hpt_npte; 250 unsigned long hpt_npte;
250 unsigned long hpt_mask; 251 unsigned long hpt_mask;
251 atomic_t hpte_mod_interest; 252 atomic_t hpte_mod_interest;
252 spinlock_t slot_phys_lock; 253 spinlock_t slot_phys_lock;
253 unsigned short last_vcpu[NR_CPUS]; 254 cpumask_t need_tlb_flush;
254 struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; 255 struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
255 struct kvmppc_linear_info *hpt_li; 256 struct kvmppc_linear_info *hpt_li;
256#endif /* CONFIG_KVM_BOOK3S_64_HV */ 257#endif /* CONFIG_KVM_BOOK3S_64_HV */
@@ -275,6 +276,7 @@ struct kvmppc_vcore {
275 int nap_count; 276 int nap_count;
276 int napping_threads; 277 int napping_threads;
277 u16 pcpu; 278 u16 pcpu;
279 u16 last_cpu;
278 u8 vcore_state; 280 u8 vcore_state;
279 u8 in_guest; 281 u8 in_guest;
280 struct list_head runnable_threads; 282 struct list_head runnable_threads;
@@ -523,7 +525,6 @@ struct kvm_vcpu_arch {
523 u64 dec_jiffies; 525 u64 dec_jiffies;
524 u64 dec_expires; 526 u64 dec_expires;
525 unsigned long pending_exceptions; 527 unsigned long pending_exceptions;
526 u16 last_cpu;
527 u8 ceded; 528 u8 ceded;
528 u8 prodded; 529 u8 prodded;
529 u32 last_inst; 530 u32 last_inst;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 7523539cfe9f..4e23ba2f3ca7 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -441,8 +441,7 @@ int main(void)
441 DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); 441 DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
442 DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1)); 442 DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
443 DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock)); 443 DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
444 DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter)); 444 DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
445 DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
446 DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); 445 DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
447 DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); 446 DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
448 DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); 447 DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
@@ -470,7 +469,6 @@ int main(void)
470 DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb)); 469 DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
471 DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max)); 470 DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
472 DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); 471 DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
473 DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
474 DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); 472 DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
475 DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); 473 DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
476 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); 474 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index ac6b5acb99b9..8cc18abd6dde 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -148,11 +148,8 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
148 * Reset all the reverse-mapping chains for all memslots 148 * Reset all the reverse-mapping chains for all memslots
149 */ 149 */
150 kvmppc_rmap_reset(kvm); 150 kvmppc_rmap_reset(kvm);
151 /* 151 /* Ensure that each vcpu will flush its TLB on next entry. */
152 * Set the whole last_vcpu array to an invalid vcpu number. 152 cpumask_setall(&kvm->arch.need_tlb_flush);
153 * This ensures that each vcpu will flush its TLB on next entry.
154 */
155 memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu));
156 *htab_orderp = order; 153 *htab_orderp = order;
157 err = 0; 154 err = 0;
158 } else { 155 } else {
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index a4f59dbcd800..ddbec60cb0d2 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -853,7 +853,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
853 goto free_vcpu; 853 goto free_vcpu;
854 854
855 vcpu->arch.shared = &vcpu->arch.shregs; 855 vcpu->arch.shared = &vcpu->arch.shregs;
856 vcpu->arch.last_cpu = -1;
857 vcpu->arch.mmcr[0] = MMCR0_FC; 856 vcpu->arch.mmcr[0] = MMCR0_FC;
858 vcpu->arch.ctrl = CTRL_RUNLATCH; 857 vcpu->arch.ctrl = CTRL_RUNLATCH;
859 /* default to host PVR, since we can't spoof it */ 858 /* default to host PVR, since we can't spoof it */
@@ -880,6 +879,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
880 vcore->preempt_tb = TB_NIL; 879 vcore->preempt_tb = TB_NIL;
881 } 880 }
882 kvm->arch.vcores[core] = vcore; 881 kvm->arch.vcores[core] = vcore;
882 kvm->arch.online_vcores++;
883 } 883 }
884 mutex_unlock(&kvm->lock); 884 mutex_unlock(&kvm->lock);
885 885
@@ -1802,6 +1802,13 @@ int kvmppc_core_init_vm(struct kvm *kvm)
1802 return -ENOMEM; 1802 return -ENOMEM;
1803 kvm->arch.lpid = lpid; 1803 kvm->arch.lpid = lpid;
1804 1804
1805 /*
1806 * Since we don't flush the TLB when tearing down a VM,
1807 * and this lpid might have previously been used,
1808 * make sure we flush on each core before running the new VM.
1809 */
1810 cpumask_setall(&kvm->arch.need_tlb_flush);
1811
1805 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); 1812 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
1806 1813
1807 kvm->arch.rma = NULL; 1814 kvm->arch.rma = NULL;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 7a57ea49172d..19c93bae1aea 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -35,6 +35,37 @@ static void *real_vmalloc_addr(void *x)
35 return __va(addr); 35 return __va(addr);
36} 36}
37 37
38/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
39static int global_invalidates(struct kvm *kvm, unsigned long flags)
40{
41 int global;
42
43 /*
44 * If there is only one vcore, and it's currently running,
45 * we can use tlbiel as long as we mark all other physical
46 * cores as potentially having stale TLB entries for this lpid.
47 * If we're not using MMU notifiers, we never take pages away
48 * from the guest, so we can use tlbiel if requested.
49 * Otherwise, don't use tlbiel.
50 */
51 if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore)
52 global = 0;
53 else if (kvm->arch.using_mmu_notifiers)
54 global = 1;
55 else
56 global = !(flags & H_LOCAL);
57
58 if (!global) {
59 /* any other core might now have stale TLB entries... */
60 smp_wmb();
61 cpumask_setall(&kvm->arch.need_tlb_flush);
62 cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
63 &kvm->arch.need_tlb_flush);
64 }
65
66 return global;
67}
68
38/* 69/*
39 * Add this HPTE into the chain for the real page. 70 * Add this HPTE into the chain for the real page.
40 * Must be called with the chain locked; it unlocks the chain. 71 * Must be called with the chain locked; it unlocks the chain.
@@ -390,7 +421,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
390 if (v & HPTE_V_VALID) { 421 if (v & HPTE_V_VALID) {
391 hpte[0] &= ~HPTE_V_VALID; 422 hpte[0] &= ~HPTE_V_VALID;
392 rb = compute_tlbie_rb(v, hpte[1], pte_index); 423 rb = compute_tlbie_rb(v, hpte[1], pte_index);
393 if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) { 424 if (global_invalidates(kvm, flags)) {
394 while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) 425 while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
395 cpu_relax(); 426 cpu_relax();
396 asm volatile("ptesync" : : : "memory"); 427 asm volatile("ptesync" : : : "memory");
@@ -565,8 +596,6 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
565 return H_NOT_FOUND; 596 return H_NOT_FOUND;
566 } 597 }
567 598
568 if (atomic_read(&kvm->online_vcpus) == 1)
569 flags |= H_LOCAL;
570 v = hpte[0]; 599 v = hpte[0];
571 bits = (flags << 55) & HPTE_R_PP0; 600 bits = (flags << 55) & HPTE_R_PP0;
572 bits |= (flags << 48) & HPTE_R_KEY_HI; 601 bits |= (flags << 48) & HPTE_R_KEY_HI;
@@ -587,7 +616,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
587 if (v & HPTE_V_VALID) { 616 if (v & HPTE_V_VALID) {
588 rb = compute_tlbie_rb(v, r, pte_index); 617 rb = compute_tlbie_rb(v, r, pte_index);
589 hpte[0] = v & ~HPTE_V_VALID; 618 hpte[0] = v & ~HPTE_V_VALID;
590 if (!(flags & H_LOCAL)) { 619 if (global_invalidates(kvm, flags)) {
591 while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) 620 while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
592 cpu_relax(); 621 cpu_relax();
593 asm volatile("ptesync" : : : "memory"); 622 asm volatile("ptesync" : : : "memory");
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 690d1120402d..b48bd53dd771 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -313,7 +313,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
313 mtspr SPRN_SDR1,r6 /* switch to partition page table */ 313 mtspr SPRN_SDR1,r6 /* switch to partition page table */
314 mtspr SPRN_LPID,r7 314 mtspr SPRN_LPID,r7
315 isync 315 isync
316
317 /* See if we need to flush the TLB */
318 lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */
319 clrldi r7,r6,64-6 /* extract bit number (6 bits) */
320 srdi r6,r6,6 /* doubleword number */
321 sldi r6,r6,3 /* address offset */
322 add r6,r6,r9
323 addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */
316 li r0,1 324 li r0,1
325 sld r0,r0,r7
326 ld r7,0(r6)
327 and. r7,r7,r0
328 beq 22f
32923: ldarx r7,0,r6 /* if set, clear the bit */
330 andc r7,r7,r0
331 stdcx. r7,0,r6
332 bne 23b
333 li r6,128 /* and flush the TLB */
334 mtctr r6
335 li r7,0x800 /* IS field = 0b10 */
336 ptesync
33728: tlbiel r7
338 addi r7,r7,0x1000
339 bdnz 28b
340 ptesync
341
34222: li r0,1
317 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ 343 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */
318 b 10f 344 b 10f
319 345
@@ -336,36 +362,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
336 mr r9,r4 362 mr r9,r4
337 blt hdec_soon 363 blt hdec_soon
338 364
339 /*
340 * Invalidate the TLB if we could possibly have stale TLB
341 * entries for this partition on this core due to the use
342 * of tlbiel.
343 * XXX maybe only need this on primary thread?
344 */
345 ld r9,VCPU_KVM(r4) /* pointer to struct kvm */
346 lwz r5,VCPU_VCPUID(r4)
347 lhz r6,PACAPACAINDEX(r13)
348 rldimi r6,r5,0,62 /* XXX map as if threads 1:1 p:v */
349 lhz r8,VCPU_LAST_CPU(r4)
350 sldi r7,r6,1 /* see if this is the same vcpu */
351 add r7,r7,r9 /* as last ran on this pcpu */
352 lhz r0,KVM_LAST_VCPU(r7)
353 cmpw r6,r8 /* on the same cpu core as last time? */
354 bne 3f
355 cmpw r0,r5 /* same vcpu as this core last ran? */
356 beq 1f
3573: sth r6,VCPU_LAST_CPU(r4) /* if not, invalidate partition TLB */
358 sth r5,KVM_LAST_VCPU(r7)
359 li r6,128
360 mtctr r6
361 li r7,0x800 /* IS field = 0b10 */
362 ptesync
3632: tlbiel r7
364 addi r7,r7,0x1000
365 bdnz 2b
366 ptesync
3671:
368
369 /* Save purr/spurr */ 365 /* Save purr/spurr */
370 mfspr r5,SPRN_PURR 366 mfspr r5,SPRN_PURR
371 mfspr r6,SPRN_SPURR 367 mfspr r6,SPRN_SPURR