aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2018-10-10 12:38:32 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2018-10-10 12:38:32 -0400
commit7dd2157cb61a38bee83e3bc4f9bc3311f7053b4b (patch)
treed9f4edd5ed3899d8270b329990292ba5b52079d4
parentdd5bd0a65ff6f22a32b35ca3fa1bcf7a6bc7104f (diff)
parent901f8c3f6feb0225c14b3bc6237850fb921d2f2d (diff)
Merge tag 'kvm-ppc-next-4.20-1' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc into HEAD
PPC KVM update for 4.20. The major new feature here is nested HV KVM support. This allows the HV KVM module to load inside a radix guest on POWER9 and run radix guests underneath it. These nested guests can run in supervisor mode and don't require any additional instructions to be emulated, unlike with PR KVM, and so performance is much better than with PR KVM, and is very close to the performance of a non-nested guest. A nested hypervisor (a guest with nested guests) can be migrated to another host and will bring all its nested guests along with it. A nested guest can also itself run guests, and so on down to any desired depth of nesting. Apart from that there are a series of updates for IOMMU handling from Alexey Kardashevskiy, a "one VM per core" mode for HV KVM for security-paranoid applications, and a small fix for PR KVM.
-rw-r--r--Documentation/virtual/kvm/api.txt19
-rw-r--r--arch/powerpc/include/asm/asm-prototypes.h21
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu-hash.h12
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush-radix.h1
-rw-r--r--arch/powerpc/include/asm/hvcall.h41
-rw-r--r--arch/powerpc/include/asm/kvm_asm.h4
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h45
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h118
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h3
-rw-r--r--arch/powerpc/include/asm/kvm_booke.h4
-rw-r--r--arch/powerpc/include/asm/kvm_host.h16
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h8
-rw-r--r--arch/powerpc/include/asm/ppc-opcode.h1
-rw-r--r--arch/powerpc/include/asm/reg.h2
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h1
-rw-r--r--arch/powerpc/kernel/asm-offsets.c5
-rw-r--r--arch/powerpc/kernel/cpu_setup_power.S4
-rw-r--r--arch/powerpc/kvm/Makefile3
-rw-r--r--arch/powerpc/kvm/book3s.c46
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c7
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c718
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c89
-rw-r--r--arch/powerpc/kvm/book3s_64_vio_hv.c81
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c13
-rw-r--r--arch/powerpc/kvm/book3s_hv.c864
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c92
-rw-r--r--arch/powerpc/kvm/book3s_hv_interrupts.S95
-rw-r--r--arch/powerpc/kvm/book3s_hv_nested.c1291
-rw-r--r--arch/powerpc/kvm/book3s_hv_ras.c10
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c13
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S809
-rw-r--r--arch/powerpc/kvm/book3s_hv_tm.c6
-rw-r--r--arch/powerpc/kvm/book3s_hv_tm_builtin.c5
-rw-r--r--arch/powerpc/kvm/book3s_pr.c5
-rw-r--r--arch/powerpc/kvm/book3s_xics.c14
-rw-r--r--arch/powerpc/kvm/book3s_xive.c63
-rw-r--r--arch/powerpc/kvm/book3s_xive_template.c8
-rw-r--r--arch/powerpc/kvm/bookehv_interrupts.S8
-rw-r--r--arch/powerpc/kvm/emulate_loadstore.c1
-rw-r--r--arch/powerpc/kvm/powerpc.c15
-rw-r--r--arch/powerpc/kvm/tm.S250
-rw-r--r--arch/powerpc/kvm/trace_book3s.h1
-rw-r--r--arch/powerpc/mm/tlb-radix.c9
-rw-r--r--include/uapi/linux/kvm.h2
-rw-r--r--tools/perf/arch/powerpc/util/book3s_hv_exits.h1
45 files changed, 3926 insertions, 898 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 647f94128a85..df98b6304769 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1922,6 +1922,7 @@ registers, find a list below:
1922 PPC | KVM_REG_PPC_TIDR | 64 1922 PPC | KVM_REG_PPC_TIDR | 64
1923 PPC | KVM_REG_PPC_PSSCR | 64 1923 PPC | KVM_REG_PPC_PSSCR | 64
1924 PPC | KVM_REG_PPC_DEC_EXPIRY | 64 1924 PPC | KVM_REG_PPC_DEC_EXPIRY | 64
1925 PPC | KVM_REG_PPC_PTCR | 64
1925 PPC | KVM_REG_PPC_TM_GPR0 | 64 1926 PPC | KVM_REG_PPC_TM_GPR0 | 64
1926 ... 1927 ...
1927 PPC | KVM_REG_PPC_TM_GPR31 | 64 1928 PPC | KVM_REG_PPC_TM_GPR31 | 64
@@ -2269,6 +2270,10 @@ The supported flags are:
2269 The emulated MMU supports 1T segments in addition to the 2270 The emulated MMU supports 1T segments in addition to the
2270 standard 256M ones. 2271 standard 256M ones.
2271 2272
2273 - KVM_PPC_NO_HASH
2274 This flag indicates that HPT guests are not supported by KVM,
2275 thus all guests must use radix MMU mode.
2276
2272The "slb_size" field indicates how many SLB entries are supported 2277The "slb_size" field indicates how many SLB entries are supported
2273 2278
2274The "sps" array contains 8 entries indicating the supported base 2279The "sps" array contains 8 entries indicating the supported base
@@ -4531,6 +4536,20 @@ With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise,
4531a #GP would be raised when the guest tries to access. Currently, this 4536a #GP would be raised when the guest tries to access. Currently, this
4532capability does not enable write permissions of this MSR for the guest. 4537capability does not enable write permissions of this MSR for the guest.
4533 4538
45397.16 KVM_CAP_PPC_NESTED_HV
4540
4541Architectures: ppc
4542Parameters: none
4543Returns: 0 on success, -EINVAL when the implementation doesn't support
4544 nested-HV virtualization.
4545
4546HV-KVM on POWER9 and later systems allows for "nested-HV"
4547virtualization, which provides a way for a guest VM to run guests that
4548can run using the CPU's supervisor mode (privileged non-hypervisor
4549state). Enabling this capability on a VM depends on the CPU having
4550the necessary functionality and on the facility being enabled with a
4551kvm-hv module parameter.
4552
45348. Other capabilities. 45538. Other capabilities.
4535---------------------- 4554----------------------
4536 4555
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index 1f4691ce4126..c55ba3b4873b 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -150,4 +150,25 @@ extern s32 patch__memset_nocache, patch__memcpy_nocache;
150 150
151extern long flush_count_cache; 151extern long flush_count_cache;
152 152
153#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
154void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv);
155void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv);
156#else
157static inline void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr,
158 bool preserve_nv) { }
159static inline void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr,
160 bool preserve_nv) { }
161#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
162
163void kvmhv_save_host_pmu(void);
164void kvmhv_load_host_pmu(void);
165void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use);
166void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu);
167
168int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu);
169
170long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr);
171long kvmppc_h_set_xdabr(struct kvm_vcpu *vcpu, unsigned long dabr,
172 unsigned long dabrx);
173
153#endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */ 174#endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index b3520b549cba..66db23e2f4dc 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -203,6 +203,18 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
203 BUG(); 203 BUG();
204} 204}
205 205
206static inline unsigned int ap_to_shift(unsigned long ap)
207{
208 int psize;
209
210 for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
211 if (mmu_psize_defs[psize].ap == ap)
212 return mmu_psize_defs[psize].shift;
213 }
214
215 return -1;
216}
217
206static inline unsigned long get_sllp_encoding(int psize) 218static inline unsigned long get_sllp_encoding(int psize)
207{ 219{
208 unsigned long sllp; 220 unsigned long sllp;
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index 1154a6dc6d26..671316f9e95d 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -53,6 +53,7 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid,
53 unsigned long addr, 53 unsigned long addr,
54 unsigned long page_size); 54 unsigned long page_size);
55extern void radix__flush_pwc_lpid(unsigned int lpid); 55extern void radix__flush_pwc_lpid(unsigned int lpid);
56extern void radix__flush_tlb_lpid(unsigned int lpid);
56extern void radix__local_flush_tlb_lpid(unsigned int lpid); 57extern void radix__local_flush_tlb_lpid(unsigned int lpid);
57extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid); 58extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);
58 59
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index a0b17f9f1ea4..45e8789bb770 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -322,6 +322,11 @@
322#define H_GET_24X7_DATA 0xF07C 322#define H_GET_24X7_DATA 0xF07C
323#define H_GET_PERF_COUNTER_INFO 0xF080 323#define H_GET_PERF_COUNTER_INFO 0xF080
324 324
325/* Platform-specific hcalls used for nested HV KVM */
326#define H_SET_PARTITION_TABLE 0xF800
327#define H_ENTER_NESTED 0xF804
328#define H_TLB_INVALIDATE 0xF808
329
325/* Values for 2nd argument to H_SET_MODE */ 330/* Values for 2nd argument to H_SET_MODE */
326#define H_SET_MODE_RESOURCE_SET_CIABR 1 331#define H_SET_MODE_RESOURCE_SET_CIABR 1
327#define H_SET_MODE_RESOURCE_SET_DAWR 2 332#define H_SET_MODE_RESOURCE_SET_DAWR 2
@@ -461,6 +466,42 @@ struct h_cpu_char_result {
461 u64 behaviour; 466 u64 behaviour;
462}; 467};
463 468
469/* Register state for entering a nested guest with H_ENTER_NESTED */
470struct hv_guest_state {
471 u64 version; /* version of this structure layout */
472 u32 lpid;
473 u32 vcpu_token;
474 /* These registers are hypervisor privileged (at least for writing) */
475 u64 lpcr;
476 u64 pcr;
477 u64 amor;
478 u64 dpdes;
479 u64 hfscr;
480 s64 tb_offset;
481 u64 dawr0;
482 u64 dawrx0;
483 u64 ciabr;
484 u64 hdec_expiry;
485 u64 purr;
486 u64 spurr;
487 u64 ic;
488 u64 vtb;
489 u64 hdar;
490 u64 hdsisr;
491 u64 heir;
492 u64 asdr;
493 /* These are OS privileged but need to be set late in guest entry */
494 u64 srr0;
495 u64 srr1;
496 u64 sprg[4];
497 u64 pidr;
498 u64 cfar;
499 u64 ppr;
500};
501
502/* Latest version of hv_guest_state structure */
503#define HV_GUEST_STATE_VERSION 1
504
464#endif /* __ASSEMBLY__ */ 505#endif /* __ASSEMBLY__ */
465#endif /* __KERNEL__ */ 506#endif /* __KERNEL__ */
466#endif /* _ASM_POWERPC_HVCALL_H */ 507#endif /* _ASM_POWERPC_HVCALL_H */
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index a790d5cf6ea3..1f321914676d 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -84,7 +84,6 @@
84#define BOOK3S_INTERRUPT_INST_STORAGE 0x400 84#define BOOK3S_INTERRUPT_INST_STORAGE 0x400
85#define BOOK3S_INTERRUPT_INST_SEGMENT 0x480 85#define BOOK3S_INTERRUPT_INST_SEGMENT 0x480
86#define BOOK3S_INTERRUPT_EXTERNAL 0x500 86#define BOOK3S_INTERRUPT_EXTERNAL 0x500
87#define BOOK3S_INTERRUPT_EXTERNAL_LEVEL 0x501
88#define BOOK3S_INTERRUPT_EXTERNAL_HV 0x502 87#define BOOK3S_INTERRUPT_EXTERNAL_HV 0x502
89#define BOOK3S_INTERRUPT_ALIGNMENT 0x600 88#define BOOK3S_INTERRUPT_ALIGNMENT 0x600
90#define BOOK3S_INTERRUPT_PROGRAM 0x700 89#define BOOK3S_INTERRUPT_PROGRAM 0x700
@@ -134,8 +133,7 @@
134#define BOOK3S_IRQPRIO_EXTERNAL 14 133#define BOOK3S_IRQPRIO_EXTERNAL 14
135#define BOOK3S_IRQPRIO_DECREMENTER 15 134#define BOOK3S_IRQPRIO_DECREMENTER 15
136#define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 16 135#define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 16
137#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL 17 136#define BOOK3S_IRQPRIO_MAX 17
138#define BOOK3S_IRQPRIO_MAX 18
139 137
140#define BOOK3S_HFLAG_DCBZ32 0x1 138#define BOOK3S_HFLAG_DCBZ32 0x1
141#define BOOK3S_HFLAG_SLB 0x2 139#define BOOK3S_HFLAG_SLB 0x2
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 83a9aa3cf689..09f8e9ba69bc 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -188,14 +188,37 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
188extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run, 188extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
189 struct kvm_vcpu *vcpu, 189 struct kvm_vcpu *vcpu,
190 unsigned long ea, unsigned long dsisr); 190 unsigned long ea, unsigned long dsisr);
191extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
192 struct kvmppc_pte *gpte, u64 root,
193 u64 *pte_ret_p);
194extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
195 struct kvmppc_pte *gpte, u64 table,
196 int table_index, u64 *pte_ret_p);
191extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 197extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
192 struct kvmppc_pte *gpte, bool data, bool iswrite); 198 struct kvmppc_pte *gpte, bool data, bool iswrite);
199extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
200 unsigned int shift, struct kvm_memory_slot *memslot,
201 unsigned int lpid);
202extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
203 bool writing, unsigned long gpa,
204 unsigned int lpid);
205extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
206 unsigned long gpa,
207 struct kvm_memory_slot *memslot,
208 bool writing, bool kvm_ro,
209 pte_t *inserted_pte, unsigned int *levelp);
193extern int kvmppc_init_vm_radix(struct kvm *kvm); 210extern int kvmppc_init_vm_radix(struct kvm *kvm);
194extern void kvmppc_free_radix(struct kvm *kvm); 211extern void kvmppc_free_radix(struct kvm *kvm);
212extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
213 unsigned int lpid);
195extern int kvmppc_radix_init(void); 214extern int kvmppc_radix_init(void);
196extern void kvmppc_radix_exit(void); 215extern void kvmppc_radix_exit(void);
197extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 216extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
198 unsigned long gfn); 217 unsigned long gfn);
218extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
219 unsigned long gpa, unsigned int shift,
220 struct kvm_memory_slot *memslot,
221 unsigned int lpid);
199extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 222extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
200 unsigned long gfn); 223 unsigned long gfn);
201extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 224extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
@@ -271,6 +294,21 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {}
271static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {} 294static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
272#endif 295#endif
273 296
297long kvmhv_nested_init(void);
298void kvmhv_nested_exit(void);
299void kvmhv_vm_nested_init(struct kvm *kvm);
300long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
301void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
302void kvmhv_release_all_nested(struct kvm *kvm);
303long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
304long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
305int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu,
306 u64 time_limit, unsigned long lpcr);
307void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
308void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
309 struct hv_guest_state *hr);
310long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu);
311
274void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); 312void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
275 313
276extern int kvm_irq_bypass; 314extern int kvm_irq_bypass;
@@ -301,12 +339,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
301 339
302static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) 340static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
303{ 341{
304 vcpu->arch.cr = val; 342 vcpu->arch.regs.ccr = val;
305} 343}
306 344
307static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) 345static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
308{ 346{
309 return vcpu->arch.cr; 347 return vcpu->arch.regs.ccr;
310} 348}
311 349
312static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val) 350static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
@@ -384,9 +422,6 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
384/* TO = 31 for unconditional trap */ 422/* TO = 31 for unconditional trap */
385#define INS_TW 0x7fe00008 423#define INS_TW 0x7fe00008
386 424
387/* LPIDs we support with this build -- runtime limit may be lower */
388#define KVMPPC_NR_LPIDS (LPID_RSVD + 1)
389
390#define SPLIT_HACK_MASK 0xff000000 425#define SPLIT_HACK_MASK 0xff000000
391#define SPLIT_HACK_OFFS 0xfb000000 426#define SPLIT_HACK_OFFS 0xfb000000
392 427
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index dc435a5af7d6..6d298145d564 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -23,6 +23,108 @@
23#include <linux/string.h> 23#include <linux/string.h>
24#include <asm/bitops.h> 24#include <asm/bitops.h>
25#include <asm/book3s/64/mmu-hash.h> 25#include <asm/book3s/64/mmu-hash.h>
26#include <asm/cpu_has_feature.h>
27#include <asm/ppc-opcode.h>
28
29#ifdef CONFIG_PPC_PSERIES
30static inline bool kvmhv_on_pseries(void)
31{
32 return !cpu_has_feature(CPU_FTR_HVMODE);
33}
34#else
35static inline bool kvmhv_on_pseries(void)
36{
37 return false;
38}
39#endif
40
41/*
42 * Structure for a nested guest, that is, for a guest that is managed by
43 * one of our guests.
44 */
45struct kvm_nested_guest {
46 struct kvm *l1_host; /* L1 VM that owns this nested guest */
47 int l1_lpid; /* lpid L1 guest thinks this guest is */
48 int shadow_lpid; /* real lpid of this nested guest */
49 pgd_t *shadow_pgtable; /* our page table for this guest */
50 u64 l1_gr_to_hr; /* L1's addr of part'n-scoped table */
51 u64 process_table; /* process table entry for this guest */
52 long refcnt; /* number of pointers to this struct */
53 struct mutex tlb_lock; /* serialize page faults and tlbies */
54 struct kvm_nested_guest *next;
55 cpumask_t need_tlb_flush;
56 cpumask_t cpu_in_guest;
57 short prev_cpu[NR_CPUS];
58};
59
60/*
61 * We define a nested rmap entry as a single 64-bit quantity
62 * 0xFFF0000000000000 12-bit lpid field
63 * 0x000FFFFFFFFFF000 40-bit guest 4k page frame number
64 * 0x0000000000000001 1-bit single entry flag
65 */
66#define RMAP_NESTED_LPID_MASK 0xFFF0000000000000UL
67#define RMAP_NESTED_LPID_SHIFT (52)
68#define RMAP_NESTED_GPA_MASK 0x000FFFFFFFFFF000UL
69#define RMAP_NESTED_IS_SINGLE_ENTRY 0x0000000000000001UL
70
71/* Structure for a nested guest rmap entry */
72struct rmap_nested {
73 struct llist_node list;
74 u64 rmap;
75};
76
77/*
78 * for_each_nest_rmap_safe - iterate over the list of nested rmap entries
79 * safe against removal of the list entry or NULL list
80 * @pos: a (struct rmap_nested *) to use as a loop cursor
81 * @node: pointer to the first entry
82 * NOTE: this can be NULL
83 * @rmapp: an (unsigned long *) in which to return the rmap entries on each
84 * iteration
85 * NOTE: this must point to already allocated memory
86 *
87 * The nested_rmap is a llist of (struct rmap_nested) entries pointed to by the
88 * rmap entry in the memslot. The list is always terminated by a "single entry"
89 * stored in the list element of the final entry of the llist. If there is ONLY
90 * a single entry then this is itself in the rmap entry of the memslot, not a
91 * llist head pointer.
92 *
93 * Note that the iterator below assumes that a nested rmap entry is always
94 * non-zero. This is true for our usage because the LPID field is always
95 * non-zero (zero is reserved for the host).
96 *
97 * This should be used to iterate over the list of rmap_nested entries with
98 * processing done on the u64 rmap value given by each iteration. This is safe
99 * against removal of list entries and it is always safe to call free on (pos).
100 *
101 * e.g.
102 * struct rmap_nested *cursor;
103 * struct llist_node *first;
104 * unsigned long rmap;
105 * for_each_nest_rmap_safe(cursor, first, &rmap) {
106 * do_something(rmap);
107 * free(cursor);
108 * }
109 */
110#define for_each_nest_rmap_safe(pos, node, rmapp) \
111 for ((pos) = llist_entry((node), typeof(*(pos)), list); \
112 (node) && \
113 (*(rmapp) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ? \
114 ((u64) (node)) : ((pos)->rmap))) && \
115 (((node) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ? \
116 ((struct llist_node *) ((pos) = NULL)) : \
117 (pos)->list.next)), true); \
118 (pos) = llist_entry((node), typeof(*(pos)), list))
119
120struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
121 bool create);
122void kvmhv_put_nested(struct kvm_nested_guest *gp);
123int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid);
124
125/* Encoding of first parameter for H_TLB_INVALIDATE */
126#define H_TLBIE_P1_ENC(ric, prs, r) (___PPC_RIC(ric) | ___PPC_PRS(prs) | \
127 ___PPC_R(r))
26 128
27/* Power architecture requires HPT is at least 256kiB, at most 64TiB */ 129/* Power architecture requires HPT is at least 256kiB, at most 64TiB */
28#define PPC_MIN_HPT_ORDER 18 130#define PPC_MIN_HPT_ORDER 18
@@ -435,6 +537,7 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
435} 537}
436 538
437extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); 539extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
540extern void kvmhv_radix_debugfs_init(struct kvm *kvm);
438 541
439extern void kvmhv_rm_send_ipi(int cpu); 542extern void kvmhv_rm_send_ipi(int cpu);
440 543
@@ -482,7 +585,7 @@ static inline u64 sanitize_msr(u64 msr)
482#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 585#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
483static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu) 586static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu)
484{ 587{
485 vcpu->arch.cr = vcpu->arch.cr_tm; 588 vcpu->arch.regs.ccr = vcpu->arch.cr_tm;
486 vcpu->arch.regs.xer = vcpu->arch.xer_tm; 589 vcpu->arch.regs.xer = vcpu->arch.xer_tm;
487 vcpu->arch.regs.link = vcpu->arch.lr_tm; 590 vcpu->arch.regs.link = vcpu->arch.lr_tm;
488 vcpu->arch.regs.ctr = vcpu->arch.ctr_tm; 591 vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
@@ -499,7 +602,7 @@ static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu)
499 602
500static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu) 603static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
501{ 604{
502 vcpu->arch.cr_tm = vcpu->arch.cr; 605 vcpu->arch.cr_tm = vcpu->arch.regs.ccr;
503 vcpu->arch.xer_tm = vcpu->arch.regs.xer; 606 vcpu->arch.xer_tm = vcpu->arch.regs.xer;
504 vcpu->arch.lr_tm = vcpu->arch.regs.link; 607 vcpu->arch.lr_tm = vcpu->arch.regs.link;
505 vcpu->arch.ctr_tm = vcpu->arch.regs.ctr; 608 vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
@@ -515,6 +618,17 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
515} 618}
516#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ 619#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
517 620
621extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
622 unsigned long gpa, unsigned int level,
623 unsigned long mmu_seq, unsigned int lpid,
624 unsigned long *rmapp, struct rmap_nested **n_rmap);
625extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
626 struct rmap_nested **n_rmap);
627extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
628 struct kvm_memory_slot *memslot,
629 unsigned long gpa, unsigned long hpa,
630 unsigned long nbytes);
631
518#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 632#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
519 633
520#endif /* __ASM_KVM_BOOK3S_64_H__ */ 634#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index d978fdf698af..eb3ba6390108 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -25,6 +25,9 @@
25#define XICS_MFRR 0xc 25#define XICS_MFRR 0xc
26#define XICS_IPI 2 /* interrupt source # for IPIs */ 26#define XICS_IPI 2 /* interrupt source # for IPIs */
27 27
28/* LPIDs we support with this build -- runtime limit may be lower */
29#define KVMPPC_NR_LPIDS (LPID_RSVD + 1)
30
28/* Maximum number of threads per physical core */ 31/* Maximum number of threads per physical core */
29#define MAX_SMT_THREADS 8 32#define MAX_SMT_THREADS 8
30 33
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
index d513e3ed1c65..f0cef625f17c 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -46,12 +46,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
46 46
47static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) 47static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
48{ 48{
49 vcpu->arch.cr = val; 49 vcpu->arch.regs.ccr = val;
50} 50}
51 51
52static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) 52static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
53{ 53{
54 return vcpu->arch.cr; 54 return vcpu->arch.regs.ccr;
55} 55}
56 56
57static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val) 57static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 906bcbdfd2a1..fac6f631ed29 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -46,6 +46,7 @@
46#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 46#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
47#include <asm/kvm_book3s_asm.h> /* for MAX_SMT_THREADS */ 47#include <asm/kvm_book3s_asm.h> /* for MAX_SMT_THREADS */
48#define KVM_MAX_VCPU_ID (MAX_SMT_THREADS * KVM_MAX_VCORES) 48#define KVM_MAX_VCPU_ID (MAX_SMT_THREADS * KVM_MAX_VCORES)
49#define KVM_MAX_NESTED_GUESTS KVMPPC_NR_LPIDS
49 50
50#else 51#else
51#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS 52#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
@@ -94,6 +95,7 @@ struct dtl_entry;
94 95
95struct kvmppc_vcpu_book3s; 96struct kvmppc_vcpu_book3s;
96struct kvmppc_book3s_shadow_vcpu; 97struct kvmppc_book3s_shadow_vcpu;
98struct kvm_nested_guest;
97 99
98struct kvm_vm_stat { 100struct kvm_vm_stat {
99 ulong remote_tlb_flush; 101 ulong remote_tlb_flush;
@@ -287,10 +289,12 @@ struct kvm_arch {
287 u8 radix; 289 u8 radix;
288 u8 fwnmi_enabled; 290 u8 fwnmi_enabled;
289 bool threads_indep; 291 bool threads_indep;
292 bool nested_enable;
290 pgd_t *pgtable; 293 pgd_t *pgtable;
291 u64 process_table; 294 u64 process_table;
292 struct dentry *debugfs_dir; 295 struct dentry *debugfs_dir;
293 struct dentry *htab_dentry; 296 struct dentry *htab_dentry;
297 struct dentry *radix_dentry;
294 struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */ 298 struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */
295#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 299#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
296#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE 300#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
@@ -311,6 +315,9 @@ struct kvm_arch {
311#endif 315#endif
312 struct kvmppc_ops *kvm_ops; 316 struct kvmppc_ops *kvm_ops;
313#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 317#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
318 u64 l1_ptcr;
319 int max_nested_lpid;
320 struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];
314 /* This array can grow quite large, keep it at the end */ 321 /* This array can grow quite large, keep it at the end */
315 struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; 322 struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
316#endif 323#endif
@@ -360,7 +367,9 @@ struct kvmppc_pte {
360 bool may_write : 1; 367 bool may_write : 1;
361 bool may_execute : 1; 368 bool may_execute : 1;
362 unsigned long wimg; 369 unsigned long wimg;
370 unsigned long rc;
363 u8 page_size; /* MMU_PAGE_xxx */ 371 u8 page_size; /* MMU_PAGE_xxx */
372 u8 page_shift;
364}; 373};
365 374
366struct kvmppc_mmu { 375struct kvmppc_mmu {
@@ -537,8 +546,6 @@ struct kvm_vcpu_arch {
537 ulong tar; 546 ulong tar;
538#endif 547#endif
539 548
540 u32 cr;
541
542#ifdef CONFIG_PPC_BOOK3S 549#ifdef CONFIG_PPC_BOOK3S
543 ulong hflags; 550 ulong hflags;
544 ulong guest_owned_ext; 551 ulong guest_owned_ext;
@@ -707,6 +714,7 @@ struct kvm_vcpu_arch {
707 u8 hcall_needed; 714 u8 hcall_needed;
708 u8 epr_flags; /* KVMPPC_EPR_xxx */ 715 u8 epr_flags; /* KVMPPC_EPR_xxx */
709 u8 epr_needed; 716 u8 epr_needed;
717 u8 external_oneshot; /* clear external irq after delivery */
710 718
711 u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ 719 u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
712 720
@@ -781,6 +789,10 @@ struct kvm_vcpu_arch {
781 u32 emul_inst; 789 u32 emul_inst;
782 790
783 u32 online; 791 u32 online;
792
793 /* For support of nested guests */
794 struct kvm_nested_guest *nested;
795 u32 nested_vcpu_id;
784#endif 796#endif
785 797
786#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 798#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index e991821dd7fa..9b89b1918dfc 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -194,9 +194,7 @@ extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
194 (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \ 194 (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \
195 (stt)->size, (ioba), (npages)) ? \ 195 (stt)->size, (ioba), (npages)) ? \
196 H_PARAMETER : H_SUCCESS) 196 H_PARAMETER : H_SUCCESS)
197extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt, 197extern long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
198 unsigned long tce);
199extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
200 unsigned long *ua, unsigned long **prmap); 198 unsigned long *ua, unsigned long **prmap);
201extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt, 199extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
202 unsigned long idx, unsigned long tce); 200 unsigned long idx, unsigned long tce);
@@ -327,6 +325,7 @@ struct kvmppc_ops {
327 int (*set_smt_mode)(struct kvm *kvm, unsigned long mode, 325 int (*set_smt_mode)(struct kvm *kvm, unsigned long mode,
328 unsigned long flags); 326 unsigned long flags);
329 void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr); 327 void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr);
328 int (*enable_nested)(struct kvm *kvm);
330}; 329};
331 330
332extern struct kvmppc_ops *kvmppc_hv_ops; 331extern struct kvmppc_ops *kvmppc_hv_ops;
@@ -585,6 +584,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
585 584
586extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, 585extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
587 int level, bool line_status); 586 int level, bool line_status);
587extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
588#else 588#else
589static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, 589static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
590 u32 priority) { return -1; } 590 u32 priority) { return -1; }
@@ -607,6 +607,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur
607 607
608static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, 608static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
609 int level, bool line_status) { return -ENODEV; } 609 int level, bool line_status) { return -ENODEV; }
610static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
610#endif /* CONFIG_KVM_XIVE */ 611#endif /* CONFIG_KVM_XIVE */
611 612
612/* 613/*
@@ -652,6 +653,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
652 unsigned long mfrr); 653 unsigned long mfrr);
653int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr); 654int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
654int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr); 655int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
656void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu);
655 657
656/* 658/*
657 * Host-side operations we want to set up while running in real 659 * Host-side operations we want to set up while running in real
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 665af14850e4..6093bc8f74e5 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -104,6 +104,7 @@
104#define OP_31_XOP_LHZUX 311 104#define OP_31_XOP_LHZUX 311
105#define OP_31_XOP_MSGSNDP 142 105#define OP_31_XOP_MSGSNDP 142
106#define OP_31_XOP_MSGCLRP 174 106#define OP_31_XOP_MSGCLRP 174
107#define OP_31_XOP_TLBIE 306
107#define OP_31_XOP_MFSPR 339 108#define OP_31_XOP_MFSPR 339
108#define OP_31_XOP_LWAX 341 109#define OP_31_XOP_LWAX 341
109#define OP_31_XOP_LHAX 343 110#define OP_31_XOP_LHAX 343
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index e5b314ed054e..c90698972f42 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -415,6 +415,7 @@
415#define HFSCR_DSCR __MASK(FSCR_DSCR_LG) 415#define HFSCR_DSCR __MASK(FSCR_DSCR_LG)
416#define HFSCR_VECVSX __MASK(FSCR_VECVSX_LG) 416#define HFSCR_VECVSX __MASK(FSCR_VECVSX_LG)
417#define HFSCR_FP __MASK(FSCR_FP_LG) 417#define HFSCR_FP __MASK(FSCR_FP_LG)
418#define HFSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56) /* interrupt cause */
418#define SPRN_TAR 0x32f /* Target Address Register */ 419#define SPRN_TAR 0x32f /* Target Address Register */
419#define SPRN_LPCR 0x13E /* LPAR Control Register */ 420#define SPRN_LPCR 0x13E /* LPAR Control Register */
420#define LPCR_VPM0 ASM_CONST(0x8000000000000000) 421#define LPCR_VPM0 ASM_CONST(0x8000000000000000)
@@ -766,6 +767,7 @@
766#define SPRN_HSRR0 0x13A /* Save/Restore Register 0 */ 767#define SPRN_HSRR0 0x13A /* Save/Restore Register 0 */
767#define SPRN_HSRR1 0x13B /* Save/Restore Register 1 */ 768#define SPRN_HSRR1 0x13B /* Save/Restore Register 1 */
768#define HSRR1_DENORM 0x00100000 /* Denorm exception */ 769#define HSRR1_DENORM 0x00100000 /* Denorm exception */
770#define HSRR1_HISI_WRITE 0x00010000 /* HISI bcs couldn't update mem */
769 771
770#define SPRN_TBCTL 0x35f /* PA6T Timebase control register */ 772#define SPRN_TBCTL 0x35f /* PA6T Timebase control register */
771#define TBCTL_FREEZE 0x0000000000000000ull /* Freeze all tbs */ 773#define TBCTL_FREEZE 0x0000000000000000ull /* Freeze all tbs */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 1b32b56a03d3..8c876c166ef2 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -634,6 +634,7 @@ struct kvm_ppc_cpu_char {
634 634
635#define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe) 635#define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe)
636#define KVM_REG_PPC_ONLINE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf) 636#define KVM_REG_PPC_ONLINE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf)
637#define KVM_REG_PPC_PTCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0)
637 638
638/* Transactional Memory checkpointed state: 639/* Transactional Memory checkpointed state:
639 * This is all GPRs, all VSX regs and a subset of SPRs 640 * This is all GPRs, all VSX regs and a subset of SPRs
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 89cf15566c4e..d0abcbbdc700 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -438,7 +438,7 @@ int main(void)
438#ifdef CONFIG_PPC_BOOK3S 438#ifdef CONFIG_PPC_BOOK3S
439 OFFSET(VCPU_TAR, kvm_vcpu, arch.tar); 439 OFFSET(VCPU_TAR, kvm_vcpu, arch.tar);
440#endif 440#endif
441 OFFSET(VCPU_CR, kvm_vcpu, arch.cr); 441 OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
442 OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip); 442 OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip);
443#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 443#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
444 OFFSET(VCPU_MSR, kvm_vcpu, arch.shregs.msr); 444 OFFSET(VCPU_MSR, kvm_vcpu, arch.shregs.msr);
@@ -503,6 +503,7 @@ int main(void)
503 OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); 503 OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
504 OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty); 504 OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty);
505 OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst); 505 OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst);
506 OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested);
506 OFFSET(VCPU_CPU, kvm_vcpu, cpu); 507 OFFSET(VCPU_CPU, kvm_vcpu, cpu);
507 OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu); 508 OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu);
508#endif 509#endif
@@ -695,7 +696,7 @@ int main(void)
695#endif /* CONFIG_PPC_BOOK3S_64 */ 696#endif /* CONFIG_PPC_BOOK3S_64 */
696 697
697#else /* CONFIG_PPC_BOOK3S */ 698#else /* CONFIG_PPC_BOOK3S */
698 OFFSET(VCPU_CR, kvm_vcpu, arch.cr); 699 OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
699 OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer); 700 OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer);
700 OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link); 701 OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link);
701 OFFSET(VCPU_CTR, kvm_vcpu, arch.regs.ctr); 702 OFFSET(VCPU_CTR, kvm_vcpu, arch.regs.ctr);
diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 458b928dbd84..c317080db771 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -147,8 +147,8 @@ __init_hvmode_206:
147 rldicl. r0,r3,4,63 147 rldicl. r0,r3,4,63
148 bnelr 148 bnelr
149 ld r5,CPU_SPEC_FEATURES(r4) 149 ld r5,CPU_SPEC_FEATURES(r4)
150 LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE) 150 LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE | CPU_FTR_P9_TM_HV_ASSIST)
151 xor r5,r5,r6 151 andc r5,r5,r6
152 std r5,CPU_SPEC_FEATURES(r4) 152 std r5,CPU_SPEC_FEATURES(r4)
153 blr 153 blr
154 154
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index f872c04bb5b1..e814f40ab836 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -75,7 +75,8 @@ kvm-hv-y += \
75 book3s_hv.o \ 75 book3s_hv.o \
76 book3s_hv_interrupts.o \ 76 book3s_hv_interrupts.o \
77 book3s_64_mmu_hv.o \ 77 book3s_64_mmu_hv.o \
78 book3s_64_mmu_radix.o 78 book3s_64_mmu_radix.o \
79 book3s_hv_nested.o
79 80
80kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \ 81kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
81 book3s_hv_tm.o 82 book3s_hv_tm.o
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 87348e498c89..fd9893bc7aa1 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -78,8 +78,11 @@ void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu)
78{ 78{
79 if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) { 79 if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) {
80 ulong pc = kvmppc_get_pc(vcpu); 80 ulong pc = kvmppc_get_pc(vcpu);
81 ulong lr = kvmppc_get_lr(vcpu);
81 if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) 82 if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
82 kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK); 83 kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK);
84 if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
85 kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK);
83 vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK; 86 vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK;
84 } 87 }
85} 88}
@@ -150,7 +153,6 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
150 case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE; break; 153 case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE; break;
151 case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT; break; 154 case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT; break;
152 case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL; break; 155 case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL; break;
153 case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL; break;
154 case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT; break; 156 case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT; break;
155 case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM; break; 157 case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM; break;
156 case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL; break; 158 case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL; break;
@@ -236,18 +238,35 @@ EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec);
236void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, 238void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
237 struct kvm_interrupt *irq) 239 struct kvm_interrupt *irq)
238{ 240{
239 unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL; 241 /*
240 242 * This case (KVM_INTERRUPT_SET) should never actually arise for
241 if (irq->irq == KVM_INTERRUPT_SET_LEVEL) 243 * a pseries guest (because pseries guests expect their interrupt
242 vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL; 244 * controllers to continue asserting an external interrupt request
245 * until it is acknowledged at the interrupt controller), but is
246 * included to avoid ABI breakage and potentially for other
247 * sorts of guest.
248 *
249 * There is a subtlety here: HV KVM does not test the
250 * external_oneshot flag in the code that synthesizes
251 * external interrupts for the guest just before entering
252 * the guest. That is OK even if userspace did do a
253 * KVM_INTERRUPT_SET on a pseries guest vcpu, because the
254 * caller (kvm_vcpu_ioctl_interrupt) does a kvm_vcpu_kick()
255 * which ends up doing a smp_send_reschedule(), which will
256 * pull the guest all the way out to the host, meaning that
257 * we will call kvmppc_core_prepare_to_enter() before entering
258 * the guest again, and that will handle the external_oneshot
259 * flag correctly.
260 */
261 if (irq->irq == KVM_INTERRUPT_SET)
262 vcpu->arch.external_oneshot = 1;
243 263
244 kvmppc_book3s_queue_irqprio(vcpu, vec); 264 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
245} 265}
246 266
247void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu) 267void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
248{ 268{
249 kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); 269 kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
250 kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
251} 270}
252 271
253void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar, 272void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar,
@@ -278,7 +297,6 @@ static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu,
278 vec = BOOK3S_INTERRUPT_DECREMENTER; 297 vec = BOOK3S_INTERRUPT_DECREMENTER;
279 break; 298 break;
280 case BOOK3S_IRQPRIO_EXTERNAL: 299 case BOOK3S_IRQPRIO_EXTERNAL:
281 case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
282 deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; 300 deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;
283 vec = BOOK3S_INTERRUPT_EXTERNAL; 301 vec = BOOK3S_INTERRUPT_EXTERNAL;
284 break; 302 break;
@@ -352,8 +370,16 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)
352 case BOOK3S_IRQPRIO_DECREMENTER: 370 case BOOK3S_IRQPRIO_DECREMENTER:
353 /* DEC interrupts get cleared by mtdec */ 371 /* DEC interrupts get cleared by mtdec */
354 return false; 372 return false;
355 case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: 373 case BOOK3S_IRQPRIO_EXTERNAL:
356 /* External interrupts get cleared by userspace */ 374 /*
375 * External interrupts get cleared by userspace
376 * except when set by the KVM_INTERRUPT ioctl with
377 * KVM_INTERRUPT_SET (not KVM_INTERRUPT_SET_LEVEL).
378 */
379 if (vcpu->arch.external_oneshot) {
380 vcpu->arch.external_oneshot = 0;
381 return true;
382 }
357 return false; 383 return false;
358 } 384 }
359 385
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 68e14afecac8..c615617e78ac 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -268,14 +268,13 @@ int kvmppc_mmu_hv_init(void)
268{ 268{
269 unsigned long host_lpid, rsvd_lpid; 269 unsigned long host_lpid, rsvd_lpid;
270 270
271 if (!cpu_has_feature(CPU_FTR_HVMODE))
272 return -EINVAL;
273
274 if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE)) 271 if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE))
275 return -EINVAL; 272 return -EINVAL;
276 273
277 /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */ 274 /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */
278 host_lpid = mfspr(SPRN_LPID); 275 host_lpid = 0;
276 if (cpu_has_feature(CPU_FTR_HVMODE))
277 host_lpid = mfspr(SPRN_LPID);
279 rsvd_lpid = LPID_RSVD; 278 rsvd_lpid = LPID_RSVD;
280 279
281 kvmppc_init_lpid(rsvd_lpid + 1); 280 kvmppc_init_lpid(rsvd_lpid + 1);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 933c574e1cf7..43b21e88c716 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -10,6 +10,9 @@
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/kvm.h> 11#include <linux/kvm.h>
12#include <linux/kvm_host.h> 12#include <linux/kvm_host.h>
13#include <linux/anon_inodes.h>
14#include <linux/file.h>
15#include <linux/debugfs.h>
13 16
14#include <asm/kvm_ppc.h> 17#include <asm/kvm_ppc.h>
15#include <asm/kvm_book3s.h> 18#include <asm/kvm_book3s.h>
@@ -26,87 +29,74 @@
26 */ 29 */
27static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; 30static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
28 31
29int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 32int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
30 struct kvmppc_pte *gpte, bool data, bool iswrite) 33 struct kvmppc_pte *gpte, u64 root,
34 u64 *pte_ret_p)
31{ 35{
32 struct kvm *kvm = vcpu->kvm; 36 struct kvm *kvm = vcpu->kvm;
33 u32 pid;
34 int ret, level, ps; 37 int ret, level, ps;
35 __be64 prte, rpte; 38 unsigned long rts, bits, offset, index;
36 unsigned long ptbl; 39 u64 pte, base, gpa;
37 unsigned long root, pte, index; 40 __be64 rpte;
38 unsigned long rts, bits, offset;
39 unsigned long gpa;
40 unsigned long proc_tbl_size;
41
42 /* Work out effective PID */
43 switch (eaddr >> 62) {
44 case 0:
45 pid = vcpu->arch.pid;
46 break;
47 case 3:
48 pid = 0;
49 break;
50 default:
51 return -EINVAL;
52 }
53 proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12);
54 if (pid * 16 >= proc_tbl_size)
55 return -EINVAL;
56
57 /* Read partition table to find root of tree for effective PID */
58 ptbl = (kvm->arch.process_table & PRTB_MASK) + (pid * 16);
59 ret = kvm_read_guest(kvm, ptbl, &prte, sizeof(prte));
60 if (ret)
61 return ret;
62 41
63 root = be64_to_cpu(prte);
64 rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) | 42 rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
65 ((root & RTS2_MASK) >> RTS2_SHIFT); 43 ((root & RTS2_MASK) >> RTS2_SHIFT);
66 bits = root & RPDS_MASK; 44 bits = root & RPDS_MASK;
67 root = root & RPDB_MASK; 45 base = root & RPDB_MASK;
68 46
69 offset = rts + 31; 47 offset = rts + 31;
70 48
71 /* current implementations only support 52-bit space */ 49 /* Current implementations only support 52-bit space */
72 if (offset != 52) 50 if (offset != 52)
73 return -EINVAL; 51 return -EINVAL;
74 52
53 /* Walk each level of the radix tree */
75 for (level = 3; level >= 0; --level) { 54 for (level = 3; level >= 0; --level) {
55 u64 addr;
56 /* Check a valid size */
76 if (level && bits != p9_supported_radix_bits[level]) 57 if (level && bits != p9_supported_radix_bits[level])
77 return -EINVAL; 58 return -EINVAL;
78 if (level == 0 && !(bits == 5 || bits == 9)) 59 if (level == 0 && !(bits == 5 || bits == 9))
79 return -EINVAL; 60 return -EINVAL;
80 offset -= bits; 61 offset -= bits;
81 index = (eaddr >> offset) & ((1UL << bits) - 1); 62 index = (eaddr >> offset) & ((1UL << bits) - 1);
82 /* check that low bits of page table base are zero */ 63 /* Check that low bits of page table base are zero */
83 if (root & ((1UL << (bits + 3)) - 1)) 64 if (base & ((1UL << (bits + 3)) - 1))
84 return -EINVAL; 65 return -EINVAL;
85 ret = kvm_read_guest(kvm, root + index * 8, 66 /* Read the entry from guest memory */
86 &rpte, sizeof(rpte)); 67 addr = base + (index * sizeof(rpte));
87 if (ret) 68 ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
69 if (ret) {
70 if (pte_ret_p)
71 *pte_ret_p = addr;
88 return ret; 72 return ret;
73 }
89 pte = __be64_to_cpu(rpte); 74 pte = __be64_to_cpu(rpte);
90 if (!(pte & _PAGE_PRESENT)) 75 if (!(pte & _PAGE_PRESENT))
91 return -ENOENT; 76 return -ENOENT;
77 /* Check if a leaf entry */
92 if (pte & _PAGE_PTE) 78 if (pte & _PAGE_PTE)
93 break; 79 break;
94 bits = pte & 0x1f; 80 /* Get ready to walk the next level */
95 root = pte & 0x0fffffffffffff00ul; 81 base = pte & RPDB_MASK;
82 bits = pte & RPDS_MASK;
96 } 83 }
97 /* need a leaf at lowest level; 512GB pages not supported */ 84
85 /* Need a leaf at lowest level; 512GB pages not supported */
98 if (level < 0 || level == 3) 86 if (level < 0 || level == 3)
99 return -EINVAL; 87 return -EINVAL;
100 88
101 /* offset is now log base 2 of the page size */ 89 /* We found a valid leaf PTE */
90 /* Offset is now log base 2 of the page size */
102 gpa = pte & 0x01fffffffffff000ul; 91 gpa = pte & 0x01fffffffffff000ul;
103 if (gpa & ((1ul << offset) - 1)) 92 if (gpa & ((1ul << offset) - 1))
104 return -EINVAL; 93 return -EINVAL;
105 gpa += eaddr & ((1ul << offset) - 1); 94 gpa |= eaddr & ((1ul << offset) - 1);
106 for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps) 95 for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
107 if (offset == mmu_psize_defs[ps].shift) 96 if (offset == mmu_psize_defs[ps].shift)
108 break; 97 break;
109 gpte->page_size = ps; 98 gpte->page_size = ps;
99 gpte->page_shift = offset;
110 100
111 gpte->eaddr = eaddr; 101 gpte->eaddr = eaddr;
112 gpte->raddr = gpa; 102 gpte->raddr = gpa;
@@ -115,6 +105,77 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
115 gpte->may_read = !!(pte & _PAGE_READ); 105 gpte->may_read = !!(pte & _PAGE_READ);
116 gpte->may_write = !!(pte & _PAGE_WRITE); 106 gpte->may_write = !!(pte & _PAGE_WRITE);
117 gpte->may_execute = !!(pte & _PAGE_EXEC); 107 gpte->may_execute = !!(pte & _PAGE_EXEC);
108
109 gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
110
111 if (pte_ret_p)
112 *pte_ret_p = pte;
113
114 return 0;
115}
116
117/*
118 * Used to walk a partition or process table radix tree in guest memory
119 * Note: We exploit the fact that a partition table and a process
120 * table have the same layout, a partition-scoped page table and a
121 * process-scoped page table have the same layout, and the 2nd
122 * doubleword of a partition table entry has the same layout as
123 * the PTCR register.
124 */
125int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
126 struct kvmppc_pte *gpte, u64 table,
127 int table_index, u64 *pte_ret_p)
128{
129 struct kvm *kvm = vcpu->kvm;
130 int ret;
131 unsigned long size, ptbl, root;
132 struct prtb_entry entry;
133
134 if ((table & PRTS_MASK) > 24)
135 return -EINVAL;
136 size = 1ul << ((table & PRTS_MASK) + 12);
137
138 /* Is the table big enough to contain this entry? */
139 if ((table_index * sizeof(entry)) >= size)
140 return -EINVAL;
141
142 /* Read the table to find the root of the radix tree */
143 ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
144 ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
145 if (ret)
146 return ret;
147
148 /* Root is stored in the first double word */
149 root = be64_to_cpu(entry.prtb0);
150
151 return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
152}
153
154int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
155 struct kvmppc_pte *gpte, bool data, bool iswrite)
156{
157 u32 pid;
158 u64 pte;
159 int ret;
160
161 /* Work out effective PID */
162 switch (eaddr >> 62) {
163 case 0:
164 pid = vcpu->arch.pid;
165 break;
166 case 3:
167 pid = 0;
168 break;
169 default:
170 return -EINVAL;
171 }
172
173 ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
174 vcpu->kvm->arch.process_table, pid, &pte);
175 if (ret)
176 return ret;
177
178 /* Check privilege (applies only to process scoped translations) */
118 if (kvmppc_get_msr(vcpu) & MSR_PR) { 179 if (kvmppc_get_msr(vcpu) & MSR_PR) {
119 if (pte & _PAGE_PRIVILEGED) { 180 if (pte & _PAGE_PRIVILEGED) {
120 gpte->may_read = 0; 181 gpte->may_read = 0;
@@ -137,20 +198,46 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
137} 198}
138 199
139static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, 200static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
140 unsigned int pshift) 201 unsigned int pshift, unsigned int lpid)
141{ 202{
142 unsigned long psize = PAGE_SIZE; 203 unsigned long psize = PAGE_SIZE;
204 int psi;
205 long rc;
206 unsigned long rb;
143 207
144 if (pshift) 208 if (pshift)
145 psize = 1UL << pshift; 209 psize = 1UL << pshift;
210 else
211 pshift = PAGE_SHIFT;
146 212
147 addr &= ~(psize - 1); 213 addr &= ~(psize - 1);
148 radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize); 214
215 if (!kvmhv_on_pseries()) {
216 radix__flush_tlb_lpid_page(lpid, addr, psize);
217 return;
218 }
219
220 psi = shift_to_mmu_psize(pshift);
221 rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
222 rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
223 lpid, rb);
224 if (rc)
225 pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
149} 226}
150 227
151static void kvmppc_radix_flush_pwc(struct kvm *kvm) 228static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
152{ 229{
153 radix__flush_pwc_lpid(kvm->arch.lpid); 230 long rc;
231
232 if (!kvmhv_on_pseries()) {
233 radix__flush_pwc_lpid(lpid);
234 return;
235 }
236
237 rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
238 lpid, TLBIEL_INVAL_SET_LPID);
239 if (rc)
240 pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
154} 241}
155 242
156static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, 243static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
@@ -195,23 +282,38 @@ static void kvmppc_pmd_free(pmd_t *pmdp)
195 kmem_cache_free(kvm_pmd_cache, pmdp); 282 kmem_cache_free(kvm_pmd_cache, pmdp);
196} 283}
197 284
198static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, 285/* Called with kvm->mmu_lock held */
199 unsigned long gpa, unsigned int shift) 286void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
287 unsigned int shift, struct kvm_memory_slot *memslot,
288 unsigned int lpid)
200 289
201{ 290{
202 unsigned long page_size = 1ul << shift;
203 unsigned long old; 291 unsigned long old;
292 unsigned long gfn = gpa >> PAGE_SHIFT;
293 unsigned long page_size = PAGE_SIZE;
294 unsigned long hpa;
204 295
205 old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift); 296 old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
206 kvmppc_radix_tlbie_page(kvm, gpa, shift); 297 kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
207 if (old & _PAGE_DIRTY) { 298
208 unsigned long gfn = gpa >> PAGE_SHIFT; 299 /* The following only applies to L1 entries */
209 struct kvm_memory_slot *memslot; 300 if (lpid != kvm->arch.lpid)
301 return;
210 302
303 if (!memslot) {
211 memslot = gfn_to_memslot(kvm, gfn); 304 memslot = gfn_to_memslot(kvm, gfn);
212 if (memslot && memslot->dirty_bitmap) 305 if (!memslot)
213 kvmppc_update_dirty_map(memslot, gfn, page_size); 306 return;
214 } 307 }
308 if (shift)
309 page_size = 1ul << shift;
310
311 gpa &= ~(page_size - 1);
312 hpa = old & PTE_RPN_MASK;
313 kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
314
315 if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
316 kvmppc_update_dirty_map(memslot, gfn, page_size);
215} 317}
216 318
217/* 319/*
@@ -224,7 +326,8 @@ static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
224 * and emit a warning if encountered, but there may already be data 326 * and emit a warning if encountered, but there may already be data
225 * corruption due to the unexpected mappings. 327 * corruption due to the unexpected mappings.
226 */ 328 */
227static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full) 329static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
330 unsigned int lpid)
228{ 331{
229 if (full) { 332 if (full) {
230 memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE); 333 memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
@@ -238,14 +341,15 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
238 WARN_ON_ONCE(1); 341 WARN_ON_ONCE(1);
239 kvmppc_unmap_pte(kvm, p, 342 kvmppc_unmap_pte(kvm, p,
240 pte_pfn(*p) << PAGE_SHIFT, 343 pte_pfn(*p) << PAGE_SHIFT,
241 PAGE_SHIFT); 344 PAGE_SHIFT, NULL, lpid);
242 } 345 }
243 } 346 }
244 347
245 kvmppc_pte_free(pte); 348 kvmppc_pte_free(pte);
246} 349}
247 350
248static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full) 351static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
352 unsigned int lpid)
249{ 353{
250 unsigned long im; 354 unsigned long im;
251 pmd_t *p = pmd; 355 pmd_t *p = pmd;
@@ -260,20 +364,21 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
260 WARN_ON_ONCE(1); 364 WARN_ON_ONCE(1);
261 kvmppc_unmap_pte(kvm, (pte_t *)p, 365 kvmppc_unmap_pte(kvm, (pte_t *)p,
262 pte_pfn(*(pte_t *)p) << PAGE_SHIFT, 366 pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
263 PMD_SHIFT); 367 PMD_SHIFT, NULL, lpid);
264 } 368 }
265 } else { 369 } else {
266 pte_t *pte; 370 pte_t *pte;
267 371
268 pte = pte_offset_map(p, 0); 372 pte = pte_offset_map(p, 0);
269 kvmppc_unmap_free_pte(kvm, pte, full); 373 kvmppc_unmap_free_pte(kvm, pte, full, lpid);
270 pmd_clear(p); 374 pmd_clear(p);
271 } 375 }
272 } 376 }
273 kvmppc_pmd_free(pmd); 377 kvmppc_pmd_free(pmd);
274} 378}
275 379
276static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud) 380static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
381 unsigned int lpid)
277{ 382{
278 unsigned long iu; 383 unsigned long iu;
279 pud_t *p = pud; 384 pud_t *p = pud;
@@ -287,36 +392,40 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
287 pmd_t *pmd; 392 pmd_t *pmd;
288 393
289 pmd = pmd_offset(p, 0); 394 pmd = pmd_offset(p, 0);
290 kvmppc_unmap_free_pmd(kvm, pmd, true); 395 kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
291 pud_clear(p); 396 pud_clear(p);
292 } 397 }
293 } 398 }
294 pud_free(kvm->mm, pud); 399 pud_free(kvm->mm, pud);
295} 400}
296 401
297void kvmppc_free_radix(struct kvm *kvm) 402void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
298{ 403{
299 unsigned long ig; 404 unsigned long ig;
300 pgd_t *pgd;
301 405
302 if (!kvm->arch.pgtable)
303 return;
304 pgd = kvm->arch.pgtable;
305 for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { 406 for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
306 pud_t *pud; 407 pud_t *pud;
307 408
308 if (!pgd_present(*pgd)) 409 if (!pgd_present(*pgd))
309 continue; 410 continue;
310 pud = pud_offset(pgd, 0); 411 pud = pud_offset(pgd, 0);
311 kvmppc_unmap_free_pud(kvm, pud); 412 kvmppc_unmap_free_pud(kvm, pud, lpid);
312 pgd_clear(pgd); 413 pgd_clear(pgd);
313 } 414 }
314 pgd_free(kvm->mm, kvm->arch.pgtable); 415}
315 kvm->arch.pgtable = NULL; 416
417void kvmppc_free_radix(struct kvm *kvm)
418{
419 if (kvm->arch.pgtable) {
420 kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
421 kvm->arch.lpid);
422 pgd_free(kvm->mm, kvm->arch.pgtable);
423 kvm->arch.pgtable = NULL;
424 }
316} 425}
317 426
318static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, 427static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
319 unsigned long gpa) 428 unsigned long gpa, unsigned int lpid)
320{ 429{
321 pte_t *pte = pte_offset_kernel(pmd, 0); 430 pte_t *pte = pte_offset_kernel(pmd, 0);
322 431
@@ -326,13 +435,13 @@ static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
326 * flushing the PWC again. 435 * flushing the PWC again.
327 */ 436 */
328 pmd_clear(pmd); 437 pmd_clear(pmd);
329 kvmppc_radix_flush_pwc(kvm); 438 kvmppc_radix_flush_pwc(kvm, lpid);
330 439
331 kvmppc_unmap_free_pte(kvm, pte, false); 440 kvmppc_unmap_free_pte(kvm, pte, false, lpid);
332} 441}
333 442
334static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, 443static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
335 unsigned long gpa) 444 unsigned long gpa, unsigned int lpid)
336{ 445{
337 pmd_t *pmd = pmd_offset(pud, 0); 446 pmd_t *pmd = pmd_offset(pud, 0);
338 447
@@ -342,9 +451,9 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
342 * so can be freed without flushing the PWC again. 451 * so can be freed without flushing the PWC again.
343 */ 452 */
344 pud_clear(pud); 453 pud_clear(pud);
345 kvmppc_radix_flush_pwc(kvm); 454 kvmppc_radix_flush_pwc(kvm, lpid);
346 455
347 kvmppc_unmap_free_pmd(kvm, pmd, false); 456 kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
348} 457}
349 458
350/* 459/*
@@ -356,8 +465,10 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
356 */ 465 */
357#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)) 466#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
358 467
359static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, 468int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
360 unsigned int level, unsigned long mmu_seq) 469 unsigned long gpa, unsigned int level,
470 unsigned long mmu_seq, unsigned int lpid,
471 unsigned long *rmapp, struct rmap_nested **n_rmap)
361{ 472{
362 pgd_t *pgd; 473 pgd_t *pgd;
363 pud_t *pud, *new_pud = NULL; 474 pud_t *pud, *new_pud = NULL;
@@ -366,7 +477,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
366 int ret; 477 int ret;
367 478
368 /* Traverse the guest's 2nd-level tree, allocate new levels needed */ 479 /* Traverse the guest's 2nd-level tree, allocate new levels needed */
369 pgd = kvm->arch.pgtable + pgd_index(gpa); 480 pgd = pgtable + pgd_index(gpa);
370 pud = NULL; 481 pud = NULL;
371 if (pgd_present(*pgd)) 482 if (pgd_present(*pgd))
372 pud = pud_offset(pgd, gpa); 483 pud = pud_offset(pgd, gpa);
@@ -423,7 +534,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
423 goto out_unlock; 534 goto out_unlock;
424 } 535 }
425 /* Valid 1GB page here already, remove it */ 536 /* Valid 1GB page here already, remove it */
426 kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT); 537 kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
538 lpid);
427 } 539 }
428 if (level == 2) { 540 if (level == 2) {
429 if (!pud_none(*pud)) { 541 if (!pud_none(*pud)) {
@@ -432,9 +544,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
432 * install a large page, so remove and free the page 544 * install a large page, so remove and free the page
433 * table page. 545 * table page.
434 */ 546 */
435 kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa); 547 kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
436 } 548 }
437 kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte); 549 kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
550 if (rmapp && n_rmap)
551 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
438 ret = 0; 552 ret = 0;
439 goto out_unlock; 553 goto out_unlock;
440 } 554 }
@@ -458,7 +572,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
458 WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) & 572 WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
459 PTE_BITS_MUST_MATCH); 573 PTE_BITS_MUST_MATCH);
460 kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), 574 kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
461 0, pte_val(pte), lgpa, PMD_SHIFT); 575 0, pte_val(pte), lgpa, PMD_SHIFT);
462 ret = 0; 576 ret = 0;
463 goto out_unlock; 577 goto out_unlock;
464 } 578 }
@@ -472,7 +586,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
472 goto out_unlock; 586 goto out_unlock;
473 } 587 }
474 /* Valid 2MB page here already, remove it */ 588 /* Valid 2MB page here already, remove it */
475 kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT); 589 kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
590 lpid);
476 } 591 }
477 if (level == 1) { 592 if (level == 1) {
478 if (!pmd_none(*pmd)) { 593 if (!pmd_none(*pmd)) {
@@ -481,9 +596,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
481 * install a large page, so remove and free the page 596 * install a large page, so remove and free the page
482 * table page. 597 * table page.
483 */ 598 */
484 kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa); 599 kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
485 } 600 }
486 kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); 601 kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
602 if (rmapp && n_rmap)
603 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
487 ret = 0; 604 ret = 0;
488 goto out_unlock; 605 goto out_unlock;
489 } 606 }
@@ -508,6 +625,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
508 goto out_unlock; 625 goto out_unlock;
509 } 626 }
510 kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); 627 kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
628 if (rmapp && n_rmap)
629 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
511 ret = 0; 630 ret = 0;
512 631
513 out_unlock: 632 out_unlock:
@@ -521,95 +640,49 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
521 return ret; 640 return ret;
522} 641}
523 642
524int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, 643bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
525 unsigned long ea, unsigned long dsisr) 644 unsigned long gpa, unsigned int lpid)
645{
646 unsigned long pgflags;
647 unsigned int shift;
648 pte_t *ptep;
649
650 /*
651 * Need to set an R or C bit in the 2nd-level tables;
652 * since we are just helping out the hardware here,
653 * it is sufficient to do what the hardware does.
654 */
655 pgflags = _PAGE_ACCESSED;
656 if (writing)
657 pgflags |= _PAGE_DIRTY;
658 /*
659 * We are walking the secondary (partition-scoped) page table here.
660 * We can do this without disabling irq because the Linux MM
661 * subsystem doesn't do THP splits and collapses on this tree.
662 */
663 ptep = __find_linux_pte(pgtable, gpa, NULL, &shift);
664 if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
665 kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
666 return true;
667 }
668 return false;
669}
670
671int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
672 unsigned long gpa,
673 struct kvm_memory_slot *memslot,
674 bool writing, bool kvm_ro,
675 pte_t *inserted_pte, unsigned int *levelp)
526{ 676{
527 struct kvm *kvm = vcpu->kvm; 677 struct kvm *kvm = vcpu->kvm;
528 unsigned long mmu_seq;
529 unsigned long gpa, gfn, hva;
530 struct kvm_memory_slot *memslot;
531 struct page *page = NULL; 678 struct page *page = NULL;
532 long ret; 679 unsigned long mmu_seq;
533 bool writing; 680 unsigned long hva, gfn = gpa >> PAGE_SHIFT;
534 bool upgrade_write = false; 681 bool upgrade_write = false;
535 bool *upgrade_p = &upgrade_write; 682 bool *upgrade_p = &upgrade_write;
536 pte_t pte, *ptep; 683 pte_t pte, *ptep;
537 unsigned long pgflags;
538 unsigned int shift, level; 684 unsigned int shift, level;
539 685 int ret;
540 /* Check for unusual errors */
541 if (dsisr & DSISR_UNSUPP_MMU) {
542 pr_err("KVM: Got unsupported MMU fault\n");
543 return -EFAULT;
544 }
545 if (dsisr & DSISR_BADACCESS) {
546 /* Reflect to the guest as DSI */
547 pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
548 kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
549 return RESUME_GUEST;
550 }
551
552 /* Translate the logical address and get the page */
553 gpa = vcpu->arch.fault_gpa & ~0xfffUL;
554 gpa &= ~0xF000000000000000ul;
555 gfn = gpa >> PAGE_SHIFT;
556 if (!(dsisr & DSISR_PRTABLE_FAULT))
557 gpa |= ea & 0xfff;
558 memslot = gfn_to_memslot(kvm, gfn);
559
560 /* No memslot means it's an emulated MMIO region */
561 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
562 if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
563 DSISR_SET_RC)) {
564 /*
565 * Bad address in guest page table tree, or other
566 * unusual error - reflect it to the guest as DSI.
567 */
568 kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
569 return RESUME_GUEST;
570 }
571 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
572 dsisr & DSISR_ISSTORE);
573 }
574
575 writing = (dsisr & DSISR_ISSTORE) != 0;
576 if (memslot->flags & KVM_MEM_READONLY) {
577 if (writing) {
578 /* give the guest a DSI */
579 dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
580 kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
581 return RESUME_GUEST;
582 }
583 upgrade_p = NULL;
584 }
585
586 if (dsisr & DSISR_SET_RC) {
587 /*
588 * Need to set an R or C bit in the 2nd-level tables;
589 * since we are just helping out the hardware here,
590 * it is sufficient to do what the hardware does.
591 */
592 pgflags = _PAGE_ACCESSED;
593 if (writing)
594 pgflags |= _PAGE_DIRTY;
595 /*
596 * We are walking the secondary page table here. We can do this
597 * without disabling irq.
598 */
599 spin_lock(&kvm->mmu_lock);
600 ptep = __find_linux_pte(kvm->arch.pgtable,
601 gpa, NULL, &shift);
602 if (ptep && pte_present(*ptep) &&
603 (!writing || pte_write(*ptep))) {
604 kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
605 gpa, shift);
606 dsisr &= ~DSISR_SET_RC;
607 }
608 spin_unlock(&kvm->mmu_lock);
609 if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
610 DSISR_PROTFAULT | DSISR_SET_RC)))
611 return RESUME_GUEST;
612 }
613 686
614 /* used to check for invalidations in progress */ 687 /* used to check for invalidations in progress */
615 mmu_seq = kvm->mmu_notifier_seq; 688 mmu_seq = kvm->mmu_notifier_seq;
@@ -622,7 +695,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
622 * is that the page is writable. 695 * is that the page is writable.
623 */ 696 */
624 hva = gfn_to_hva_memslot(memslot, gfn); 697 hva = gfn_to_hva_memslot(memslot, gfn);
625 if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) { 698 if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
626 upgrade_write = true; 699 upgrade_write = true;
627 } else { 700 } else {
628 unsigned long pfn; 701 unsigned long pfn;
@@ -680,7 +753,12 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
680 } 753 }
681 754
682 /* Allocate space in the tree and write the PTE */ 755 /* Allocate space in the tree and write the PTE */
683 ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); 756 ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
757 mmu_seq, kvm->arch.lpid, NULL, NULL);
758 if (inserted_pte)
759 *inserted_pte = pte;
760 if (levelp)
761 *levelp = level;
684 762
685 if (page) { 763 if (page) {
686 if (!ret && (pte_val(pte) & _PAGE_WRITE)) 764 if (!ret && (pte_val(pte) & _PAGE_WRITE))
@@ -688,6 +766,82 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
688 put_page(page); 766 put_page(page);
689 } 767 }
690 768
769 return ret;
770}
771
772int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
773 unsigned long ea, unsigned long dsisr)
774{
775 struct kvm *kvm = vcpu->kvm;
776 unsigned long gpa, gfn;
777 struct kvm_memory_slot *memslot;
778 long ret;
779 bool writing = !!(dsisr & DSISR_ISSTORE);
780 bool kvm_ro = false;
781
782 /* Check for unusual errors */
783 if (dsisr & DSISR_UNSUPP_MMU) {
784 pr_err("KVM: Got unsupported MMU fault\n");
785 return -EFAULT;
786 }
787 if (dsisr & DSISR_BADACCESS) {
788 /* Reflect to the guest as DSI */
789 pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
790 kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
791 return RESUME_GUEST;
792 }
793
794 /* Translate the logical address */
795 gpa = vcpu->arch.fault_gpa & ~0xfffUL;
796 gpa &= ~0xF000000000000000ul;
797 gfn = gpa >> PAGE_SHIFT;
798 if (!(dsisr & DSISR_PRTABLE_FAULT))
799 gpa |= ea & 0xfff;
800
801 /* Get the corresponding memslot */
802 memslot = gfn_to_memslot(kvm, gfn);
803
804 /* No memslot means it's an emulated MMIO region */
805 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
806 if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
807 DSISR_SET_RC)) {
808 /*
809 * Bad address in guest page table tree, or other
810 * unusual error - reflect it to the guest as DSI.
811 */
812 kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
813 return RESUME_GUEST;
814 }
815 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
816 }
817
818 if (memslot->flags & KVM_MEM_READONLY) {
819 if (writing) {
820 /* give the guest a DSI */
821 kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
822 DSISR_PROTFAULT);
823 return RESUME_GUEST;
824 }
825 kvm_ro = true;
826 }
827
828 /* Failed to set the reference/change bits */
829 if (dsisr & DSISR_SET_RC) {
830 spin_lock(&kvm->mmu_lock);
831 if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
832 writing, gpa, kvm->arch.lpid))
833 dsisr &= ~DSISR_SET_RC;
834 spin_unlock(&kvm->mmu_lock);
835
836 if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
837 DSISR_PROTFAULT | DSISR_SET_RC)))
838 return RESUME_GUEST;
839 }
840
841 /* Try to insert a pte */
842 ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
843 kvm_ro, NULL, NULL);
844
691 if (ret == 0 || ret == -EAGAIN) 845 if (ret == 0 || ret == -EAGAIN)
692 ret = RESUME_GUEST; 846 ret = RESUME_GUEST;
693 return ret; 847 return ret;
@@ -700,20 +854,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
700 pte_t *ptep; 854 pte_t *ptep;
701 unsigned long gpa = gfn << PAGE_SHIFT; 855 unsigned long gpa = gfn << PAGE_SHIFT;
702 unsigned int shift; 856 unsigned int shift;
703 unsigned long old;
704 857
705 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); 858 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
706 if (ptep && pte_present(*ptep)) { 859 if (ptep && pte_present(*ptep))
707 old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0, 860 kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
708 gpa, shift); 861 kvm->arch.lpid);
709 kvmppc_radix_tlbie_page(kvm, gpa, shift);
710 if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
711 unsigned long psize = PAGE_SIZE;
712 if (shift)
713 psize = 1ul << shift;
714 kvmppc_update_dirty_map(memslot, gfn, psize);
715 }
716 }
717 return 0; 862 return 0;
718} 863}
719 864
@@ -768,7 +913,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm,
768 ret = 1 << (shift - PAGE_SHIFT); 913 ret = 1 << (shift - PAGE_SHIFT);
769 kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, 914 kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
770 gpa, shift); 915 gpa, shift);
771 kvmppc_radix_tlbie_page(kvm, gpa, shift); 916 kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
772 } 917 }
773 return ret; 918 return ret;
774} 919}
@@ -853,6 +998,215 @@ static void pmd_ctor(void *addr)
853 memset(addr, 0, RADIX_PMD_TABLE_SIZE); 998 memset(addr, 0, RADIX_PMD_TABLE_SIZE);
854} 999}
855 1000
1001struct debugfs_radix_state {
1002 struct kvm *kvm;
1003 struct mutex mutex;
1004 unsigned long gpa;
1005 int lpid;
1006 int chars_left;
1007 int buf_index;
1008 char buf[128];
1009 u8 hdr;
1010};
1011
1012static int debugfs_radix_open(struct inode *inode, struct file *file)
1013{
1014 struct kvm *kvm = inode->i_private;
1015 struct debugfs_radix_state *p;
1016
1017 p = kzalloc(sizeof(*p), GFP_KERNEL);
1018 if (!p)
1019 return -ENOMEM;
1020
1021 kvm_get_kvm(kvm);
1022 p->kvm = kvm;
1023 mutex_init(&p->mutex);
1024 file->private_data = p;
1025
1026 return nonseekable_open(inode, file);
1027}
1028
1029static int debugfs_radix_release(struct inode *inode, struct file *file)
1030{
1031 struct debugfs_radix_state *p = file->private_data;
1032
1033 kvm_put_kvm(p->kvm);
1034 kfree(p);
1035 return 0;
1036}
1037
1038static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
1039 size_t len, loff_t *ppos)
1040{
1041 struct debugfs_radix_state *p = file->private_data;
1042 ssize_t ret, r;
1043 unsigned long n;
1044 struct kvm *kvm;
1045 unsigned long gpa;
1046 pgd_t *pgt;
1047 struct kvm_nested_guest *nested;
1048 pgd_t pgd, *pgdp;
1049 pud_t pud, *pudp;
1050 pmd_t pmd, *pmdp;
1051 pte_t *ptep;
1052 int shift;
1053 unsigned long pte;
1054
1055 kvm = p->kvm;
1056 if (!kvm_is_radix(kvm))
1057 return 0;
1058
1059 ret = mutex_lock_interruptible(&p->mutex);
1060 if (ret)
1061 return ret;
1062
1063 if (p->chars_left) {
1064 n = p->chars_left;
1065 if (n > len)
1066 n = len;
1067 r = copy_to_user(buf, p->buf + p->buf_index, n);
1068 n -= r;
1069 p->chars_left -= n;
1070 p->buf_index += n;
1071 buf += n;
1072 len -= n;
1073 ret = n;
1074 if (r) {
1075 if (!n)
1076 ret = -EFAULT;
1077 goto out;
1078 }
1079 }
1080
1081 gpa = p->gpa;
1082 nested = NULL;
1083 pgt = NULL;
1084 while (len != 0 && p->lpid >= 0) {
1085 if (gpa >= RADIX_PGTABLE_RANGE) {
1086 gpa = 0;
1087 pgt = NULL;
1088 if (nested) {
1089 kvmhv_put_nested(nested);
1090 nested = NULL;
1091 }
1092 p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
1093 p->hdr = 0;
1094 if (p->lpid < 0)
1095 break;
1096 }
1097 if (!pgt) {
1098 if (p->lpid == 0) {
1099 pgt = kvm->arch.pgtable;
1100 } else {
1101 nested = kvmhv_get_nested(kvm, p->lpid, false);
1102 if (!nested) {
1103 gpa = RADIX_PGTABLE_RANGE;
1104 continue;
1105 }
1106 pgt = nested->shadow_pgtable;
1107 }
1108 }
1109 n = 0;
1110 if (!p->hdr) {
1111 if (p->lpid > 0)
1112 n = scnprintf(p->buf, sizeof(p->buf),
1113 "\nNested LPID %d: ", p->lpid);
1114 n += scnprintf(p->buf + n, sizeof(p->buf) - n,
1115 "pgdir: %lx\n", (unsigned long)pgt);
1116 p->hdr = 1;
1117 goto copy;
1118 }
1119
1120 pgdp = pgt + pgd_index(gpa);
1121 pgd = READ_ONCE(*pgdp);
1122 if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
1123 gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE;
1124 continue;
1125 }
1126
1127 pudp = pud_offset(&pgd, gpa);
1128 pud = READ_ONCE(*pudp);
1129 if (!(pud_val(pud) & _PAGE_PRESENT)) {
1130 gpa = (gpa & PUD_MASK) + PUD_SIZE;
1131 continue;
1132 }
1133 if (pud_val(pud) & _PAGE_PTE) {
1134 pte = pud_val(pud);
1135 shift = PUD_SHIFT;
1136 goto leaf;
1137 }
1138
1139 pmdp = pmd_offset(&pud, gpa);
1140 pmd = READ_ONCE(*pmdp);
1141 if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
1142 gpa = (gpa & PMD_MASK) + PMD_SIZE;
1143 continue;
1144 }
1145 if (pmd_val(pmd) & _PAGE_PTE) {
1146 pte = pmd_val(pmd);
1147 shift = PMD_SHIFT;
1148 goto leaf;
1149 }
1150
1151 ptep = pte_offset_kernel(&pmd, gpa);
1152 pte = pte_val(READ_ONCE(*ptep));
1153 if (!(pte & _PAGE_PRESENT)) {
1154 gpa += PAGE_SIZE;
1155 continue;
1156 }
1157 shift = PAGE_SHIFT;
1158 leaf:
1159 n = scnprintf(p->buf, sizeof(p->buf),
1160 " %lx: %lx %d\n", gpa, pte, shift);
1161 gpa += 1ul << shift;
1162 copy:
1163 p->chars_left = n;
1164 if (n > len)
1165 n = len;
1166 r = copy_to_user(buf, p->buf, n);
1167 n -= r;
1168 p->chars_left -= n;
1169 p->buf_index = n;
1170 buf += n;
1171 len -= n;
1172 ret += n;
1173 if (r) {
1174 if (!ret)
1175 ret = -EFAULT;
1176 break;
1177 }
1178 }
1179 p->gpa = gpa;
1180 if (nested)
1181 kvmhv_put_nested(nested);
1182
1183 out:
1184 mutex_unlock(&p->mutex);
1185 return ret;
1186}
1187
1188static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
1189 size_t len, loff_t *ppos)
1190{
1191 return -EACCES;
1192}
1193
1194static const struct file_operations debugfs_radix_fops = {
1195 .owner = THIS_MODULE,
1196 .open = debugfs_radix_open,
1197 .release = debugfs_radix_release,
1198 .read = debugfs_radix_read,
1199 .write = debugfs_radix_write,
1200 .llseek = generic_file_llseek,
1201};
1202
1203void kvmhv_radix_debugfs_init(struct kvm *kvm)
1204{
1205 kvm->arch.radix_dentry = debugfs_create_file("radix", 0400,
1206 kvm->arch.debugfs_dir, kvm,
1207 &debugfs_radix_fops);
1208}
1209
856int kvmppc_radix_init(void) 1210int kvmppc_radix_init(void)
857{ 1211{
858 unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE; 1212 unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 9a3f2646ecc7..c0c64d11cc71 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -363,6 +363,40 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
363 return ret; 363 return ret;
364} 364}
365 365
366static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
367 unsigned long tce)
368{
369 unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
370 enum dma_data_direction dir = iommu_tce_direction(tce);
371 struct kvmppc_spapr_tce_iommu_table *stit;
372 unsigned long ua = 0;
373
374 /* Allow userspace to poison TCE table */
375 if (dir == DMA_NONE)
376 return H_SUCCESS;
377
378 if (iommu_tce_check_gpa(stt->page_shift, gpa))
379 return H_TOO_HARD;
380
381 if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL))
382 return H_TOO_HARD;
383
384 list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
385 unsigned long hpa = 0;
386 struct mm_iommu_table_group_mem_t *mem;
387 long shift = stit->tbl->it_page_shift;
388
389 mem = mm_iommu_lookup(stt->kvm->mm, ua, 1ULL << shift);
390 if (!mem)
391 return H_TOO_HARD;
392
393 if (mm_iommu_ua_to_hpa(mem, ua, shift, &hpa))
394 return H_TOO_HARD;
395 }
396
397 return H_SUCCESS;
398}
399
366static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry) 400static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
367{ 401{
368 unsigned long hpa = 0; 402 unsigned long hpa = 0;
@@ -401,7 +435,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
401 long ret; 435 long ret;
402 436
403 if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir))) 437 if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
404 return H_HARDWARE; 438 return H_TOO_HARD;
405 439
406 if (dir == DMA_NONE) 440 if (dir == DMA_NONE)
407 return H_SUCCESS; 441 return H_SUCCESS;
@@ -449,15 +483,15 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
449 return H_TOO_HARD; 483 return H_TOO_HARD;
450 484
451 if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa))) 485 if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa)))
452 return H_HARDWARE; 486 return H_TOO_HARD;
453 487
454 if (mm_iommu_mapped_inc(mem)) 488 if (mm_iommu_mapped_inc(mem))
455 return H_CLOSED; 489 return H_TOO_HARD;
456 490
457 ret = iommu_tce_xchg(tbl, entry, &hpa, &dir); 491 ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
458 if (WARN_ON_ONCE(ret)) { 492 if (WARN_ON_ONCE(ret)) {
459 mm_iommu_mapped_dec(mem); 493 mm_iommu_mapped_dec(mem);
460 return H_HARDWARE; 494 return H_TOO_HARD;
461 } 495 }
462 496
463 if (dir != DMA_NONE) 497 if (dir != DMA_NONE)
@@ -517,8 +551,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
517 551
518 idx = srcu_read_lock(&vcpu->kvm->srcu); 552 idx = srcu_read_lock(&vcpu->kvm->srcu);
519 553
520 if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, 554 if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
521 tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) {
522 ret = H_PARAMETER; 555 ret = H_PARAMETER;
523 goto unlock_exit; 556 goto unlock_exit;
524 } 557 }
@@ -533,14 +566,10 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
533 ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl, 566 ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl,
534 entry, ua, dir); 567 entry, ua, dir);
535 568
536 if (ret == H_SUCCESS) 569 if (ret != H_SUCCESS) {
537 continue; 570 kvmppc_clear_tce(stit->tbl, entry);
538
539 if (ret == H_TOO_HARD)
540 goto unlock_exit; 571 goto unlock_exit;
541 572 }
542 WARN_ON_ONCE(1);
543 kvmppc_clear_tce(stit->tbl, entry);
544 } 573 }
545 574
546 kvmppc_tce_put(stt, entry, tce); 575 kvmppc_tce_put(stt, entry, tce);
@@ -583,7 +612,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
583 return ret; 612 return ret;
584 613
585 idx = srcu_read_lock(&vcpu->kvm->srcu); 614 idx = srcu_read_lock(&vcpu->kvm->srcu);
586 if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) { 615 if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
587 ret = H_TOO_HARD; 616 ret = H_TOO_HARD;
588 goto unlock_exit; 617 goto unlock_exit;
589 } 618 }
@@ -599,10 +628,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
599 ret = kvmppc_tce_validate(stt, tce); 628 ret = kvmppc_tce_validate(stt, tce);
600 if (ret != H_SUCCESS) 629 if (ret != H_SUCCESS)
601 goto unlock_exit; 630 goto unlock_exit;
631 }
632
633 for (i = 0; i < npages; ++i) {
634 /*
635 * This looks unsafe, because we validate, then regrab
636 * the TCE from userspace which could have been changed by
637 * another thread.
638 *
639 * But it actually is safe, because the relevant checks will be
640 * re-executed in the following code. If userspace tries to
641 * change this dodgily it will result in a messier failure mode
642 * but won't threaten the host.
643 */
644 if (get_user(tce, tces + i)) {
645 ret = H_TOO_HARD;
646 goto unlock_exit;
647 }
648 tce = be64_to_cpu(tce);
602 649
603 if (kvmppc_gpa_to_ua(vcpu->kvm, 650 if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
604 tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
605 &ua, NULL))
606 return H_PARAMETER; 651 return H_PARAMETER;
607 652
608 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { 653 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -610,14 +655,10 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
610 stit->tbl, entry + i, ua, 655 stit->tbl, entry + i, ua,
611 iommu_tce_direction(tce)); 656 iommu_tce_direction(tce));
612 657
613 if (ret == H_SUCCESS) 658 if (ret != H_SUCCESS) {
614 continue; 659 kvmppc_clear_tce(stit->tbl, entry);
615
616 if (ret == H_TOO_HARD)
617 goto unlock_exit; 660 goto unlock_exit;
618 661 }
619 WARN_ON_ONCE(1);
620 kvmppc_clear_tce(stit->tbl, entry);
621 } 662 }
622 663
623 kvmppc_tce_put(stt, entry + i, tce); 664 kvmppc_tce_put(stt, entry + i, tce);
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 6821ead4b4eb..ec99363fdf54 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -87,6 +87,7 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
87} 87}
88EXPORT_SYMBOL_GPL(kvmppc_find_table); 88EXPORT_SYMBOL_GPL(kvmppc_find_table);
89 89
90#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
90/* 91/*
91 * Validates TCE address. 92 * Validates TCE address.
92 * At the moment flags and page mask are validated. 93 * At the moment flags and page mask are validated.
@@ -94,14 +95,14 @@ EXPORT_SYMBOL_GPL(kvmppc_find_table);
94 * to the table and user space is supposed to process them), we can skip 95 * to the table and user space is supposed to process them), we can skip
95 * checking other things (such as TCE is a guest RAM address or the page 96 * checking other things (such as TCE is a guest RAM address or the page
96 * was actually allocated). 97 * was actually allocated).
97 *
98 * WARNING: This will be called in real-mode on HV KVM and virtual
99 * mode on PR KVM
100 */ 98 */
101long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) 99static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt,
100 unsigned long tce)
102{ 101{
103 unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE); 102 unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
104 enum dma_data_direction dir = iommu_tce_direction(tce); 103 enum dma_data_direction dir = iommu_tce_direction(tce);
104 struct kvmppc_spapr_tce_iommu_table *stit;
105 unsigned long ua = 0;
105 106
106 /* Allow userspace to poison TCE table */ 107 /* Allow userspace to poison TCE table */
107 if (dir == DMA_NONE) 108 if (dir == DMA_NONE)
@@ -110,9 +111,25 @@ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
110 if (iommu_tce_check_gpa(stt->page_shift, gpa)) 111 if (iommu_tce_check_gpa(stt->page_shift, gpa))
111 return H_PARAMETER; 112 return H_PARAMETER;
112 113
114 if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL))
115 return H_TOO_HARD;
116
117 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
118 unsigned long hpa = 0;
119 struct mm_iommu_table_group_mem_t *mem;
120 long shift = stit->tbl->it_page_shift;
121
122 mem = mm_iommu_lookup_rm(stt->kvm->mm, ua, 1ULL << shift);
123 if (!mem)
124 return H_TOO_HARD;
125
126 if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, &hpa))
127 return H_TOO_HARD;
128 }
129
113 return H_SUCCESS; 130 return H_SUCCESS;
114} 131}
115EXPORT_SYMBOL_GPL(kvmppc_tce_validate); 132#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
116 133
117/* Note on the use of page_address() in real mode, 134/* Note on the use of page_address() in real mode,
118 * 135 *
@@ -164,10 +181,10 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
164} 181}
165EXPORT_SYMBOL_GPL(kvmppc_tce_put); 182EXPORT_SYMBOL_GPL(kvmppc_tce_put);
166 183
167long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, 184long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
168 unsigned long *ua, unsigned long **prmap) 185 unsigned long *ua, unsigned long **prmap)
169{ 186{
170 unsigned long gfn = gpa >> PAGE_SHIFT; 187 unsigned long gfn = tce >> PAGE_SHIFT;
171 struct kvm_memory_slot *memslot; 188 struct kvm_memory_slot *memslot;
172 189
173 memslot = search_memslots(kvm_memslots(kvm), gfn); 190 memslot = search_memslots(kvm_memslots(kvm), gfn);
@@ -175,7 +192,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
175 return -EINVAL; 192 return -EINVAL;
176 193
177 *ua = __gfn_to_hva_memslot(memslot, gfn) | 194 *ua = __gfn_to_hva_memslot(memslot, gfn) |
178 (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); 195 (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
179 196
180#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 197#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
181 if (prmap) 198 if (prmap)
@@ -184,7 +201,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
184 201
185 return 0; 202 return 0;
186} 203}
187EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); 204EXPORT_SYMBOL_GPL(kvmppc_tce_to_ua);
188 205
189#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 206#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
190static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, 207static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
@@ -300,10 +317,10 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
300 317
301 if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift, 318 if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift,
302 &hpa))) 319 &hpa)))
303 return H_HARDWARE; 320 return H_TOO_HARD;
304 321
305 if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) 322 if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
306 return H_CLOSED; 323 return H_TOO_HARD;
307 324
308 ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); 325 ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
309 if (ret) { 326 if (ret) {
@@ -368,13 +385,12 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
368 if (ret != H_SUCCESS) 385 if (ret != H_SUCCESS)
369 return ret; 386 return ret;
370 387
371 ret = kvmppc_tce_validate(stt, tce); 388 ret = kvmppc_rm_tce_validate(stt, tce);
372 if (ret != H_SUCCESS) 389 if (ret != H_SUCCESS)
373 return ret; 390 return ret;
374 391
375 dir = iommu_tce_direction(tce); 392 dir = iommu_tce_direction(tce);
376 if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, 393 if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
377 tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
378 return H_PARAMETER; 394 return H_PARAMETER;
379 395
380 entry = ioba >> stt->page_shift; 396 entry = ioba >> stt->page_shift;
@@ -387,14 +403,10 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
387 ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, 403 ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt,
388 stit->tbl, entry, ua, dir); 404 stit->tbl, entry, ua, dir);
389 405
390 if (ret == H_SUCCESS) 406 if (ret != H_SUCCESS) {
391 continue; 407 kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
392
393 if (ret == H_TOO_HARD)
394 return ret; 408 return ret;
395 409 }
396 WARN_ON_ONCE_RM(1);
397 kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
398 } 410 }
399 411
400 kvmppc_tce_put(stt, entry, tce); 412 kvmppc_tce_put(stt, entry, tce);
@@ -480,7 +492,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
480 */ 492 */
481 struct mm_iommu_table_group_mem_t *mem; 493 struct mm_iommu_table_group_mem_t *mem;
482 494
483 if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) 495 if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL))
484 return H_TOO_HARD; 496 return H_TOO_HARD;
485 497
486 mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K); 498 mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
@@ -496,12 +508,12 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
496 * We do not require memory to be preregistered in this case 508 * We do not require memory to be preregistered in this case
497 * so lock rmap and do __find_linux_pte_or_hugepte(). 509 * so lock rmap and do __find_linux_pte_or_hugepte().
498 */ 510 */
499 if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) 511 if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
500 return H_TOO_HARD; 512 return H_TOO_HARD;
501 513
502 rmap = (void *) vmalloc_to_phys(rmap); 514 rmap = (void *) vmalloc_to_phys(rmap);
503 if (WARN_ON_ONCE_RM(!rmap)) 515 if (WARN_ON_ONCE_RM(!rmap))
504 return H_HARDWARE; 516 return H_TOO_HARD;
505 517
506 /* 518 /*
507 * Synchronize with the MMU notifier callbacks in 519 * Synchronize with the MMU notifier callbacks in
@@ -521,14 +533,16 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
521 for (i = 0; i < npages; ++i) { 533 for (i = 0; i < npages; ++i) {
522 unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); 534 unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
523 535
524 ret = kvmppc_tce_validate(stt, tce); 536 ret = kvmppc_rm_tce_validate(stt, tce);
525 if (ret != H_SUCCESS) 537 if (ret != H_SUCCESS)
526 goto unlock_exit; 538 goto unlock_exit;
539 }
540
541 for (i = 0; i < npages; ++i) {
542 unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
527 543
528 ua = 0; 544 ua = 0;
529 if (kvmppc_gpa_to_ua(vcpu->kvm, 545 if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
530 tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
531 &ua, NULL))
532 return H_PARAMETER; 546 return H_PARAMETER;
533 547
534 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { 548 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -536,14 +550,11 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
536 stit->tbl, entry + i, ua, 550 stit->tbl, entry + i, ua,
537 iommu_tce_direction(tce)); 551 iommu_tce_direction(tce));
538 552
539 if (ret == H_SUCCESS) 553 if (ret != H_SUCCESS) {
540 continue; 554 kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl,
541 555 entry);
542 if (ret == H_TOO_HARD)
543 goto unlock_exit; 556 goto unlock_exit;
544 557 }
545 WARN_ON_ONCE_RM(1);
546 kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
547 } 558 }
548 559
549 kvmppc_tce_put(stt, entry + i, tce); 560 kvmppc_tce_put(stt, entry + i, tce);
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 36b11c5a0dbb..8c7e933e942e 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -36,7 +36,6 @@
36#define OP_31_XOP_MTSR 210 36#define OP_31_XOP_MTSR 210
37#define OP_31_XOP_MTSRIN 242 37#define OP_31_XOP_MTSRIN 242
38#define OP_31_XOP_TLBIEL 274 38#define OP_31_XOP_TLBIEL 274
39#define OP_31_XOP_TLBIE 306
40/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */ 39/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */
41#define OP_31_XOP_FAKE_SC1 308 40#define OP_31_XOP_FAKE_SC1 308
42#define OP_31_XOP_SLBMTE 402 41#define OP_31_XOP_SLBMTE 402
@@ -110,7 +109,7 @@ static inline void kvmppc_copyto_vcpu_tm(struct kvm_vcpu *vcpu)
110 vcpu->arch.ctr_tm = vcpu->arch.regs.ctr; 109 vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
111 vcpu->arch.tar_tm = vcpu->arch.tar; 110 vcpu->arch.tar_tm = vcpu->arch.tar;
112 vcpu->arch.lr_tm = vcpu->arch.regs.link; 111 vcpu->arch.lr_tm = vcpu->arch.regs.link;
113 vcpu->arch.cr_tm = vcpu->arch.cr; 112 vcpu->arch.cr_tm = vcpu->arch.regs.ccr;
114 vcpu->arch.xer_tm = vcpu->arch.regs.xer; 113 vcpu->arch.xer_tm = vcpu->arch.regs.xer;
115 vcpu->arch.vrsave_tm = vcpu->arch.vrsave; 114 vcpu->arch.vrsave_tm = vcpu->arch.vrsave;
116} 115}
@@ -129,7 +128,7 @@ static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu *vcpu)
129 vcpu->arch.regs.ctr = vcpu->arch.ctr_tm; 128 vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
130 vcpu->arch.tar = vcpu->arch.tar_tm; 129 vcpu->arch.tar = vcpu->arch.tar_tm;
131 vcpu->arch.regs.link = vcpu->arch.lr_tm; 130 vcpu->arch.regs.link = vcpu->arch.lr_tm;
132 vcpu->arch.cr = vcpu->arch.cr_tm; 131 vcpu->arch.regs.ccr = vcpu->arch.cr_tm;
133 vcpu->arch.regs.xer = vcpu->arch.xer_tm; 132 vcpu->arch.regs.xer = vcpu->arch.xer_tm;
134 vcpu->arch.vrsave = vcpu->arch.vrsave_tm; 133 vcpu->arch.vrsave = vcpu->arch.vrsave_tm;
135} 134}
@@ -141,7 +140,7 @@ static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, int ra_val)
141 uint64_t texasr; 140 uint64_t texasr;
142 141
143 /* CR0 = 0 | MSR[TS] | 0 */ 142 /* CR0 = 0 | MSR[TS] | 0 */
144 vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) | 143 vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
145 (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1)) 144 (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
146 << CR0_SHIFT); 145 << CR0_SHIFT);
147 146
@@ -220,7 +219,7 @@ void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val)
220 tm_abort(ra_val); 219 tm_abort(ra_val);
221 220
222 /* CR0 = 0 | MSR[TS] | 0 */ 221 /* CR0 = 0 | MSR[TS] | 0 */
223 vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) | 222 vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
224 (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1)) 223 (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
225 << CR0_SHIFT); 224 << CR0_SHIFT);
226 225
@@ -494,8 +493,8 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
494 493
495 if (!(kvmppc_get_msr(vcpu) & MSR_PR)) { 494 if (!(kvmppc_get_msr(vcpu) & MSR_PR)) {
496 preempt_disable(); 495 preempt_disable();
497 vcpu->arch.cr = (CR0_TBEGIN_FAILURE | 496 vcpu->arch.regs.ccr = (CR0_TBEGIN_FAILURE |
498 (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT))); 497 (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)));
499 498
500 vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT | 499 vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT |
501 (((u64)(TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT)) 500 (((u64)(TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT))
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 3e3a71594e63..788bc61bd08c 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -50,6 +50,7 @@
50#include <asm/reg.h> 50#include <asm/reg.h>
51#include <asm/ppc-opcode.h> 51#include <asm/ppc-opcode.h>
52#include <asm/asm-prototypes.h> 52#include <asm/asm-prototypes.h>
53#include <asm/archrandom.h>
53#include <asm/debug.h> 54#include <asm/debug.h>
54#include <asm/disassemble.h> 55#include <asm/disassemble.h>
55#include <asm/cputable.h> 56#include <asm/cputable.h>
@@ -104,6 +105,10 @@ static bool indep_threads_mode = true;
104module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR); 105module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
105MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)"); 106MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
106 107
108static bool one_vm_per_core;
109module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
110MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)");
111
107#ifdef CONFIG_KVM_XICS 112#ifdef CONFIG_KVM_XICS
108static struct kernel_param_ops module_param_ops = { 113static struct kernel_param_ops module_param_ops = {
109 .set = param_set_int, 114 .set = param_set_int,
@@ -117,6 +122,16 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644);
117MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); 122MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
118#endif 123#endif
119 124
125/* If set, guests are allowed to create and control nested guests */
126static bool nested = true;
127module_param(nested, bool, S_IRUGO | S_IWUSR);
128MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
129
130static inline bool nesting_enabled(struct kvm *kvm)
131{
132 return kvm->arch.nested_enable && kvm_is_radix(kvm);
133}
134
120/* If set, the threads on each CPU core have to be in the same MMU mode */ 135/* If set, the threads on each CPU core have to be in the same MMU mode */
121static bool no_mixing_hpt_and_radix; 136static bool no_mixing_hpt_and_radix;
122 137
@@ -173,6 +188,10 @@ static bool kvmppc_ipi_thread(int cpu)
173{ 188{
174 unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); 189 unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
175 190
191 /* If we're a nested hypervisor, fall back to ordinary IPIs for now */
192 if (kvmhv_on_pseries())
193 return false;
194
176 /* On POWER9 we can use msgsnd to IPI any cpu */ 195 /* On POWER9 we can use msgsnd to IPI any cpu */
177 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 196 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
178 msg |= get_hard_smp_processor_id(cpu); 197 msg |= get_hard_smp_processor_id(cpu);
@@ -410,8 +429,8 @@ static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
410 vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1); 429 vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
411 pr_err("sprg2 = %.16llx sprg3 = %.16llx\n", 430 pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
412 vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3); 431 vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
413 pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n", 432 pr_err("cr = %.8lx xer = %.16lx dsisr = %.8x\n",
414 vcpu->arch.cr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr); 433 vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
415 pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar); 434 pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
416 pr_err("fault dar = %.16lx dsisr = %.8x\n", 435 pr_err("fault dar = %.16lx dsisr = %.8x\n",
417 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); 436 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
@@ -730,8 +749,7 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
730 /* 749 /*
731 * Ensure that the read of vcore->dpdes comes after the read 750 * Ensure that the read of vcore->dpdes comes after the read
732 * of vcpu->doorbell_request. This barrier matches the 751 * of vcpu->doorbell_request. This barrier matches the
733 * lwsync in book3s_hv_rmhandlers.S just before the 752 * smb_wmb() in kvmppc_guest_entry_inject().
734 * fast_guest_return label.
735 */ 753 */
736 smp_rmb(); 754 smp_rmb();
737 vc = vcpu->arch.vcore; 755 vc = vcpu->arch.vcore;
@@ -912,6 +930,19 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
912 break; 930 break;
913 } 931 }
914 return RESUME_HOST; 932 return RESUME_HOST;
933 case H_SET_DABR:
934 ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
935 break;
936 case H_SET_XDABR:
937 ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
938 kvmppc_get_gpr(vcpu, 5));
939 break;
940 case H_GET_TCE:
941 ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
942 kvmppc_get_gpr(vcpu, 5));
943 if (ret == H_TOO_HARD)
944 return RESUME_HOST;
945 break;
915 case H_PUT_TCE: 946 case H_PUT_TCE:
916 ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4), 947 ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
917 kvmppc_get_gpr(vcpu, 5), 948 kvmppc_get_gpr(vcpu, 5),
@@ -935,6 +966,32 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
935 if (ret == H_TOO_HARD) 966 if (ret == H_TOO_HARD)
936 return RESUME_HOST; 967 return RESUME_HOST;
937 break; 968 break;
969 case H_RANDOM:
970 if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
971 ret = H_HARDWARE;
972 break;
973
974 case H_SET_PARTITION_TABLE:
975 ret = H_FUNCTION;
976 if (nesting_enabled(vcpu->kvm))
977 ret = kvmhv_set_partition_table(vcpu);
978 break;
979 case H_ENTER_NESTED:
980 ret = H_FUNCTION;
981 if (!nesting_enabled(vcpu->kvm))
982 break;
983 ret = kvmhv_enter_nested_guest(vcpu);
984 if (ret == H_INTERRUPT) {
985 kvmppc_set_gpr(vcpu, 3, 0);
986 return -EINTR;
987 }
988 break;
989 case H_TLB_INVALIDATE:
990 ret = H_FUNCTION;
991 if (nesting_enabled(vcpu->kvm))
992 ret = kvmhv_do_nested_tlbie(vcpu);
993 break;
994
938 default: 995 default:
939 return RESUME_HOST; 996 return RESUME_HOST;
940 } 997 }
@@ -943,6 +1000,24 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
943 return RESUME_GUEST; 1000 return RESUME_GUEST;
944} 1001}
945 1002
1003/*
1004 * Handle H_CEDE in the nested virtualization case where we haven't
1005 * called the real-mode hcall handlers in book3s_hv_rmhandlers.S.
1006 * This has to be done early, not in kvmppc_pseries_do_hcall(), so
1007 * that the cede logic in kvmppc_run_single_vcpu() works properly.
1008 */
1009static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
1010{
1011 vcpu->arch.shregs.msr |= MSR_EE;
1012 vcpu->arch.ceded = 1;
1013 smp_mb();
1014 if (vcpu->arch.prodded) {
1015 vcpu->arch.prodded = 0;
1016 smp_mb();
1017 vcpu->arch.ceded = 0;
1018 }
1019}
1020
946static int kvmppc_hcall_impl_hv(unsigned long cmd) 1021static int kvmppc_hcall_impl_hv(unsigned long cmd)
947{ 1022{
948 switch (cmd) { 1023 switch (cmd) {
@@ -1085,7 +1160,6 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
1085 return RESUME_GUEST; 1160 return RESUME_GUEST;
1086} 1161}
1087 1162
1088/* Called with vcpu->arch.vcore->lock held */
1089static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, 1163static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
1090 struct task_struct *tsk) 1164 struct task_struct *tsk)
1091{ 1165{
@@ -1190,7 +1264,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
1190 break; 1264 break;
1191 case BOOK3S_INTERRUPT_H_INST_STORAGE: 1265 case BOOK3S_INTERRUPT_H_INST_STORAGE:
1192 vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); 1266 vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
1193 vcpu->arch.fault_dsisr = 0; 1267 vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
1268 DSISR_SRR1_MATCH_64S;
1269 if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
1270 vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
1194 r = RESUME_PAGE_FAULT; 1271 r = RESUME_PAGE_FAULT;
1195 break; 1272 break;
1196 /* 1273 /*
@@ -1206,10 +1283,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
1206 swab32(vcpu->arch.emul_inst) : 1283 swab32(vcpu->arch.emul_inst) :
1207 vcpu->arch.emul_inst; 1284 vcpu->arch.emul_inst;
1208 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) { 1285 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
1209 /* Need vcore unlocked to call kvmppc_get_last_inst */
1210 spin_unlock(&vcpu->arch.vcore->lock);
1211 r = kvmppc_emulate_debug_inst(run, vcpu); 1286 r = kvmppc_emulate_debug_inst(run, vcpu);
1212 spin_lock(&vcpu->arch.vcore->lock);
1213 } else { 1287 } else {
1214 kvmppc_core_queue_program(vcpu, SRR1_PROGILL); 1288 kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
1215 r = RESUME_GUEST; 1289 r = RESUME_GUEST;
@@ -1225,12 +1299,8 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
1225 case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: 1299 case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
1226 r = EMULATE_FAIL; 1300 r = EMULATE_FAIL;
1227 if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) && 1301 if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) &&
1228 cpu_has_feature(CPU_FTR_ARCH_300)) { 1302 cpu_has_feature(CPU_FTR_ARCH_300))
1229 /* Need vcore unlocked to call kvmppc_get_last_inst */
1230 spin_unlock(&vcpu->arch.vcore->lock);
1231 r = kvmppc_emulate_doorbell_instr(vcpu); 1303 r = kvmppc_emulate_doorbell_instr(vcpu);
1232 spin_lock(&vcpu->arch.vcore->lock);
1233 }
1234 if (r == EMULATE_FAIL) { 1304 if (r == EMULATE_FAIL) {
1235 kvmppc_core_queue_program(vcpu, SRR1_PROGILL); 1305 kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
1236 r = RESUME_GUEST; 1306 r = RESUME_GUEST;
@@ -1265,6 +1335,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
1265 return r; 1335 return r;
1266} 1336}
1267 1337
1338static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
1339{
1340 int r;
1341 int srcu_idx;
1342
1343 vcpu->stat.sum_exits++;
1344
1345 /*
1346 * This can happen if an interrupt occurs in the last stages
1347 * of guest entry or the first stages of guest exit (i.e. after
1348 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
1349 * and before setting it to KVM_GUEST_MODE_HOST_HV).
1350 * That can happen due to a bug, or due to a machine check
1351 * occurring at just the wrong time.
1352 */
1353 if (vcpu->arch.shregs.msr & MSR_HV) {
1354 pr_emerg("KVM trap in HV mode while nested!\n");
1355 pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
1356 vcpu->arch.trap, kvmppc_get_pc(vcpu),
1357 vcpu->arch.shregs.msr);
1358 kvmppc_dump_regs(vcpu);
1359 return RESUME_HOST;
1360 }
1361 switch (vcpu->arch.trap) {
1362 /* We're good on these - the host merely wanted to get our attention */
1363 case BOOK3S_INTERRUPT_HV_DECREMENTER:
1364 vcpu->stat.dec_exits++;
1365 r = RESUME_GUEST;
1366 break;
1367 case BOOK3S_INTERRUPT_EXTERNAL:
1368 vcpu->stat.ext_intr_exits++;
1369 r = RESUME_HOST;
1370 break;
1371 case BOOK3S_INTERRUPT_H_DOORBELL:
1372 case BOOK3S_INTERRUPT_H_VIRT:
1373 vcpu->stat.ext_intr_exits++;
1374 r = RESUME_GUEST;
1375 break;
1376 /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
1377 case BOOK3S_INTERRUPT_HMI:
1378 case BOOK3S_INTERRUPT_PERFMON:
1379 case BOOK3S_INTERRUPT_SYSTEM_RESET:
1380 r = RESUME_GUEST;
1381 break;
1382 case BOOK3S_INTERRUPT_MACHINE_CHECK:
1383 /* Pass the machine check to the L1 guest */
1384 r = RESUME_HOST;
1385 /* Print the MCE event to host console. */
1386 machine_check_print_event_info(&vcpu->arch.mce_evt, false);
1387 break;
1388 /*
1389 * We get these next two if the guest accesses a page which it thinks
1390 * it has mapped but which is not actually present, either because
1391 * it is for an emulated I/O device or because the corresonding
1392 * host page has been paged out.
1393 */
1394 case BOOK3S_INTERRUPT_H_DATA_STORAGE:
1395 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1396 r = kvmhv_nested_page_fault(vcpu);
1397 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
1398 break;
1399 case BOOK3S_INTERRUPT_H_INST_STORAGE:
1400 vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
1401 vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
1402 DSISR_SRR1_MATCH_64S;
1403 if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
1404 vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
1405 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1406 r = kvmhv_nested_page_fault(vcpu);
1407 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
1408 break;
1409
1410#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1411 case BOOK3S_INTERRUPT_HV_SOFTPATCH:
1412 /*
1413 * This occurs for various TM-related instructions that
1414 * we need to emulate on POWER9 DD2.2. We have already
1415 * handled the cases where the guest was in real-suspend
1416 * mode and was transitioning to transactional state.
1417 */
1418 r = kvmhv_p9_tm_emulation(vcpu);
1419 break;
1420#endif
1421
1422 case BOOK3S_INTERRUPT_HV_RM_HARD:
1423 vcpu->arch.trap = 0;
1424 r = RESUME_GUEST;
1425 if (!xive_enabled())
1426 kvmppc_xics_rm_complete(vcpu, 0);
1427 break;
1428 default:
1429 r = RESUME_HOST;
1430 break;
1431 }
1432
1433 return r;
1434}
1435
1268static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, 1436static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
1269 struct kvm_sregs *sregs) 1437 struct kvm_sregs *sregs)
1270{ 1438{
@@ -1555,6 +1723,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
1555 case KVM_REG_PPC_ONLINE: 1723 case KVM_REG_PPC_ONLINE:
1556 *val = get_reg_val(id, vcpu->arch.online); 1724 *val = get_reg_val(id, vcpu->arch.online);
1557 break; 1725 break;
1726 case KVM_REG_PPC_PTCR:
1727 *val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
1728 break;
1558 default: 1729 default:
1559 r = -EINVAL; 1730 r = -EINVAL;
1560 break; 1731 break;
@@ -1786,6 +1957,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
1786 atomic_dec(&vcpu->arch.vcore->online_count); 1957 atomic_dec(&vcpu->arch.vcore->online_count);
1787 vcpu->arch.online = i; 1958 vcpu->arch.online = i;
1788 break; 1959 break;
1960 case KVM_REG_PPC_PTCR:
1961 vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
1962 break;
1789 default: 1963 default:
1790 r = -EINVAL; 1964 r = -EINVAL;
1791 break; 1965 break;
@@ -2019,15 +2193,18 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
2019 * Set the default HFSCR for the guest from the host value. 2193 * Set the default HFSCR for the guest from the host value.
2020 * This value is only used on POWER9. 2194 * This value is only used on POWER9.
2021 * On POWER9, we want to virtualize the doorbell facility, so we 2195 * On POWER9, we want to virtualize the doorbell facility, so we
2022 * turn off the HFSCR bit, which causes those instructions to trap. 2196 * don't set the HFSCR_MSGP bit, and that causes those instructions
2197 * to trap and then we emulate them.
2023 */ 2198 */
2024 vcpu->arch.hfscr = mfspr(SPRN_HFSCR); 2199 vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
2025 if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) 2200 HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP;
2201 if (cpu_has_feature(CPU_FTR_HVMODE)) {
2202 vcpu->arch.hfscr &= mfspr(SPRN_HFSCR);
2203 if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
2204 vcpu->arch.hfscr |= HFSCR_TM;
2205 }
2206 if (cpu_has_feature(CPU_FTR_TM_COMP))
2026 vcpu->arch.hfscr |= HFSCR_TM; 2207 vcpu->arch.hfscr |= HFSCR_TM;
2027 else if (!cpu_has_feature(CPU_FTR_TM_COMP))
2028 vcpu->arch.hfscr &= ~HFSCR_TM;
2029 if (cpu_has_feature(CPU_FTR_ARCH_300))
2030 vcpu->arch.hfscr &= ~HFSCR_MSGP;
2031 2208
2032 kvmppc_mmu_book3s_hv_init(vcpu); 2209 kvmppc_mmu_book3s_hv_init(vcpu);
2033 2210
@@ -2242,10 +2419,18 @@ static void kvmppc_release_hwthread(int cpu)
2242 2419
2243static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) 2420static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
2244{ 2421{
2422 struct kvm_nested_guest *nested = vcpu->arch.nested;
2423 cpumask_t *cpu_in_guest;
2245 int i; 2424 int i;
2246 2425
2247 cpu = cpu_first_thread_sibling(cpu); 2426 cpu = cpu_first_thread_sibling(cpu);
2248 cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush); 2427 if (nested) {
2428 cpumask_set_cpu(cpu, &nested->need_tlb_flush);
2429 cpu_in_guest = &nested->cpu_in_guest;
2430 } else {
2431 cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
2432 cpu_in_guest = &kvm->arch.cpu_in_guest;
2433 }
2249 /* 2434 /*
2250 * Make sure setting of bit in need_tlb_flush precedes 2435 * Make sure setting of bit in need_tlb_flush precedes
2251 * testing of cpu_in_guest bits. The matching barrier on 2436 * testing of cpu_in_guest bits. The matching barrier on
@@ -2253,13 +2438,23 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
2253 */ 2438 */
2254 smp_mb(); 2439 smp_mb();
2255 for (i = 0; i < threads_per_core; ++i) 2440 for (i = 0; i < threads_per_core; ++i)
2256 if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest)) 2441 if (cpumask_test_cpu(cpu + i, cpu_in_guest))
2257 smp_call_function_single(cpu + i, do_nothing, NULL, 1); 2442 smp_call_function_single(cpu + i, do_nothing, NULL, 1);
2258} 2443}
2259 2444
2260static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) 2445static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
2261{ 2446{
2447 struct kvm_nested_guest *nested = vcpu->arch.nested;
2262 struct kvm *kvm = vcpu->kvm; 2448 struct kvm *kvm = vcpu->kvm;
2449 int prev_cpu;
2450
2451 if (!cpu_has_feature(CPU_FTR_HVMODE))
2452 return;
2453
2454 if (nested)
2455 prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
2456 else
2457 prev_cpu = vcpu->arch.prev_cpu;
2263 2458
2264 /* 2459 /*
2265 * With radix, the guest can do TLB invalidations itself, 2460 * With radix, the guest can do TLB invalidations itself,
@@ -2273,12 +2468,46 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
2273 * ran to flush the TLB. The TLB is shared between threads, 2468 * ran to flush the TLB. The TLB is shared between threads,
2274 * so we use a single bit in .need_tlb_flush for all 4 threads. 2469 * so we use a single bit in .need_tlb_flush for all 4 threads.
2275 */ 2470 */
2276 if (vcpu->arch.prev_cpu != pcpu) { 2471 if (prev_cpu != pcpu) {
2277 if (vcpu->arch.prev_cpu >= 0 && 2472 if (prev_cpu >= 0 &&
2278 cpu_first_thread_sibling(vcpu->arch.prev_cpu) != 2473 cpu_first_thread_sibling(prev_cpu) !=
2279 cpu_first_thread_sibling(pcpu)) 2474 cpu_first_thread_sibling(pcpu))
2280 radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); 2475 radix_flush_cpu(kvm, prev_cpu, vcpu);
2281 vcpu->arch.prev_cpu = pcpu; 2476 if (nested)
2477 nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
2478 else
2479 vcpu->arch.prev_cpu = pcpu;
2480 }
2481}
2482
2483static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu,
2484 struct kvm_nested_guest *nested)
2485{
2486 cpumask_t *need_tlb_flush;
2487 int lpid;
2488
2489 if (!cpu_has_feature(CPU_FTR_HVMODE))
2490 return;
2491
2492 if (cpu_has_feature(CPU_FTR_ARCH_300))
2493 pcpu &= ~0x3UL;
2494
2495 if (nested) {
2496 lpid = nested->shadow_lpid;
2497 need_tlb_flush = &nested->need_tlb_flush;
2498 } else {
2499 lpid = kvm->arch.lpid;
2500 need_tlb_flush = &kvm->arch.need_tlb_flush;
2501 }
2502
2503 mtspr(SPRN_LPID, lpid);
2504 isync();
2505 smp_mb();
2506
2507 if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
2508 radix__local_flush_tlb_lpid_guest(lpid);
2509 /* Clear the bit after the TLB flush */
2510 cpumask_clear_cpu(pcpu, need_tlb_flush);
2282 } 2511 }
2283} 2512}
2284 2513
@@ -2493,6 +2722,10 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
2493 if (!cpu_has_feature(CPU_FTR_ARCH_207S)) 2722 if (!cpu_has_feature(CPU_FTR_ARCH_207S))
2494 return false; 2723 return false;
2495 2724
2725 /* In one_vm_per_core mode, require all vcores to be from the same vm */
2726 if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
2727 return false;
2728
2496 /* Some POWER9 chips require all threads to be in the same MMU mode */ 2729 /* Some POWER9 chips require all threads to be in the same MMU mode */
2497 if (no_mixing_hpt_and_radix && 2730 if (no_mixing_hpt_and_radix &&
2498 kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm)) 2731 kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
@@ -2600,6 +2833,14 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
2600 spin_lock(&vc->lock); 2833 spin_lock(&vc->lock);
2601 now = get_tb(); 2834 now = get_tb();
2602 for_each_runnable_thread(i, vcpu, vc) { 2835 for_each_runnable_thread(i, vcpu, vc) {
2836 /*
2837 * It's safe to unlock the vcore in the loop here, because
2838 * for_each_runnable_thread() is safe against removal of
2839 * the vcpu, and the vcore state is VCORE_EXITING here,
2840 * so any vcpus becoming runnable will have their arch.trap
2841 * set to zero and can't actually run in the guest.
2842 */
2843 spin_unlock(&vc->lock);
2603 /* cancel pending dec exception if dec is positive */ 2844 /* cancel pending dec exception if dec is positive */
2604 if (now < vcpu->arch.dec_expires && 2845 if (now < vcpu->arch.dec_expires &&
2605 kvmppc_core_pending_dec(vcpu)) 2846 kvmppc_core_pending_dec(vcpu))
@@ -2615,6 +2856,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
2615 vcpu->arch.ret = ret; 2856 vcpu->arch.ret = ret;
2616 vcpu->arch.trap = 0; 2857 vcpu->arch.trap = 0;
2617 2858
2859 spin_lock(&vc->lock);
2618 if (is_kvmppc_resume_guest(vcpu->arch.ret)) { 2860 if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
2619 if (vcpu->arch.pending_exceptions) 2861 if (vcpu->arch.pending_exceptions)
2620 kvmppc_core_prepare_to_enter(vcpu); 2862 kvmppc_core_prepare_to_enter(vcpu);
@@ -2963,8 +3205,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2963 spin_unlock(&core_info.vc[sub]->lock); 3205 spin_unlock(&core_info.vc[sub]->lock);
2964 3206
2965 if (kvm_is_radix(vc->kvm)) { 3207 if (kvm_is_radix(vc->kvm)) {
2966 int tmp = pcpu;
2967
2968 /* 3208 /*
2969 * Do we need to flush the process scoped TLB for the LPAR? 3209 * Do we need to flush the process scoped TLB for the LPAR?
2970 * 3210 *
@@ -2975,17 +3215,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2975 * 3215 *
2976 * Hash must be flushed in realmode in order to use tlbiel. 3216 * Hash must be flushed in realmode in order to use tlbiel.
2977 */ 3217 */
2978 mtspr(SPRN_LPID, vc->kvm->arch.lpid); 3218 kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL);
2979 isync();
2980
2981 if (cpu_has_feature(CPU_FTR_ARCH_300))
2982 tmp &= ~0x3UL;
2983
2984 if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
2985 radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid);
2986 /* Clear the bit after the TLB flush */
2987 cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
2988 }
2989 } 3219 }
2990 3220
2991 /* 3221 /*
@@ -3080,6 +3310,300 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
3080} 3310}
3081 3311
3082/* 3312/*
3313 * Load up hypervisor-mode registers on P9.
3314 */
3315static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
3316 unsigned long lpcr)
3317{
3318 struct kvmppc_vcore *vc = vcpu->arch.vcore;
3319 s64 hdec;
3320 u64 tb, purr, spurr;
3321 int trap;
3322 unsigned long host_hfscr = mfspr(SPRN_HFSCR);
3323 unsigned long host_ciabr = mfspr(SPRN_CIABR);
3324 unsigned long host_dawr = mfspr(SPRN_DAWR);
3325 unsigned long host_dawrx = mfspr(SPRN_DAWRX);
3326 unsigned long host_psscr = mfspr(SPRN_PSSCR);
3327 unsigned long host_pidr = mfspr(SPRN_PID);
3328
3329 hdec = time_limit - mftb();
3330 if (hdec < 0)
3331 return BOOK3S_INTERRUPT_HV_DECREMENTER;
3332 mtspr(SPRN_HDEC, hdec);
3333
3334 if (vc->tb_offset) {
3335 u64 new_tb = mftb() + vc->tb_offset;
3336 mtspr(SPRN_TBU40, new_tb);
3337 tb = mftb();
3338 if ((tb & 0xffffff) < (new_tb & 0xffffff))
3339 mtspr(SPRN_TBU40, new_tb + 0x1000000);
3340 vc->tb_offset_applied = vc->tb_offset;
3341 }
3342
3343 if (vc->pcr)
3344 mtspr(SPRN_PCR, vc->pcr);
3345 mtspr(SPRN_DPDES, vc->dpdes);
3346 mtspr(SPRN_VTB, vc->vtb);
3347
3348 local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
3349 local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
3350 mtspr(SPRN_PURR, vcpu->arch.purr);
3351 mtspr(SPRN_SPURR, vcpu->arch.spurr);
3352
3353 if (cpu_has_feature(CPU_FTR_DAWR)) {
3354 mtspr(SPRN_DAWR, vcpu->arch.dawr);
3355 mtspr(SPRN_DAWRX, vcpu->arch.dawrx);
3356 }
3357 mtspr(SPRN_CIABR, vcpu->arch.ciabr);
3358 mtspr(SPRN_IC, vcpu->arch.ic);
3359 mtspr(SPRN_PID, vcpu->arch.pid);
3360
3361 mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
3362 (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
3363
3364 mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
3365
3366 mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
3367 mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
3368 mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
3369 mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
3370
3371 mtspr(SPRN_AMOR, ~0UL);
3372
3373 mtspr(SPRN_LPCR, lpcr);
3374 isync();
3375
3376 kvmppc_xive_push_vcpu(vcpu);
3377
3378 mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
3379 mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
3380
3381 trap = __kvmhv_vcpu_entry_p9(vcpu);
3382
3383 /* Advance host PURR/SPURR by the amount used by guest */
3384 purr = mfspr(SPRN_PURR);
3385 spurr = mfspr(SPRN_SPURR);
3386 mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
3387 purr - vcpu->arch.purr);
3388 mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
3389 spurr - vcpu->arch.spurr);
3390 vcpu->arch.purr = purr;
3391 vcpu->arch.spurr = spurr;
3392
3393 vcpu->arch.ic = mfspr(SPRN_IC);
3394 vcpu->arch.pid = mfspr(SPRN_PID);
3395 vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
3396
3397 vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
3398 vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
3399 vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
3400 vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
3401
3402 mtspr(SPRN_PSSCR, host_psscr);
3403 mtspr(SPRN_HFSCR, host_hfscr);
3404 mtspr(SPRN_CIABR, host_ciabr);
3405 mtspr(SPRN_DAWR, host_dawr);
3406 mtspr(SPRN_DAWRX, host_dawrx);
3407 mtspr(SPRN_PID, host_pidr);
3408
3409 /*
3410 * Since this is radix, do a eieio; tlbsync; ptesync sequence in
3411 * case we interrupted the guest between a tlbie and a ptesync.
3412 */
3413 asm volatile("eieio; tlbsync; ptesync");
3414
3415 mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid); /* restore host LPID */
3416 isync();
3417
3418 vc->dpdes = mfspr(SPRN_DPDES);
3419 vc->vtb = mfspr(SPRN_VTB);
3420 mtspr(SPRN_DPDES, 0);
3421 if (vc->pcr)
3422 mtspr(SPRN_PCR, 0);
3423
3424 if (vc->tb_offset_applied) {
3425 u64 new_tb = mftb() - vc->tb_offset_applied;
3426 mtspr(SPRN_TBU40, new_tb);
3427 tb = mftb();
3428 if ((tb & 0xffffff) < (new_tb & 0xffffff))
3429 mtspr(SPRN_TBU40, new_tb + 0x1000000);
3430 vc->tb_offset_applied = 0;
3431 }
3432
3433 mtspr(SPRN_HDEC, 0x7fffffff);
3434 mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
3435
3436 return trap;
3437}
3438
3439/*
3440 * Virtual-mode guest entry for POWER9 and later when the host and
3441 * guest are both using the radix MMU. The LPIDR has already been set.
3442 */
3443int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
3444 unsigned long lpcr)
3445{
3446 struct kvmppc_vcore *vc = vcpu->arch.vcore;
3447 unsigned long host_dscr = mfspr(SPRN_DSCR);
3448 unsigned long host_tidr = mfspr(SPRN_TIDR);
3449 unsigned long host_iamr = mfspr(SPRN_IAMR);
3450 s64 dec;
3451 u64 tb;
3452 int trap, save_pmu;
3453
3454 dec = mfspr(SPRN_DEC);
3455 tb = mftb();
3456 if (dec < 512)
3457 return BOOK3S_INTERRUPT_HV_DECREMENTER;
3458 local_paca->kvm_hstate.dec_expires = dec + tb;
3459 if (local_paca->kvm_hstate.dec_expires < time_limit)
3460 time_limit = local_paca->kvm_hstate.dec_expires;
3461
3462 vcpu->arch.ceded = 0;
3463
3464 kvmhv_save_host_pmu(); /* saves it to PACA kvm_hstate */
3465
3466 kvmppc_subcore_enter_guest();
3467
3468 vc->entry_exit_map = 1;
3469 vc->in_guest = 1;
3470
3471 if (vcpu->arch.vpa.pinned_addr) {
3472 struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
3473 u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
3474 lp->yield_count = cpu_to_be32(yield_count);
3475 vcpu->arch.vpa.dirty = 1;
3476 }
3477
3478 if (cpu_has_feature(CPU_FTR_TM) ||
3479 cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
3480 kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
3481
3482 kvmhv_load_guest_pmu(vcpu);
3483
3484 msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
3485 load_fp_state(&vcpu->arch.fp);
3486#ifdef CONFIG_ALTIVEC
3487 load_vr_state(&vcpu->arch.vr);
3488#endif
3489
3490 mtspr(SPRN_DSCR, vcpu->arch.dscr);
3491 mtspr(SPRN_IAMR, vcpu->arch.iamr);
3492 mtspr(SPRN_PSPB, vcpu->arch.pspb);
3493 mtspr(SPRN_FSCR, vcpu->arch.fscr);
3494 mtspr(SPRN_TAR, vcpu->arch.tar);
3495 mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
3496 mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
3497 mtspr(SPRN_BESCR, vcpu->arch.bescr);
3498 mtspr(SPRN_WORT, vcpu->arch.wort);
3499 mtspr(SPRN_TIDR, vcpu->arch.tid);
3500 mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
3501 mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
3502 mtspr(SPRN_AMR, vcpu->arch.amr);
3503 mtspr(SPRN_UAMOR, vcpu->arch.uamor);
3504
3505 if (!(vcpu->arch.ctrl & 1))
3506 mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
3507
3508 mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
3509
3510 if (kvmhv_on_pseries()) {
3511 /* call our hypervisor to load up HV regs and go */
3512 struct hv_guest_state hvregs;
3513
3514 kvmhv_save_hv_regs(vcpu, &hvregs);
3515 hvregs.lpcr = lpcr;
3516 vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
3517 hvregs.version = HV_GUEST_STATE_VERSION;
3518 if (vcpu->arch.nested) {
3519 hvregs.lpid = vcpu->arch.nested->shadow_lpid;
3520 hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
3521 } else {
3522 hvregs.lpid = vcpu->kvm->arch.lpid;
3523 hvregs.vcpu_token = vcpu->vcpu_id;
3524 }
3525 hvregs.hdec_expiry = time_limit;
3526 trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
3527 __pa(&vcpu->arch.regs));
3528 kvmhv_restore_hv_return_state(vcpu, &hvregs);
3529 vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
3530 vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
3531 vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
3532
3533 /* H_CEDE has to be handled now, not later */
3534 if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
3535 kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
3536 kvmppc_nested_cede(vcpu);
3537 trap = 0;
3538 }
3539 } else {
3540 trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
3541 }
3542
3543 vcpu->arch.slb_max = 0;
3544 dec = mfspr(SPRN_DEC);
3545 tb = mftb();
3546 vcpu->arch.dec_expires = dec + tb;
3547 vcpu->cpu = -1;
3548 vcpu->arch.thread_cpu = -1;
3549 vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
3550
3551 vcpu->arch.iamr = mfspr(SPRN_IAMR);
3552 vcpu->arch.pspb = mfspr(SPRN_PSPB);
3553 vcpu->arch.fscr = mfspr(SPRN_FSCR);
3554 vcpu->arch.tar = mfspr(SPRN_TAR);
3555 vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
3556 vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
3557 vcpu->arch.bescr = mfspr(SPRN_BESCR);
3558 vcpu->arch.wort = mfspr(SPRN_WORT);
3559 vcpu->arch.tid = mfspr(SPRN_TIDR);
3560 vcpu->arch.amr = mfspr(SPRN_AMR);
3561 vcpu->arch.uamor = mfspr(SPRN_UAMOR);
3562 vcpu->arch.dscr = mfspr(SPRN_DSCR);
3563
3564 mtspr(SPRN_PSPB, 0);
3565 mtspr(SPRN_WORT, 0);
3566 mtspr(SPRN_AMR, 0);
3567 mtspr(SPRN_UAMOR, 0);
3568 mtspr(SPRN_DSCR, host_dscr);
3569 mtspr(SPRN_TIDR, host_tidr);
3570 mtspr(SPRN_IAMR, host_iamr);
3571 mtspr(SPRN_PSPB, 0);
3572
3573 msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
3574 store_fp_state(&vcpu->arch.fp);
3575#ifdef CONFIG_ALTIVEC
3576 store_vr_state(&vcpu->arch.vr);
3577#endif
3578
3579 if (cpu_has_feature(CPU_FTR_TM) ||
3580 cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
3581 kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
3582
3583 save_pmu = 1;
3584 if (vcpu->arch.vpa.pinned_addr) {
3585 struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
3586 u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
3587 lp->yield_count = cpu_to_be32(yield_count);
3588 vcpu->arch.vpa.dirty = 1;
3589 save_pmu = lp->pmcregs_in_use;
3590 }
3591
3592 kvmhv_save_guest_pmu(vcpu, save_pmu);
3593
3594 vc->entry_exit_map = 0x101;
3595 vc->in_guest = 0;
3596
3597 mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
3598
3599 kvmhv_load_host_pmu();
3600
3601 kvmppc_subcore_exit_guest();
3602
3603 return trap;
3604}
3605
3606/*
3083 * Wait for some other vcpu thread to execute us, and 3607 * Wait for some other vcpu thread to execute us, and
3084 * wake us up when we need to handle something in the host. 3608 * wake us up when we need to handle something in the host.
3085 */ 3609 */
@@ -3256,6 +3780,11 @@ out:
3256 trace_kvmppc_vcore_wakeup(do_sleep, block_ns); 3780 trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
3257} 3781}
3258 3782
3783/*
3784 * This never fails for a radix guest, as none of the operations it does
3785 * for a radix guest can fail or have a way to report failure.
3786 * kvmhv_run_single_vcpu() relies on this fact.
3787 */
3259static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) 3788static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
3260{ 3789{
3261 int r = 0; 3790 int r = 0;
@@ -3405,6 +3934,171 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3405 return vcpu->arch.ret; 3934 return vcpu->arch.ret;
3406} 3935}
3407 3936
3937int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
3938 struct kvm_vcpu *vcpu, u64 time_limit,
3939 unsigned long lpcr)
3940{
3941 int trap, r, pcpu;
3942 int srcu_idx;
3943 struct kvmppc_vcore *vc;
3944 struct kvm *kvm = vcpu->kvm;
3945 struct kvm_nested_guest *nested = vcpu->arch.nested;
3946
3947 trace_kvmppc_run_vcpu_enter(vcpu);
3948
3949 kvm_run->exit_reason = 0;
3950 vcpu->arch.ret = RESUME_GUEST;
3951 vcpu->arch.trap = 0;
3952
3953 vc = vcpu->arch.vcore;
3954 vcpu->arch.ceded = 0;
3955 vcpu->arch.run_task = current;
3956 vcpu->arch.kvm_run = kvm_run;
3957 vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
3958 vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
3959 vcpu->arch.busy_preempt = TB_NIL;
3960 vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
3961 vc->runnable_threads[0] = vcpu;
3962 vc->n_runnable = 1;
3963 vc->runner = vcpu;
3964
3965 /* See if the MMU is ready to go */
3966 if (!kvm->arch.mmu_ready)
3967 kvmhv_setup_mmu(vcpu);
3968
3969 if (need_resched())
3970 cond_resched();
3971
3972 kvmppc_update_vpas(vcpu);
3973
3974 init_vcore_to_run(vc);
3975 vc->preempt_tb = TB_NIL;
3976
3977 preempt_disable();
3978 pcpu = smp_processor_id();
3979 vc->pcpu = pcpu;
3980 kvmppc_prepare_radix_vcpu(vcpu, pcpu);
3981
3982 local_irq_disable();
3983 hard_irq_disable();
3984 if (signal_pending(current))
3985 goto sigpend;
3986 if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready)
3987 goto out;
3988
3989 if (!nested) {
3990 kvmppc_core_prepare_to_enter(vcpu);
3991 if (vcpu->arch.doorbell_request) {
3992 vc->dpdes = 1;
3993 smp_wmb();
3994 vcpu->arch.doorbell_request = 0;
3995 }
3996 if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
3997 &vcpu->arch.pending_exceptions))
3998 lpcr |= LPCR_MER;
3999 } else if (vcpu->arch.pending_exceptions ||
4000 vcpu->arch.doorbell_request ||
4001 xive_interrupt_pending(vcpu)) {
4002 vcpu->arch.ret = RESUME_HOST;
4003 goto out;
4004 }
4005
4006 kvmppc_clear_host_core(pcpu);
4007
4008 local_paca->kvm_hstate.tid = 0;
4009 local_paca->kvm_hstate.napping = 0;
4010 local_paca->kvm_hstate.kvm_split_mode = NULL;
4011 kvmppc_start_thread(vcpu, vc);
4012 kvmppc_create_dtl_entry(vcpu, vc);
4013 trace_kvm_guest_enter(vcpu);
4014
4015 vc->vcore_state = VCORE_RUNNING;
4016 trace_kvmppc_run_core(vc, 0);
4017
4018 if (cpu_has_feature(CPU_FTR_HVMODE))
4019 kvmppc_radix_check_need_tlb_flush(kvm, pcpu, nested);
4020
4021 trace_hardirqs_on();
4022 guest_enter_irqoff();
4023
4024 srcu_idx = srcu_read_lock(&kvm->srcu);
4025
4026 this_cpu_disable_ftrace();
4027
4028 trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr);
4029 vcpu->arch.trap = trap;
4030
4031 this_cpu_enable_ftrace();
4032
4033 srcu_read_unlock(&kvm->srcu, srcu_idx);
4034
4035 if (cpu_has_feature(CPU_FTR_HVMODE)) {
4036 mtspr(SPRN_LPID, kvm->arch.host_lpid);
4037 isync();
4038 }
4039
4040 trace_hardirqs_off();
4041 set_irq_happened(trap);
4042
4043 kvmppc_set_host_core(pcpu);
4044
4045 local_irq_enable();
4046 guest_exit();
4047
4048 cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest);
4049
4050 preempt_enable();
4051
4052 /* cancel pending decrementer exception if DEC is now positive */
4053 if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
4054 kvmppc_core_dequeue_dec(vcpu);
4055
4056 trace_kvm_guest_exit(vcpu);
4057 r = RESUME_GUEST;
4058 if (trap) {
4059 if (!nested)
4060 r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
4061 else
4062 r = kvmppc_handle_nested_exit(vcpu);
4063 }
4064 vcpu->arch.ret = r;
4065
4066 if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded &&
4067 !kvmppc_vcpu_woken(vcpu)) {
4068 kvmppc_set_timer(vcpu);
4069 while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) {
4070 if (signal_pending(current)) {
4071 vcpu->stat.signal_exits++;
4072 kvm_run->exit_reason = KVM_EXIT_INTR;
4073 vcpu->arch.ret = -EINTR;
4074 break;
4075 }
4076 spin_lock(&vc->lock);
4077 kvmppc_vcore_blocked(vc);
4078 spin_unlock(&vc->lock);
4079 }
4080 }
4081 vcpu->arch.ceded = 0;
4082
4083 vc->vcore_state = VCORE_INACTIVE;
4084 trace_kvmppc_run_core(vc, 1);
4085
4086 done:
4087 kvmppc_remove_runnable(vc, vcpu);
4088 trace_kvmppc_run_vcpu_exit(vcpu, kvm_run);
4089
4090 return vcpu->arch.ret;
4091
4092 sigpend:
4093 vcpu->stat.signal_exits++;
4094 kvm_run->exit_reason = KVM_EXIT_INTR;
4095 vcpu->arch.ret = -EINTR;
4096 out:
4097 local_irq_enable();
4098 preempt_enable();
4099 goto done;
4100}
4101
3408static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) 4102static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
3409{ 4103{
3410 int r; 4104 int r;
@@ -3480,7 +4174,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
3480 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; 4174 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
3481 4175
3482 do { 4176 do {
3483 r = kvmppc_run_vcpu(run, vcpu); 4177 if (kvm->arch.threads_indep && kvm_is_radix(kvm))
4178 r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0,
4179 vcpu->arch.vcore->lpcr);
4180 else
4181 r = kvmppc_run_vcpu(run, vcpu);
3484 4182
3485 if (run->exit_reason == KVM_EXIT_PAPR_HCALL && 4183 if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
3486 !(vcpu->arch.shregs.msr & MSR_PR)) { 4184 !(vcpu->arch.shregs.msr & MSR_PR)) {
@@ -3559,6 +4257,10 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
3559 kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01); 4257 kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
3560 kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L); 4258 kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
3561 4259
4260 /* If running as a nested hypervisor, we don't support HPT guests */
4261 if (kvmhv_on_pseries())
4262 info->flags |= KVM_PPC_NO_HASH;
4263
3562 return 0; 4264 return 0;
3563} 4265}
3564 4266
@@ -3723,8 +4425,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm)
3723 __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE; 4425 __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
3724 dw1 = PATB_GR | kvm->arch.process_table; 4426 dw1 = PATB_GR | kvm->arch.process_table;
3725 } 4427 }
3726 4428 kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
3727 mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
3728} 4429}
3729 4430
3730/* 4431/*
@@ -3820,6 +4521,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
3820/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ 4521/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
3821int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) 4522int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
3822{ 4523{
4524 if (nesting_enabled(kvm))
4525 kvmhv_release_all_nested(kvm);
3823 kvmppc_free_radix(kvm); 4526 kvmppc_free_radix(kvm);
3824 kvmppc_update_lpcr(kvm, LPCR_VPM1, 4527 kvmppc_update_lpcr(kvm, LPCR_VPM1,
3825 LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); 4528 LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
@@ -3841,6 +4544,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
3841 kvmppc_free_hpt(&kvm->arch.hpt); 4544 kvmppc_free_hpt(&kvm->arch.hpt);
3842 kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, 4545 kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
3843 LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); 4546 LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
4547 kvmppc_rmap_reset(kvm);
3844 kvm->arch.radix = 1; 4548 kvm->arch.radix = 1;
3845 return 0; 4549 return 0;
3846} 4550}
@@ -3940,6 +4644,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
3940 4644
3941 kvmppc_alloc_host_rm_ops(); 4645 kvmppc_alloc_host_rm_ops();
3942 4646
4647 kvmhv_vm_nested_init(kvm);
4648
3943 /* 4649 /*
3944 * Since we don't flush the TLB when tearing down a VM, 4650 * Since we don't flush the TLB when tearing down a VM,
3945 * and this lpid might have previously been used, 4651 * and this lpid might have previously been used,
@@ -3958,9 +4664,13 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
3958 kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); 4664 kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
3959 4665
3960 /* Init LPCR for virtual RMA mode */ 4666 /* Init LPCR for virtual RMA mode */
3961 kvm->arch.host_lpid = mfspr(SPRN_LPID); 4667 if (cpu_has_feature(CPU_FTR_HVMODE)) {
3962 kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); 4668 kvm->arch.host_lpid = mfspr(SPRN_LPID);
3963 lpcr &= LPCR_PECE | LPCR_LPES; 4669 kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
4670 lpcr &= LPCR_PECE | LPCR_LPES;
4671 } else {
4672 lpcr = 0;
4673 }
3964 lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | 4674 lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
3965 LPCR_VPM0 | LPCR_VPM1; 4675 LPCR_VPM0 | LPCR_VPM1;
3966 kvm->arch.vrma_slb_v = SLB_VSID_B_1T | 4676 kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
@@ -4027,8 +4737,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
4027 * On POWER9, we only need to do this if the "indep_threads_mode" 4737 * On POWER9, we only need to do this if the "indep_threads_mode"
4028 * module parameter has been set to N. 4738 * module parameter has been set to N.
4029 */ 4739 */
4030 if (cpu_has_feature(CPU_FTR_ARCH_300)) 4740 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
4031 kvm->arch.threads_indep = indep_threads_mode; 4741 if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
4742 pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
4743 kvm->arch.threads_indep = true;
4744 } else {
4745 kvm->arch.threads_indep = indep_threads_mode;
4746 }
4747 }
4032 if (!kvm->arch.threads_indep) 4748 if (!kvm->arch.threads_indep)
4033 kvm_hv_vm_activated(); 4749 kvm_hv_vm_activated();
4034 4750
@@ -4051,6 +4767,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
4051 snprintf(buf, sizeof(buf), "vm%d", current->pid); 4767 snprintf(buf, sizeof(buf), "vm%d", current->pid);
4052 kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir); 4768 kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
4053 kvmppc_mmu_debugfs_init(kvm); 4769 kvmppc_mmu_debugfs_init(kvm);
4770 if (radix_enabled())
4771 kvmhv_radix_debugfs_init(kvm);
4054 4772
4055 return 0; 4773 return 0;
4056} 4774}
@@ -4073,13 +4791,21 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
4073 4791
4074 kvmppc_free_vcores(kvm); 4792 kvmppc_free_vcores(kvm);
4075 4793
4076 kvmppc_free_lpid(kvm->arch.lpid);
4077 4794
4078 if (kvm_is_radix(kvm)) 4795 if (kvm_is_radix(kvm))
4079 kvmppc_free_radix(kvm); 4796 kvmppc_free_radix(kvm);
4080 else 4797 else
4081 kvmppc_free_hpt(&kvm->arch.hpt); 4798 kvmppc_free_hpt(&kvm->arch.hpt);
4082 4799
4800 /* Perform global invalidation and return lpid to the pool */
4801 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
4802 if (nesting_enabled(kvm))
4803 kvmhv_release_all_nested(kvm);
4804 kvm->arch.process_table = 0;
4805 kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
4806 }
4807 kvmppc_free_lpid(kvm->arch.lpid);
4808
4083 kvmppc_free_pimap(kvm); 4809 kvmppc_free_pimap(kvm);
4084} 4810}
4085 4811
@@ -4104,11 +4830,15 @@ static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
4104 4830
4105static int kvmppc_core_check_processor_compat_hv(void) 4831static int kvmppc_core_check_processor_compat_hv(void)
4106{ 4832{
4107 if (!cpu_has_feature(CPU_FTR_HVMODE) || 4833 if (cpu_has_feature(CPU_FTR_HVMODE) &&
4108 !cpu_has_feature(CPU_FTR_ARCH_206)) 4834 cpu_has_feature(CPU_FTR_ARCH_206))
4109 return -EIO; 4835 return 0;
4110 4836
4111 return 0; 4837 /* POWER9 in radix mode is capable of being a nested hypervisor. */
4838 if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
4839 return 0;
4840
4841 return -EIO;
4112} 4842}
4113 4843
4114#ifdef CONFIG_KVM_XICS 4844#ifdef CONFIG_KVM_XICS
@@ -4426,6 +5156,10 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
4426 if (radix && !radix_enabled()) 5156 if (radix && !radix_enabled())
4427 return -EINVAL; 5157 return -EINVAL;
4428 5158
5159 /* If we're a nested hypervisor, we currently only support radix */
5160 if (kvmhv_on_pseries() && !radix)
5161 return -EINVAL;
5162
4429 mutex_lock(&kvm->lock); 5163 mutex_lock(&kvm->lock);
4430 if (radix != kvm_is_radix(kvm)) { 5164 if (radix != kvm_is_radix(kvm)) {
4431 if (kvm->arch.mmu_ready) { 5165 if (kvm->arch.mmu_ready) {
@@ -4458,6 +5192,19 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
4458 return err; 5192 return err;
4459} 5193}
4460 5194
5195static int kvmhv_enable_nested(struct kvm *kvm)
5196{
5197 if (!nested)
5198 return -EPERM;
5199 if (!cpu_has_feature(CPU_FTR_ARCH_300))
5200 return -ENODEV;
5201
5202 /* kvm == NULL means the caller is testing if the capability exists */
5203 if (kvm)
5204 kvm->arch.nested_enable = true;
5205 return 0;
5206}
5207
4461static struct kvmppc_ops kvm_ops_hv = { 5208static struct kvmppc_ops kvm_ops_hv = {
4462 .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, 5209 .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
4463 .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, 5210 .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -4497,6 +5244,7 @@ static struct kvmppc_ops kvm_ops_hv = {
4497 .configure_mmu = kvmhv_configure_mmu, 5244 .configure_mmu = kvmhv_configure_mmu,
4498 .get_rmmu_info = kvmhv_get_rmmu_info, 5245 .get_rmmu_info = kvmhv_get_rmmu_info,
4499 .set_smt_mode = kvmhv_set_smt_mode, 5246 .set_smt_mode = kvmhv_set_smt_mode,
5247 .enable_nested = kvmhv_enable_nested,
4500}; 5248};
4501 5249
4502static int kvm_init_subcore_bitmap(void) 5250static int kvm_init_subcore_bitmap(void)
@@ -4547,6 +5295,10 @@ static int kvmppc_book3s_init_hv(void)
4547 if (r < 0) 5295 if (r < 0)
4548 return -ENODEV; 5296 return -ENODEV;
4549 5297
5298 r = kvmhv_nested_init();
5299 if (r)
5300 return r;
5301
4550 r = kvm_init_subcore_bitmap(); 5302 r = kvm_init_subcore_bitmap();
4551 if (r) 5303 if (r)
4552 return r; 5304 return r;
@@ -4557,7 +5309,8 @@ static int kvmppc_book3s_init_hv(void)
4557 * indirectly, via OPAL. 5309 * indirectly, via OPAL.
4558 */ 5310 */
4559#ifdef CONFIG_SMP 5311#ifdef CONFIG_SMP
4560 if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) { 5312 if (!xive_enabled() && !kvmhv_on_pseries() &&
5313 !local_paca->kvm_hstate.xics_phys) {
4561 struct device_node *np; 5314 struct device_node *np;
4562 5315
4563 np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc"); 5316 np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
@@ -4605,6 +5358,7 @@ static void kvmppc_book3s_exit_hv(void)
4605 if (kvmppc_radix_possible()) 5358 if (kvmppc_radix_possible())
4606 kvmppc_radix_exit(); 5359 kvmppc_radix_exit();
4607 kvmppc_hv_ops = NULL; 5360 kvmppc_hv_ops = NULL;
5361 kvmhv_nested_exit();
4608} 5362}
4609 5363
4610module_init(kvmppc_book3s_init_hv); 5364module_init(kvmppc_book3s_init_hv);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index fc6bb9630a9c..a71e2fc00a4e 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -231,6 +231,15 @@ void kvmhv_rm_send_ipi(int cpu)
231 void __iomem *xics_phys; 231 void __iomem *xics_phys;
232 unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); 232 unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
233 233
234 /* For a nested hypervisor, use the XICS via hcall */
235 if (kvmhv_on_pseries()) {
236 unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
237
238 plpar_hcall_raw(H_IPI, retbuf, get_hard_smp_processor_id(cpu),
239 IPI_PRIORITY);
240 return;
241 }
242
234 /* On POWER9 we can use msgsnd for any destination cpu. */ 243 /* On POWER9 we can use msgsnd for any destination cpu. */
235 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 244 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
236 msg |= get_hard_smp_processor_id(cpu); 245 msg |= get_hard_smp_processor_id(cpu);
@@ -460,12 +469,19 @@ static long kvmppc_read_one_intr(bool *again)
460 return 1; 469 return 1;
461 470
462 /* Now read the interrupt from the ICP */ 471 /* Now read the interrupt from the ICP */
463 xics_phys = local_paca->kvm_hstate.xics_phys; 472 if (kvmhv_on_pseries()) {
464 rc = 0; 473 unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
465 if (!xics_phys) 474
466 rc = opal_int_get_xirr(&xirr, false); 475 rc = plpar_hcall_raw(H_XIRR, retbuf, 0xFF);
467 else 476 xirr = cpu_to_be32(retbuf[0]);
468 xirr = __raw_rm_readl(xics_phys + XICS_XIRR); 477 } else {
478 xics_phys = local_paca->kvm_hstate.xics_phys;
479 rc = 0;
480 if (!xics_phys)
481 rc = opal_int_get_xirr(&xirr, false);
482 else
483 xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
484 }
469 if (rc < 0) 485 if (rc < 0)
470 return 1; 486 return 1;
471 487
@@ -494,7 +510,13 @@ static long kvmppc_read_one_intr(bool *again)
494 */ 510 */
495 if (xisr == XICS_IPI) { 511 if (xisr == XICS_IPI) {
496 rc = 0; 512 rc = 0;
497 if (xics_phys) { 513 if (kvmhv_on_pseries()) {
514 unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
515
516 plpar_hcall_raw(H_IPI, retbuf,
517 hard_smp_processor_id(), 0xff);
518 plpar_hcall_raw(H_EOI, retbuf, h_xirr);
519 } else if (xics_phys) {
498 __raw_rm_writeb(0xff, xics_phys + XICS_MFRR); 520 __raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
499 __raw_rm_writel(xirr, xics_phys + XICS_XIRR); 521 __raw_rm_writel(xirr, xics_phys + XICS_XIRR);
500 } else { 522 } else {
@@ -520,7 +542,13 @@ static long kvmppc_read_one_intr(bool *again)
520 /* We raced with the host, 542 /* We raced with the host,
521 * we need to resend that IPI, bummer 543 * we need to resend that IPI, bummer
522 */ 544 */
523 if (xics_phys) 545 if (kvmhv_on_pseries()) {
546 unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
547
548 plpar_hcall_raw(H_IPI, retbuf,
549 hard_smp_processor_id(),
550 IPI_PRIORITY);
551 } else if (xics_phys)
524 __raw_rm_writeb(IPI_PRIORITY, 552 __raw_rm_writeb(IPI_PRIORITY,
525 xics_phys + XICS_MFRR); 553 xics_phys + XICS_MFRR);
526 else 554 else
@@ -729,3 +757,51 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
729 smp_mb(); 757 smp_mb();
730 local_paca->kvm_hstate.kvm_split_mode = NULL; 758 local_paca->kvm_hstate.kvm_split_mode = NULL;
731} 759}
760
761/*
762 * Is there a PRIV_DOORBELL pending for the guest (on POWER9)?
763 * Can we inject a Decrementer or a External interrupt?
764 */
765void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
766{
767 int ext;
768 unsigned long vec = 0;
769 unsigned long lpcr;
770
771 /* Insert EXTERNAL bit into LPCR at the MER bit position */
772 ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1;
773 lpcr = mfspr(SPRN_LPCR);
774 lpcr |= ext << LPCR_MER_SH;
775 mtspr(SPRN_LPCR, lpcr);
776 isync();
777
778 if (vcpu->arch.shregs.msr & MSR_EE) {
779 if (ext) {
780 vec = BOOK3S_INTERRUPT_EXTERNAL;
781 } else {
782 long int dec = mfspr(SPRN_DEC);
783 if (!(lpcr & LPCR_LD))
784 dec = (int) dec;
785 if (dec < 0)
786 vec = BOOK3S_INTERRUPT_DECREMENTER;
787 }
788 }
789 if (vec) {
790 unsigned long msr, old_msr = vcpu->arch.shregs.msr;
791
792 kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu));
793 kvmppc_set_srr1(vcpu, old_msr);
794 kvmppc_set_pc(vcpu, vec);
795 msr = vcpu->arch.intr_msr;
796 if (MSR_TM_ACTIVE(old_msr))
797 msr |= MSR_TS_S;
798 vcpu->arch.shregs.msr = msr;
799 }
800
801 if (vcpu->arch.doorbell_request) {
802 mtspr(SPRN_DPDES, 1);
803 vcpu->arch.vcore->dpdes = 1;
804 smp_wmb();
805 vcpu->arch.doorbell_request = 0;
806 }
807}
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
index 666b91c79eb4..a6d10010d9e8 100644
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -64,52 +64,7 @@ BEGIN_FTR_SECTION
64END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) 64END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
65 65
66 /* Save host PMU registers */ 66 /* Save host PMU registers */
67BEGIN_FTR_SECTION 67 bl kvmhv_save_host_pmu
68 /* Work around P8 PMAE bug */
69 li r3, -1
70 clrrdi r3, r3, 10
71 mfspr r8, SPRN_MMCR2
72 mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */
73 isync
74END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
75 li r3, 1
76 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
77 mfspr r7, SPRN_MMCR0 /* save MMCR0 */
78 mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */
79 mfspr r6, SPRN_MMCRA
80 /* Clear MMCRA in order to disable SDAR updates */
81 li r5, 0
82 mtspr SPRN_MMCRA, r5
83 isync
84 lbz r5, PACA_PMCINUSE(r13) /* is the host using the PMU? */
85 cmpwi r5, 0
86 beq 31f /* skip if not */
87 mfspr r5, SPRN_MMCR1
88 mfspr r9, SPRN_SIAR
89 mfspr r10, SPRN_SDAR
90 std r7, HSTATE_MMCR0(r13)
91 std r5, HSTATE_MMCR1(r13)
92 std r6, HSTATE_MMCRA(r13)
93 std r9, HSTATE_SIAR(r13)
94 std r10, HSTATE_SDAR(r13)
95BEGIN_FTR_SECTION
96 mfspr r9, SPRN_SIER
97 std r8, HSTATE_MMCR2(r13)
98 std r9, HSTATE_SIER(r13)
99END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
100 mfspr r3, SPRN_PMC1
101 mfspr r5, SPRN_PMC2
102 mfspr r6, SPRN_PMC3
103 mfspr r7, SPRN_PMC4
104 mfspr r8, SPRN_PMC5
105 mfspr r9, SPRN_PMC6
106 stw r3, HSTATE_PMC1(r13)
107 stw r5, HSTATE_PMC2(r13)
108 stw r6, HSTATE_PMC3(r13)
109 stw r7, HSTATE_PMC4(r13)
110 stw r8, HSTATE_PMC5(r13)
111 stw r9, HSTATE_PMC6(r13)
11231:
113 68
114 /* 69 /*
115 * Put whatever is in the decrementer into the 70 * Put whatever is in the decrementer into the
@@ -161,3 +116,51 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
161 ld r0, PPC_LR_STKOFF(r1) 116 ld r0, PPC_LR_STKOFF(r1)
162 mtlr r0 117 mtlr r0
163 blr 118 blr
119
120_GLOBAL(kvmhv_save_host_pmu)
121BEGIN_FTR_SECTION
122 /* Work around P8 PMAE bug */
123 li r3, -1
124 clrrdi r3, r3, 10
125 mfspr r8, SPRN_MMCR2
126 mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */
127 isync
128END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
129 li r3, 1
130 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
131 mfspr r7, SPRN_MMCR0 /* save MMCR0 */
132 mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */
133 mfspr r6, SPRN_MMCRA
134 /* Clear MMCRA in order to disable SDAR updates */
135 li r5, 0
136 mtspr SPRN_MMCRA, r5
137 isync
138 lbz r5, PACA_PMCINUSE(r13) /* is the host using the PMU? */
139 cmpwi r5, 0
140 beq 31f /* skip if not */
141 mfspr r5, SPRN_MMCR1
142 mfspr r9, SPRN_SIAR
143 mfspr r10, SPRN_SDAR
144 std r7, HSTATE_MMCR0(r13)
145 std r5, HSTATE_MMCR1(r13)
146 std r6, HSTATE_MMCRA(r13)
147 std r9, HSTATE_SIAR(r13)
148 std r10, HSTATE_SDAR(r13)
149BEGIN_FTR_SECTION
150 mfspr r9, SPRN_SIER
151 std r8, HSTATE_MMCR2(r13)
152 std r9, HSTATE_SIER(r13)
153END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
154 mfspr r3, SPRN_PMC1
155 mfspr r5, SPRN_PMC2
156 mfspr r6, SPRN_PMC3
157 mfspr r7, SPRN_PMC4
158 mfspr r8, SPRN_PMC5
159 mfspr r9, SPRN_PMC6
160 stw r3, HSTATE_PMC1(r13)
161 stw r5, HSTATE_PMC2(r13)
162 stw r6, HSTATE_PMC3(r13)
163 stw r7, HSTATE_PMC4(r13)
164 stw r8, HSTATE_PMC5(r13)
165 stw r9, HSTATE_PMC6(r13)
16631: blr
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
new file mode 100644
index 000000000000..401d2ecbebc5
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -0,0 +1,1291 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright IBM Corporation, 2018
4 * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
5 * Paul Mackerras <paulus@ozlabs.org>
6 *
7 * Description: KVM functions specific to running nested KVM-HV guests
8 * on Book3S processors (specifically POWER9 and later).
9 */
10
11#include <linux/kernel.h>
12#include <linux/kvm_host.h>
13#include <linux/llist.h>
14
15#include <asm/kvm_ppc.h>
16#include <asm/kvm_book3s.h>
17#include <asm/mmu.h>
18#include <asm/pgtable.h>
19#include <asm/pgalloc.h>
20#include <asm/pte-walk.h>
21#include <asm/reg.h>
22
23static struct patb_entry *pseries_partition_tb;
24
25static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
26static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free);
27
28void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
29{
30 struct kvmppc_vcore *vc = vcpu->arch.vcore;
31
32 hr->pcr = vc->pcr;
33 hr->dpdes = vc->dpdes;
34 hr->hfscr = vcpu->arch.hfscr;
35 hr->tb_offset = vc->tb_offset;
36 hr->dawr0 = vcpu->arch.dawr;
37 hr->dawrx0 = vcpu->arch.dawrx;
38 hr->ciabr = vcpu->arch.ciabr;
39 hr->purr = vcpu->arch.purr;
40 hr->spurr = vcpu->arch.spurr;
41 hr->ic = vcpu->arch.ic;
42 hr->vtb = vc->vtb;
43 hr->srr0 = vcpu->arch.shregs.srr0;
44 hr->srr1 = vcpu->arch.shregs.srr1;
45 hr->sprg[0] = vcpu->arch.shregs.sprg0;
46 hr->sprg[1] = vcpu->arch.shregs.sprg1;
47 hr->sprg[2] = vcpu->arch.shregs.sprg2;
48 hr->sprg[3] = vcpu->arch.shregs.sprg3;
49 hr->pidr = vcpu->arch.pid;
50 hr->cfar = vcpu->arch.cfar;
51 hr->ppr = vcpu->arch.ppr;
52}
53
54static void byteswap_pt_regs(struct pt_regs *regs)
55{
56 unsigned long *addr = (unsigned long *) regs;
57
58 for (; addr < ((unsigned long *) (regs + 1)); addr++)
59 *addr = swab64(*addr);
60}
61
62static void byteswap_hv_regs(struct hv_guest_state *hr)
63{
64 hr->version = swab64(hr->version);
65 hr->lpid = swab32(hr->lpid);
66 hr->vcpu_token = swab32(hr->vcpu_token);
67 hr->lpcr = swab64(hr->lpcr);
68 hr->pcr = swab64(hr->pcr);
69 hr->amor = swab64(hr->amor);
70 hr->dpdes = swab64(hr->dpdes);
71 hr->hfscr = swab64(hr->hfscr);
72 hr->tb_offset = swab64(hr->tb_offset);
73 hr->dawr0 = swab64(hr->dawr0);
74 hr->dawrx0 = swab64(hr->dawrx0);
75 hr->ciabr = swab64(hr->ciabr);
76 hr->hdec_expiry = swab64(hr->hdec_expiry);
77 hr->purr = swab64(hr->purr);
78 hr->spurr = swab64(hr->spurr);
79 hr->ic = swab64(hr->ic);
80 hr->vtb = swab64(hr->vtb);
81 hr->hdar = swab64(hr->hdar);
82 hr->hdsisr = swab64(hr->hdsisr);
83 hr->heir = swab64(hr->heir);
84 hr->asdr = swab64(hr->asdr);
85 hr->srr0 = swab64(hr->srr0);
86 hr->srr1 = swab64(hr->srr1);
87 hr->sprg[0] = swab64(hr->sprg[0]);
88 hr->sprg[1] = swab64(hr->sprg[1]);
89 hr->sprg[2] = swab64(hr->sprg[2]);
90 hr->sprg[3] = swab64(hr->sprg[3]);
91 hr->pidr = swab64(hr->pidr);
92 hr->cfar = swab64(hr->cfar);
93 hr->ppr = swab64(hr->ppr);
94}
95
96static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
97 struct hv_guest_state *hr)
98{
99 struct kvmppc_vcore *vc = vcpu->arch.vcore;
100
101 hr->dpdes = vc->dpdes;
102 hr->hfscr = vcpu->arch.hfscr;
103 hr->purr = vcpu->arch.purr;
104 hr->spurr = vcpu->arch.spurr;
105 hr->ic = vcpu->arch.ic;
106 hr->vtb = vc->vtb;
107 hr->srr0 = vcpu->arch.shregs.srr0;
108 hr->srr1 = vcpu->arch.shregs.srr1;
109 hr->sprg[0] = vcpu->arch.shregs.sprg0;
110 hr->sprg[1] = vcpu->arch.shregs.sprg1;
111 hr->sprg[2] = vcpu->arch.shregs.sprg2;
112 hr->sprg[3] = vcpu->arch.shregs.sprg3;
113 hr->pidr = vcpu->arch.pid;
114 hr->cfar = vcpu->arch.cfar;
115 hr->ppr = vcpu->arch.ppr;
116 switch (trap) {
117 case BOOK3S_INTERRUPT_H_DATA_STORAGE:
118 hr->hdar = vcpu->arch.fault_dar;
119 hr->hdsisr = vcpu->arch.fault_dsisr;
120 hr->asdr = vcpu->arch.fault_gpa;
121 break;
122 case BOOK3S_INTERRUPT_H_INST_STORAGE:
123 hr->asdr = vcpu->arch.fault_gpa;
124 break;
125 case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
126 hr->heir = vcpu->arch.emul_inst;
127 break;
128 }
129}
130
131static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
132{
133 /*
134 * Don't let L1 enable features for L2 which we've disabled for L1,
135 * but preserve the interrupt cause field.
136 */
137 hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
138
139 /* Don't let data address watchpoint match in hypervisor state */
140 hr->dawrx0 &= ~DAWRX_HYP;
141
142 /* Don't let completed instruction address breakpt match in HV state */
143 if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
144 hr->ciabr &= ~CIABR_PRIV;
145}
146
147static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
148{
149 struct kvmppc_vcore *vc = vcpu->arch.vcore;
150
151 vc->pcr = hr->pcr;
152 vc->dpdes = hr->dpdes;
153 vcpu->arch.hfscr = hr->hfscr;
154 vcpu->arch.dawr = hr->dawr0;
155 vcpu->arch.dawrx = hr->dawrx0;
156 vcpu->arch.ciabr = hr->ciabr;
157 vcpu->arch.purr = hr->purr;
158 vcpu->arch.spurr = hr->spurr;
159 vcpu->arch.ic = hr->ic;
160 vc->vtb = hr->vtb;
161 vcpu->arch.shregs.srr0 = hr->srr0;
162 vcpu->arch.shregs.srr1 = hr->srr1;
163 vcpu->arch.shregs.sprg0 = hr->sprg[0];
164 vcpu->arch.shregs.sprg1 = hr->sprg[1];
165 vcpu->arch.shregs.sprg2 = hr->sprg[2];
166 vcpu->arch.shregs.sprg3 = hr->sprg[3];
167 vcpu->arch.pid = hr->pidr;
168 vcpu->arch.cfar = hr->cfar;
169 vcpu->arch.ppr = hr->ppr;
170}
171
172void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
173 struct hv_guest_state *hr)
174{
175 struct kvmppc_vcore *vc = vcpu->arch.vcore;
176
177 vc->dpdes = hr->dpdes;
178 vcpu->arch.hfscr = hr->hfscr;
179 vcpu->arch.purr = hr->purr;
180 vcpu->arch.spurr = hr->spurr;
181 vcpu->arch.ic = hr->ic;
182 vc->vtb = hr->vtb;
183 vcpu->arch.fault_dar = hr->hdar;
184 vcpu->arch.fault_dsisr = hr->hdsisr;
185 vcpu->arch.fault_gpa = hr->asdr;
186 vcpu->arch.emul_inst = hr->heir;
187 vcpu->arch.shregs.srr0 = hr->srr0;
188 vcpu->arch.shregs.srr1 = hr->srr1;
189 vcpu->arch.shregs.sprg0 = hr->sprg[0];
190 vcpu->arch.shregs.sprg1 = hr->sprg[1];
191 vcpu->arch.shregs.sprg2 = hr->sprg[2];
192 vcpu->arch.shregs.sprg3 = hr->sprg[3];
193 vcpu->arch.pid = hr->pidr;
194 vcpu->arch.cfar = hr->cfar;
195 vcpu->arch.ppr = hr->ppr;
196}
197
198long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
199{
200 long int err, r;
201 struct kvm_nested_guest *l2;
202 struct pt_regs l2_regs, saved_l1_regs;
203 struct hv_guest_state l2_hv, saved_l1_hv;
204 struct kvmppc_vcore *vc = vcpu->arch.vcore;
205 u64 hv_ptr, regs_ptr;
206 u64 hdec_exp;
207 s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
208 u64 mask;
209 unsigned long lpcr;
210
211 if (vcpu->kvm->arch.l1_ptcr == 0)
212 return H_NOT_AVAILABLE;
213
214 /* copy parameters in */
215 hv_ptr = kvmppc_get_gpr(vcpu, 4);
216 err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv,
217 sizeof(struct hv_guest_state));
218 if (err)
219 return H_PARAMETER;
220 if (kvmppc_need_byteswap(vcpu))
221 byteswap_hv_regs(&l2_hv);
222 if (l2_hv.version != HV_GUEST_STATE_VERSION)
223 return H_P2;
224
225 regs_ptr = kvmppc_get_gpr(vcpu, 5);
226 err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs,
227 sizeof(struct pt_regs));
228 if (err)
229 return H_PARAMETER;
230 if (kvmppc_need_byteswap(vcpu))
231 byteswap_pt_regs(&l2_regs);
232 if (l2_hv.vcpu_token >= NR_CPUS)
233 return H_PARAMETER;
234
235 /* translate lpid */
236 l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true);
237 if (!l2)
238 return H_PARAMETER;
239 if (!l2->l1_gr_to_hr) {
240 mutex_lock(&l2->tlb_lock);
241 kvmhv_update_ptbl_cache(l2);
242 mutex_unlock(&l2->tlb_lock);
243 }
244
245 /* save l1 values of things */
246 vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
247 saved_l1_regs = vcpu->arch.regs;
248 kvmhv_save_hv_regs(vcpu, &saved_l1_hv);
249
250 /* convert TB values/offsets to host (L0) values */
251 hdec_exp = l2_hv.hdec_expiry - vc->tb_offset;
252 vc->tb_offset += l2_hv.tb_offset;
253
254 /* set L1 state to L2 state */
255 vcpu->arch.nested = l2;
256 vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token;
257 vcpu->arch.regs = l2_regs;
258 vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
259 mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
260 LPCR_LPES | LPCR_MER;
261 lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
262 sanitise_hv_regs(vcpu, &l2_hv);
263 restore_hv_regs(vcpu, &l2_hv);
264
265 vcpu->arch.ret = RESUME_GUEST;
266 vcpu->arch.trap = 0;
267 do {
268 if (mftb() >= hdec_exp) {
269 vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
270 r = RESUME_HOST;
271 break;
272 }
273 r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp,
274 lpcr);
275 } while (is_kvmppc_resume_guest(r));
276
277 /* save L2 state for return */
278 l2_regs = vcpu->arch.regs;
279 l2_regs.msr = vcpu->arch.shregs.msr;
280 delta_purr = vcpu->arch.purr - l2_hv.purr;
281 delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
282 delta_ic = vcpu->arch.ic - l2_hv.ic;
283 delta_vtb = vc->vtb - l2_hv.vtb;
284 save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
285
286 /* restore L1 state */
287 vcpu->arch.nested = NULL;
288 vcpu->arch.regs = saved_l1_regs;
289 vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK;
290 /* set L1 MSR TS field according to L2 transaction state */
291 if (l2_regs.msr & MSR_TS_MASK)
292 vcpu->arch.shregs.msr |= MSR_TS_S;
293 vc->tb_offset = saved_l1_hv.tb_offset;
294 restore_hv_regs(vcpu, &saved_l1_hv);
295 vcpu->arch.purr += delta_purr;
296 vcpu->arch.spurr += delta_spurr;
297 vcpu->arch.ic += delta_ic;
298 vc->vtb += delta_vtb;
299
300 kvmhv_put_nested(l2);
301
302 /* copy l2_hv_state and regs back to guest */
303 if (kvmppc_need_byteswap(vcpu)) {
304 byteswap_hv_regs(&l2_hv);
305 byteswap_pt_regs(&l2_regs);
306 }
307 err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv,
308 sizeof(struct hv_guest_state));
309 if (err)
310 return H_AUTHORITY;
311 err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs,
312 sizeof(struct pt_regs));
313 if (err)
314 return H_AUTHORITY;
315
316 if (r == -EINTR)
317 return H_INTERRUPT;
318
319 return vcpu->arch.trap;
320}
321
322long kvmhv_nested_init(void)
323{
324 long int ptb_order;
325 unsigned long ptcr;
326 long rc;
327
328 if (!kvmhv_on_pseries())
329 return 0;
330 if (!radix_enabled())
331 return -ENODEV;
332
333 /* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
334 ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
335 if (ptb_order < 8)
336 ptb_order = 8;
337 pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
338 GFP_KERNEL);
339 if (!pseries_partition_tb) {
340 pr_err("kvm-hv: failed to allocated nested partition table\n");
341 return -ENOMEM;
342 }
343
344 ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
345 rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
346 if (rc != H_SUCCESS) {
347 pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
348 rc);
349 kfree(pseries_partition_tb);
350 pseries_partition_tb = NULL;
351 return -ENODEV;
352 }
353
354 return 0;
355}
356
357void kvmhv_nested_exit(void)
358{
359 /*
360 * N.B. the kvmhv_on_pseries() test is there because it enables
361 * the compiler to remove the call to plpar_hcall_norets()
362 * when CONFIG_PPC_PSERIES=n.
363 */
364 if (kvmhv_on_pseries() && pseries_partition_tb) {
365 plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
366 kfree(pseries_partition_tb);
367 pseries_partition_tb = NULL;
368 }
369}
370
371static void kvmhv_flush_lpid(unsigned int lpid)
372{
373 long rc;
374
375 if (!kvmhv_on_pseries()) {
376 radix__flush_tlb_lpid(lpid);
377 return;
378 }
379
380 rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1),
381 lpid, TLBIEL_INVAL_SET_LPID);
382 if (rc)
383 pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc);
384}
385
386void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
387{
388 if (!kvmhv_on_pseries()) {
389 mmu_partition_table_set_entry(lpid, dw0, dw1);
390 return;
391 }
392
393 pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
394 pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
395 /* L0 will do the necessary barriers */
396 kvmhv_flush_lpid(lpid);
397}
398
399static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
400{
401 unsigned long dw0;
402
403 dw0 = PATB_HR | radix__get_tree_size() |
404 __pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
405 kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
406}
407
408void kvmhv_vm_nested_init(struct kvm *kvm)
409{
410 kvm->arch.max_nested_lpid = -1;
411}
412
413/*
414 * Handle the H_SET_PARTITION_TABLE hcall.
415 * r4 = guest real address of partition table + log_2(size) - 12
416 * (formatted as for the PTCR).
417 */
418long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
419{
420 struct kvm *kvm = vcpu->kvm;
421 unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
422 int srcu_idx;
423 long ret = H_SUCCESS;
424
425 srcu_idx = srcu_read_lock(&kvm->srcu);
426 /*
427 * Limit the partition table to 4096 entries (because that's what
428 * hardware supports), and check the base address.
429 */
430 if ((ptcr & PRTS_MASK) > 12 - 8 ||
431 !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT))
432 ret = H_PARAMETER;
433 srcu_read_unlock(&kvm->srcu, srcu_idx);
434 if (ret == H_SUCCESS)
435 kvm->arch.l1_ptcr = ptcr;
436 return ret;
437}
438
439/*
440 * Reload the partition table entry for a guest.
441 * Caller must hold gp->tlb_lock.
442 */
443static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
444{
445 int ret;
446 struct patb_entry ptbl_entry;
447 unsigned long ptbl_addr;
448 struct kvm *kvm = gp->l1_host;
449
450 ret = -EFAULT;
451 ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4);
452 if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8)))
453 ret = kvm_read_guest(kvm, ptbl_addr,
454 &ptbl_entry, sizeof(ptbl_entry));
455 if (ret) {
456 gp->l1_gr_to_hr = 0;
457 gp->process_table = 0;
458 } else {
459 gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0);
460 gp->process_table = be64_to_cpu(ptbl_entry.patb1);
461 }
462 kvmhv_set_nested_ptbl(gp);
463}
464
465struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
466{
467 struct kvm_nested_guest *gp;
468 long shadow_lpid;
469
470 gp = kzalloc(sizeof(*gp), GFP_KERNEL);
471 if (!gp)
472 return NULL;
473 gp->l1_host = kvm;
474 gp->l1_lpid = lpid;
475 mutex_init(&gp->tlb_lock);
476 gp->shadow_pgtable = pgd_alloc(kvm->mm);
477 if (!gp->shadow_pgtable)
478 goto out_free;
479 shadow_lpid = kvmppc_alloc_lpid();
480 if (shadow_lpid < 0)
481 goto out_free2;
482 gp->shadow_lpid = shadow_lpid;
483
484 memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu));
485
486 return gp;
487
488 out_free2:
489 pgd_free(kvm->mm, gp->shadow_pgtable);
490 out_free:
491 kfree(gp);
492 return NULL;
493}
494
495/*
496 * Free up any resources allocated for a nested guest.
497 */
498static void kvmhv_release_nested(struct kvm_nested_guest *gp)
499{
500 struct kvm *kvm = gp->l1_host;
501
502 if (gp->shadow_pgtable) {
503 /*
504 * No vcpu is using this struct and no call to
505 * kvmhv_get_nested can find this struct,
506 * so we don't need to hold kvm->mmu_lock.
507 */
508 kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
509 gp->shadow_lpid);
510 pgd_free(kvm->mm, gp->shadow_pgtable);
511 }
512 kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
513 kvmppc_free_lpid(gp->shadow_lpid);
514 kfree(gp);
515}
516
517static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
518{
519 struct kvm *kvm = gp->l1_host;
520 int lpid = gp->l1_lpid;
521 long ref;
522
523 spin_lock(&kvm->mmu_lock);
524 if (gp == kvm->arch.nested_guests[lpid]) {
525 kvm->arch.nested_guests[lpid] = NULL;
526 if (lpid == kvm->arch.max_nested_lpid) {
527 while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
528 ;
529 kvm->arch.max_nested_lpid = lpid;
530 }
531 --gp->refcnt;
532 }
533 ref = gp->refcnt;
534 spin_unlock(&kvm->mmu_lock);
535 if (ref == 0)
536 kvmhv_release_nested(gp);
537}
538
539/*
540 * Free up all nested resources allocated for this guest.
541 * This is called with no vcpus of the guest running, when
542 * switching the guest to HPT mode or when destroying the
543 * guest.
544 */
545void kvmhv_release_all_nested(struct kvm *kvm)
546{
547 int i;
548 struct kvm_nested_guest *gp;
549 struct kvm_nested_guest *freelist = NULL;
550 struct kvm_memory_slot *memslot;
551 int srcu_idx;
552
553 spin_lock(&kvm->mmu_lock);
554 for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
555 gp = kvm->arch.nested_guests[i];
556 if (!gp)
557 continue;
558 kvm->arch.nested_guests[i] = NULL;
559 if (--gp->refcnt == 0) {
560 gp->next = freelist;
561 freelist = gp;
562 }
563 }
564 kvm->arch.max_nested_lpid = -1;
565 spin_unlock(&kvm->mmu_lock);
566 while ((gp = freelist) != NULL) {
567 freelist = gp->next;
568 kvmhv_release_nested(gp);
569 }
570
571 srcu_idx = srcu_read_lock(&kvm->srcu);
572 kvm_for_each_memslot(memslot, kvm_memslots(kvm))
573 kvmhv_free_memslot_nest_rmap(memslot);
574 srcu_read_unlock(&kvm->srcu, srcu_idx);
575}
576
577/* caller must hold gp->tlb_lock */
578static void kvmhv_flush_nested(struct kvm_nested_guest *gp)
579{
580 struct kvm *kvm = gp->l1_host;
581
582 spin_lock(&kvm->mmu_lock);
583 kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, gp->shadow_lpid);
584 spin_unlock(&kvm->mmu_lock);
585 kvmhv_flush_lpid(gp->shadow_lpid);
586 kvmhv_update_ptbl_cache(gp);
587 if (gp->l1_gr_to_hr == 0)
588 kvmhv_remove_nested(gp);
589}
590
591struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
592 bool create)
593{
594 struct kvm_nested_guest *gp, *newgp;
595
596 if (l1_lpid >= KVM_MAX_NESTED_GUESTS ||
597 l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
598 return NULL;
599
600 spin_lock(&kvm->mmu_lock);
601 gp = kvm->arch.nested_guests[l1_lpid];
602 if (gp)
603 ++gp->refcnt;
604 spin_unlock(&kvm->mmu_lock);
605
606 if (gp || !create)
607 return gp;
608
609 newgp = kvmhv_alloc_nested(kvm, l1_lpid);
610 if (!newgp)
611 return NULL;
612 spin_lock(&kvm->mmu_lock);
613 if (kvm->arch.nested_guests[l1_lpid]) {
614 /* someone else beat us to it */
615 gp = kvm->arch.nested_guests[l1_lpid];
616 } else {
617 kvm->arch.nested_guests[l1_lpid] = newgp;
618 ++newgp->refcnt;
619 gp = newgp;
620 newgp = NULL;
621 if (l1_lpid > kvm->arch.max_nested_lpid)
622 kvm->arch.max_nested_lpid = l1_lpid;
623 }
624 ++gp->refcnt;
625 spin_unlock(&kvm->mmu_lock);
626
627 if (newgp)
628 kvmhv_release_nested(newgp);
629
630 return gp;
631}
632
633void kvmhv_put_nested(struct kvm_nested_guest *gp)
634{
635 struct kvm *kvm = gp->l1_host;
636 long ref;
637
638 spin_lock(&kvm->mmu_lock);
639 ref = --gp->refcnt;
640 spin_unlock(&kvm->mmu_lock);
641 if (ref == 0)
642 kvmhv_release_nested(gp);
643}
644
645static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
646{
647 if (lpid > kvm->arch.max_nested_lpid)
648 return NULL;
649 return kvm->arch.nested_guests[lpid];
650}
651
652static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2)
653{
654 return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK |
655 RMAP_NESTED_GPA_MASK));
656}
657
658void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
659 struct rmap_nested **n_rmap)
660{
661 struct llist_node *entry = ((struct llist_head *) rmapp)->first;
662 struct rmap_nested *cursor;
663 u64 rmap, new_rmap = (*n_rmap)->rmap;
664
665 /* Are there any existing entries? */
666 if (!(*rmapp)) {
667 /* No -> use the rmap as a single entry */
668 *rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY;
669 return;
670 }
671
672 /* Do any entries match what we're trying to insert? */
673 for_each_nest_rmap_safe(cursor, entry, &rmap) {
674 if (kvmhv_n_rmap_is_equal(rmap, new_rmap))
675 return;
676 }
677
678 /* Do we need to create a list or just add the new entry? */
679 rmap = *rmapp;
680 if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
681 *rmapp = 0UL;
682 llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp);
683 if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
684 (*n_rmap)->list.next = (struct llist_node *) rmap;
685
686 /* Set NULL so not freed by caller */
687 *n_rmap = NULL;
688}
689
690static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap,
691 unsigned long hpa, unsigned long mask)
692{
693 struct kvm_nested_guest *gp;
694 unsigned long gpa;
695 unsigned int shift, lpid;
696 pte_t *ptep;
697
698 gpa = n_rmap & RMAP_NESTED_GPA_MASK;
699 lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
700 gp = kvmhv_find_nested(kvm, lpid);
701 if (!gp)
702 return;
703
704 /* Find and invalidate the pte */
705 ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
706 /* Don't spuriously invalidate ptes if the pfn has changed */
707 if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
708 kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
709}
710
711static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp,
712 unsigned long hpa, unsigned long mask)
713{
714 struct llist_node *entry = llist_del_all((struct llist_head *) rmapp);
715 struct rmap_nested *cursor;
716 unsigned long rmap;
717
718 for_each_nest_rmap_safe(cursor, entry, &rmap) {
719 kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask);
720 kfree(cursor);
721 }
722}
723
724/* called with kvm->mmu_lock held */
725void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
726 struct kvm_memory_slot *memslot,
727 unsigned long gpa, unsigned long hpa,
728 unsigned long nbytes)
729{
730 unsigned long gfn, end_gfn;
731 unsigned long addr_mask;
732
733 if (!memslot)
734 return;
735 gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn;
736 end_gfn = gfn + (nbytes >> PAGE_SHIFT);
737
738 addr_mask = PTE_RPN_MASK & ~(nbytes - 1);
739 hpa &= addr_mask;
740
741 for (; gfn < end_gfn; gfn++) {
742 unsigned long *rmap = &memslot->arch.rmap[gfn];
743 kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask);
744 }
745}
746
747static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free)
748{
749 unsigned long page;
750
751 for (page = 0; page < free->npages; page++) {
752 unsigned long rmap, *rmapp = &free->arch.rmap[page];
753 struct rmap_nested *cursor;
754 struct llist_node *entry;
755
756 entry = llist_del_all((struct llist_head *) rmapp);
757 for_each_nest_rmap_safe(cursor, entry, &rmap)
758 kfree(cursor);
759 }
760}
761
762static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
763 struct kvm_nested_guest *gp,
764 long gpa, int *shift_ret)
765{
766 struct kvm *kvm = vcpu->kvm;
767 bool ret = false;
768 pte_t *ptep;
769 int shift;
770
771 spin_lock(&kvm->mmu_lock);
772 ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
773 if (!shift)
774 shift = PAGE_SHIFT;
775 if (ptep && pte_present(*ptep)) {
776 kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
777 ret = true;
778 }
779 spin_unlock(&kvm->mmu_lock);
780
781 if (shift_ret)
782 *shift_ret = shift;
783 return ret;
784}
785
786static inline int get_ric(unsigned int instr)
787{
788 return (instr >> 18) & 0x3;
789}
790
791static inline int get_prs(unsigned int instr)
792{
793 return (instr >> 17) & 0x1;
794}
795
796static inline int get_r(unsigned int instr)
797{
798 return (instr >> 16) & 0x1;
799}
800
801static inline int get_lpid(unsigned long r_val)
802{
803 return r_val & 0xffffffff;
804}
805
806static inline int get_is(unsigned long r_val)
807{
808 return (r_val >> 10) & 0x3;
809}
810
811static inline int get_ap(unsigned long r_val)
812{
813 return (r_val >> 5) & 0x7;
814}
815
816static inline long get_epn(unsigned long r_val)
817{
818 return r_val >> 12;
819}
820
821static int kvmhv_emulate_tlbie_tlb_addr(struct kvm_vcpu *vcpu, int lpid,
822 int ap, long epn)
823{
824 struct kvm *kvm = vcpu->kvm;
825 struct kvm_nested_guest *gp;
826 long npages;
827 int shift, shadow_shift;
828 unsigned long addr;
829
830 shift = ap_to_shift(ap);
831 addr = epn << 12;
832 if (shift < 0)
833 /* Invalid ap encoding */
834 return -EINVAL;
835
836 addr &= ~((1UL << shift) - 1);
837 npages = 1UL << (shift - PAGE_SHIFT);
838
839 gp = kvmhv_get_nested(kvm, lpid, false);
840 if (!gp) /* No such guest -> nothing to do */
841 return 0;
842 mutex_lock(&gp->tlb_lock);
843
844 /* There may be more than one host page backing this single guest pte */
845 do {
846 kvmhv_invalidate_shadow_pte(vcpu, gp, addr, &shadow_shift);
847
848 npages -= 1UL << (shadow_shift - PAGE_SHIFT);
849 addr += 1UL << shadow_shift;
850 } while (npages > 0);
851
852 mutex_unlock(&gp->tlb_lock);
853 kvmhv_put_nested(gp);
854 return 0;
855}
856
857static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu,
858 struct kvm_nested_guest *gp, int ric)
859{
860 struct kvm *kvm = vcpu->kvm;
861
862 mutex_lock(&gp->tlb_lock);
863 switch (ric) {
864 case 0:
865 /* Invalidate TLB */
866 spin_lock(&kvm->mmu_lock);
867 kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
868 gp->shadow_lpid);
869 kvmhv_flush_lpid(gp->shadow_lpid);
870 spin_unlock(&kvm->mmu_lock);
871 break;
872 case 1:
873 /*
874 * Invalidate PWC
875 * We don't cache this -> nothing to do
876 */
877 break;
878 case 2:
879 /* Invalidate TLB, PWC and caching of partition table entries */
880 kvmhv_flush_nested(gp);
881 break;
882 default:
883 break;
884 }
885 mutex_unlock(&gp->tlb_lock);
886}
887
888static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric)
889{
890 struct kvm *kvm = vcpu->kvm;
891 struct kvm_nested_guest *gp;
892 int i;
893
894 spin_lock(&kvm->mmu_lock);
895 for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
896 gp = kvm->arch.nested_guests[i];
897 if (gp) {
898 spin_unlock(&kvm->mmu_lock);
899 kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
900 spin_lock(&kvm->mmu_lock);
901 }
902 }
903 spin_unlock(&kvm->mmu_lock);
904}
905
906static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr,
907 unsigned long rsval, unsigned long rbval)
908{
909 struct kvm *kvm = vcpu->kvm;
910 struct kvm_nested_guest *gp;
911 int r, ric, prs, is, ap;
912 int lpid;
913 long epn;
914 int ret = 0;
915
916 ric = get_ric(instr);
917 prs = get_prs(instr);
918 r = get_r(instr);
919 lpid = get_lpid(rsval);
920 is = get_is(rbval);
921
922 /*
923 * These cases are invalid and are not handled:
924 * r != 1 -> Only radix supported
925 * prs == 1 -> Not HV privileged
926 * ric == 3 -> No cluster bombs for radix
927 * is == 1 -> Partition scoped translations not associated with pid
928 * (!is) && (ric == 1 || ric == 2) -> Not supported by ISA
929 */
930 if ((!r) || (prs) || (ric == 3) || (is == 1) ||
931 ((!is) && (ric == 1 || ric == 2)))
932 return -EINVAL;
933
934 switch (is) {
935 case 0:
936 /*
937 * We know ric == 0
938 * Invalidate TLB for a given target address
939 */
940 epn = get_epn(rbval);
941 ap = get_ap(rbval);
942 ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, epn);
943 break;
944 case 2:
945 /* Invalidate matching LPID */
946 gp = kvmhv_get_nested(kvm, lpid, false);
947 if (gp) {
948 kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
949 kvmhv_put_nested(gp);
950 }
951 break;
952 case 3:
953 /* Invalidate ALL LPIDs */
954 kvmhv_emulate_tlbie_all_lpid(vcpu, ric);
955 break;
956 default:
957 ret = -EINVAL;
958 break;
959 }
960
961 return ret;
962}
963
964/*
965 * This handles the H_TLB_INVALIDATE hcall.
966 * Parameters are (r4) tlbie instruction code, (r5) rS contents,
967 * (r6) rB contents.
968 */
969long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu)
970{
971 int ret;
972
973 ret = kvmhv_emulate_priv_tlbie(vcpu, kvmppc_get_gpr(vcpu, 4),
974 kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6));
975 if (ret)
976 return H_PARAMETER;
977 return H_SUCCESS;
978}
979
980/* Used to convert a nested guest real address to a L1 guest real address */
981static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
982 struct kvm_nested_guest *gp,
983 unsigned long n_gpa, unsigned long dsisr,
984 struct kvmppc_pte *gpte_p)
985{
986 u64 fault_addr, flags = dsisr & DSISR_ISSTORE;
987 int ret;
988
989 ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr,
990 &fault_addr);
991
992 if (ret) {
993 /* We didn't find a pte */
994 if (ret == -EINVAL) {
995 /* Unsupported mmu config */
996 flags |= DSISR_UNSUPP_MMU;
997 } else if (ret == -ENOENT) {
998 /* No translation found */
999 flags |= DSISR_NOHPTE;
1000 } else if (ret == -EFAULT) {
1001 /* Couldn't access L1 real address */
1002 flags |= DSISR_PRTABLE_FAULT;
1003 vcpu->arch.fault_gpa = fault_addr;
1004 } else {
1005 /* Unknown error */
1006 return ret;
1007 }
1008 goto forward_to_l1;
1009 } else {
1010 /* We found a pte -> check permissions */
1011 if (dsisr & DSISR_ISSTORE) {
1012 /* Can we write? */
1013 if (!gpte_p->may_write) {
1014 flags |= DSISR_PROTFAULT;
1015 goto forward_to_l1;
1016 }
1017 } else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
1018 /* Can we execute? */
1019 if (!gpte_p->may_execute) {
1020 flags |= SRR1_ISI_N_OR_G;
1021 goto forward_to_l1;
1022 }
1023 } else {
1024 /* Can we read? */
1025 if (!gpte_p->may_read && !gpte_p->may_write) {
1026 flags |= DSISR_PROTFAULT;
1027 goto forward_to_l1;
1028 }
1029 }
1030 }
1031
1032 return 0;
1033
1034forward_to_l1:
1035 vcpu->arch.fault_dsisr = flags;
1036 if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
1037 vcpu->arch.shregs.msr &= ~0x783f0000ul;
1038 vcpu->arch.shregs.msr |= flags;
1039 }
1040 return RESUME_HOST;
1041}
1042
1043static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu,
1044 struct kvm_nested_guest *gp,
1045 unsigned long n_gpa,
1046 struct kvmppc_pte gpte,
1047 unsigned long dsisr)
1048{
1049 struct kvm *kvm = vcpu->kvm;
1050 bool writing = !!(dsisr & DSISR_ISSTORE);
1051 u64 pgflags;
1052 bool ret;
1053
1054 /* Are the rc bits set in the L1 partition scoped pte? */
1055 pgflags = _PAGE_ACCESSED;
1056 if (writing)
1057 pgflags |= _PAGE_DIRTY;
1058 if (pgflags & ~gpte.rc)
1059 return RESUME_HOST;
1060
1061 spin_lock(&kvm->mmu_lock);
1062 /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */
1063 ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing,
1064 gpte.raddr, kvm->arch.lpid);
1065 spin_unlock(&kvm->mmu_lock);
1066 if (!ret)
1067 return -EINVAL;
1068
1069 /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */
1070 ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa,
1071 gp->shadow_lpid);
1072 if (!ret)
1073 return -EINVAL;
1074 return 0;
1075}
1076
1077static inline int kvmppc_radix_level_to_shift(int level)
1078{
1079 switch (level) {
1080 case 2:
1081 return PUD_SHIFT;
1082 case 1:
1083 return PMD_SHIFT;
1084 default:
1085 return PAGE_SHIFT;
1086 }
1087}
1088
1089static inline int kvmppc_radix_shift_to_level(int shift)
1090{
1091 if (shift == PUD_SHIFT)
1092 return 2;
1093 if (shift == PMD_SHIFT)
1094 return 1;
1095 if (shift == PAGE_SHIFT)
1096 return 0;
1097 WARN_ON_ONCE(1);
1098 return 0;
1099}
1100
1101/* called with gp->tlb_lock held */
1102static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
1103 struct kvm_nested_guest *gp)
1104{
1105 struct kvm *kvm = vcpu->kvm;
1106 struct kvm_memory_slot *memslot;
1107 struct rmap_nested *n_rmap;
1108 struct kvmppc_pte gpte;
1109 pte_t pte, *pte_p;
1110 unsigned long mmu_seq;
1111 unsigned long dsisr = vcpu->arch.fault_dsisr;
1112 unsigned long ea = vcpu->arch.fault_dar;
1113 unsigned long *rmapp;
1114 unsigned long n_gpa, gpa, gfn, perm = 0UL;
1115 unsigned int shift, l1_shift, level;
1116 bool writing = !!(dsisr & DSISR_ISSTORE);
1117 bool kvm_ro = false;
1118 long int ret;
1119
1120 if (!gp->l1_gr_to_hr) {
1121 kvmhv_update_ptbl_cache(gp);
1122 if (!gp->l1_gr_to_hr)
1123 return RESUME_HOST;
1124 }
1125
1126 /* Convert the nested guest real address into a L1 guest real address */
1127
1128 n_gpa = vcpu->arch.fault_gpa & ~0xF000000000000FFFULL;
1129 if (!(dsisr & DSISR_PRTABLE_FAULT))
1130 n_gpa |= ea & 0xFFF;
1131 ret = kvmhv_translate_addr_nested(vcpu, gp, n_gpa, dsisr, &gpte);
1132
1133 /*
1134 * If the hardware found a translation but we don't now have a usable
1135 * translation in the l1 partition-scoped tree, remove the shadow pte
1136 * and let the guest retry.
1137 */
1138 if (ret == RESUME_HOST &&
1139 (dsisr & (DSISR_PROTFAULT | DSISR_BADACCESS | DSISR_NOEXEC_OR_G |
1140 DSISR_BAD_COPYPASTE)))
1141 goto inval;
1142 if (ret)
1143 return ret;
1144
1145 /* Failed to set the reference/change bits */
1146 if (dsisr & DSISR_SET_RC) {
1147 ret = kvmhv_handle_nested_set_rc(vcpu, gp, n_gpa, gpte, dsisr);
1148 if (ret == RESUME_HOST)
1149 return ret;
1150 if (ret)
1151 goto inval;
1152 dsisr &= ~DSISR_SET_RC;
1153 if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
1154 DSISR_PROTFAULT)))
1155 return RESUME_GUEST;
1156 }
1157
1158 /*
1159 * We took an HISI or HDSI while we were running a nested guest which
1160 * means we have no partition scoped translation for that. This means
1161 * we need to insert a pte for the mapping into our shadow_pgtable.
1162 */
1163
1164 l1_shift = gpte.page_shift;
1165 if (l1_shift < PAGE_SHIFT) {
1166 /* We don't support l1 using a page size smaller than our own */
1167 pr_err("KVM: L1 guest page shift (%d) less than our own (%d)\n",
1168 l1_shift, PAGE_SHIFT);
1169 return -EINVAL;
1170 }
1171 gpa = gpte.raddr;
1172 gfn = gpa >> PAGE_SHIFT;
1173
1174 /* 1. Get the corresponding host memslot */
1175
1176 memslot = gfn_to_memslot(kvm, gfn);
1177 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
1178 if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS)) {
1179 /* unusual error -> reflect to the guest as a DSI */
1180 kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
1181 return RESUME_GUEST;
1182 }
1183 /* passthrough of emulated MMIO case... */
1184 pr_err("emulated MMIO passthrough?\n");
1185 return -EINVAL;
1186 }
1187 if (memslot->flags & KVM_MEM_READONLY) {
1188 if (writing) {
1189 /* Give the guest a DSI */
1190 kvmppc_core_queue_data_storage(vcpu, ea,
1191 DSISR_ISSTORE | DSISR_PROTFAULT);
1192 return RESUME_GUEST;
1193 }
1194 kvm_ro = true;
1195 }
1196
1197 /* 2. Find the host pte for this L1 guest real address */
1198
1199 /* Used to check for invalidations in progress */
1200 mmu_seq = kvm->mmu_notifier_seq;
1201 smp_rmb();
1202
1203 /* See if can find translation in our partition scoped tables for L1 */
1204 pte = __pte(0);
1205 spin_lock(&kvm->mmu_lock);
1206 pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
1207 if (!shift)
1208 shift = PAGE_SHIFT;
1209 if (pte_p)
1210 pte = *pte_p;
1211 spin_unlock(&kvm->mmu_lock);
1212
1213 if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) {
1214 /* No suitable pte found -> try to insert a mapping */
1215 ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot,
1216 writing, kvm_ro, &pte, &level);
1217 if (ret == -EAGAIN)
1218 return RESUME_GUEST;
1219 else if (ret)
1220 return ret;
1221 shift = kvmppc_radix_level_to_shift(level);
1222 }
1223
1224 /* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */
1225
1226 /* The permissions is the combination of the host and l1 guest ptes */
1227 perm |= gpte.may_read ? 0UL : _PAGE_READ;
1228 perm |= gpte.may_write ? 0UL : _PAGE_WRITE;
1229 perm |= gpte.may_execute ? 0UL : _PAGE_EXEC;
1230 pte = __pte(pte_val(pte) & ~perm);
1231
1232 /* What size pte can we insert? */
1233 if (shift > l1_shift) {
1234 u64 mask;
1235 unsigned int actual_shift = PAGE_SHIFT;
1236 if (PMD_SHIFT < l1_shift)
1237 actual_shift = PMD_SHIFT;
1238 mask = (1UL << shift) - (1UL << actual_shift);
1239 pte = __pte(pte_val(pte) | (gpa & mask));
1240 shift = actual_shift;
1241 }
1242 level = kvmppc_radix_shift_to_level(shift);
1243 n_gpa &= ~((1UL << shift) - 1);
1244
1245 /* 4. Insert the pte into our shadow_pgtable */
1246
1247 n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL);
1248 if (!n_rmap)
1249 return RESUME_GUEST; /* Let the guest try again */
1250 n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) |
1251 (((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT);
1252 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1253 ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level,
1254 mmu_seq, gp->shadow_lpid, rmapp, &n_rmap);
1255 if (n_rmap)
1256 kfree(n_rmap);
1257 if (ret == -EAGAIN)
1258 ret = RESUME_GUEST; /* Let the guest try again */
1259
1260 return ret;
1261
1262 inval:
1263 kvmhv_invalidate_shadow_pte(vcpu, gp, n_gpa, NULL);
1264 return RESUME_GUEST;
1265}
1266
1267long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu)
1268{
1269 struct kvm_nested_guest *gp = vcpu->arch.nested;
1270 long int ret;
1271
1272 mutex_lock(&gp->tlb_lock);
1273 ret = __kvmhv_nested_page_fault(vcpu, gp);
1274 mutex_unlock(&gp->tlb_lock);
1275 return ret;
1276}
1277
1278int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid)
1279{
1280 int ret = -1;
1281
1282 spin_lock(&kvm->mmu_lock);
1283 while (++lpid <= kvm->arch.max_nested_lpid) {
1284 if (kvm->arch.nested_guests[lpid]) {
1285 ret = lpid;
1286 break;
1287 }
1288 }
1289 spin_unlock(&kvm->mmu_lock);
1290 return ret;
1291}
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index b11043b23c18..0787f12c1a1b 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -177,6 +177,7 @@ void kvmppc_subcore_enter_guest(void)
177 177
178 local_paca->sibling_subcore_state->in_guest[subcore_id] = 1; 178 local_paca->sibling_subcore_state->in_guest[subcore_id] = 1;
179} 179}
180EXPORT_SYMBOL_GPL(kvmppc_subcore_enter_guest);
180 181
181void kvmppc_subcore_exit_guest(void) 182void kvmppc_subcore_exit_guest(void)
182{ 183{
@@ -187,6 +188,7 @@ void kvmppc_subcore_exit_guest(void)
187 188
188 local_paca->sibling_subcore_state->in_guest[subcore_id] = 0; 189 local_paca->sibling_subcore_state->in_guest[subcore_id] = 0;
189} 190}
191EXPORT_SYMBOL_GPL(kvmppc_subcore_exit_guest);
190 192
191static bool kvmppc_tb_resync_required(void) 193static bool kvmppc_tb_resync_required(void)
192{ 194{
@@ -331,5 +333,13 @@ long kvmppc_realmode_hmi_handler(void)
331 } else { 333 } else {
332 wait_for_tb_resync(); 334 wait_for_tb_resync();
333 } 335 }
336
337 /*
338 * Reset tb_offset_applied so the guest exit code won't try
339 * to subtract the previous timebase offset from the timebase.
340 */
341 if (local_paca->kvm_hstate.kvm_vcore)
342 local_paca->kvm_hstate.kvm_vcore->tb_offset_applied = 0;
343
334 return 0; 344 return 0;
335} 345}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 758d1d23215e..b3f5786b20dc 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -136,7 +136,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
136 136
137 /* Mark the target VCPU as having an interrupt pending */ 137 /* Mark the target VCPU as having an interrupt pending */
138 vcpu->stat.queue_intr++; 138 vcpu->stat.queue_intr++;
139 set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); 139 set_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
140 140
141 /* Kick self ? Just set MER and return */ 141 /* Kick self ? Just set MER and return */
142 if (vcpu == this_vcpu) { 142 if (vcpu == this_vcpu) {
@@ -170,8 +170,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
170static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu) 170static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
171{ 171{
172 /* Note: Only called on self ! */ 172 /* Note: Only called on self ! */
173 clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 173 clear_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
174 &vcpu->arch.pending_exceptions);
175 mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER); 174 mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER);
176} 175}
177 176
@@ -768,6 +767,14 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
768 void __iomem *xics_phys; 767 void __iomem *xics_phys;
769 int64_t rc; 768 int64_t rc;
770 769
770 if (kvmhv_on_pseries()) {
771 unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
772
773 iosync();
774 plpar_hcall_raw(H_EOI, retbuf, hwirq);
775 return;
776 }
777
771 rc = pnv_opal_pci_msi_eoi(c, hwirq); 778 rc = pnv_opal_pci_msi_eoi(c, hwirq);
772 779
773 if (rc) 780 if (rc)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 1d14046124a0..9b8d50a7cbaf 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -28,6 +28,7 @@
28#include <asm/exception-64s.h> 28#include <asm/exception-64s.h>
29#include <asm/kvm_book3s_asm.h> 29#include <asm/kvm_book3s_asm.h>
30#include <asm/book3s/64/mmu-hash.h> 30#include <asm/book3s/64/mmu-hash.h>
31#include <asm/export.h>
31#include <asm/tm.h> 32#include <asm/tm.h>
32#include <asm/opal.h> 33#include <asm/opal.h>
33#include <asm/xive-regs.h> 34#include <asm/xive-regs.h>
@@ -46,8 +47,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
46#define NAPPING_NOVCPU 2 47#define NAPPING_NOVCPU 2
47 48
48/* Stack frame offsets for kvmppc_hv_entry */ 49/* Stack frame offsets for kvmppc_hv_entry */
49#define SFS 160 50#define SFS 208
50#define STACK_SLOT_TRAP (SFS-4) 51#define STACK_SLOT_TRAP (SFS-4)
52#define STACK_SLOT_SHORT_PATH (SFS-8)
51#define STACK_SLOT_TID (SFS-16) 53#define STACK_SLOT_TID (SFS-16)
52#define STACK_SLOT_PSSCR (SFS-24) 54#define STACK_SLOT_PSSCR (SFS-24)
53#define STACK_SLOT_PID (SFS-32) 55#define STACK_SLOT_PID (SFS-32)
@@ -56,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
56#define STACK_SLOT_DAWR (SFS-56) 58#define STACK_SLOT_DAWR (SFS-56)
57#define STACK_SLOT_DAWRX (SFS-64) 59#define STACK_SLOT_DAWRX (SFS-64)
58#define STACK_SLOT_HFSCR (SFS-72) 60#define STACK_SLOT_HFSCR (SFS-72)
61/* the following is used by the P9 short path */
62#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */
59 63
60/* 64/*
61 * Call kvmppc_hv_entry in real mode. 65 * Call kvmppc_hv_entry in real mode.
@@ -113,45 +117,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
113 mtspr SPRN_SPRG_VDSO_WRITE,r3 117 mtspr SPRN_SPRG_VDSO_WRITE,r3
114 118
115 /* Reload the host's PMU registers */ 119 /* Reload the host's PMU registers */
116 lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */ 120 bl kvmhv_load_host_pmu
117 cmpwi r4, 0
118 beq 23f /* skip if not */
119BEGIN_FTR_SECTION
120 ld r3, HSTATE_MMCR0(r13)
121 andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
122 cmpwi r4, MMCR0_PMAO
123 beql kvmppc_fix_pmao
124END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
125 lwz r3, HSTATE_PMC1(r13)
126 lwz r4, HSTATE_PMC2(r13)
127 lwz r5, HSTATE_PMC3(r13)
128 lwz r6, HSTATE_PMC4(r13)
129 lwz r8, HSTATE_PMC5(r13)
130 lwz r9, HSTATE_PMC6(r13)
131 mtspr SPRN_PMC1, r3
132 mtspr SPRN_PMC2, r4
133 mtspr SPRN_PMC3, r5
134 mtspr SPRN_PMC4, r6
135 mtspr SPRN_PMC5, r8
136 mtspr SPRN_PMC6, r9
137 ld r3, HSTATE_MMCR0(r13)
138 ld r4, HSTATE_MMCR1(r13)
139 ld r5, HSTATE_MMCRA(r13)
140 ld r6, HSTATE_SIAR(r13)
141 ld r7, HSTATE_SDAR(r13)
142 mtspr SPRN_MMCR1, r4
143 mtspr SPRN_MMCRA, r5
144 mtspr SPRN_SIAR, r6
145 mtspr SPRN_SDAR, r7
146BEGIN_FTR_SECTION
147 ld r8, HSTATE_MMCR2(r13)
148 ld r9, HSTATE_SIER(r13)
149 mtspr SPRN_MMCR2, r8
150 mtspr SPRN_SIER, r9
151END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
152 mtspr SPRN_MMCR0, r3
153 isync
15423:
155 121
156 /* 122 /*
157 * Reload DEC. HDEC interrupts were disabled when 123 * Reload DEC. HDEC interrupts were disabled when
@@ -796,66 +762,23 @@ BEGIN_FTR_SECTION
796 b 91f 762 b 91f
797END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) 763END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
798 /* 764 /*
799 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR 765 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
800 */ 766 */
801 mr r3, r4 767 mr r3, r4
802 ld r4, VCPU_MSR(r3) 768 ld r4, VCPU_MSR(r3)
769 li r5, 0 /* don't preserve non-vol regs */
803 bl kvmppc_restore_tm_hv 770 bl kvmppc_restore_tm_hv
771 nop
804 ld r4, HSTATE_KVM_VCPU(r13) 772 ld r4, HSTATE_KVM_VCPU(r13)
80591: 77391:
806#endif 774#endif
807 775
808 /* Load guest PMU registers */ 776 /* Load guest PMU registers; r4 = vcpu pointer here */
809 /* R4 is live here (vcpu pointer) */ 777 mr r3, r4
810 li r3, 1 778 bl kvmhv_load_guest_pmu
811 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
812 mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
813 isync
814BEGIN_FTR_SECTION
815 ld r3, VCPU_MMCR(r4)
816 andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
817 cmpwi r5, MMCR0_PMAO
818 beql kvmppc_fix_pmao
819END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
820 lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */
821 lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */
822 lwz r6, VCPU_PMC + 8(r4)
823 lwz r7, VCPU_PMC + 12(r4)
824 lwz r8, VCPU_PMC + 16(r4)
825 lwz r9, VCPU_PMC + 20(r4)
826 mtspr SPRN_PMC1, r3
827 mtspr SPRN_PMC2, r5
828 mtspr SPRN_PMC3, r6
829 mtspr SPRN_PMC4, r7
830 mtspr SPRN_PMC5, r8
831 mtspr SPRN_PMC6, r9
832 ld r3, VCPU_MMCR(r4)
833 ld r5, VCPU_MMCR + 8(r4)
834 ld r6, VCPU_MMCR + 16(r4)
835 ld r7, VCPU_SIAR(r4)
836 ld r8, VCPU_SDAR(r4)
837 mtspr SPRN_MMCR1, r5
838 mtspr SPRN_MMCRA, r6
839 mtspr SPRN_SIAR, r7
840 mtspr SPRN_SDAR, r8
841BEGIN_FTR_SECTION
842 ld r5, VCPU_MMCR + 24(r4)
843 ld r6, VCPU_SIER(r4)
844 mtspr SPRN_MMCR2, r5
845 mtspr SPRN_SIER, r6
846BEGIN_FTR_SECTION_NESTED(96)
847 lwz r7, VCPU_PMC + 24(r4)
848 lwz r8, VCPU_PMC + 28(r4)
849 ld r9, VCPU_MMCR + 32(r4)
850 mtspr SPRN_SPMC1, r7
851 mtspr SPRN_SPMC2, r8
852 mtspr SPRN_MMCRS, r9
853END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
854END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
855 mtspr SPRN_MMCR0, r3
856 isync
857 779
858 /* Load up FP, VMX and VSX registers */ 780 /* Load up FP, VMX and VSX registers */
781 ld r4, HSTATE_KVM_VCPU(r13)
859 bl kvmppc_load_fp 782 bl kvmppc_load_fp
860 783
861 ld r14, VCPU_GPR(R14)(r4) 784 ld r14, VCPU_GPR(R14)(r4)
@@ -1100,73 +1023,40 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
1100no_xive: 1023no_xive:
1101#endif /* CONFIG_KVM_XICS */ 1024#endif /* CONFIG_KVM_XICS */
1102 1025
1103deliver_guest_interrupt: 1026 li r0, 0
1104 ld r6, VCPU_CTR(r4) 1027 stw r0, STACK_SLOT_SHORT_PATH(r1)
1105 ld r7, VCPU_XER(r4)
1106
1107 mtctr r6
1108 mtxer r7
1109 1028
1110kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ 1029deliver_guest_interrupt: /* r4 = vcpu, r13 = paca */
1111 ld r10, VCPU_PC(r4) 1030 /* Check if we can deliver an external or decrementer interrupt now */
1112 ld r11, VCPU_MSR(r4) 1031 ld r0, VCPU_PENDING_EXC(r4)
1032BEGIN_FTR_SECTION
1033 /* On POWER9, also check for emulated doorbell interrupt */
1034 lbz r3, VCPU_DBELL_REQ(r4)
1035 or r0, r0, r3
1036END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
1037 cmpdi r0, 0
1038 beq 71f
1039 mr r3, r4
1040 bl kvmppc_guest_entry_inject_int
1041 ld r4, HSTATE_KVM_VCPU(r13)
104271:
1113 ld r6, VCPU_SRR0(r4) 1043 ld r6, VCPU_SRR0(r4)
1114 ld r7, VCPU_SRR1(r4) 1044 ld r7, VCPU_SRR1(r4)
1115 mtspr SPRN_SRR0, r6 1045 mtspr SPRN_SRR0, r6
1116 mtspr SPRN_SRR1, r7 1046 mtspr SPRN_SRR1, r7
1117 1047
1048fast_guest_entry_c:
1049 ld r10, VCPU_PC(r4)
1050 ld r11, VCPU_MSR(r4)
1118 /* r11 = vcpu->arch.msr & ~MSR_HV */ 1051 /* r11 = vcpu->arch.msr & ~MSR_HV */
1119 rldicl r11, r11, 63 - MSR_HV_LG, 1 1052 rldicl r11, r11, 63 - MSR_HV_LG, 1
1120 rotldi r11, r11, 1 + MSR_HV_LG 1053 rotldi r11, r11, 1 + MSR_HV_LG
1121 ori r11, r11, MSR_ME 1054 ori r11, r11, MSR_ME
1122 1055
1123 /* Check if we can deliver an external or decrementer interrupt now */ 1056 ld r6, VCPU_CTR(r4)
1124 ld r0, VCPU_PENDING_EXC(r4) 1057 ld r7, VCPU_XER(r4)
1125 rldicl r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63 1058 mtctr r6
1126 cmpdi cr1, r0, 0 1059 mtxer r7
1127 andi. r8, r11, MSR_EE
1128 mfspr r8, SPRN_LPCR
1129 /* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */
1130 rldimi r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH
1131 mtspr SPRN_LPCR, r8
1132 isync
1133 beq 5f
1134 li r0, BOOK3S_INTERRUPT_EXTERNAL
1135 bne cr1, 12f
1136 mfspr r0, SPRN_DEC
1137BEGIN_FTR_SECTION
1138 /* On POWER9 check whether the guest has large decrementer enabled */
1139 andis. r8, r8, LPCR_LD@h
1140 bne 15f
1141END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
1142 extsw r0, r0
114315: cmpdi r0, 0
1144 li r0, BOOK3S_INTERRUPT_DECREMENTER
1145 bge 5f
1146
114712: mtspr SPRN_SRR0, r10
1148 mr r10,r0
1149 mtspr SPRN_SRR1, r11
1150 mr r9, r4
1151 bl kvmppc_msr_interrupt
11525:
1153BEGIN_FTR_SECTION
1154 b fast_guest_return
1155END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
1156 /* On POWER9, check for pending doorbell requests */
1157 lbz r0, VCPU_DBELL_REQ(r4)
1158 cmpwi r0, 0
1159 beq fast_guest_return
1160 ld r5, HSTATE_KVM_VCORE(r13)
1161 /* Set DPDES register so the CPU will take a doorbell interrupt */
1162 li r0, 1
1163 mtspr SPRN_DPDES, r0
1164 std r0, VCORE_DPDES(r5)
1165 /* Make sure other cpus see vcore->dpdes set before dbell req clear */
1166 lwsync
1167 /* Clear the pending doorbell request */
1168 li r0, 0
1169 stb r0, VCPU_DBELL_REQ(r4)
1170 1060
1171/* 1061/*
1172 * Required state: 1062 * Required state:
@@ -1202,7 +1092,7 @@ BEGIN_FTR_SECTION
1202END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) 1092END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
1203 1093
1204 ld r5, VCPU_LR(r4) 1094 ld r5, VCPU_LR(r4)
1205 lwz r6, VCPU_CR(r4) 1095 ld r6, VCPU_CR(r4)
1206 mtlr r5 1096 mtlr r5
1207 mtcr r6 1097 mtcr r6
1208 1098
@@ -1234,6 +1124,83 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
1234 HRFI_TO_GUEST 1124 HRFI_TO_GUEST
1235 b . 1125 b .
1236 1126
1127/*
1128 * Enter the guest on a P9 or later system where we have exactly
1129 * one vcpu per vcore and we don't need to go to real mode
1130 * (which implies that host and guest are both using radix MMU mode).
1131 * r3 = vcpu pointer
1132 * Most SPRs and all the VSRs have been loaded already.
1133 */
1134_GLOBAL(__kvmhv_vcpu_entry_p9)
1135EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9)
1136 mflr r0
1137 std r0, PPC_LR_STKOFF(r1)
1138 stdu r1, -SFS(r1)
1139
1140 li r0, 1
1141 stw r0, STACK_SLOT_SHORT_PATH(r1)
1142
1143 std r3, HSTATE_KVM_VCPU(r13)
1144 mfcr r4
1145 stw r4, SFS+8(r1)
1146
1147 std r1, HSTATE_HOST_R1(r13)
1148
1149 reg = 14
1150 .rept 18
1151 std reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
1152 reg = reg + 1
1153 .endr
1154
1155 reg = 14
1156 .rept 18
1157 ld reg, __VCPU_GPR(reg)(r3)
1158 reg = reg + 1
1159 .endr
1160
1161 mfmsr r10
1162 std r10, HSTATE_HOST_MSR(r13)
1163
1164 mr r4, r3
1165 b fast_guest_entry_c
1166guest_exit_short_path:
1167
1168 li r0, KVM_GUEST_MODE_NONE
1169 stb r0, HSTATE_IN_GUEST(r13)
1170
1171 reg = 14
1172 .rept 18
1173 std reg, __VCPU_GPR(reg)(r9)
1174 reg = reg + 1
1175 .endr
1176
1177 reg = 14
1178 .rept 18
1179 ld reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
1180 reg = reg + 1
1181 .endr
1182
1183 lwz r4, SFS+8(r1)
1184 mtcr r4
1185
1186 mr r3, r12 /* trap number */
1187
1188 addi r1, r1, SFS
1189 ld r0, PPC_LR_STKOFF(r1)
1190 mtlr r0
1191
1192 /* If we are in real mode, do a rfid to get back to the caller */
1193 mfmsr r4
1194 andi. r5, r4, MSR_IR
1195 bnelr
1196 rldicl r5, r4, 64 - MSR_TS_S_LG, 62 /* extract TS field */
1197 mtspr SPRN_SRR0, r0
1198 ld r10, HSTATE_HOST_MSR(r13)
1199 rldimi r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG
1200 mtspr SPRN_SRR1, r10
1201 RFI_TO_KERNEL
1202 b .
1203
1237secondary_too_late: 1204secondary_too_late:
1238 li r12, 0 1205 li r12, 0
1239 stw r12, STACK_SLOT_TRAP(r1) 1206 stw r12, STACK_SLOT_TRAP(r1)
@@ -1313,7 +1280,7 @@ kvmppc_interrupt_hv:
1313 std r3, VCPU_GPR(R12)(r9) 1280 std r3, VCPU_GPR(R12)(r9)
1314 /* CR is in the high half of r12 */ 1281 /* CR is in the high half of r12 */
1315 srdi r4, r12, 32 1282 srdi r4, r12, 32
1316 stw r4, VCPU_CR(r9) 1283 std r4, VCPU_CR(r9)
1317BEGIN_FTR_SECTION 1284BEGIN_FTR_SECTION
1318 ld r3, HSTATE_CFAR(r13) 1285 ld r3, HSTATE_CFAR(r13)
1319 std r3, VCPU_CFAR(r9) 1286 std r3, VCPU_CFAR(r9)
@@ -1387,18 +1354,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
1387 std r3, VCPU_CTR(r9) 1354 std r3, VCPU_CTR(r9)
1388 std r4, VCPU_XER(r9) 1355 std r4, VCPU_XER(r9)
1389 1356
1390#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1357 /* Save more register state */
1391 /* For softpatch interrupt, go off and do TM instruction emulation */ 1358 mfdar r3
1392 cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH 1359 mfdsisr r4
1393 beq kvmppc_tm_emul 1360 std r3, VCPU_DAR(r9)
1394#endif 1361 stw r4, VCPU_DSISR(r9)
1395 1362
1396 /* If this is a page table miss then see if it's theirs or ours */ 1363 /* If this is a page table miss then see if it's theirs or ours */
1397 cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE 1364 cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
1398 beq kvmppc_hdsi 1365 beq kvmppc_hdsi
1366 std r3, VCPU_FAULT_DAR(r9)
1367 stw r4, VCPU_FAULT_DSISR(r9)
1399 cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE 1368 cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE
1400 beq kvmppc_hisi 1369 beq kvmppc_hisi
1401 1370
1371#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1372 /* For softpatch interrupt, go off and do TM instruction emulation */
1373 cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH
1374 beq kvmppc_tm_emul
1375#endif
1376
1402 /* See if this is a leftover HDEC interrupt */ 1377 /* See if this is a leftover HDEC interrupt */
1403 cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER 1378 cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER
1404 bne 2f 1379 bne 2f
@@ -1418,10 +1393,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
1418BEGIN_FTR_SECTION 1393BEGIN_FTR_SECTION
1419 PPC_MSGSYNC 1394 PPC_MSGSYNC
1420 lwsync 1395 lwsync
1396 /* always exit if we're running a nested guest */
1397 ld r0, VCPU_NESTED(r9)
1398 cmpdi r0, 0
1399 bne guest_exit_cont
1421END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 1400END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
1422 lbz r0, HSTATE_HOST_IPI(r13) 1401 lbz r0, HSTATE_HOST_IPI(r13)
1423 cmpwi r0, 0 1402 cmpwi r0, 0
1424 beq 4f 1403 beq maybe_reenter_guest
1425 b guest_exit_cont 1404 b guest_exit_cont
14263: 14053:
1427 /* If it's a hypervisor facility unavailable interrupt, save HFSCR */ 1406 /* If it's a hypervisor facility unavailable interrupt, save HFSCR */
@@ -1433,82 +1412,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
143314: 141214:
1434 /* External interrupt ? */ 1413 /* External interrupt ? */
1435 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL 1414 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
1436 bne+ guest_exit_cont 1415 beq kvmppc_guest_external
1437
1438 /* External interrupt, first check for host_ipi. If this is
1439 * set, we know the host wants us out so let's do it now
1440 */
1441 bl kvmppc_read_intr
1442
1443 /*
1444 * Restore the active volatile registers after returning from
1445 * a C function.
1446 */
1447 ld r9, HSTATE_KVM_VCPU(r13)
1448 li r12, BOOK3S_INTERRUPT_EXTERNAL
1449
1450 /*
1451 * kvmppc_read_intr return codes:
1452 *
1453 * Exit to host (r3 > 0)
1454 * 1 An interrupt is pending that needs to be handled by the host
1455 * Exit guest and return to host by branching to guest_exit_cont
1456 *
1457 * 2 Passthrough that needs completion in the host
1458 * Exit guest and return to host by branching to guest_exit_cont
1459 * However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
1460 * to indicate to the host to complete handling the interrupt
1461 *
1462 * Before returning to guest, we check if any CPU is heading out
1463 * to the host and if so, we head out also. If no CPUs are heading
1464 * check return values <= 0.
1465 *
1466 * Return to guest (r3 <= 0)
1467 * 0 No external interrupt is pending
1468 * -1 A guest wakeup IPI (which has now been cleared)
1469 * In either case, we return to guest to deliver any pending
1470 * guest interrupts.
1471 *
1472 * -2 A PCI passthrough external interrupt was handled
1473 * (interrupt was delivered directly to guest)
1474 * Return to guest to deliver any pending guest interrupts.
1475 */
1476
1477 cmpdi r3, 1
1478 ble 1f
1479
1480 /* Return code = 2 */
1481 li r12, BOOK3S_INTERRUPT_HV_RM_HARD
1482 stw r12, VCPU_TRAP(r9)
1483 b guest_exit_cont
1484
14851: /* Return code <= 1 */
1486 cmpdi r3, 0
1487 bgt guest_exit_cont
1488
1489 /* Return code <= 0 */
14904: ld r5, HSTATE_KVM_VCORE(r13)
1491 lwz r0, VCORE_ENTRY_EXIT(r5)
1492 cmpwi r0, 0x100
1493 mr r4, r9
1494 blt deliver_guest_interrupt
1495
1496guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
1497 /* Save more register state */
1498 mfdar r6
1499 mfdsisr r7
1500 std r6, VCPU_DAR(r9)
1501 stw r7, VCPU_DSISR(r9)
1502 /* don't overwrite fault_dar/fault_dsisr if HDSI */
1503 cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
1504 beq mc_cont
1505 std r6, VCPU_FAULT_DAR(r9)
1506 stw r7, VCPU_FAULT_DSISR(r9)
1507
1508 /* See if it is a machine check */ 1416 /* See if it is a machine check */
1509 cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK 1417 cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK
1510 beq machine_check_realmode 1418 beq machine_check_realmode
1511mc_cont: 1419 /* Or a hypervisor maintenance interrupt */
1420 cmpwi r12, BOOK3S_INTERRUPT_HMI
1421 beq hmi_realmode
1422
1423guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
1424
1512#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1425#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
1513 addi r3, r9, VCPU_TB_RMEXIT 1426 addi r3, r9, VCPU_TB_RMEXIT
1514 mr r4, r9 1427 mr r4, r9
@@ -1552,6 +1465,11 @@ mc_cont:
15521: 14651:
1553#endif /* CONFIG_KVM_XICS */ 1466#endif /* CONFIG_KVM_XICS */
1554 1467
1468 /* If we came in through the P9 short path, go back out to C now */
1469 lwz r0, STACK_SLOT_SHORT_PATH(r1)
1470 cmpwi r0, 0
1471 bne guest_exit_short_path
1472
1555 /* For hash guest, read the guest SLB and save it away */ 1473 /* For hash guest, read the guest SLB and save it away */
1556 ld r5, VCPU_KVM(r9) 1474 ld r5, VCPU_KVM(r9)
1557 lbz r0, KVM_RADIX(r5) 1475 lbz r0, KVM_RADIX(r5)
@@ -1780,11 +1698,13 @@ BEGIN_FTR_SECTION
1780 b 91f 1698 b 91f
1781END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) 1699END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
1782 /* 1700 /*
1783 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR 1701 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
1784 */ 1702 */
1785 mr r3, r9 1703 mr r3, r9
1786 ld r4, VCPU_MSR(r3) 1704 ld r4, VCPU_MSR(r3)
1705 li r5, 0 /* don't preserve non-vol regs */
1787 bl kvmppc_save_tm_hv 1706 bl kvmppc_save_tm_hv
1707 nop
1788 ld r9, HSTATE_KVM_VCPU(r13) 1708 ld r9, HSTATE_KVM_VCPU(r13)
178991: 170991:
1790#endif 1710#endif
@@ -1802,83 +1722,12 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
180225: 172225:
1803 /* Save PMU registers if requested */ 1723 /* Save PMU registers if requested */
1804 /* r8 and cr0.eq are live here */ 1724 /* r8 and cr0.eq are live here */
1805BEGIN_FTR_SECTION 1725 mr r3, r9
1806 /* 1726 li r4, 1
1807 * POWER8 seems to have a hardware bug where setting
1808 * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
1809 * when some counters are already negative doesn't seem
1810 * to cause a performance monitor alert (and hence interrupt).
1811 * The effect of this is that when saving the PMU state,
1812 * if there is no PMU alert pending when we read MMCR0
1813 * before freezing the counters, but one becomes pending
1814 * before we read the counters, we lose it.
1815 * To work around this, we need a way to freeze the counters
1816 * before reading MMCR0. Normally, freezing the counters
1817 * is done by writing MMCR0 (to set MMCR0[FC]) which
1818 * unavoidably writes MMCR0[PMA0] as well. On POWER8,
1819 * we can also freeze the counters using MMCR2, by writing
1820 * 1s to all the counter freeze condition bits (there are
1821 * 9 bits each for 6 counters).
1822 */
1823 li r3, -1 /* set all freeze bits */
1824 clrrdi r3, r3, 10
1825 mfspr r10, SPRN_MMCR2
1826 mtspr SPRN_MMCR2, r3
1827 isync
1828END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
1829 li r3, 1
1830 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
1831 mfspr r4, SPRN_MMCR0 /* save MMCR0 */
1832 mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
1833 mfspr r6, SPRN_MMCRA
1834 /* Clear MMCRA in order to disable SDAR updates */
1835 li r7, 0
1836 mtspr SPRN_MMCRA, r7
1837 isync
1838 beq 21f /* if no VPA, save PMU stuff anyway */ 1727 beq 21f /* if no VPA, save PMU stuff anyway */
1839 lbz r7, LPPACA_PMCINUSE(r8) 1728 lbz r4, LPPACA_PMCINUSE(r8)
1840 cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */ 172921: bl kvmhv_save_guest_pmu
1841 bne 21f 1730 ld r9, HSTATE_KVM_VCPU(r13)
1842 std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */
1843 b 22f
184421: mfspr r5, SPRN_MMCR1
1845 mfspr r7, SPRN_SIAR
1846 mfspr r8, SPRN_SDAR
1847 std r4, VCPU_MMCR(r9)
1848 std r5, VCPU_MMCR + 8(r9)
1849 std r6, VCPU_MMCR + 16(r9)
1850BEGIN_FTR_SECTION
1851 std r10, VCPU_MMCR + 24(r9)
1852END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
1853 std r7, VCPU_SIAR(r9)
1854 std r8, VCPU_SDAR(r9)
1855 mfspr r3, SPRN_PMC1
1856 mfspr r4, SPRN_PMC2
1857 mfspr r5, SPRN_PMC3
1858 mfspr r6, SPRN_PMC4
1859 mfspr r7, SPRN_PMC5
1860 mfspr r8, SPRN_PMC6
1861 stw r3, VCPU_PMC(r9)
1862 stw r4, VCPU_PMC + 4(r9)
1863 stw r5, VCPU_PMC + 8(r9)
1864 stw r6, VCPU_PMC + 12(r9)
1865 stw r7, VCPU_PMC + 16(r9)
1866 stw r8, VCPU_PMC + 20(r9)
1867BEGIN_FTR_SECTION
1868 mfspr r5, SPRN_SIER
1869 std r5, VCPU_SIER(r9)
1870BEGIN_FTR_SECTION_NESTED(96)
1871 mfspr r6, SPRN_SPMC1
1872 mfspr r7, SPRN_SPMC2
1873 mfspr r8, SPRN_MMCRS
1874 stw r6, VCPU_PMC + 24(r9)
1875 stw r7, VCPU_PMC + 28(r9)
1876 std r8, VCPU_MMCR + 32(r9)
1877 lis r4, 0x8000
1878 mtspr SPRN_MMCRS, r4
1879END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
1880END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
188122:
1882 1731
1883 /* Restore host values of some registers */ 1732 /* Restore host values of some registers */
1884BEGIN_FTR_SECTION 1733BEGIN_FTR_SECTION
@@ -2010,24 +1859,6 @@ BEGIN_FTR_SECTION
2010 mtspr SPRN_DPDES, r8 1859 mtspr SPRN_DPDES, r8
2011END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 1860END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
2012 1861
2013 /* If HMI, call kvmppc_realmode_hmi_handler() */
2014 lwz r12, STACK_SLOT_TRAP(r1)
2015 cmpwi r12, BOOK3S_INTERRUPT_HMI
2016 bne 27f
2017 bl kvmppc_realmode_hmi_handler
2018 nop
2019 cmpdi r3, 0
2020 /*
2021 * At this point kvmppc_realmode_hmi_handler may have resync-ed
2022 * the TB, and if it has, we must not subtract the guest timebase
2023 * offset from the timebase. So, skip it.
2024 *
2025 * Also, do not call kvmppc_subcore_exit_guest() because it has
2026 * been invoked as part of kvmppc_realmode_hmi_handler().
2027 */
2028 beq 30f
2029
203027:
2031 /* Subtract timebase offset from timebase */ 1862 /* Subtract timebase offset from timebase */
2032 ld r8, VCORE_TB_OFFSET_APPL(r5) 1863 ld r8, VCORE_TB_OFFSET_APPL(r5)
2033 cmpdi r8,0 1864 cmpdi r8,0
@@ -2045,7 +1876,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
2045 addis r8,r8,0x100 /* if so, increment upper 40 bits */ 1876 addis r8,r8,0x100 /* if so, increment upper 40 bits */
2046 mtspr SPRN_TBU40,r8 1877 mtspr SPRN_TBU40,r8
2047 1878
204817: bl kvmppc_subcore_exit_guest 187917:
1880 /*
1881 * If this is an HMI, we called kvmppc_realmode_hmi_handler
1882 * above, which may or may not have already called
1883 * kvmppc_subcore_exit_guest. Fortunately, all that
1884 * kvmppc_subcore_exit_guest does is clear a flag, so calling
1885 * it again here is benign even if kvmppc_realmode_hmi_handler
1886 * has already called it.
1887 */
1888 bl kvmppc_subcore_exit_guest
2049 nop 1889 nop
205030: ld r5,HSTATE_KVM_VCORE(r13) 189030: ld r5,HSTATE_KVM_VCORE(r13)
2051 ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ 1891 ld r4,VCORE_KVM(r5) /* pointer to struct kvm */
@@ -2099,6 +1939,67 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
2099 mtlr r0 1939 mtlr r0
2100 blr 1940 blr
2101 1941
1942kvmppc_guest_external:
1943 /* External interrupt, first check for host_ipi. If this is
1944 * set, we know the host wants us out so let's do it now
1945 */
1946 bl kvmppc_read_intr
1947
1948 /*
1949 * Restore the active volatile registers after returning from
1950 * a C function.
1951 */
1952 ld r9, HSTATE_KVM_VCPU(r13)
1953 li r12, BOOK3S_INTERRUPT_EXTERNAL
1954
1955 /*
1956 * kvmppc_read_intr return codes:
1957 *
1958 * Exit to host (r3 > 0)
1959 * 1 An interrupt is pending that needs to be handled by the host
1960 * Exit guest and return to host by branching to guest_exit_cont
1961 *
1962 * 2 Passthrough that needs completion in the host
1963 * Exit guest and return to host by branching to guest_exit_cont
1964 * However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
1965 * to indicate to the host to complete handling the interrupt
1966 *
1967 * Before returning to guest, we check if any CPU is heading out
1968 * to the host and if so, we head out also. If no CPUs are heading
1969 * check return values <= 0.
1970 *
1971 * Return to guest (r3 <= 0)
1972 * 0 No external interrupt is pending
1973 * -1 A guest wakeup IPI (which has now been cleared)
1974 * In either case, we return to guest to deliver any pending
1975 * guest interrupts.
1976 *
1977 * -2 A PCI passthrough external interrupt was handled
1978 * (interrupt was delivered directly to guest)
1979 * Return to guest to deliver any pending guest interrupts.
1980 */
1981
1982 cmpdi r3, 1
1983 ble 1f
1984
1985 /* Return code = 2 */
1986 li r12, BOOK3S_INTERRUPT_HV_RM_HARD
1987 stw r12, VCPU_TRAP(r9)
1988 b guest_exit_cont
1989
19901: /* Return code <= 1 */
1991 cmpdi r3, 0
1992 bgt guest_exit_cont
1993
1994 /* Return code <= 0 */
1995maybe_reenter_guest:
1996 ld r5, HSTATE_KVM_VCORE(r13)
1997 lwz r0, VCORE_ENTRY_EXIT(r5)
1998 cmpwi r0, 0x100
1999 mr r4, r9
2000 blt deliver_guest_interrupt
2001 b guest_exit_cont
2002
2102#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 2003#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2103/* 2004/*
2104 * Softpatch interrupt for transactional memory emulation cases 2005 * Softpatch interrupt for transactional memory emulation cases
@@ -2302,6 +2203,10 @@ hcall_try_real_mode:
2302 andi. r0,r11,MSR_PR 2203 andi. r0,r11,MSR_PR
2303 /* sc 1 from userspace - reflect to guest syscall */ 2204 /* sc 1 from userspace - reflect to guest syscall */
2304 bne sc_1_fast_return 2205 bne sc_1_fast_return
2206 /* sc 1 from nested guest - give it to L1 to handle */
2207 ld r0, VCPU_NESTED(r9)
2208 cmpdi r0, 0
2209 bne guest_exit_cont
2305 clrrdi r3,r3,2 2210 clrrdi r3,r3,2
2306 cmpldi r3,hcall_real_table_end - hcall_real_table 2211 cmpldi r3,hcall_real_table_end - hcall_real_table
2307 bge guest_exit_cont 2212 bge guest_exit_cont
@@ -2561,6 +2466,7 @@ hcall_real_table:
2561hcall_real_table_end: 2466hcall_real_table_end:
2562 2467
2563_GLOBAL(kvmppc_h_set_xdabr) 2468_GLOBAL(kvmppc_h_set_xdabr)
2469EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr)
2564 andi. r0, r5, DABRX_USER | DABRX_KERNEL 2470 andi. r0, r5, DABRX_USER | DABRX_KERNEL
2565 beq 6f 2471 beq 6f
2566 li r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI 2472 li r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI
@@ -2570,6 +2476,7 @@ _GLOBAL(kvmppc_h_set_xdabr)
2570 blr 2476 blr
2571 2477
2572_GLOBAL(kvmppc_h_set_dabr) 2478_GLOBAL(kvmppc_h_set_dabr)
2479EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr)
2573 li r5, DABRX_USER | DABRX_KERNEL 2480 li r5, DABRX_USER | DABRX_KERNEL
25743: 24813:
2575BEGIN_FTR_SECTION 2482BEGIN_FTR_SECTION
@@ -2682,11 +2589,13 @@ BEGIN_FTR_SECTION
2682 b 91f 2589 b 91f
2683END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) 2590END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
2684 /* 2591 /*
2685 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR 2592 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
2686 */ 2593 */
2687 ld r3, HSTATE_KVM_VCPU(r13) 2594 ld r3, HSTATE_KVM_VCPU(r13)
2688 ld r4, VCPU_MSR(r3) 2595 ld r4, VCPU_MSR(r3)
2596 li r5, 0 /* don't preserve non-vol regs */
2689 bl kvmppc_save_tm_hv 2597 bl kvmppc_save_tm_hv
2598 nop
269091: 259991:
2691#endif 2600#endif
2692 2601
@@ -2802,11 +2711,13 @@ BEGIN_FTR_SECTION
2802 b 91f 2711 b 91f
2803END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) 2712END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
2804 /* 2713 /*
2805 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR 2714 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
2806 */ 2715 */
2807 mr r3, r4 2716 mr r3, r4
2808 ld r4, VCPU_MSR(r3) 2717 ld r4, VCPU_MSR(r3)
2718 li r5, 0 /* don't preserve non-vol regs */
2809 bl kvmppc_restore_tm_hv 2719 bl kvmppc_restore_tm_hv
2720 nop
2810 ld r4, HSTATE_KVM_VCPU(r13) 2721 ld r4, HSTATE_KVM_VCPU(r13)
281191: 272291:
2812#endif 2723#endif
@@ -2874,13 +2785,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
2874 mr r9, r4 2785 mr r9, r4
2875 cmpdi r3, 0 2786 cmpdi r3, 0
2876 bgt guest_exit_cont 2787 bgt guest_exit_cont
2877 2788 b maybe_reenter_guest
2878 /* see if any other thread is already exiting */
2879 lwz r0,VCORE_ENTRY_EXIT(r5)
2880 cmpwi r0,0x100
2881 bge guest_exit_cont
2882
2883 b kvmppc_cede_reentry /* if not go back to guest */
2884 2789
2885 /* cede when already previously prodded case */ 2790 /* cede when already previously prodded case */
2886kvm_cede_prodded: 2791kvm_cede_prodded:
@@ -2947,12 +2852,12 @@ machine_check_realmode:
2947 */ 2852 */
2948 ld r11, VCPU_MSR(r9) 2853 ld r11, VCPU_MSR(r9)
2949 rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */ 2854 rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
2950 bne mc_cont /* if so, exit to host */ 2855 bne guest_exit_cont /* if so, exit to host */
2951 /* Check if guest is capable of handling NMI exit */ 2856 /* Check if guest is capable of handling NMI exit */
2952 ld r10, VCPU_KVM(r9) 2857 ld r10, VCPU_KVM(r9)
2953 lbz r10, KVM_FWNMI(r10) 2858 lbz r10, KVM_FWNMI(r10)
2954 cmpdi r10, 1 /* FWNMI capable? */ 2859 cmpdi r10, 1 /* FWNMI capable? */
2955 beq mc_cont /* if so, exit with KVM_EXIT_NMI. */ 2860 beq guest_exit_cont /* if so, exit with KVM_EXIT_NMI. */
2956 2861
2957 /* if not, fall through for backward compatibility. */ 2862 /* if not, fall through for backward compatibility. */
2958 andi. r10, r11, MSR_RI /* check for unrecoverable exception */ 2863 andi. r10, r11, MSR_RI /* check for unrecoverable exception */
@@ -2966,6 +2871,21 @@ machine_check_realmode:
29662: b fast_interrupt_c_return 28712: b fast_interrupt_c_return
2967 2872
2968/* 2873/*
2874 * Call C code to handle a HMI in real mode.
2875 * Only the primary thread does the call, secondary threads are handled
2876 * by calling hmi_exception_realmode() after kvmppc_hv_entry returns.
2877 * r9 points to the vcpu on entry
2878 */
2879hmi_realmode:
2880 lbz r0, HSTATE_PTID(r13)
2881 cmpwi r0, 0
2882 bne guest_exit_cont
2883 bl kvmppc_realmode_hmi_handler
2884 ld r9, HSTATE_KVM_VCPU(r13)
2885 li r12, BOOK3S_INTERRUPT_HMI
2886 b guest_exit_cont
2887
2888/*
2969 * Check the reason we woke from nap, and take appropriate action. 2889 * Check the reason we woke from nap, and take appropriate action.
2970 * Returns (in r3): 2890 * Returns (in r3):
2971 * 0 if nothing needs to be done 2891 * 0 if nothing needs to be done
@@ -3130,10 +3050,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
3130 * Save transactional state and TM-related registers. 3050 * Save transactional state and TM-related registers.
3131 * Called with r3 pointing to the vcpu struct and r4 containing 3051 * Called with r3 pointing to the vcpu struct and r4 containing
3132 * the guest MSR value. 3052 * the guest MSR value.
3133 * This can modify all checkpointed registers, but 3053 * r5 is non-zero iff non-volatile register state needs to be maintained.
3054 * If r5 == 0, this can modify all checkpointed registers, but
3134 * restores r1 and r2 before exit. 3055 * restores r1 and r2 before exit.
3135 */ 3056 */
3136kvmppc_save_tm_hv: 3057_GLOBAL_TOC(kvmppc_save_tm_hv)
3058EXPORT_SYMBOL_GPL(kvmppc_save_tm_hv)
3137 /* See if we need to handle fake suspend mode */ 3059 /* See if we need to handle fake suspend mode */
3138BEGIN_FTR_SECTION 3060BEGIN_FTR_SECTION
3139 b __kvmppc_save_tm 3061 b __kvmppc_save_tm
@@ -3161,12 +3083,6 @@ BEGIN_FTR_SECTION
3161END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) 3083END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
3162 nop 3084 nop
3163 3085
3164 std r1, HSTATE_HOST_R1(r13)
3165
3166 /* Clear the MSR RI since r1, r13 may be foobar. */
3167 li r5, 0
3168 mtmsrd r5, 1
3169
3170 /* We have to treclaim here because that's the only way to do S->N */ 3086 /* We have to treclaim here because that's the only way to do S->N */
3171 li r3, TM_CAUSE_KVM_RESCHED 3087 li r3, TM_CAUSE_KVM_RESCHED
3172 TRECLAIM(R3) 3088 TRECLAIM(R3)
@@ -3175,22 +3091,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
3175 * We were in fake suspend, so we are not going to save the 3091 * We were in fake suspend, so we are not going to save the
3176 * register state as the guest checkpointed state (since 3092 * register state as the guest checkpointed state (since
3177 * we already have it), therefore we can now use any volatile GPR. 3093 * we already have it), therefore we can now use any volatile GPR.
3094 * In fact treclaim in fake suspend state doesn't modify
3095 * any registers.
3178 */ 3096 */
3179 /* Reload PACA pointer, stack pointer and TOC. */
3180 GET_PACA(r13)
3181 ld r1, HSTATE_HOST_R1(r13)
3182 ld r2, PACATOC(r13)
3183 3097
3184 /* Set MSR RI now we have r1 and r13 back. */ 3098BEGIN_FTR_SECTION
3185 li r5, MSR_RI
3186 mtmsrd r5, 1
3187
3188 HMT_MEDIUM
3189 ld r6, HSTATE_DSCR(r13)
3190 mtspr SPRN_DSCR, r6
3191BEGIN_FTR_SECTION_NESTED(96)
3192 bl pnv_power9_force_smt4_release 3099 bl pnv_power9_force_smt4_release
3193END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96) 3100END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
3194 nop 3101 nop
3195 3102
31964: 31034:
@@ -3216,10 +3123,12 @@ END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
3216 * Restore transactional state and TM-related registers. 3123 * Restore transactional state and TM-related registers.
3217 * Called with r3 pointing to the vcpu struct 3124 * Called with r3 pointing to the vcpu struct
3218 * and r4 containing the guest MSR value. 3125 * and r4 containing the guest MSR value.
3126 * r5 is non-zero iff non-volatile register state needs to be maintained.
3219 * This potentially modifies all checkpointed registers. 3127 * This potentially modifies all checkpointed registers.
3220 * It restores r1 and r2 from the PACA. 3128 * It restores r1 and r2 from the PACA.
3221 */ 3129 */
3222kvmppc_restore_tm_hv: 3130_GLOBAL_TOC(kvmppc_restore_tm_hv)
3131EXPORT_SYMBOL_GPL(kvmppc_restore_tm_hv)
3223 /* 3132 /*
3224 * If we are doing TM emulation for the guest on a POWER9 DD2, 3133 * If we are doing TM emulation for the guest on a POWER9 DD2,
3225 * then we don't actually do a trechkpt -- we either set up 3134 * then we don't actually do a trechkpt -- we either set up
@@ -3424,6 +3333,194 @@ kvmppc_msr_interrupt:
3424 blr 3333 blr
3425 3334
3426/* 3335/*
3336 * Load up guest PMU state. R3 points to the vcpu struct.
3337 */
3338_GLOBAL(kvmhv_load_guest_pmu)
3339EXPORT_SYMBOL_GPL(kvmhv_load_guest_pmu)
3340 mr r4, r3
3341 mflr r0
3342 li r3, 1
3343 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
3344 mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
3345 isync
3346BEGIN_FTR_SECTION
3347 ld r3, VCPU_MMCR(r4)
3348 andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
3349 cmpwi r5, MMCR0_PMAO
3350 beql kvmppc_fix_pmao
3351END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
3352 lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */
3353 lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */
3354 lwz r6, VCPU_PMC + 8(r4)
3355 lwz r7, VCPU_PMC + 12(r4)
3356 lwz r8, VCPU_PMC + 16(r4)
3357 lwz r9, VCPU_PMC + 20(r4)
3358 mtspr SPRN_PMC1, r3
3359 mtspr SPRN_PMC2, r5
3360 mtspr SPRN_PMC3, r6
3361 mtspr SPRN_PMC4, r7
3362 mtspr SPRN_PMC5, r8
3363 mtspr SPRN_PMC6, r9
3364 ld r3, VCPU_MMCR(r4)
3365 ld r5, VCPU_MMCR + 8(r4)
3366 ld r6, VCPU_MMCR + 16(r4)
3367 ld r7, VCPU_SIAR(r4)
3368 ld r8, VCPU_SDAR(r4)
3369 mtspr SPRN_MMCR1, r5
3370 mtspr SPRN_MMCRA, r6
3371 mtspr SPRN_SIAR, r7
3372 mtspr SPRN_SDAR, r8
3373BEGIN_FTR_SECTION
3374 ld r5, VCPU_MMCR + 24(r4)
3375 ld r6, VCPU_SIER(r4)
3376 mtspr SPRN_MMCR2, r5
3377 mtspr SPRN_SIER, r6
3378BEGIN_FTR_SECTION_NESTED(96)
3379 lwz r7, VCPU_PMC + 24(r4)
3380 lwz r8, VCPU_PMC + 28(r4)
3381 ld r9, VCPU_MMCR + 32(r4)
3382 mtspr SPRN_SPMC1, r7
3383 mtspr SPRN_SPMC2, r8
3384 mtspr SPRN_MMCRS, r9
3385END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
3386END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
3387 mtspr SPRN_MMCR0, r3
3388 isync
3389 mtlr r0
3390 blr
3391
3392/*
3393 * Reload host PMU state saved in the PACA by kvmhv_save_host_pmu.
3394 */
3395_GLOBAL(kvmhv_load_host_pmu)
3396EXPORT_SYMBOL_GPL(kvmhv_load_host_pmu)
3397 mflr r0
3398 lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
3399 cmpwi r4, 0
3400 beq 23f /* skip if not */
3401BEGIN_FTR_SECTION
3402 ld r3, HSTATE_MMCR0(r13)
3403 andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
3404 cmpwi r4, MMCR0_PMAO
3405 beql kvmppc_fix_pmao
3406END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
3407 lwz r3, HSTATE_PMC1(r13)
3408 lwz r4, HSTATE_PMC2(r13)
3409 lwz r5, HSTATE_PMC3(r13)
3410 lwz r6, HSTATE_PMC4(r13)
3411 lwz r8, HSTATE_PMC5(r13)
3412 lwz r9, HSTATE_PMC6(r13)
3413 mtspr SPRN_PMC1, r3
3414 mtspr SPRN_PMC2, r4
3415 mtspr SPRN_PMC3, r5
3416 mtspr SPRN_PMC4, r6
3417 mtspr SPRN_PMC5, r8
3418 mtspr SPRN_PMC6, r9
3419 ld r3, HSTATE_MMCR0(r13)
3420 ld r4, HSTATE_MMCR1(r13)
3421 ld r5, HSTATE_MMCRA(r13)
3422 ld r6, HSTATE_SIAR(r13)
3423 ld r7, HSTATE_SDAR(r13)
3424 mtspr SPRN_MMCR1, r4
3425 mtspr SPRN_MMCRA, r5
3426 mtspr SPRN_SIAR, r6
3427 mtspr SPRN_SDAR, r7
3428BEGIN_FTR_SECTION
3429 ld r8, HSTATE_MMCR2(r13)
3430 ld r9, HSTATE_SIER(r13)
3431 mtspr SPRN_MMCR2, r8
3432 mtspr SPRN_SIER, r9
3433END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
3434 mtspr SPRN_MMCR0, r3
3435 isync
3436 mtlr r0
343723: blr
3438
3439/*
3440 * Save guest PMU state into the vcpu struct.
3441 * r3 = vcpu, r4 = full save flag (PMU in use flag set in VPA)
3442 */
3443_GLOBAL(kvmhv_save_guest_pmu)
3444EXPORT_SYMBOL_GPL(kvmhv_save_guest_pmu)
3445 mr r9, r3
3446 mr r8, r4
3447BEGIN_FTR_SECTION
3448 /*
3449 * POWER8 seems to have a hardware bug where setting
3450 * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
3451 * when some counters are already negative doesn't seem
3452 * to cause a performance monitor alert (and hence interrupt).
3453 * The effect of this is that when saving the PMU state,
3454 * if there is no PMU alert pending when we read MMCR0
3455 * before freezing the counters, but one becomes pending
3456 * before we read the counters, we lose it.
3457 * To work around this, we need a way to freeze the counters
3458 * before reading MMCR0. Normally, freezing the counters
3459 * is done by writing MMCR0 (to set MMCR0[FC]) which
3460 * unavoidably writes MMCR0[PMA0] as well. On POWER8,
3461 * we can also freeze the counters using MMCR2, by writing
3462 * 1s to all the counter freeze condition bits (there are
3463 * 9 bits each for 6 counters).
3464 */
3465 li r3, -1 /* set all freeze bits */
3466 clrrdi r3, r3, 10
3467 mfspr r10, SPRN_MMCR2
3468 mtspr SPRN_MMCR2, r3
3469 isync
3470END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
3471 li r3, 1
3472 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
3473 mfspr r4, SPRN_MMCR0 /* save MMCR0 */
3474 mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
3475 mfspr r6, SPRN_MMCRA
3476 /* Clear MMCRA in order to disable SDAR updates */
3477 li r7, 0
3478 mtspr SPRN_MMCRA, r7
3479 isync
3480 cmpwi r8, 0 /* did they ask for PMU stuff to be saved? */
3481 bne 21f
3482 std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */
3483 b 22f
348421: mfspr r5, SPRN_MMCR1
3485 mfspr r7, SPRN_SIAR
3486 mfspr r8, SPRN_SDAR
3487 std r4, VCPU_MMCR(r9)
3488 std r5, VCPU_MMCR + 8(r9)
3489 std r6, VCPU_MMCR + 16(r9)
3490BEGIN_FTR_SECTION
3491 std r10, VCPU_MMCR + 24(r9)
3492END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
3493 std r7, VCPU_SIAR(r9)
3494 std r8, VCPU_SDAR(r9)
3495 mfspr r3, SPRN_PMC1
3496 mfspr r4, SPRN_PMC2
3497 mfspr r5, SPRN_PMC3
3498 mfspr r6, SPRN_PMC4
3499 mfspr r7, SPRN_PMC5
3500 mfspr r8, SPRN_PMC6
3501 stw r3, VCPU_PMC(r9)
3502 stw r4, VCPU_PMC + 4(r9)
3503 stw r5, VCPU_PMC + 8(r9)
3504 stw r6, VCPU_PMC + 12(r9)
3505 stw r7, VCPU_PMC + 16(r9)
3506 stw r8, VCPU_PMC + 20(r9)
3507BEGIN_FTR_SECTION
3508 mfspr r5, SPRN_SIER
3509 std r5, VCPU_SIER(r9)
3510BEGIN_FTR_SECTION_NESTED(96)
3511 mfspr r6, SPRN_SPMC1
3512 mfspr r7, SPRN_SPMC2
3513 mfspr r8, SPRN_MMCRS
3514 stw r6, VCPU_PMC + 24(r9)
3515 stw r7, VCPU_PMC + 28(r9)
3516 std r8, VCPU_MMCR + 32(r9)
3517 lis r4, 0x8000
3518 mtspr SPRN_MMCRS, r4
3519END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
3520END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
352122: blr
3522
3523/*
3427 * This works around a hardware bug on POWER8E processors, where 3524 * This works around a hardware bug on POWER8E processors, where
3428 * writing a 1 to the MMCR0[PMAO] bit doesn't generate a 3525 * writing a 1 to the MMCR0[PMAO] bit doesn't generate a
3429 * performance monitor interrupt. Instead, when we need to have 3526 * performance monitor interrupt. Instead, when we need to have
diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c
index 008285058f9b..888e2609e3f1 100644
--- a/arch/powerpc/kvm/book3s_hv_tm.c
+++ b/arch/powerpc/kvm/book3s_hv_tm.c
@@ -130,7 +130,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
130 return RESUME_GUEST; 130 return RESUME_GUEST;
131 } 131 }
132 /* Set CR0 to indicate previous transactional state */ 132 /* Set CR0 to indicate previous transactional state */
133 vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 133 vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
134 (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28); 134 (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
135 /* L=1 => tresume, L=0 => tsuspend */ 135 /* L=1 => tresume, L=0 => tsuspend */
136 if (instr & (1 << 21)) { 136 if (instr & (1 << 21)) {
@@ -174,7 +174,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
174 copy_from_checkpoint(vcpu); 174 copy_from_checkpoint(vcpu);
175 175
176 /* Set CR0 to indicate previous transactional state */ 176 /* Set CR0 to indicate previous transactional state */
177 vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 177 vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
178 (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28); 178 (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
179 vcpu->arch.shregs.msr &= ~MSR_TS_MASK; 179 vcpu->arch.shregs.msr &= ~MSR_TS_MASK;
180 return RESUME_GUEST; 180 return RESUME_GUEST;
@@ -204,7 +204,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
204 copy_to_checkpoint(vcpu); 204 copy_to_checkpoint(vcpu);
205 205
206 /* Set CR0 to indicate previous transactional state */ 206 /* Set CR0 to indicate previous transactional state */
207 vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 207 vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
208 (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28); 208 (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
209 vcpu->arch.shregs.msr = msr | MSR_TS_S; 209 vcpu->arch.shregs.msr = msr | MSR_TS_S;
210 return RESUME_GUEST; 210 return RESUME_GUEST;
diff --git a/arch/powerpc/kvm/book3s_hv_tm_builtin.c b/arch/powerpc/kvm/book3s_hv_tm_builtin.c
index b2c7c6fca4f9..3cf5863bc06e 100644
--- a/arch/powerpc/kvm/book3s_hv_tm_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_tm_builtin.c
@@ -89,7 +89,8 @@ int kvmhv_p9_tm_emulation_early(struct kvm_vcpu *vcpu)
89 if (instr & (1 << 21)) 89 if (instr & (1 << 21))
90 vcpu->arch.shregs.msr = (msr & ~MSR_TS_MASK) | MSR_TS_T; 90 vcpu->arch.shregs.msr = (msr & ~MSR_TS_MASK) | MSR_TS_T;
91 /* Set CR0 to 0b0010 */ 91 /* Set CR0 to 0b0010 */
92 vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0x20000000; 92 vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
93 0x20000000;
93 return 1; 94 return 1;
94 } 95 }
95 96
@@ -105,5 +106,5 @@ void kvmhv_emulate_tm_rollback(struct kvm_vcpu *vcpu)
105 vcpu->arch.shregs.msr &= ~MSR_TS_MASK; /* go to N state */ 106 vcpu->arch.shregs.msr &= ~MSR_TS_MASK; /* go to N state */
106 vcpu->arch.regs.nip = vcpu->arch.tfhar; 107 vcpu->arch.regs.nip = vcpu->arch.tfhar;
107 copy_from_checkpoint(vcpu); 108 copy_from_checkpoint(vcpu);
108 vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0xa0000000; 109 vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | 0xa0000000;
109} 110}
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 614ebb4261f7..4efd65d9e828 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -167,7 +167,7 @@ void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu)
167 svcpu->gpr[11] = vcpu->arch.regs.gpr[11]; 167 svcpu->gpr[11] = vcpu->arch.regs.gpr[11];
168 svcpu->gpr[12] = vcpu->arch.regs.gpr[12]; 168 svcpu->gpr[12] = vcpu->arch.regs.gpr[12];
169 svcpu->gpr[13] = vcpu->arch.regs.gpr[13]; 169 svcpu->gpr[13] = vcpu->arch.regs.gpr[13];
170 svcpu->cr = vcpu->arch.cr; 170 svcpu->cr = vcpu->arch.regs.ccr;
171 svcpu->xer = vcpu->arch.regs.xer; 171 svcpu->xer = vcpu->arch.regs.xer;
172 svcpu->ctr = vcpu->arch.regs.ctr; 172 svcpu->ctr = vcpu->arch.regs.ctr;
173 svcpu->lr = vcpu->arch.regs.link; 173 svcpu->lr = vcpu->arch.regs.link;
@@ -249,7 +249,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
249 vcpu->arch.regs.gpr[11] = svcpu->gpr[11]; 249 vcpu->arch.regs.gpr[11] = svcpu->gpr[11];
250 vcpu->arch.regs.gpr[12] = svcpu->gpr[12]; 250 vcpu->arch.regs.gpr[12] = svcpu->gpr[12];
251 vcpu->arch.regs.gpr[13] = svcpu->gpr[13]; 251 vcpu->arch.regs.gpr[13] = svcpu->gpr[13];
252 vcpu->arch.cr = svcpu->cr; 252 vcpu->arch.regs.ccr = svcpu->cr;
253 vcpu->arch.regs.xer = svcpu->xer; 253 vcpu->arch.regs.xer = svcpu->xer;
254 vcpu->arch.regs.ctr = svcpu->ctr; 254 vcpu->arch.regs.ctr = svcpu->ctr;
255 vcpu->arch.regs.link = svcpu->lr; 255 vcpu->arch.regs.link = svcpu->lr;
@@ -1246,7 +1246,6 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
1246 r = RESUME_GUEST; 1246 r = RESUME_GUEST;
1247 break; 1247 break;
1248 case BOOK3S_INTERRUPT_EXTERNAL: 1248 case BOOK3S_INTERRUPT_EXTERNAL:
1249 case BOOK3S_INTERRUPT_EXTERNAL_LEVEL:
1250 case BOOK3S_INTERRUPT_EXTERNAL_HV: 1249 case BOOK3S_INTERRUPT_EXTERNAL_HV:
1251 case BOOK3S_INTERRUPT_H_VIRT: 1250 case BOOK3S_INTERRUPT_H_VIRT:
1252 vcpu->stat.ext_intr_exits++; 1251 vcpu->stat.ext_intr_exits++;
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index b8356cdc0c04..b0b2bfc2ff51 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -310,7 +310,7 @@ static inline bool icp_try_update(struct kvmppc_icp *icp,
310 */ 310 */
311 if (new.out_ee) { 311 if (new.out_ee) {
312 kvmppc_book3s_queue_irqprio(icp->vcpu, 312 kvmppc_book3s_queue_irqprio(icp->vcpu,
313 BOOK3S_INTERRUPT_EXTERNAL_LEVEL); 313 BOOK3S_INTERRUPT_EXTERNAL);
314 if (!change_self) 314 if (!change_self)
315 kvmppc_fast_vcpu_kick(icp->vcpu); 315 kvmppc_fast_vcpu_kick(icp->vcpu);
316 } 316 }
@@ -593,8 +593,7 @@ static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu)
593 u32 xirr; 593 u32 xirr;
594 594
595 /* First, remove EE from the processor */ 595 /* First, remove EE from the processor */
596 kvmppc_book3s_dequeue_irqprio(icp->vcpu, 596 kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
597 BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
598 597
599 /* 598 /*
600 * ICP State: Accept_Interrupt 599 * ICP State: Accept_Interrupt
@@ -754,8 +753,7 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
754 * We can remove EE from the current processor, the update 753 * We can remove EE from the current processor, the update
755 * transaction will set it again if needed 754 * transaction will set it again if needed
756 */ 755 */
757 kvmppc_book3s_dequeue_irqprio(icp->vcpu, 756 kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
758 BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
759 757
760 do { 758 do {
761 old_state = new_state = READ_ONCE(icp->state); 759 old_state = new_state = READ_ONCE(icp->state);
@@ -1167,8 +1165,7 @@ int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
1167 * Deassert the CPU interrupt request. 1165 * Deassert the CPU interrupt request.
1168 * icp_try_update will reassert it if necessary. 1166 * icp_try_update will reassert it if necessary.
1169 */ 1167 */
1170 kvmppc_book3s_dequeue_irqprio(icp->vcpu, 1168 kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
1171 BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
1172 1169
1173 /* 1170 /*
1174 * Note that if we displace an interrupt from old_state.xisr, 1171 * Note that if we displace an interrupt from old_state.xisr,
@@ -1393,7 +1390,8 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
1393 } 1390 }
1394 1391
1395#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 1392#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
1396 if (cpu_has_feature(CPU_FTR_ARCH_206)) { 1393 if (cpu_has_feature(CPU_FTR_ARCH_206) &&
1394 cpu_has_feature(CPU_FTR_HVMODE)) {
1397 /* Enable real mode support */ 1395 /* Enable real mode support */
1398 xics->real_mode = ENABLE_REALMODE; 1396 xics->real_mode = ENABLE_REALMODE;
1399 xics->real_mode_dbg = DEBUG_REALMODE; 1397 xics->real_mode_dbg = DEBUG_REALMODE;
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 30c2eb766954..ad4a370703d3 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -62,6 +62,69 @@
62#define XIVE_Q_GAP 2 62#define XIVE_Q_GAP 2
63 63
64/* 64/*
65 * Push a vcpu's context to the XIVE on guest entry.
66 * This assumes we are in virtual mode (MMU on)
67 */
68void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
69{
70 void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
71 u64 pq;
72
73 if (!tima)
74 return;
75 eieio();
76 __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
77 __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
78 vcpu->arch.xive_pushed = 1;
79 eieio();
80
81 /*
82 * We clear the irq_pending flag. There is a small chance of a
83 * race vs. the escalation interrupt happening on another
84 * processor setting it again, but the only consequence is to
85 * cause a spurious wakeup on the next H_CEDE, which is not an
86 * issue.
87 */
88 vcpu->arch.irq_pending = 0;
89
90 /*
91 * In single escalation mode, if the escalation interrupt is
92 * on, we mask it.
93 */
94 if (vcpu->arch.xive_esc_on) {
95 pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
96 XIVE_ESB_SET_PQ_01));
97 mb();
98
99 /*
100 * We have a possible subtle race here: The escalation
101 * interrupt might have fired and be on its way to the
102 * host queue while we mask it, and if we unmask it
103 * early enough (re-cede right away), there is a
104 * theorical possibility that it fires again, thus
105 * landing in the target queue more than once which is
106 * a big no-no.
107 *
108 * Fortunately, solving this is rather easy. If the
109 * above load setting PQ to 01 returns a previous
110 * value where P is set, then we know the escalation
111 * interrupt is somewhere on its way to the host. In
112 * that case we simply don't clear the xive_esc_on
113 * flag below. It will be eventually cleared by the
114 * handler for the escalation interrupt.
115 *
116 * Then, when doing a cede, we check that flag again
117 * before re-enabling the escalation interrupt, and if
118 * set, we abort the cede.
119 */
120 if (!(pq & XIVE_ESB_VAL_P))
121 /* Now P is 0, we can clear the flag */
122 vcpu->arch.xive_esc_on = 0;
123 }
124}
125EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
126
127/*
65 * This is a simple trigger for a generic XIVE IRQ. This must 128 * This is a simple trigger for a generic XIVE IRQ. This must
66 * only be called for interrupts that support a trigger page 129 * only be called for interrupts that support a trigger page
67 */ 130 */
diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
index 4171ede8722b..033363d6e764 100644
--- a/arch/powerpc/kvm/book3s_xive_template.c
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -280,14 +280,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu)
280 /* First collect pending bits from HW */ 280 /* First collect pending bits from HW */
281 GLUE(X_PFX,ack_pending)(xc); 281 GLUE(X_PFX,ack_pending)(xc);
282 282
283 /*
284 * Cleanup the old-style bits if needed (they may have been
285 * set by pull or an escalation interrupts).
286 */
287 if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions))
288 clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
289 &vcpu->arch.pending_exceptions);
290
291 pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n", 283 pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
292 xc->pending, xc->hw_cppr, xc->cppr); 284 xc->pending, xc->hw_cppr, xc->cppr);
293 285
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index 81bd8a07aa51..051af7d97327 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -182,7 +182,7 @@
182 */ 182 */
183 PPC_LL r4, PACACURRENT(r13) 183 PPC_LL r4, PACACURRENT(r13)
184 PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4) 184 PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4)
185 stw r10, VCPU_CR(r4) 185 PPC_STL r10, VCPU_CR(r4)
186 PPC_STL r11, VCPU_GPR(R4)(r4) 186 PPC_STL r11, VCPU_GPR(R4)(r4)
187 PPC_STL r5, VCPU_GPR(R5)(r4) 187 PPC_STL r5, VCPU_GPR(R5)(r4)
188 PPC_STL r6, VCPU_GPR(R6)(r4) 188 PPC_STL r6, VCPU_GPR(R6)(r4)
@@ -292,7 +292,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
292 PPC_STL r4, VCPU_GPR(R4)(r11) 292 PPC_STL r4, VCPU_GPR(R4)(r11)
293 PPC_LL r4, THREAD_NORMSAVE(0)(r10) 293 PPC_LL r4, THREAD_NORMSAVE(0)(r10)
294 PPC_STL r5, VCPU_GPR(R5)(r11) 294 PPC_STL r5, VCPU_GPR(R5)(r11)
295 stw r13, VCPU_CR(r11) 295 PPC_STL r13, VCPU_CR(r11)
296 mfspr r5, \srr0 296 mfspr r5, \srr0
297 PPC_STL r3, VCPU_GPR(R10)(r11) 297 PPC_STL r3, VCPU_GPR(R10)(r11)
298 PPC_LL r3, THREAD_NORMSAVE(2)(r10) 298 PPC_LL r3, THREAD_NORMSAVE(2)(r10)
@@ -319,7 +319,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
319 PPC_STL r4, VCPU_GPR(R4)(r11) 319 PPC_STL r4, VCPU_GPR(R4)(r11)
320 PPC_LL r4, GPR9(r8) 320 PPC_LL r4, GPR9(r8)
321 PPC_STL r5, VCPU_GPR(R5)(r11) 321 PPC_STL r5, VCPU_GPR(R5)(r11)
322 stw r9, VCPU_CR(r11) 322 PPC_STL r9, VCPU_CR(r11)
323 mfspr r5, \srr0 323 mfspr r5, \srr0
324 PPC_STL r3, VCPU_GPR(R8)(r11) 324 PPC_STL r3, VCPU_GPR(R8)(r11)
325 PPC_LL r3, GPR10(r8) 325 PPC_LL r3, GPR10(r8)
@@ -643,7 +643,7 @@ lightweight_exit:
643 PPC_LL r3, VCPU_LR(r4) 643 PPC_LL r3, VCPU_LR(r4)
644 PPC_LL r5, VCPU_XER(r4) 644 PPC_LL r5, VCPU_XER(r4)
645 PPC_LL r6, VCPU_CTR(r4) 645 PPC_LL r6, VCPU_CTR(r4)
646 lwz r7, VCPU_CR(r4) 646 PPC_LL r7, VCPU_CR(r4)
647 PPC_LL r8, VCPU_PC(r4) 647 PPC_LL r8, VCPU_PC(r4)
648 PPC_LD(r9, VCPU_SHARED_MSR, r11) 648 PPC_LD(r9, VCPU_SHARED_MSR, r11)
649 PPC_LL r0, VCPU_GPR(R0)(r4) 649 PPC_LL r0, VCPU_GPR(R0)(r4)
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
index 75dce1ef3bc8..f91b1309a0a8 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -117,7 +117,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
117 117
118 emulated = EMULATE_FAIL; 118 emulated = EMULATE_FAIL;
119 vcpu->arch.regs.msr = vcpu->arch.shared->msr; 119 vcpu->arch.regs.msr = vcpu->arch.shared->msr;
120 vcpu->arch.regs.ccr = vcpu->arch.cr;
121 if (analyse_instr(&op, &vcpu->arch.regs, inst) == 0) { 120 if (analyse_instr(&op, &vcpu->arch.regs, inst) == 0) {
122 int type = op.type & INSTR_TYPE_MASK; 121 int type = op.type & INSTR_TYPE_MASK;
123 int size = GETSIZE(op.type); 122 int size = GETSIZE(op.type);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index eba5756d5b41..2869a299c4ed 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -594,7 +594,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
594 r = !!(hv_enabled && radix_enabled()); 594 r = !!(hv_enabled && radix_enabled());
595 break; 595 break;
596 case KVM_CAP_PPC_MMU_HASH_V3: 596 case KVM_CAP_PPC_MMU_HASH_V3:
597 r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300)); 597 r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) &&
598 cpu_has_feature(CPU_FTR_HVMODE));
599 break;
600 case KVM_CAP_PPC_NESTED_HV:
601 r = !!(hv_enabled && kvmppc_hv_ops->enable_nested &&
602 !kvmppc_hv_ops->enable_nested(NULL));
598 break; 603 break;
599#endif 604#endif
600 case KVM_CAP_SYNC_MMU: 605 case KVM_CAP_SYNC_MMU:
@@ -2114,6 +2119,14 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
2114 r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags); 2119 r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags);
2115 break; 2120 break;
2116 } 2121 }
2122
2123 case KVM_CAP_PPC_NESTED_HV:
2124 r = -EINVAL;
2125 if (!is_kvmppc_hv_enabled(kvm) ||
2126 !kvm->arch.kvm_ops->enable_nested)
2127 break;
2128 r = kvm->arch.kvm_ops->enable_nested(kvm);
2129 break;
2117#endif 2130#endif
2118 default: 2131 default:
2119 r = -EINVAL; 2132 r = -EINVAL;
diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S
index 90e330f21356..0531a1492fdf 100644
--- a/arch/powerpc/kvm/tm.S
+++ b/arch/powerpc/kvm/tm.S
@@ -28,17 +28,25 @@
28 * Save transactional state and TM-related registers. 28 * Save transactional state and TM-related registers.
29 * Called with: 29 * Called with:
30 * - r3 pointing to the vcpu struct 30 * - r3 pointing to the vcpu struct
31 * - r4 points to the MSR with current TS bits: 31 * - r4 containing the MSR with current TS bits:
32 * (For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR). 32 * (For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR).
33 * This can modify all checkpointed registers, but 33 * - r5 containing a flag indicating that non-volatile registers
34 * restores r1, r2 before exit. 34 * must be preserved.
35 * If r5 == 0, this can modify all checkpointed registers, but
36 * restores r1, r2 before exit. If r5 != 0, this restores the
37 * MSR TM/FP/VEC/VSX bits to their state on entry.
35 */ 38 */
36_GLOBAL(__kvmppc_save_tm) 39_GLOBAL(__kvmppc_save_tm)
37 mflr r0 40 mflr r0
38 std r0, PPC_LR_STKOFF(r1) 41 std r0, PPC_LR_STKOFF(r1)
42 stdu r1, -SWITCH_FRAME_SIZE(r1)
43
44 mr r9, r3
45 cmpdi cr7, r5, 0
39 46
40 /* Turn on TM. */ 47 /* Turn on TM. */
41 mfmsr r8 48 mfmsr r8
49 mr r10, r8
42 li r0, 1 50 li r0, 1
43 rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG 51 rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG
44 ori r8, r8, MSR_FP 52 ori r8, r8, MSR_FP
@@ -51,6 +59,27 @@ _GLOBAL(__kvmppc_save_tm)
51 std r1, HSTATE_SCRATCH2(r13) 59 std r1, HSTATE_SCRATCH2(r13)
52 std r3, HSTATE_SCRATCH1(r13) 60 std r3, HSTATE_SCRATCH1(r13)
53 61
62 /* Save CR on the stack - even if r5 == 0 we need to get cr7 back. */
63 mfcr r6
64 SAVE_GPR(6, r1)
65
66 /* Save DSCR so we can restore it to avoid running with user value */
67 mfspr r7, SPRN_DSCR
68 SAVE_GPR(7, r1)
69
70 /*
71 * We are going to do treclaim., which will modify all checkpointed
72 * registers. Save the non-volatile registers on the stack if
73 * preservation of non-volatile state has been requested.
74 */
75 beq cr7, 3f
76 SAVE_NVGPRS(r1)
77
78 /* MSR[TS] will be 0 (non-transactional) once we do treclaim. */
79 li r0, 0
80 rldimi r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
81 SAVE_GPR(10, r1) /* final MSR value */
823:
54#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 83#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
55BEGIN_FTR_SECTION 84BEGIN_FTR_SECTION
56 /* Emulation of the treclaim instruction needs TEXASR before treclaim */ 85 /* Emulation of the treclaim instruction needs TEXASR before treclaim */
@@ -74,22 +103,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
74 std r9, PACATMSCRATCH(r13) 103 std r9, PACATMSCRATCH(r13)
75 ld r9, HSTATE_SCRATCH1(r13) 104 ld r9, HSTATE_SCRATCH1(r13)
76 105
77 /* Get a few more GPRs free. */ 106 /* Save away PPR soon so we don't run with user value. */
78 std r29, VCPU_GPRS_TM(29)(r9) 107 std r0, VCPU_GPRS_TM(0)(r9)
79 std r30, VCPU_GPRS_TM(30)(r9) 108 mfspr r0, SPRN_PPR
80 std r31, VCPU_GPRS_TM(31)(r9)
81
82 /* Save away PPR and DSCR soon so don't run with user values. */
83 mfspr r31, SPRN_PPR
84 HMT_MEDIUM 109 HMT_MEDIUM
85 mfspr r30, SPRN_DSCR
86#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
87 ld r29, HSTATE_DSCR(r13)
88 mtspr SPRN_DSCR, r29
89#endif
90 110
91 /* Save all but r9, r13 & r29-r31 */ 111 /* Reload stack pointer. */
92 reg = 0 112 std r1, VCPU_GPRS_TM(1)(r9)
113 ld r1, HSTATE_SCRATCH2(r13)
114
115 /* Set MSR RI now we have r1 and r13 back. */
116 std r2, VCPU_GPRS_TM(2)(r9)
117 li r2, MSR_RI
118 mtmsrd r2, 1
119
120 /* Reload TOC pointer. */
121 ld r2, PACATOC(r13)
122
123 /* Save all but r0-r2, r9 & r13 */
124 reg = 3
93 .rept 29 125 .rept 29
94 .if (reg != 9) && (reg != 13) 126 .if (reg != 9) && (reg != 13)
95 std reg, VCPU_GPRS_TM(reg)(r9) 127 std reg, VCPU_GPRS_TM(reg)(r9)
@@ -103,33 +135,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
103 ld r4, PACATMSCRATCH(r13) 135 ld r4, PACATMSCRATCH(r13)
104 std r4, VCPU_GPRS_TM(9)(r9) 136 std r4, VCPU_GPRS_TM(9)(r9)
105 137
106 /* Reload stack pointer and TOC. */ 138 /* Restore host DSCR and CR values, after saving guest values */
107 ld r1, HSTATE_SCRATCH2(r13) 139 mfcr r6
108 ld r2, PACATOC(r13) 140 mfspr r7, SPRN_DSCR
109 141 stw r6, VCPU_CR_TM(r9)
110 /* Set MSR RI now we have r1 and r13 back. */ 142 std r7, VCPU_DSCR_TM(r9)
111 li r5, MSR_RI 143 REST_GPR(6, r1)
112 mtmsrd r5, 1 144 REST_GPR(7, r1)
145 mtcr r6
146 mtspr SPRN_DSCR, r7
113 147
114 /* Save away checkpinted SPRs. */ 148 /* Save away checkpointed SPRs. */
115 std r31, VCPU_PPR_TM(r9) 149 std r0, VCPU_PPR_TM(r9)
116 std r30, VCPU_DSCR_TM(r9)
117 mflr r5 150 mflr r5
118 mfcr r6
119 mfctr r7 151 mfctr r7
120 mfspr r8, SPRN_AMR 152 mfspr r8, SPRN_AMR
121 mfspr r10, SPRN_TAR 153 mfspr r10, SPRN_TAR
122 mfxer r11 154 mfxer r11
123 std r5, VCPU_LR_TM(r9) 155 std r5, VCPU_LR_TM(r9)
124 stw r6, VCPU_CR_TM(r9)
125 std r7, VCPU_CTR_TM(r9) 156 std r7, VCPU_CTR_TM(r9)
126 std r8, VCPU_AMR_TM(r9) 157 std r8, VCPU_AMR_TM(r9)
127 std r10, VCPU_TAR_TM(r9) 158 std r10, VCPU_TAR_TM(r9)
128 std r11, VCPU_XER_TM(r9) 159 std r11, VCPU_XER_TM(r9)
129 160
130 /* Restore r12 as trap number. */
131 lwz r12, VCPU_TRAP(r9)
132
133 /* Save FP/VSX. */ 161 /* Save FP/VSX. */
134 addi r3, r9, VCPU_FPRS_TM 162 addi r3, r9, VCPU_FPRS_TM
135 bl store_fp_state 163 bl store_fp_state
@@ -137,6 +165,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
137 bl store_vr_state 165 bl store_vr_state
138 mfspr r6, SPRN_VRSAVE 166 mfspr r6, SPRN_VRSAVE
139 stw r6, VCPU_VRSAVE_TM(r9) 167 stw r6, VCPU_VRSAVE_TM(r9)
168
169 /* Restore non-volatile registers if requested to */
170 beq cr7, 1f
171 REST_NVGPRS(r1)
172 REST_GPR(10, r1)
1401: 1731:
141 /* 174 /*
142 * We need to save these SPRs after the treclaim so that the software 175 * We need to save these SPRs after the treclaim so that the software
@@ -146,12 +179,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
146 */ 179 */
147 mfspr r7, SPRN_TEXASR 180 mfspr r7, SPRN_TEXASR
148 std r7, VCPU_TEXASR(r9) 181 std r7, VCPU_TEXASR(r9)
14911:
150 mfspr r5, SPRN_TFHAR 182 mfspr r5, SPRN_TFHAR
151 mfspr r6, SPRN_TFIAR 183 mfspr r6, SPRN_TFIAR
152 std r5, VCPU_TFHAR(r9) 184 std r5, VCPU_TFHAR(r9)
153 std r6, VCPU_TFIAR(r9) 185 std r6, VCPU_TFIAR(r9)
154 186
187 /* Restore MSR state if requested */
188 beq cr7, 2f
189 mtmsrd r10, 0
1902:
191 addi r1, r1, SWITCH_FRAME_SIZE
155 ld r0, PPC_LR_STKOFF(r1) 192 ld r0, PPC_LR_STKOFF(r1)
156 mtlr r0 193 mtlr r0
157 blr 194 blr
@@ -161,49 +198,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
161 * be invoked from C function by PR KVM only. 198 * be invoked from C function by PR KVM only.
162 */ 199 */
163_GLOBAL(_kvmppc_save_tm_pr) 200_GLOBAL(_kvmppc_save_tm_pr)
164 mflr r5 201 mflr r0
165 std r5, PPC_LR_STKOFF(r1) 202 std r0, PPC_LR_STKOFF(r1)
166 stdu r1, -SWITCH_FRAME_SIZE(r1) 203 stdu r1, -PPC_MIN_STKFRM(r1)
167 SAVE_NVGPRS(r1)
168
169 /* save MSR since TM/math bits might be impacted
170 * by __kvmppc_save_tm().
171 */
172 mfmsr r5
173 SAVE_GPR(5, r1)
174
175 /* also save DSCR/CR/TAR so that it can be recovered later */
176 mfspr r6, SPRN_DSCR
177 SAVE_GPR(6, r1)
178
179 mfcr r7
180 stw r7, _CCR(r1)
181 204
182 mfspr r8, SPRN_TAR 205 mfspr r8, SPRN_TAR
183 SAVE_GPR(8, r1) 206 std r8, PPC_MIN_STKFRM-8(r1)
184 207
208 li r5, 1 /* preserve non-volatile registers */
185 bl __kvmppc_save_tm 209 bl __kvmppc_save_tm
186 210
187 REST_GPR(8, r1) 211 ld r8, PPC_MIN_STKFRM-8(r1)
188 mtspr SPRN_TAR, r8 212 mtspr SPRN_TAR, r8
189 213
190 ld r7, _CCR(r1) 214 addi r1, r1, PPC_MIN_STKFRM
191 mtcr r7 215 ld r0, PPC_LR_STKOFF(r1)
192 216 mtlr r0
193 REST_GPR(6, r1)
194 mtspr SPRN_DSCR, r6
195
196 /* need preserve current MSR's MSR_TS bits */
197 REST_GPR(5, r1)
198 mfmsr r6
199 rldicl r6, r6, 64 - MSR_TS_S_LG, 62
200 rldimi r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
201 mtmsrd r5
202
203 REST_NVGPRS(r1)
204 addi r1, r1, SWITCH_FRAME_SIZE
205 ld r5, PPC_LR_STKOFF(r1)
206 mtlr r5
207 blr 217 blr
208 218
209EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr); 219EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
@@ -215,15 +225,21 @@ EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
215 * - r4 is the guest MSR with desired TS bits: 225 * - r4 is the guest MSR with desired TS bits:
216 * For HV KVM, it is VCPU_MSR 226 * For HV KVM, it is VCPU_MSR
217 * For PR KVM, it is provided by caller 227 * For PR KVM, it is provided by caller
218 * This potentially modifies all checkpointed registers. 228 * - r5 containing a flag indicating that non-volatile registers
219 * It restores r1, r2 from the PACA. 229 * must be preserved.
230 * If r5 == 0, this potentially modifies all checkpointed registers, but
231 * restores r1, r2 from the PACA before exit.
232 * If r5 != 0, this restores the MSR TM/FP/VEC/VSX bits to their state on entry.
220 */ 233 */
221_GLOBAL(__kvmppc_restore_tm) 234_GLOBAL(__kvmppc_restore_tm)
222 mflr r0 235 mflr r0
223 std r0, PPC_LR_STKOFF(r1) 236 std r0, PPC_LR_STKOFF(r1)
224 237
238 cmpdi cr7, r5, 0
239
225 /* Turn on TM/FP/VSX/VMX so we can restore them. */ 240 /* Turn on TM/FP/VSX/VMX so we can restore them. */
226 mfmsr r5 241 mfmsr r5
242 mr r10, r5
227 li r6, MSR_TM >> 32 243 li r6, MSR_TM >> 32
228 sldi r6, r6, 32 244 sldi r6, r6, 32
229 or r5, r5, r6 245 or r5, r5, r6
@@ -244,8 +260,7 @@ _GLOBAL(__kvmppc_restore_tm)
244 260
245 mr r5, r4 261 mr r5, r4
246 rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 262 rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
247 beqlr /* TM not active in guest */ 263 beq 9f /* TM not active in guest */
248 std r1, HSTATE_SCRATCH2(r13)
249 264
250 /* Make sure the failure summary is set, otherwise we'll program check 265 /* Make sure the failure summary is set, otherwise we'll program check
251 * when we trechkpt. It's possible that this might have been not set 266 * when we trechkpt. It's possible that this might have been not set
@@ -256,6 +271,26 @@ _GLOBAL(__kvmppc_restore_tm)
256 mtspr SPRN_TEXASR, r7 271 mtspr SPRN_TEXASR, r7
257 272
258 /* 273 /*
274 * Make a stack frame and save non-volatile registers if requested.
275 */
276 stdu r1, -SWITCH_FRAME_SIZE(r1)
277 std r1, HSTATE_SCRATCH2(r13)
278
279 mfcr r6
280 mfspr r7, SPRN_DSCR
281 SAVE_GPR(2, r1)
282 SAVE_GPR(6, r1)
283 SAVE_GPR(7, r1)
284
285 beq cr7, 4f
286 SAVE_NVGPRS(r1)
287
288 /* MSR[TS] will be 1 (suspended) once we do trechkpt */
289 li r0, 1
290 rldimi r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
291 SAVE_GPR(10, r1) /* final MSR value */
2924:
293 /*
259 * We need to load up the checkpointed state for the guest. 294 * We need to load up the checkpointed state for the guest.
260 * We need to do this early as it will blow away any GPRs, VSRs and 295 * We need to do this early as it will blow away any GPRs, VSRs and
261 * some SPRs. 296 * some SPRs.
@@ -291,8 +326,6 @@ _GLOBAL(__kvmppc_restore_tm)
291 ld r29, VCPU_DSCR_TM(r3) 326 ld r29, VCPU_DSCR_TM(r3)
292 ld r30, VCPU_PPR_TM(r3) 327 ld r30, VCPU_PPR_TM(r3)
293 328
294 std r2, PACATMSCRATCH(r13) /* Save TOC */
295
296 /* Clear the MSR RI since r1, r13 are all going to be foobar. */ 329 /* Clear the MSR RI since r1, r13 are all going to be foobar. */
297 li r5, 0 330 li r5, 0
298 mtmsrd r5, 1 331 mtmsrd r5, 1
@@ -318,18 +351,31 @@ _GLOBAL(__kvmppc_restore_tm)
318 /* Now let's get back the state we need. */ 351 /* Now let's get back the state we need. */
319 HMT_MEDIUM 352 HMT_MEDIUM
320 GET_PACA(r13) 353 GET_PACA(r13)
321#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
322 ld r29, HSTATE_DSCR(r13)
323 mtspr SPRN_DSCR, r29
324#endif
325 ld r1, HSTATE_SCRATCH2(r13) 354 ld r1, HSTATE_SCRATCH2(r13)
326 ld r2, PACATMSCRATCH(r13) 355 REST_GPR(7, r1)
356 mtspr SPRN_DSCR, r7
327 357
328 /* Set the MSR RI since we have our registers back. */ 358 /* Set the MSR RI since we have our registers back. */
329 li r5, MSR_RI 359 li r5, MSR_RI
330 mtmsrd r5, 1 360 mtmsrd r5, 1
361
362 /* Restore TOC pointer and CR */
363 REST_GPR(2, r1)
364 REST_GPR(6, r1)
365 mtcr r6
366
367 /* Restore non-volatile registers if requested to. */
368 beq cr7, 5f
369 REST_GPR(10, r1)
370 REST_NVGPRS(r1)
371
3725: addi r1, r1, SWITCH_FRAME_SIZE
331 ld r0, PPC_LR_STKOFF(r1) 373 ld r0, PPC_LR_STKOFF(r1)
332 mtlr r0 374 mtlr r0
375
3769: /* Restore MSR bits if requested */
377 beqlr cr7
378 mtmsrd r10, 0
333 blr 379 blr
334 380
335/* 381/*
@@ -337,47 +383,23 @@ _GLOBAL(__kvmppc_restore_tm)
337 * can be invoked from C function by PR KVM only. 383 * can be invoked from C function by PR KVM only.
338 */ 384 */
339_GLOBAL(_kvmppc_restore_tm_pr) 385_GLOBAL(_kvmppc_restore_tm_pr)
340 mflr r5 386 mflr r0
341 std r5, PPC_LR_STKOFF(r1) 387 std r0, PPC_LR_STKOFF(r1)
342 stdu r1, -SWITCH_FRAME_SIZE(r1) 388 stdu r1, -PPC_MIN_STKFRM(r1)
343 SAVE_NVGPRS(r1)
344
345 /* save MSR to avoid TM/math bits change */
346 mfmsr r5
347 SAVE_GPR(5, r1)
348
349 /* also save DSCR/CR/TAR so that it can be recovered later */
350 mfspr r6, SPRN_DSCR
351 SAVE_GPR(6, r1)
352
353 mfcr r7
354 stw r7, _CCR(r1)
355 389
390 /* save TAR so that it can be recovered later */
356 mfspr r8, SPRN_TAR 391 mfspr r8, SPRN_TAR
357 SAVE_GPR(8, r1) 392 std r8, PPC_MIN_STKFRM-8(r1)
358 393
394 li r5, 1
359 bl __kvmppc_restore_tm 395 bl __kvmppc_restore_tm
360 396
361 REST_GPR(8, r1) 397 ld r8, PPC_MIN_STKFRM-8(r1)
362 mtspr SPRN_TAR, r8 398 mtspr SPRN_TAR, r8
363 399
364 ld r7, _CCR(r1) 400 addi r1, r1, PPC_MIN_STKFRM
365 mtcr r7 401 ld r0, PPC_LR_STKOFF(r1)
366 402 mtlr r0
367 REST_GPR(6, r1)
368 mtspr SPRN_DSCR, r6
369
370 /* need preserve current MSR's MSR_TS bits */
371 REST_GPR(5, r1)
372 mfmsr r6
373 rldicl r6, r6, 64 - MSR_TS_S_LG, 62
374 rldimi r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
375 mtmsrd r5
376
377 REST_NVGPRS(r1)
378 addi r1, r1, SWITCH_FRAME_SIZE
379 ld r5, PPC_LR_STKOFF(r1)
380 mtlr r5
381 blr 403 blr
382 404
383EXPORT_SYMBOL_GPL(_kvmppc_restore_tm_pr); 405EXPORT_SYMBOL_GPL(_kvmppc_restore_tm_pr);
diff --git a/arch/powerpc/kvm/trace_book3s.h b/arch/powerpc/kvm/trace_book3s.h
index f3b23759e017..372a82fa2de3 100644
--- a/arch/powerpc/kvm/trace_book3s.h
+++ b/arch/powerpc/kvm/trace_book3s.h
@@ -14,7 +14,6 @@
14 {0x400, "INST_STORAGE"}, \ 14 {0x400, "INST_STORAGE"}, \
15 {0x480, "INST_SEGMENT"}, \ 15 {0x480, "INST_SEGMENT"}, \
16 {0x500, "EXTERNAL"}, \ 16 {0x500, "EXTERNAL"}, \
17 {0x501, "EXTERNAL_LEVEL"}, \
18 {0x502, "EXTERNAL_HV"}, \ 17 {0x502, "EXTERNAL_HV"}, \
19 {0x600, "ALIGNMENT"}, \ 18 {0x600, "ALIGNMENT"}, \
20 {0x700, "PROGRAM"}, \ 19 {0x700, "PROGRAM"}, \
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index fef3e1eb3a19..4c4dfc473800 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -833,6 +833,15 @@ EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
833/* 833/*
834 * Flush partition scoped translations from LPID (=LPIDR) 834 * Flush partition scoped translations from LPID (=LPIDR)
835 */ 835 */
836void radix__flush_tlb_lpid(unsigned int lpid)
837{
838 _tlbie_lpid(lpid, RIC_FLUSH_ALL);
839}
840EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
841
842/*
843 * Flush partition scoped translations from LPID (=LPIDR)
844 */
836void radix__local_flush_tlb_lpid(unsigned int lpid) 845void radix__local_flush_tlb_lpid(unsigned int lpid)
837{ 846{
838 _tlbiel_lpid(lpid, RIC_FLUSH_ALL); 847 _tlbiel_lpid(lpid, RIC_FLUSH_ALL);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 251be353f950..7f2ff3a76995 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -719,6 +719,7 @@ struct kvm_ppc_one_seg_page_size {
719 719
720#define KVM_PPC_PAGE_SIZES_REAL 0x00000001 720#define KVM_PPC_PAGE_SIZES_REAL 0x00000001
721#define KVM_PPC_1T_SEGMENTS 0x00000002 721#define KVM_PPC_1T_SEGMENTS 0x00000002
722#define KVM_PPC_NO_HASH 0x00000004
722 723
723struct kvm_ppc_smmu_info { 724struct kvm_ppc_smmu_info {
724 __u64 flags; 725 __u64 flags;
@@ -953,6 +954,7 @@ struct kvm_ppc_resize_hpt {
953#define KVM_CAP_NESTED_STATE 157 954#define KVM_CAP_NESTED_STATE 157
954#define KVM_CAP_ARM_INJECT_SERROR_ESR 158 955#define KVM_CAP_ARM_INJECT_SERROR_ESR 158
955#define KVM_CAP_MSR_PLATFORM_INFO 159 956#define KVM_CAP_MSR_PLATFORM_INFO 159
957#define KVM_CAP_PPC_NESTED_HV 160
956 958
957#ifdef KVM_CAP_IRQ_ROUTING 959#ifdef KVM_CAP_IRQ_ROUTING
958 960
diff --git a/tools/perf/arch/powerpc/util/book3s_hv_exits.h b/tools/perf/arch/powerpc/util/book3s_hv_exits.h
index 853b95d1e139..2011376c7ab5 100644
--- a/tools/perf/arch/powerpc/util/book3s_hv_exits.h
+++ b/tools/perf/arch/powerpc/util/book3s_hv_exits.h
@@ -15,7 +15,6 @@
15 {0x400, "INST_STORAGE"}, \ 15 {0x400, "INST_STORAGE"}, \
16 {0x480, "INST_SEGMENT"}, \ 16 {0x480, "INST_SEGMENT"}, \
17 {0x500, "EXTERNAL"}, \ 17 {0x500, "EXTERNAL"}, \
18 {0x501, "EXTERNAL_LEVEL"}, \
19 {0x502, "EXTERNAL_HV"}, \ 18 {0x502, "EXTERNAL_HV"}, \
20 {0x600, "ALIGNMENT"}, \ 19 {0x600, "ALIGNMENT"}, \
21 {0x700, "PROGRAM"}, \ 20 {0x700, "PROGRAM"}, \