aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@ozlabs.org>2017-01-31 03:21:26 -0500
committerPaul Mackerras <paulus@ozlabs.org>2017-01-31 03:21:26 -0500
commit167c76e05591c2b656c0f329282f453dd46f4ea5 (patch)
tree25a0af78d28cbec1decab6ea31360882a31426d1
parentfcd4f3c6d150357a02af8526e69bfebb82dd5d46 (diff)
parent8cf4ecc0ca9bd9bdc9b4ca0a99f7445a1e74afed (diff)
Merge remote-tracking branch 'remotes/powerpc/topic/ppc-kvm' into kvm-ppc-next
This merges in the POWER9 radix MMU host and guest support, which was put into a topic branch because it touches both powerpc and KVM code. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
-rw-r--r--Documentation/virtual/kvm/api.txt83
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu.h18
-rw-r--r--arch/powerpc/include/asm/exception-64s.h75
-rw-r--r--arch/powerpc/include/asm/head-64.h2
-rw-r--r--arch/powerpc/include/asm/hvcall.h11
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h26
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h6
-rw-r--r--arch/powerpc/include/asm/kvm_host.h6
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h2
-rw-r--r--arch/powerpc/include/asm/prom.h17
-rw-r--r--arch/powerpc/include/asm/reg.h4
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h20
-rw-r--r--arch/powerpc/kernel/asm-offsets.c2
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S61
-rw-r--r--arch/powerpc/kernel/prom_init.c18
-rw-r--r--arch/powerpc/kvm/Makefile3
-rw-r--r--arch/powerpc/kvm/book3s.c1
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c110
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c716
-rw-r--r--arch/powerpc/kvm/book3s_hv.c205
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c38
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c25
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c8
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S154
-rw-r--r--arch/powerpc/kvm/book3s_segment.S32
-rw-r--r--arch/powerpc/kvm/powerpc.c32
-rw-r--r--arch/powerpc/mm/init-common.c3
-rw-r--r--arch/powerpc/mm/init_64.c35
-rw-r--r--arch/powerpc/mm/pgtable-radix.c2
-rw-r--r--arch/powerpc/mm/pgtable_64.c16
-rw-r--r--arch/powerpc/platforms/pseries/firmware.c2
-rw-r--r--arch/powerpc/platforms/pseries/lpar.c29
-rw-r--r--include/uapi/linux/kvm.h6
33 files changed, 1585 insertions, 183 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 03145b7cafaa..4470671b0c26 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3201,6 +3201,71 @@ struct kvm_reinject_control {
3201pit_reinject = 0 (!reinject mode) is recommended, unless running an old 3201pit_reinject = 0 (!reinject mode) is recommended, unless running an old
3202operating system that uses the PIT for timing (e.g. Linux 2.4.x). 3202operating system that uses the PIT for timing (e.g. Linux 2.4.x).
3203 3203
32044.99 KVM_PPC_CONFIGURE_V3_MMU
3205
3206Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3
3207Architectures: ppc
3208Type: vm ioctl
3209Parameters: struct kvm_ppc_mmuv3_cfg (in)
3210Returns: 0 on success,
3211 -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read,
3212 -EINVAL if the configuration is invalid
3213
3214This ioctl controls whether the guest will use radix or HPT (hashed
3215page table) translation, and sets the pointer to the process table for
3216the guest.
3217
3218struct kvm_ppc_mmuv3_cfg {
3219 __u64 flags;
3220 __u64 process_table;
3221};
3222
3223There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and
3224KVM_PPC_MMUV3_GTSE. KVM_PPC_MMUV3_RADIX, if set, configures the guest
3225to use radix tree translation, and if clear, to use HPT translation.
3226KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest
3227to be able to use the global TLB and SLB invalidation instructions;
3228if clear, the guest may not use these instructions.
3229
3230The process_table field specifies the address and size of the guest
3231process table, which is in the guest's space. This field is formatted
3232as the second doubleword of the partition table entry, as defined in
3233the Power ISA V3.00, Book III section 5.7.6.1.
3234
32354.100 KVM_PPC_GET_RMMU_INFO
3236
3237Capability: KVM_CAP_PPC_RADIX_MMU
3238Architectures: ppc
3239Type: vm ioctl
3240Parameters: struct kvm_ppc_rmmu_info (out)
3241Returns: 0 on success,
3242 -EFAULT if struct kvm_ppc_rmmu_info cannot be written,
3243 -EINVAL if no useful information can be returned
3244
3245This ioctl returns a structure containing two things: (a) a list
3246containing supported radix tree geometries, and (b) a list that maps
3247page sizes to put in the "AP" (actual page size) field for the tlbie
3248(TLB invalidate entry) instruction.
3249
3250struct kvm_ppc_rmmu_info {
3251 struct kvm_ppc_radix_geom {
3252 __u8 page_shift;
3253 __u8 level_bits[4];
3254 __u8 pad[3];
3255 } geometries[8];
3256 __u32 ap_encodings[8];
3257};
3258
3259The geometries[] field gives up to 8 supported geometries for the
3260radix page table, in terms of the log base 2 of the smallest page
3261size, and the number of bits indexed at each level of the tree, from
3262the PTE level up to the PGD level in that order. Any unused entries
3263will have 0 in the page_shift field.
3264
3265The ap_encodings gives the supported page sizes and their AP field
3266encodings, encoded with the AP value in the top 3 bits and the log
3267base 2 of the page size in the bottom 6 bits.
3268
32045. The kvm_run structure 32695. The kvm_run structure
3205------------------------ 3270------------------------
3206 3271
@@ -3942,3 +4007,21 @@ In order to use SynIC, it has to be activated by setting this
3942capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this 4007capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this
3943will disable the use of APIC hardware virtualization even if supported 4008will disable the use of APIC hardware virtualization even if supported
3944by the CPU, as it's incompatible with SynIC auto-EOI behavior. 4009by the CPU, as it's incompatible with SynIC auto-EOI behavior.
4010
40118.3 KVM_CAP_PPC_RADIX_MMU
4012
4013Architectures: ppc
4014
4015This capability, if KVM_CHECK_EXTENSION indicates that it is
4016available, means that that the kernel can support guests using the
4017radix MMU defined in Power ISA V3.00 (as implemented in the POWER9
4018processor).
4019
40208.4 KVM_CAP_PPC_HASH_MMU_V3
4021
4022Architectures: ppc
4023
4024This capability, if KVM_CHECK_EXTENSION indicates that it is
4025available, means that that the kernel can support guests using the
4026hashed page table MMU defined in Power ISA V3.00 (as implemented in
4027the POWER9 processor), including in-memory segment tables.
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 8afb0e00f7d9..d73e9dfa5237 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -44,10 +44,20 @@ struct patb_entry {
44}; 44};
45extern struct patb_entry *partition_tb; 45extern struct patb_entry *partition_tb;
46 46
47/* Bits in patb0 field */
47#define PATB_HR (1UL << 63) 48#define PATB_HR (1UL << 63)
48#define PATB_GR (1UL << 63)
49#define RPDB_MASK 0x0ffffffffffff00fUL 49#define RPDB_MASK 0x0ffffffffffff00fUL
50#define RPDB_SHIFT (1UL << 8) 50#define RPDB_SHIFT (1UL << 8)
51#define RTS1_SHIFT 61 /* top 2 bits of radix tree size */
52#define RTS1_MASK (3UL << RTS1_SHIFT)
53#define RTS2_SHIFT 5 /* bottom 3 bits of radix tree size */
54#define RTS2_MASK (7UL << RTS2_SHIFT)
55#define RPDS_MASK 0x1f /* root page dir. size field */
56
57/* Bits in patb1 field */
58#define PATB_GR (1UL << 63) /* guest uses radix; must match HR */
59#define PRTS_MASK 0x1f /* process table size field */
60
51/* 61/*
52 * Limit process table to PAGE_SIZE table. This 62 * Limit process table to PAGE_SIZE table. This
53 * also limit the max pid we can support. 63 * also limit the max pid we can support.
@@ -138,5 +148,11 @@ static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base,
138extern int (*register_process_table)(unsigned long base, unsigned long page_size, 148extern int (*register_process_table)(unsigned long base, unsigned long page_size,
139 unsigned long tbl_size); 149 unsigned long tbl_size);
140 150
151#ifdef CONFIG_PPC_PSERIES
152extern void radix_init_pseries(void);
153#else
154static inline void radix_init_pseries(void) { };
155#endif
156
141#endif /* __ASSEMBLY__ */ 157#endif /* __ASSEMBLY__ */
142#endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */ 158#endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 9a3eee661297..8fa09fa500f0 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -97,6 +97,15 @@
97 ld reg,PACAKBASE(r13); \ 97 ld reg,PACAKBASE(r13); \
98 ori reg,reg,(ABS_ADDR(label))@l; 98 ori reg,reg,(ABS_ADDR(label))@l;
99 99
100/*
101 * Branches from unrelocated code (e.g., interrupts) to labels outside
102 * head-y require >64K offsets.
103 */
104#define __LOAD_FAR_HANDLER(reg, label) \
105 ld reg,PACAKBASE(r13); \
106 ori reg,reg,(ABS_ADDR(label))@l; \
107 addis reg,reg,(ABS_ADDR(label))@h;
108
100/* Exception register prefixes */ 109/* Exception register prefixes */
101#define EXC_HV H 110#define EXC_HV H
102#define EXC_STD 111#define EXC_STD
@@ -227,13 +236,41 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
227 mtctr reg; \ 236 mtctr reg; \
228 bctr 237 bctr
229 238
239/*
240 * KVM requires __LOAD_FAR_HANDLER.
241 *
242 * __BRANCH_TO_KVM_EXIT branches are also a special case because they
243 * explicitly use r9 then reload it from PACA before branching. Hence
244 * the double-underscore.
245 */
246#define __BRANCH_TO_KVM_EXIT(area, label) \
247 mfctr r9; \
248 std r9,HSTATE_SCRATCH1(r13); \
249 __LOAD_FAR_HANDLER(r9, label); \
250 mtctr r9; \
251 ld r9,area+EX_R9(r13); \
252 bctr
253
254#define BRANCH_TO_KVM(reg, label) \
255 __LOAD_FAR_HANDLER(reg, label); \
256 mtctr reg; \
257 bctr
258
230#else 259#else
231#define BRANCH_TO_COMMON(reg, label) \ 260#define BRANCH_TO_COMMON(reg, label) \
232 b label 261 b label
233 262
263#define BRANCH_TO_KVM(reg, label) \
264 b label
265
266#define __BRANCH_TO_KVM_EXIT(area, label) \
267 ld r9,area+EX_R9(r13); \
268 b label
269
234#endif 270#endif
235 271
236#define __KVM_HANDLER_PROLOG(area, n) \ 272
273#define __KVM_HANDLER(area, h, n) \
237 BEGIN_FTR_SECTION_NESTED(947) \ 274 BEGIN_FTR_SECTION_NESTED(947) \
238 ld r10,area+EX_CFAR(r13); \ 275 ld r10,area+EX_CFAR(r13); \
239 std r10,HSTATE_CFAR(r13); \ 276 std r10,HSTATE_CFAR(r13); \
@@ -243,30 +280,28 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
243 std r10,HSTATE_PPR(r13); \ 280 std r10,HSTATE_PPR(r13); \
244 END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948); \ 281 END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948); \
245 ld r10,area+EX_R10(r13); \ 282 ld r10,area+EX_R10(r13); \
246 stw r9,HSTATE_SCRATCH1(r13); \
247 ld r9,area+EX_R9(r13); \
248 std r12,HSTATE_SCRATCH0(r13); \ 283 std r12,HSTATE_SCRATCH0(r13); \
249 284 sldi r12,r9,32; \
250#define __KVM_HANDLER(area, h, n) \ 285 ori r12,r12,(n); \
251 __KVM_HANDLER_PROLOG(area, n) \ 286 /* This reloads r9 before branching to kvmppc_interrupt */ \
252 li r12,n; \ 287 __BRANCH_TO_KVM_EXIT(area, kvmppc_interrupt)
253 b kvmppc_interrupt
254 288
255#define __KVM_HANDLER_SKIP(area, h, n) \ 289#define __KVM_HANDLER_SKIP(area, h, n) \
256 cmpwi r10,KVM_GUEST_MODE_SKIP; \ 290 cmpwi r10,KVM_GUEST_MODE_SKIP; \
257 ld r10,area+EX_R10(r13); \
258 beq 89f; \ 291 beq 89f; \
259 stw r9,HSTATE_SCRATCH1(r13); \
260 BEGIN_FTR_SECTION_NESTED(948) \ 292 BEGIN_FTR_SECTION_NESTED(948) \
261 ld r9,area+EX_PPR(r13); \ 293 ld r10,area+EX_PPR(r13); \
262 std r9,HSTATE_PPR(r13); \ 294 std r10,HSTATE_PPR(r13); \
263 END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948); \ 295 END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948); \
264 ld r9,area+EX_R9(r13); \ 296 ld r10,area+EX_R10(r13); \
265 std r12,HSTATE_SCRATCH0(r13); \ 297 std r12,HSTATE_SCRATCH0(r13); \
266 li r12,n; \ 298 sldi r12,r9,32; \
267 b kvmppc_interrupt; \ 299 ori r12,r12,(n); \
300 /* This reloads r9 before branching to kvmppc_interrupt */ \
301 __BRANCH_TO_KVM_EXIT(area, kvmppc_interrupt); \
26889: mtocrf 0x80,r9; \ 30289: mtocrf 0x80,r9; \
269 ld r9,area+EX_R9(r13); \ 303 ld r9,area+EX_R9(r13); \
304 ld r10,area+EX_R10(r13); \
270 b kvmppc_skip_##h##interrupt 305 b kvmppc_skip_##h##interrupt
271 306
272#ifdef CONFIG_KVM_BOOK3S_64_HANDLER 307#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
@@ -393,12 +428,12 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
393 EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_STD) 428 EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_STD)
394 429
395#define STD_RELON_EXCEPTION_HV(loc, vec, label) \ 430#define STD_RELON_EXCEPTION_HV(loc, vec, label) \
396 /* No guest interrupts come through here */ \
397 SET_SCRATCH0(r13); /* save r13 */ \ 431 SET_SCRATCH0(r13); /* save r13 */ \
398 EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label, EXC_HV, NOTEST, vec); 432 EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label, \
433 EXC_HV, KVMTEST_HV, vec);
399 434
400#define STD_RELON_EXCEPTION_HV_OOL(vec, label) \ 435#define STD_RELON_EXCEPTION_HV_OOL(vec, label) \
401 EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec); \ 436 EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, vec); \
402 EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV) 437 EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV)
403 438
404/* This associate vector numbers with bits in paca->irq_happened */ 439/* This associate vector numbers with bits in paca->irq_happened */
@@ -475,10 +510,10 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
475 510
476#define MASKABLE_RELON_EXCEPTION_HV(loc, vec, label) \ 511#define MASKABLE_RELON_EXCEPTION_HV(loc, vec, label) \
477 _MASKABLE_RELON_EXCEPTION_PSERIES(vec, label, \ 512 _MASKABLE_RELON_EXCEPTION_PSERIES(vec, label, \
478 EXC_HV, SOFTEN_NOTEST_HV) 513 EXC_HV, SOFTEN_TEST_HV)
479 514
480#define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label) \ 515#define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label) \
481 EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_NOTEST_HV, vec); \ 516 EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec); \
482 EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV) 517 EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV)
483 518
484/* 519/*
diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h
index fca7033839a9..9bd81619d090 100644
--- a/arch/powerpc/include/asm/head-64.h
+++ b/arch/powerpc/include/asm/head-64.h
@@ -218,7 +218,7 @@ name:
218 218
219#ifdef CONFIG_KVM_BOOK3S_64_HANDLER 219#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
220#define TRAMP_KVM_BEGIN(name) \ 220#define TRAMP_KVM_BEGIN(name) \
221 TRAMP_REAL_BEGIN(name) 221 TRAMP_VIRT_BEGIN(name)
222#else 222#else
223#define TRAMP_KVM_BEGIN(name) 223#define TRAMP_KVM_BEGIN(name)
224#endif 224#endif
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 77ff1ba99d1f..54d11b3a6bf7 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -276,6 +276,7 @@
276#define H_GET_MPP_X 0x314 276#define H_GET_MPP_X 0x314
277#define H_SET_MODE 0x31C 277#define H_SET_MODE 0x31C
278#define H_CLEAR_HPT 0x358 278#define H_CLEAR_HPT 0x358
279#define H_REGISTER_PROC_TBL 0x37C
279#define H_SIGNAL_SYS_RESET 0x380 280#define H_SIGNAL_SYS_RESET 0x380
280#define MAX_HCALL_OPCODE H_SIGNAL_SYS_RESET 281#define MAX_HCALL_OPCODE H_SIGNAL_SYS_RESET
281 282
@@ -313,6 +314,16 @@
313#define H_SIGNAL_SYS_RESET_ALL_OTHERS -2 314#define H_SIGNAL_SYS_RESET_ALL_OTHERS -2
314/* >= 0 values are CPU number */ 315/* >= 0 values are CPU number */
315 316
317/* Flag values used in H_REGISTER_PROC_TBL hcall */
318#define PROC_TABLE_OP_MASK 0x18
319#define PROC_TABLE_DEREG 0x10
320#define PROC_TABLE_NEW 0x18
321#define PROC_TABLE_TYPE_MASK 0x06
322#define PROC_TABLE_HPT_SLB 0x00
323#define PROC_TABLE_HPT_PT 0x02
324#define PROC_TABLE_RADIX 0x04
325#define PROC_TABLE_GTSE 0x01
326
316#ifndef __ASSEMBLY__ 327#ifndef __ASSEMBLY__
317 328
318/** 329/**
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 5cf306ae0ac3..2bf35017ffc0 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -170,6 +170,8 @@ extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run,
170 unsigned long status); 170 unsigned long status);
171extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, 171extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr,
172 unsigned long slb_v, unsigned long valid); 172 unsigned long slb_v, unsigned long valid);
173extern int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
174 unsigned long gpa, gva_t ea, int is_store);
173 175
174extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte); 176extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
175extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu); 177extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
@@ -182,6 +184,25 @@ extern void kvmppc_mmu_hpte_sysexit(void);
182extern int kvmppc_mmu_hv_init(void); 184extern int kvmppc_mmu_hv_init(void);
183extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc); 185extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
184 186
187extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
188 struct kvm_vcpu *vcpu,
189 unsigned long ea, unsigned long dsisr);
190extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
191 struct kvmppc_pte *gpte, bool data, bool iswrite);
192extern int kvmppc_init_vm_radix(struct kvm *kvm);
193extern void kvmppc_free_radix(struct kvm *kvm);
194extern int kvmppc_radix_init(void);
195extern void kvmppc_radix_exit(void);
196extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
197 unsigned long gfn);
198extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
199 unsigned long gfn);
200extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
201 unsigned long gfn);
202extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
203 struct kvm_memory_slot *memslot, unsigned long *map);
204extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
205
185/* XXX remove this export when load_last_inst() is generic */ 206/* XXX remove this export when load_last_inst() is generic */
186extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); 207extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
187extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec); 208extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
@@ -211,8 +232,11 @@ extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
211extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, 232extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
212 unsigned long pte_index, unsigned long avpn, 233 unsigned long pte_index, unsigned long avpn,
213 unsigned long *hpret); 234 unsigned long *hpret);
214extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, 235extern long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
215 struct kvm_memory_slot *memslot, unsigned long *map); 236 struct kvm_memory_slot *memslot, unsigned long *map);
237extern void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
238 struct kvm_memory_slot *memslot,
239 unsigned long *map);
216extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, 240extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr,
217 unsigned long mask); 241 unsigned long mask);
218extern void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr); 242extern void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 848292176908..0db010cc4e65 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -36,6 +36,12 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
36#endif 36#endif
37 37
38#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 38#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
39
40static inline bool kvm_is_radix(struct kvm *kvm)
41{
42 return kvm->arch.radix;
43}
44
39#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ 45#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */
40#endif 46#endif
41 47
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e59b172666cd..b2dbeac3f450 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -263,7 +263,11 @@ struct kvm_arch {
263 unsigned long hpt_mask; 263 unsigned long hpt_mask;
264 atomic_t hpte_mod_interest; 264 atomic_t hpte_mod_interest;
265 cpumask_t need_tlb_flush; 265 cpumask_t need_tlb_flush;
266 cpumask_t cpu_in_guest;
266 int hpt_cma_alloc; 267 int hpt_cma_alloc;
268 u8 radix;
269 pgd_t *pgtable;
270 u64 process_table;
267 struct dentry *debugfs_dir; 271 struct dentry *debugfs_dir;
268 struct dentry *htab_dentry; 272 struct dentry *htab_dentry;
269#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 273#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
@@ -603,6 +607,7 @@ struct kvm_vcpu_arch {
603 ulong fault_dar; 607 ulong fault_dar;
604 u32 fault_dsisr; 608 u32 fault_dsisr;
605 unsigned long intr_msr; 609 unsigned long intr_msr;
610 ulong fault_gpa; /* guest real address of page fault (POWER9) */
606#endif 611#endif
607 612
608#ifdef CONFIG_BOOKE 613#ifdef CONFIG_BOOKE
@@ -657,6 +662,7 @@ struct kvm_vcpu_arch {
657 int state; 662 int state;
658 int ptid; 663 int ptid;
659 int thread_cpu; 664 int thread_cpu;
665 int prev_cpu;
660 bool timer_running; 666 bool timer_running;
661 wait_queue_head_t cpu_run; 667 wait_queue_head_t cpu_run;
662 668
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2da67bf1f2ec..48c760f89590 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -291,6 +291,8 @@ struct kvmppc_ops {
291 struct irq_bypass_producer *); 291 struct irq_bypass_producer *);
292 void (*irq_bypass_del_producer)(struct irq_bypass_consumer *, 292 void (*irq_bypass_del_producer)(struct irq_bypass_consumer *,
293 struct irq_bypass_producer *); 293 struct irq_bypass_producer *);
294 int (*configure_mmu)(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg);
295 int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
294}; 296};
295 297
296extern struct kvmppc_ops *kvmppc_hv_ops; 298extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 5e57705b4759..8af2546ea593 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -121,6 +121,8 @@ struct of_drconf_cell {
121#define OV1_PPC_2_06 0x02 /* set if we support PowerPC 2.06 */ 121#define OV1_PPC_2_06 0x02 /* set if we support PowerPC 2.06 */
122#define OV1_PPC_2_07 0x01 /* set if we support PowerPC 2.07 */ 122#define OV1_PPC_2_07 0x01 /* set if we support PowerPC 2.07 */
123 123
124#define OV1_PPC_3_00 0x80 /* set if we support PowerPC 3.00 */
125
124/* Option vector 2: Open Firmware options supported */ 126/* Option vector 2: Open Firmware options supported */
125#define OV2_REAL_MODE 0x20 /* set if we want OF in real mode */ 127#define OV2_REAL_MODE 0x20 /* set if we want OF in real mode */
126 128
@@ -151,10 +153,17 @@ struct of_drconf_cell {
151#define OV5_XCMO 0x0440 /* Page Coalescing */ 153#define OV5_XCMO 0x0440 /* Page Coalescing */
152#define OV5_TYPE1_AFFINITY 0x0580 /* Type 1 NUMA affinity */ 154#define OV5_TYPE1_AFFINITY 0x0580 /* Type 1 NUMA affinity */
153#define OV5_PRRN 0x0540 /* Platform Resource Reassignment */ 155#define OV5_PRRN 0x0540 /* Platform Resource Reassignment */
154#define OV5_PFO_HW_RNG 0x0E80 /* PFO Random Number Generator */ 156#define OV5_PFO_HW_RNG 0x1180 /* PFO Random Number Generator */
155#define OV5_PFO_HW_842 0x0E40 /* PFO Compression Accelerator */ 157#define OV5_PFO_HW_842 0x1140 /* PFO Compression Accelerator */
156#define OV5_PFO_HW_ENCR 0x0E20 /* PFO Encryption Accelerator */ 158#define OV5_PFO_HW_ENCR 0x1120 /* PFO Encryption Accelerator */
157#define OV5_SUB_PROCESSORS 0x0F01 /* 1,2,or 4 Sub-Processors supported */ 159#define OV5_SUB_PROCESSORS 0x1501 /* 1,2,or 4 Sub-Processors supported */
160#define OV5_XIVE_EXPLOIT 0x1701 /* XIVE exploitation supported */
161#define OV5_MMU_RADIX_300 0x1880 /* ISA v3.00 radix MMU supported */
162#define OV5_MMU_HASH_300 0x1840 /* ISA v3.00 hash MMU supported */
163#define OV5_MMU_SEGM_RADIX 0x1820 /* radix mode (no segmentation) */
164#define OV5_MMU_PROC_TBL 0x1810 /* hcall selects SLB or proc table */
165#define OV5_MMU_SLB 0x1800 /* always use SLB */
166#define OV5_MMU_GTSE 0x1808 /* Guest translation shootdown */
158 167
159/* Option Vector 6: IBM PAPR hints */ 168/* Option Vector 6: IBM PAPR hints */
160#define OV6_LINUX 0x02 /* Linux is our OS */ 169#define OV6_LINUX 0x02 /* Linux is our OS */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 0d4531aa2052..aa44a83ad3ec 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -274,10 +274,14 @@
274#define SPRN_DSISR 0x012 /* Data Storage Interrupt Status Register */ 274#define SPRN_DSISR 0x012 /* Data Storage Interrupt Status Register */
275#define DSISR_NOHPTE 0x40000000 /* no translation found */ 275#define DSISR_NOHPTE 0x40000000 /* no translation found */
276#define DSISR_PROTFAULT 0x08000000 /* protection fault */ 276#define DSISR_PROTFAULT 0x08000000 /* protection fault */
277#define DSISR_BADACCESS 0x04000000 /* bad access to CI or G */
277#define DSISR_ISSTORE 0x02000000 /* access was a store */ 278#define DSISR_ISSTORE 0x02000000 /* access was a store */
278#define DSISR_DABRMATCH 0x00400000 /* hit data breakpoint */ 279#define DSISR_DABRMATCH 0x00400000 /* hit data breakpoint */
279#define DSISR_NOSEGMENT 0x00200000 /* SLB miss */ 280#define DSISR_NOSEGMENT 0x00200000 /* SLB miss */
280#define DSISR_KEYFAULT 0x00200000 /* Key fault */ 281#define DSISR_KEYFAULT 0x00200000 /* Key fault */
282#define DSISR_UNSUPP_MMU 0x00080000 /* Unsupported MMU config */
283#define DSISR_SET_RC 0x00040000 /* Failed setting of R/C bits */
284#define DSISR_PGDIRFAULT 0x00020000 /* Fault on page directory */
281#define SPRN_TBRL 0x10C /* Time Base Read Lower Register (user, R/O) */ 285#define SPRN_TBRL 0x10C /* Time Base Read Lower Register (user, R/O) */
282#define SPRN_TBRU 0x10D /* Time Base Read Upper Register (user, R/O) */ 286#define SPRN_TBRU 0x10D /* Time Base Read Upper Register (user, R/O) */
283#define SPRN_CIR 0x11B /* Chip Information Register (hyper, R/0) */ 287#define SPRN_CIR 0x11B /* Chip Information Register (hyper, R/0) */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index e3db3a50127b..4edbe4bb0e8b 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -413,6 +413,26 @@ struct kvm_get_htab_header {
413 __u16 n_invalid; 413 __u16 n_invalid;
414}; 414};
415 415
416/* For KVM_PPC_CONFIGURE_V3_MMU */
417struct kvm_ppc_mmuv3_cfg {
418 __u64 flags;
419 __u64 process_table; /* second doubleword of partition table entry */
420};
421
422/* Flag values for KVM_PPC_CONFIGURE_V3_MMU */
423#define KVM_PPC_MMUV3_RADIX 1 /* 1 = radix mode, 0 = HPT */
424#define KVM_PPC_MMUV3_GTSE 2 /* global translation shootdown enb. */
425
426/* For KVM_PPC_GET_RMMU_INFO */
427struct kvm_ppc_rmmu_info {
428 struct kvm_ppc_radix_geom {
429 __u8 page_shift;
430 __u8 level_bits[4];
431 __u8 pad[3];
432 } geometries[8];
433 __u32 ap_encodings[8];
434};
435
416/* Per-vcpu XICS interrupt controller state */ 436/* Per-vcpu XICS interrupt controller state */
417#define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c) 437#define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)
418 438
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 0601e6a7297c..3afa0ad9837f 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -498,6 +498,7 @@ int main(void)
498 DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits)); 498 DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
499 DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls)); 499 DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls));
500 DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); 500 DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
501 DEFINE(KVM_RADIX, offsetof(struct kvm, arch.radix));
501 DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); 502 DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
502 DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); 503 DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
503 DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr)); 504 DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
@@ -537,6 +538,7 @@ int main(void)
537 DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); 538 DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
538 DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); 539 DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
539 DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); 540 DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
541 DEFINE(VCPU_FAULT_GPA, offsetof(struct kvm_vcpu, arch.fault_gpa));
540 DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr)); 542 DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));
541 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); 543 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
542 DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap)); 544 DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index d39d6118c6e9..34a04a5fa468 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -142,7 +142,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
142 lbz r0,HSTATE_HWTHREAD_REQ(r13) 142 lbz r0,HSTATE_HWTHREAD_REQ(r13)
143 cmpwi r0,0 143 cmpwi r0,0
144 beq 1f 144 beq 1f
145 b kvm_start_guest 145 BRANCH_TO_KVM(r10, kvm_start_guest)
1461: 1461:
147#endif 147#endif
148 148
@@ -717,13 +717,9 @@ hardware_interrupt_hv:
717 BEGIN_FTR_SECTION 717 BEGIN_FTR_SECTION
718 _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common, 718 _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common,
719 EXC_HV, SOFTEN_TEST_HV) 719 EXC_HV, SOFTEN_TEST_HV)
720do_kvm_H0x500:
721 KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
722 FTR_SECTION_ELSE 720 FTR_SECTION_ELSE
723 _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common, 721 _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common,
724 EXC_STD, SOFTEN_TEST_PR) 722 EXC_STD, SOFTEN_TEST_PR)
725do_kvm_0x500:
726 KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
727 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) 723 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
728EXC_REAL_END(hardware_interrupt, 0x500, 0x600) 724EXC_REAL_END(hardware_interrupt, 0x500, 0x600)
729 725
@@ -737,6 +733,8 @@ hardware_interrupt_relon_hv:
737 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) 733 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
738EXC_VIRT_END(hardware_interrupt, 0x4500, 0x4600) 734EXC_VIRT_END(hardware_interrupt, 0x4500, 0x4600)
739 735
736TRAMP_KVM(PACA_EXGEN, 0x500)
737TRAMP_KVM_HV(PACA_EXGEN, 0x500)
740EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) 738EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ)
741 739
742 740
@@ -832,6 +830,31 @@ EXC_VIRT(trap_0b, 0x4b00, 0x4c00, 0xb00)
832TRAMP_KVM(PACA_EXGEN, 0xb00) 830TRAMP_KVM(PACA_EXGEN, 0xb00)
833EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) 831EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
834 832
833#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
834 /*
835 * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems
836 * that support it) before changing to HMT_MEDIUM. That allows the KVM
837 * code to save that value into the guest state (it is the guest's PPR
838 * value). Otherwise just change to HMT_MEDIUM as userspace has
839 * already saved the PPR.
840 */
841#define SYSCALL_KVMTEST \
842 SET_SCRATCH0(r13); \
843 GET_PACA(r13); \
844 std r9,PACA_EXGEN+EX_R9(r13); \
845 OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \
846 HMT_MEDIUM; \
847 std r10,PACA_EXGEN+EX_R10(r13); \
848 OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR); \
849 mfcr r9; \
850 KVMTEST_PR(0xc00); \
851 GET_SCRATCH0(r13)
852
853#else
854#define SYSCALL_KVMTEST \
855 HMT_MEDIUM
856#endif
857
835#define LOAD_SYSCALL_HANDLER(reg) \ 858#define LOAD_SYSCALL_HANDLER(reg) \
836 __LOAD_HANDLER(reg, system_call_common) 859 __LOAD_HANDLER(reg, system_call_common)
837 860
@@ -885,34 +908,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \
885#endif 908#endif
886 909
887EXC_REAL_BEGIN(system_call, 0xc00, 0xd00) 910EXC_REAL_BEGIN(system_call, 0xc00, 0xd00)
888 /* 911 SYSCALL_KVMTEST
889 * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems
890 * that support it) before changing to HMT_MEDIUM. That allows the KVM
891 * code to save that value into the guest state (it is the guest's PPR
892 * value). Otherwise just change to HMT_MEDIUM as userspace has
893 * already saved the PPR.
894 */
895#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
896 SET_SCRATCH0(r13)
897 GET_PACA(r13)
898 std r9,PACA_EXGEN+EX_R9(r13)
899 OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR);
900 HMT_MEDIUM;
901 std r10,PACA_EXGEN+EX_R10(r13)
902 OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR);
903 mfcr r9
904 KVMTEST_PR(0xc00)
905 GET_SCRATCH0(r13)
906#else
907 HMT_MEDIUM;
908#endif
909 SYSCALL_PSERIES_1 912 SYSCALL_PSERIES_1
910 SYSCALL_PSERIES_2_RFID 913 SYSCALL_PSERIES_2_RFID
911 SYSCALL_PSERIES_3 914 SYSCALL_PSERIES_3
912EXC_REAL_END(system_call, 0xc00, 0xd00) 915EXC_REAL_END(system_call, 0xc00, 0xd00)
913 916
914EXC_VIRT_BEGIN(system_call, 0x4c00, 0x4d00) 917EXC_VIRT_BEGIN(system_call, 0x4c00, 0x4d00)
915 HMT_MEDIUM 918 SYSCALL_KVMTEST
916 SYSCALL_PSERIES_1 919 SYSCALL_PSERIES_1
917 SYSCALL_PSERIES_2_DIRECT 920 SYSCALL_PSERIES_2_DIRECT
918 SYSCALL_PSERIES_3 921 SYSCALL_PSERIES_3
@@ -927,7 +930,7 @@ TRAMP_KVM(PACA_EXGEN, 0xd00)
927EXC_COMMON(single_step_common, 0xd00, single_step_exception) 930EXC_COMMON(single_step_common, 0xd00, single_step_exception)
928 931
929EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0xe20) 932EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0xe20)
930EXC_VIRT_NONE(0x4e00, 0x4e20) 933EXC_VIRT_OOL_HV(h_data_storage, 0x4e00, 0x4e20, 0xe00)
931TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00) 934TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00)
932EXC_COMMON_BEGIN(h_data_storage_common) 935EXC_COMMON_BEGIN(h_data_storage_common)
933 mfspr r10,SPRN_HDAR 936 mfspr r10,SPRN_HDAR
@@ -943,7 +946,7 @@ EXC_COMMON_BEGIN(h_data_storage_common)
943 946
944 947
945EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0xe40) 948EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0xe40)
946EXC_VIRT_NONE(0x4e20, 0x4e40) 949EXC_VIRT_OOL_HV(h_instr_storage, 0x4e20, 0x4e40, 0xe20)
947TRAMP_KVM_HV(PACA_EXGEN, 0xe20) 950TRAMP_KVM_HV(PACA_EXGEN, 0xe20)
948EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception) 951EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception)
949 952
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index ec47a939cbdd..358d43f8f84f 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -649,6 +649,7 @@ static void __init early_cmdline_parse(void)
649struct option_vector1 { 649struct option_vector1 {
650 u8 byte1; 650 u8 byte1;
651 u8 arch_versions; 651 u8 arch_versions;
652 u8 arch_versions3;
652} __packed; 653} __packed;
653 654
654struct option_vector2 { 655struct option_vector2 {
@@ -691,6 +692,9 @@ struct option_vector5 {
691 u8 reserved2; 692 u8 reserved2;
692 __be16 reserved3; 693 __be16 reserved3;
693 u8 subprocessors; 694 u8 subprocessors;
695 u8 byte22;
696 u8 intarch;
697 u8 mmu;
694} __packed; 698} __packed;
695 699
696struct option_vector6 { 700struct option_vector6 {
@@ -700,7 +704,7 @@ struct option_vector6 {
700} __packed; 704} __packed;
701 705
702struct ibm_arch_vec { 706struct ibm_arch_vec {
703 struct { u32 mask, val; } pvrs[10]; 707 struct { u32 mask, val; } pvrs[12];
704 708
705 u8 num_vectors; 709 u8 num_vectors;
706 710
@@ -750,6 +754,14 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
750 .val = cpu_to_be32(0x004d0000), 754 .val = cpu_to_be32(0x004d0000),
751 }, 755 },
752 { 756 {
757 .mask = cpu_to_be32(0xffff0000), /* POWER9 */
758 .val = cpu_to_be32(0x004e0000),
759 },
760 {
761 .mask = cpu_to_be32(0xffffffff), /* all 3.00-compliant */
762 .val = cpu_to_be32(0x0f000005),
763 },
764 {
753 .mask = cpu_to_be32(0xffffffff), /* all 2.07-compliant */ 765 .mask = cpu_to_be32(0xffffffff), /* all 2.07-compliant */
754 .val = cpu_to_be32(0x0f000004), 766 .val = cpu_to_be32(0x0f000004),
755 }, 767 },
@@ -774,6 +786,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
774 .byte1 = 0, 786 .byte1 = 0,
775 .arch_versions = OV1_PPC_2_00 | OV1_PPC_2_01 | OV1_PPC_2_02 | OV1_PPC_2_03 | 787 .arch_versions = OV1_PPC_2_00 | OV1_PPC_2_01 | OV1_PPC_2_02 | OV1_PPC_2_03 |
776 OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06 | OV1_PPC_2_07, 788 OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06 | OV1_PPC_2_07,
789 .arch_versions3 = OV1_PPC_3_00,
777 }, 790 },
778 791
779 .vec2_len = VECTOR_LENGTH(sizeof(struct option_vector2)), 792 .vec2_len = VECTOR_LENGTH(sizeof(struct option_vector2)),
@@ -836,6 +849,9 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
836 .reserved2 = 0, 849 .reserved2 = 0,
837 .reserved3 = 0, 850 .reserved3 = 0,
838 .subprocessors = 1, 851 .subprocessors = 1,
852 .intarch = 0,
853 .mmu = OV5_FEAT(OV5_MMU_RADIX_300) | OV5_FEAT(OV5_MMU_HASH_300) |
854 OV5_FEAT(OV5_MMU_PROC_TBL) | OV5_FEAT(OV5_MMU_GTSE),
839 }, 855 },
840 856
841 /* option vector 6: IBM PAPR hints */ 857 /* option vector 6: IBM PAPR hints */
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 7dd89b79d038..b87ccde2137a 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -70,7 +70,8 @@ endif
70kvm-hv-y += \ 70kvm-hv-y += \
71 book3s_hv.o \ 71 book3s_hv.o \
72 book3s_hv_interrupts.o \ 72 book3s_hv_interrupts.o \
73 book3s_64_mmu_hv.o 73 book3s_64_mmu_hv.o \
74 book3s_64_mmu_radix.o
74 75
75kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ 76kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
76 book3s_hv_rm_xics.o 77 book3s_hv_rm_xics.o
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 019f008775b9..b6b5c185bd92 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -239,6 +239,7 @@ void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar,
239 kvmppc_set_dsisr(vcpu, flags); 239 kvmppc_set_dsisr(vcpu, flags);
240 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); 240 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE);
241} 241}
242EXPORT_SYMBOL_GPL(kvmppc_core_queue_data_storage); /* used by kvm_hv */
242 243
243void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags) 244void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags)
244{ 245{
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index b795dd1ac2ef..9df3d940acec 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -119,6 +119,9 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
119 long err = -EBUSY; 119 long err = -EBUSY;
120 long order; 120 long order;
121 121
122 if (kvm_is_radix(kvm))
123 return -EINVAL;
124
122 mutex_lock(&kvm->lock); 125 mutex_lock(&kvm->lock);
123 if (kvm->arch.hpte_setup_done) { 126 if (kvm->arch.hpte_setup_done) {
124 kvm->arch.hpte_setup_done = 0; 127 kvm->arch.hpte_setup_done = 0;
@@ -152,12 +155,11 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
152 155
153void kvmppc_free_hpt(struct kvm *kvm) 156void kvmppc_free_hpt(struct kvm *kvm)
154{ 157{
155 kvmppc_free_lpid(kvm->arch.lpid);
156 vfree(kvm->arch.revmap); 158 vfree(kvm->arch.revmap);
157 if (kvm->arch.hpt_cma_alloc) 159 if (kvm->arch.hpt_cma_alloc)
158 kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt), 160 kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
159 1 << (kvm->arch.hpt_order - PAGE_SHIFT)); 161 1 << (kvm->arch.hpt_order - PAGE_SHIFT));
160 else 162 else if (kvm->arch.hpt_virt)
161 free_pages(kvm->arch.hpt_virt, 163 free_pages(kvm->arch.hpt_virt,
162 kvm->arch.hpt_order - PAGE_SHIFT); 164 kvm->arch.hpt_order - PAGE_SHIFT);
163} 165}
@@ -392,8 +394,8 @@ static int instruction_is_store(unsigned int instr)
392 return (instr & mask) != 0; 394 return (instr & mask) != 0;
393} 395}
394 396
395static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, 397int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
396 unsigned long gpa, gva_t ea, int is_store) 398 unsigned long gpa, gva_t ea, int is_store)
397{ 399{
398 u32 last_inst; 400 u32 last_inst;
399 401
@@ -458,6 +460,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
458 unsigned long rcbits; 460 unsigned long rcbits;
459 long mmio_update; 461 long mmio_update;
460 462
463 if (kvm_is_radix(kvm))
464 return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr);
465
461 /* 466 /*
462 * Real-mode code has already searched the HPT and found the 467 * Real-mode code has already searched the HPT and found the
463 * entry we're interested in. Lock the entry and check that 468 * entry we're interested in. Lock the entry and check that
@@ -695,12 +700,13 @@ static void kvmppc_rmap_reset(struct kvm *kvm)
695 srcu_read_unlock(&kvm->srcu, srcu_idx); 700 srcu_read_unlock(&kvm->srcu, srcu_idx);
696} 701}
697 702
703typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot,
704 unsigned long gfn);
705
698static int kvm_handle_hva_range(struct kvm *kvm, 706static int kvm_handle_hva_range(struct kvm *kvm,
699 unsigned long start, 707 unsigned long start,
700 unsigned long end, 708 unsigned long end,
701 int (*handler)(struct kvm *kvm, 709 hva_handler_fn handler)
702 unsigned long *rmapp,
703 unsigned long gfn))
704{ 710{
705 int ret; 711 int ret;
706 int retval = 0; 712 int retval = 0;
@@ -725,9 +731,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
725 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 731 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
726 732
727 for (; gfn < gfn_end; ++gfn) { 733 for (; gfn < gfn_end; ++gfn) {
728 gfn_t gfn_offset = gfn - memslot->base_gfn; 734 ret = handler(kvm, memslot, gfn);
729
730 ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn);
731 retval |= ret; 735 retval |= ret;
732 } 736 }
733 } 737 }
@@ -736,20 +740,21 @@ static int kvm_handle_hva_range(struct kvm *kvm,
736} 740}
737 741
738static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 742static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
739 int (*handler)(struct kvm *kvm, unsigned long *rmapp, 743 hva_handler_fn handler)
740 unsigned long gfn))
741{ 744{
742 return kvm_handle_hva_range(kvm, hva, hva + 1, handler); 745 return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
743} 746}
744 747
745static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 748static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
746 unsigned long gfn) 749 unsigned long gfn)
747{ 750{
748 struct revmap_entry *rev = kvm->arch.revmap; 751 struct revmap_entry *rev = kvm->arch.revmap;
749 unsigned long h, i, j; 752 unsigned long h, i, j;
750 __be64 *hptep; 753 __be64 *hptep;
751 unsigned long ptel, psize, rcbits; 754 unsigned long ptel, psize, rcbits;
755 unsigned long *rmapp;
752 756
757 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
753 for (;;) { 758 for (;;) {
754 lock_rmap(rmapp); 759 lock_rmap(rmapp);
755 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 760 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
@@ -810,26 +815,36 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
810 815
811int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva) 816int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva)
812{ 817{
813 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); 818 hva_handler_fn handler;
819
820 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
821 kvm_handle_hva(kvm, hva, handler);
814 return 0; 822 return 0;
815} 823}
816 824
817int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) 825int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
818{ 826{
819 kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp); 827 hva_handler_fn handler;
828
829 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
830 kvm_handle_hva_range(kvm, start, end, handler);
820 return 0; 831 return 0;
821} 832}
822 833
823void kvmppc_core_flush_memslot_hv(struct kvm *kvm, 834void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
824 struct kvm_memory_slot *memslot) 835 struct kvm_memory_slot *memslot)
825{ 836{
826 unsigned long *rmapp;
827 unsigned long gfn; 837 unsigned long gfn;
828 unsigned long n; 838 unsigned long n;
839 unsigned long *rmapp;
829 840
830 rmapp = memslot->arch.rmap;
831 gfn = memslot->base_gfn; 841 gfn = memslot->base_gfn;
832 for (n = memslot->npages; n; --n) { 842 rmapp = memslot->arch.rmap;
843 for (n = memslot->npages; n; --n, ++gfn) {
844 if (kvm_is_radix(kvm)) {
845 kvm_unmap_radix(kvm, memslot, gfn);
846 continue;
847 }
833 /* 848 /*
834 * Testing the present bit without locking is OK because 849 * Testing the present bit without locking is OK because
835 * the memslot has been marked invalid already, and hence 850 * the memslot has been marked invalid already, and hence
@@ -837,20 +852,21 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
837 * thus the present bit can't go from 0 to 1. 852 * thus the present bit can't go from 0 to 1.
838 */ 853 */
839 if (*rmapp & KVMPPC_RMAP_PRESENT) 854 if (*rmapp & KVMPPC_RMAP_PRESENT)
840 kvm_unmap_rmapp(kvm, rmapp, gfn); 855 kvm_unmap_rmapp(kvm, memslot, gfn);
841 ++rmapp; 856 ++rmapp;
842 ++gfn;
843 } 857 }
844} 858}
845 859
846static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 860static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
847 unsigned long gfn) 861 unsigned long gfn)
848{ 862{
849 struct revmap_entry *rev = kvm->arch.revmap; 863 struct revmap_entry *rev = kvm->arch.revmap;
850 unsigned long head, i, j; 864 unsigned long head, i, j;
851 __be64 *hptep; 865 __be64 *hptep;
852 int ret = 0; 866 int ret = 0;
867 unsigned long *rmapp;
853 868
869 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
854 retry: 870 retry:
855 lock_rmap(rmapp); 871 lock_rmap(rmapp);
856 if (*rmapp & KVMPPC_RMAP_REFERENCED) { 872 if (*rmapp & KVMPPC_RMAP_REFERENCED) {
@@ -898,17 +914,22 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
898 914
899int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) 915int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
900{ 916{
901 return kvm_handle_hva_range(kvm, start, end, kvm_age_rmapp); 917 hva_handler_fn handler;
918
919 handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp;
920 return kvm_handle_hva_range(kvm, start, end, handler);
902} 921}
903 922
904static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 923static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
905 unsigned long gfn) 924 unsigned long gfn)
906{ 925{
907 struct revmap_entry *rev = kvm->arch.revmap; 926 struct revmap_entry *rev = kvm->arch.revmap;
908 unsigned long head, i, j; 927 unsigned long head, i, j;
909 unsigned long *hp; 928 unsigned long *hp;
910 int ret = 1; 929 int ret = 1;
930 unsigned long *rmapp;
911 931
932 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
912 if (*rmapp & KVMPPC_RMAP_REFERENCED) 933 if (*rmapp & KVMPPC_RMAP_REFERENCED)
913 return 1; 934 return 1;
914 935
@@ -934,12 +955,18 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
934 955
935int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) 956int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
936{ 957{
937 return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); 958 hva_handler_fn handler;
959
960 handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp;
961 return kvm_handle_hva(kvm, hva, handler);
938} 962}
939 963
940void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) 964void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
941{ 965{
942 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); 966 hva_handler_fn handler;
967
968 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
969 kvm_handle_hva(kvm, hva, handler);
943} 970}
944 971
945static int vcpus_running(struct kvm *kvm) 972static int vcpus_running(struct kvm *kvm)
@@ -1040,7 +1067,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
1040 return npages_dirty; 1067 return npages_dirty;
1041} 1068}
1042 1069
1043static void harvest_vpa_dirty(struct kvmppc_vpa *vpa, 1070void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
1044 struct kvm_memory_slot *memslot, 1071 struct kvm_memory_slot *memslot,
1045 unsigned long *map) 1072 unsigned long *map)
1046{ 1073{
@@ -1058,12 +1085,11 @@ static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
1058 __set_bit_le(gfn - memslot->base_gfn, map); 1085 __set_bit_le(gfn - memslot->base_gfn, map);
1059} 1086}
1060 1087
1061long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, 1088long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
1062 unsigned long *map) 1089 struct kvm_memory_slot *memslot, unsigned long *map)
1063{ 1090{
1064 unsigned long i, j; 1091 unsigned long i, j;
1065 unsigned long *rmapp; 1092 unsigned long *rmapp;
1066 struct kvm_vcpu *vcpu;
1067 1093
1068 preempt_disable(); 1094 preempt_disable();
1069 rmapp = memslot->arch.rmap; 1095 rmapp = memslot->arch.rmap;
@@ -1079,15 +1105,6 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
1079 __set_bit_le(j, map); 1105 __set_bit_le(j, map);
1080 ++rmapp; 1106 ++rmapp;
1081 } 1107 }
1082
1083 /* Harvest dirty bits from VPA and DTL updates */
1084 /* Note: we never modify the SLB shadow buffer areas */
1085 kvm_for_each_vcpu(i, vcpu, kvm) {
1086 spin_lock(&vcpu->arch.vpa_update_lock);
1087 harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map);
1088 harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map);
1089 spin_unlock(&vcpu->arch.vpa_update_lock);
1090 }
1091 preempt_enable(); 1108 preempt_enable();
1092 return 0; 1109 return 0;
1093} 1110}
@@ -1142,10 +1159,14 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
1142 srcu_idx = srcu_read_lock(&kvm->srcu); 1159 srcu_idx = srcu_read_lock(&kvm->srcu);
1143 memslot = gfn_to_memslot(kvm, gfn); 1160 memslot = gfn_to_memslot(kvm, gfn);
1144 if (memslot) { 1161 if (memslot) {
1145 rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1162 if (!kvm_is_radix(kvm)) {
1146 lock_rmap(rmap); 1163 rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
1147 *rmap |= KVMPPC_RMAP_CHANGED; 1164 lock_rmap(rmap);
1148 unlock_rmap(rmap); 1165 *rmap |= KVMPPC_RMAP_CHANGED;
1166 unlock_rmap(rmap);
1167 } else if (memslot->dirty_bitmap) {
1168 mark_page_dirty(kvm, gfn);
1169 }
1149 } 1170 }
1150 srcu_read_unlock(&kvm->srcu, srcu_idx); 1171 srcu_read_unlock(&kvm->srcu, srcu_idx);
1151} 1172}
@@ -1675,7 +1696,10 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
1675 1696
1676 vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ 1697 vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */
1677 1698
1678 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; 1699 if (kvm_is_radix(vcpu->kvm))
1700 mmu->xlate = kvmppc_mmu_radix_xlate;
1701 else
1702 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
1679 mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; 1703 mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
1680 1704
1681 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; 1705 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
new file mode 100644
index 000000000000..4344651f408c
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -0,0 +1,716 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License, version 2, as
4 * published by the Free Software Foundation.
5 *
6 * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
7 */
8
9#include <linux/types.h>
10#include <linux/string.h>
11#include <linux/kvm.h>
12#include <linux/kvm_host.h>
13
14#include <asm/kvm_ppc.h>
15#include <asm/kvm_book3s.h>
16#include <asm/page.h>
17#include <asm/mmu.h>
18#include <asm/pgtable.h>
19#include <asm/pgalloc.h>
20
21/*
22 * Supported radix tree geometry.
23 * Like p9, we support either 5 or 9 bits at the first (lowest) level,
24 * for a page size of 64k or 4k.
25 */
26static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
27
28int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
29 struct kvmppc_pte *gpte, bool data, bool iswrite)
30{
31 struct kvm *kvm = vcpu->kvm;
32 u32 pid;
33 int ret, level, ps;
34 __be64 prte, rpte;
35 unsigned long root, pte, index;
36 unsigned long rts, bits, offset;
37 unsigned long gpa;
38 unsigned long proc_tbl_size;
39
40 /* Work out effective PID */
41 switch (eaddr >> 62) {
42 case 0:
43 pid = vcpu->arch.pid;
44 break;
45 case 3:
46 pid = 0;
47 break;
48 default:
49 return -EINVAL;
50 }
51 proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12);
52 if (pid * 16 >= proc_tbl_size)
53 return -EINVAL;
54
55 /* Read partition table to find root of tree for effective PID */
56 ret = kvm_read_guest(kvm, kvm->arch.process_table + pid * 16,
57 &prte, sizeof(prte));
58 if (ret)
59 return ret;
60
61 root = be64_to_cpu(prte);
62 rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
63 ((root & RTS2_MASK) >> RTS2_SHIFT);
64 bits = root & RPDS_MASK;
65 root = root & RPDB_MASK;
66
67 /* P9 DD1 interprets RTS (radix tree size) differently */
68 offset = rts + 31;
69 if (cpu_has_feature(CPU_FTR_POWER9_DD1))
70 offset -= 3;
71
72 /* current implementations only support 52-bit space */
73 if (offset != 52)
74 return -EINVAL;
75
76 for (level = 3; level >= 0; --level) {
77 if (level && bits != p9_supported_radix_bits[level])
78 return -EINVAL;
79 if (level == 0 && !(bits == 5 || bits == 9))
80 return -EINVAL;
81 offset -= bits;
82 index = (eaddr >> offset) & ((1UL << bits) - 1);
83 /* check that low bits of page table base are zero */
84 if (root & ((1UL << (bits + 3)) - 1))
85 return -EINVAL;
86 ret = kvm_read_guest(kvm, root + index * 8,
87 &rpte, sizeof(rpte));
88 if (ret)
89 return ret;
90 pte = __be64_to_cpu(rpte);
91 if (!(pte & _PAGE_PRESENT))
92 return -ENOENT;
93 if (pte & _PAGE_PTE)
94 break;
95 bits = pte & 0x1f;
96 root = pte & 0x0fffffffffffff00ul;
97 }
98 /* need a leaf at lowest level; 512GB pages not supported */
99 if (level < 0 || level == 3)
100 return -EINVAL;
101
102 /* offset is now log base 2 of the page size */
103 gpa = pte & 0x01fffffffffff000ul;
104 if (gpa & ((1ul << offset) - 1))
105 return -EINVAL;
106 gpa += eaddr & ((1ul << offset) - 1);
107 for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
108 if (offset == mmu_psize_defs[ps].shift)
109 break;
110 gpte->page_size = ps;
111
112 gpte->eaddr = eaddr;
113 gpte->raddr = gpa;
114
115 /* Work out permissions */
116 gpte->may_read = !!(pte & _PAGE_READ);
117 gpte->may_write = !!(pte & _PAGE_WRITE);
118 gpte->may_execute = !!(pte & _PAGE_EXEC);
119 if (kvmppc_get_msr(vcpu) & MSR_PR) {
120 if (pte & _PAGE_PRIVILEGED) {
121 gpte->may_read = 0;
122 gpte->may_write = 0;
123 gpte->may_execute = 0;
124 }
125 } else {
126 if (!(pte & _PAGE_PRIVILEGED)) {
127 /* Check AMR/IAMR to see if strict mode is in force */
128 if (vcpu->arch.amr & (1ul << 62))
129 gpte->may_read = 0;
130 if (vcpu->arch.amr & (1ul << 63))
131 gpte->may_write = 0;
132 if (vcpu->arch.iamr & (1ul << 62))
133 gpte->may_execute = 0;
134 }
135 }
136
137 return 0;
138}
139
140#ifdef CONFIG_PPC_64K_PAGES
141#define MMU_BASE_PSIZE MMU_PAGE_64K
142#else
143#define MMU_BASE_PSIZE MMU_PAGE_4K
144#endif
145
146static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
147 unsigned int pshift)
148{
149 int psize = MMU_BASE_PSIZE;
150
151 if (pshift >= PMD_SHIFT)
152 psize = MMU_PAGE_2M;
153 addr &= ~0xfffUL;
154 addr |= mmu_psize_defs[psize].ap << 5;
155 asm volatile("ptesync": : :"memory");
156 asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1)
157 : : "r" (addr), "r" (kvm->arch.lpid) : "memory");
158 asm volatile("ptesync": : :"memory");
159}
160
161unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
162 unsigned long clr, unsigned long set,
163 unsigned long addr, unsigned int shift)
164{
165 unsigned long old = 0;
166
167 if (!(clr & _PAGE_PRESENT) && cpu_has_feature(CPU_FTR_POWER9_DD1) &&
168 pte_present(*ptep)) {
169 /* have to invalidate it first */
170 old = __radix_pte_update(ptep, _PAGE_PRESENT, 0);
171 kvmppc_radix_tlbie_page(kvm, addr, shift);
172 set |= _PAGE_PRESENT;
173 old &= _PAGE_PRESENT;
174 }
175 return __radix_pte_update(ptep, clr, set) | old;
176}
177
178void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
179 pte_t *ptep, pte_t pte)
180{
181 radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
182}
183
184static struct kmem_cache *kvm_pte_cache;
185
186static pte_t *kvmppc_pte_alloc(void)
187{
188 return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
189}
190
191static void kvmppc_pte_free(pte_t *ptep)
192{
193 kmem_cache_free(kvm_pte_cache, ptep);
194}
195
196static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
197 unsigned int level, unsigned long mmu_seq)
198{
199 pgd_t *pgd;
200 pud_t *pud, *new_pud = NULL;
201 pmd_t *pmd, *new_pmd = NULL;
202 pte_t *ptep, *new_ptep = NULL;
203 unsigned long old;
204 int ret;
205
206 /* Traverse the guest's 2nd-level tree, allocate new levels needed */
207 pgd = kvm->arch.pgtable + pgd_index(gpa);
208 pud = NULL;
209 if (pgd_present(*pgd))
210 pud = pud_offset(pgd, gpa);
211 else
212 new_pud = pud_alloc_one(kvm->mm, gpa);
213
214 pmd = NULL;
215 if (pud && pud_present(*pud))
216 pmd = pmd_offset(pud, gpa);
217 else
218 new_pmd = pmd_alloc_one(kvm->mm, gpa);
219
220 if (level == 0 && !(pmd && pmd_present(*pmd)))
221 new_ptep = kvmppc_pte_alloc();
222
223 /* Check if we might have been invalidated; let the guest retry if so */
224 spin_lock(&kvm->mmu_lock);
225 ret = -EAGAIN;
226 if (mmu_notifier_retry(kvm, mmu_seq))
227 goto out_unlock;
228
229 /* Now traverse again under the lock and change the tree */
230 ret = -ENOMEM;
231 if (pgd_none(*pgd)) {
232 if (!new_pud)
233 goto out_unlock;
234 pgd_populate(kvm->mm, pgd, new_pud);
235 new_pud = NULL;
236 }
237 pud = pud_offset(pgd, gpa);
238 if (pud_none(*pud)) {
239 if (!new_pmd)
240 goto out_unlock;
241 pud_populate(kvm->mm, pud, new_pmd);
242 new_pmd = NULL;
243 }
244 pmd = pmd_offset(pud, gpa);
245 if (pmd_large(*pmd)) {
246 /* Someone else has instantiated a large page here; retry */
247 ret = -EAGAIN;
248 goto out_unlock;
249 }
250 if (level == 1 && !pmd_none(*pmd)) {
251 /*
252 * There's a page table page here, but we wanted
253 * to install a large page. Tell the caller and let
254 * it try installing a normal page if it wants.
255 */
256 ret = -EBUSY;
257 goto out_unlock;
258 }
259 if (level == 0) {
260 if (pmd_none(*pmd)) {
261 if (!new_ptep)
262 goto out_unlock;
263 pmd_populate(kvm->mm, pmd, new_ptep);
264 new_ptep = NULL;
265 }
266 ptep = pte_offset_kernel(pmd, gpa);
267 if (pte_present(*ptep)) {
268 /* PTE was previously valid, so invalidate it */
269 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
270 0, gpa, 0);
271 kvmppc_radix_tlbie_page(kvm, gpa, 0);
272 if (old & _PAGE_DIRTY)
273 mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
274 }
275 kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
276 } else {
277 kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
278 }
279 ret = 0;
280
281 out_unlock:
282 spin_unlock(&kvm->mmu_lock);
283 if (new_pud)
284 pud_free(kvm->mm, new_pud);
285 if (new_pmd)
286 pmd_free(kvm->mm, new_pmd);
287 if (new_ptep)
288 kvmppc_pte_free(new_ptep);
289 return ret;
290}
291
292int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
293 unsigned long ea, unsigned long dsisr)
294{
295 struct kvm *kvm = vcpu->kvm;
296 unsigned long mmu_seq, pte_size;
297 unsigned long gpa, gfn, hva, pfn;
298 struct kvm_memory_slot *memslot;
299 struct page *page = NULL, *pages[1];
300 long ret, npages, ok;
301 unsigned int writing;
302 struct vm_area_struct *vma;
303 unsigned long flags;
304 pte_t pte, *ptep;
305 unsigned long pgflags;
306 unsigned int shift, level;
307
308 /* Check for unusual errors */
309 if (dsisr & DSISR_UNSUPP_MMU) {
310 pr_err("KVM: Got unsupported MMU fault\n");
311 return -EFAULT;
312 }
313 if (dsisr & DSISR_BADACCESS) {
314 /* Reflect to the guest as DSI */
315 pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
316 kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
317 return RESUME_GUEST;
318 }
319
320 /* Translate the logical address and get the page */
321 gpa = vcpu->arch.fault_gpa & ~0xfffUL;
322 gpa &= ~0xF000000000000000ul;
323 gfn = gpa >> PAGE_SHIFT;
324 if (!(dsisr & DSISR_PGDIRFAULT))
325 gpa |= ea & 0xfff;
326 memslot = gfn_to_memslot(kvm, gfn);
327
328 /* No memslot means it's an emulated MMIO region */
329 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
330 if (dsisr & (DSISR_PGDIRFAULT | DSISR_BADACCESS |
331 DSISR_SET_RC)) {
332 /*
333 * Bad address in guest page table tree, or other
334 * unusual error - reflect it to the guest as DSI.
335 */
336 kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
337 return RESUME_GUEST;
338 }
339 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
340 dsisr & DSISR_ISSTORE);
341 }
342
343 /* used to check for invalidations in progress */
344 mmu_seq = kvm->mmu_notifier_seq;
345 smp_rmb();
346
347 writing = (dsisr & DSISR_ISSTORE) != 0;
348 hva = gfn_to_hva_memslot(memslot, gfn);
349 if (dsisr & DSISR_SET_RC) {
350 /*
351 * Need to set an R or C bit in the 2nd-level tables;
352 * if the relevant bits aren't already set in the linux
353 * page tables, fall through to do the gup_fast to
354 * set them in the linux page tables too.
355 */
356 ok = 0;
357 pgflags = _PAGE_ACCESSED;
358 if (writing)
359 pgflags |= _PAGE_DIRTY;
360 local_irq_save(flags);
361 ptep = __find_linux_pte_or_hugepte(current->mm->pgd, hva,
362 NULL, NULL);
363 if (ptep) {
364 pte = READ_ONCE(*ptep);
365 if (pte_present(pte) &&
366 (pte_val(pte) & pgflags) == pgflags)
367 ok = 1;
368 }
369 local_irq_restore(flags);
370 if (ok) {
371 spin_lock(&kvm->mmu_lock);
372 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
373 spin_unlock(&kvm->mmu_lock);
374 return RESUME_GUEST;
375 }
376 ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable,
377 gpa, NULL, &shift);
378 if (ptep && pte_present(*ptep)) {
379 kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
380 gpa, shift);
381 spin_unlock(&kvm->mmu_lock);
382 return RESUME_GUEST;
383 }
384 spin_unlock(&kvm->mmu_lock);
385 }
386 }
387
388 ret = -EFAULT;
389 pfn = 0;
390 pte_size = PAGE_SIZE;
391 pgflags = _PAGE_READ | _PAGE_EXEC;
392 level = 0;
393 npages = get_user_pages_fast(hva, 1, writing, pages);
394 if (npages < 1) {
395 /* Check if it's an I/O mapping */
396 down_read(&current->mm->mmap_sem);
397 vma = find_vma(current->mm, hva);
398 if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
399 (vma->vm_flags & VM_PFNMAP)) {
400 pfn = vma->vm_pgoff +
401 ((hva - vma->vm_start) >> PAGE_SHIFT);
402 pgflags = pgprot_val(vma->vm_page_prot);
403 }
404 up_read(&current->mm->mmap_sem);
405 if (!pfn)
406 return -EFAULT;
407 } else {
408 page = pages[0];
409 pfn = page_to_pfn(page);
410 if (PageHuge(page)) {
411 page = compound_head(page);
412 pte_size <<= compound_order(page);
413 /* See if we can insert a 2MB large-page PTE here */
414 if (pte_size >= PMD_SIZE &&
415 (gpa & PMD_MASK & PAGE_MASK) ==
416 (hva & PMD_MASK & PAGE_MASK)) {
417 level = 1;
418 pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
419 }
420 }
421 /* See if we can provide write access */
422 if (writing) {
423 /*
424 * We assume gup_fast has set dirty on the host PTE.
425 */
426 pgflags |= _PAGE_WRITE;
427 } else {
428 local_irq_save(flags);
429 ptep = __find_linux_pte_or_hugepte(current->mm->pgd,
430 hva, NULL, NULL);
431 if (ptep && pte_write(*ptep) && pte_dirty(*ptep))
432 pgflags |= _PAGE_WRITE;
433 local_irq_restore(flags);
434 }
435 }
436
437 /*
438 * Compute the PTE value that we need to insert.
439 */
440 pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED;
441 if (pgflags & _PAGE_WRITE)
442 pgflags |= _PAGE_DIRTY;
443 pte = pfn_pte(pfn, __pgprot(pgflags));
444
445 /* Allocate space in the tree and write the PTE */
446 ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
447 if (ret == -EBUSY) {
448 /*
449 * There's already a PMD where wanted to install a large page;
450 * for now, fall back to installing a small page.
451 */
452 level = 0;
453 pfn |= gfn & ((PMD_SIZE >> PAGE_SHIFT) - 1);
454 pte = pfn_pte(pfn, __pgprot(pgflags));
455 ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
456 }
457 if (ret == 0 || ret == -EAGAIN)
458 ret = RESUME_GUEST;
459
460 if (page) {
461 /*
462 * We drop pages[0] here, not page because page might
463 * have been set to the head page of a compound, but
464 * we have to drop the reference on the correct tail
465 * page to match the get inside gup()
466 */
467 put_page(pages[0]);
468 }
469 return ret;
470}
471
472static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot,
473 unsigned long gfn, unsigned int order)
474{
475 unsigned long i, limit;
476 unsigned long *dp;
477
478 if (!memslot->dirty_bitmap)
479 return;
480 limit = 1ul << order;
481 if (limit < BITS_PER_LONG) {
482 for (i = 0; i < limit; ++i)
483 mark_page_dirty(kvm, gfn + i);
484 return;
485 }
486 dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn);
487 limit /= BITS_PER_LONG;
488 for (i = 0; i < limit; ++i)
489 *dp++ = ~0ul;
490}
491
492/* Called with kvm->lock held */
493int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
494 unsigned long gfn)
495{
496 pte_t *ptep;
497 unsigned long gpa = gfn << PAGE_SHIFT;
498 unsigned int shift;
499 unsigned long old;
500
501 ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
502 NULL, &shift);
503 if (ptep && pte_present(*ptep)) {
504 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
505 gpa, shift);
506 kvmppc_radix_tlbie_page(kvm, gpa, shift);
507 if (old & _PAGE_DIRTY) {
508 if (!shift)
509 mark_page_dirty(kvm, gfn);
510 else
511 mark_pages_dirty(kvm, memslot,
512 gfn, shift - PAGE_SHIFT);
513 }
514 }
515 return 0;
516}
517
518/* Called with kvm->lock held */
519int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
520 unsigned long gfn)
521{
522 pte_t *ptep;
523 unsigned long gpa = gfn << PAGE_SHIFT;
524 unsigned int shift;
525 int ref = 0;
526
527 ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
528 NULL, &shift);
529 if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
530 kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
531 gpa, shift);
532 /* XXX need to flush tlb here? */
533 ref = 1;
534 }
535 return ref;
536}
537
538/* Called with kvm->lock held */
539int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
540 unsigned long gfn)
541{
542 pte_t *ptep;
543 unsigned long gpa = gfn << PAGE_SHIFT;
544 unsigned int shift;
545 int ref = 0;
546
547 ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
548 NULL, &shift);
549 if (ptep && pte_present(*ptep) && pte_young(*ptep))
550 ref = 1;
551 return ref;
552}
553
554/* Returns the number of PAGE_SIZE pages that are dirty */
555static int kvm_radix_test_clear_dirty(struct kvm *kvm,
556 struct kvm_memory_slot *memslot, int pagenum)
557{
558 unsigned long gfn = memslot->base_gfn + pagenum;
559 unsigned long gpa = gfn << PAGE_SHIFT;
560 pte_t *ptep;
561 unsigned int shift;
562 int ret = 0;
563
564 ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
565 NULL, &shift);
566 if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) {
567 ret = 1;
568 if (shift)
569 ret = 1 << (shift - PAGE_SHIFT);
570 kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
571 gpa, shift);
572 kvmppc_radix_tlbie_page(kvm, gpa, shift);
573 }
574 return ret;
575}
576
577long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
578 struct kvm_memory_slot *memslot, unsigned long *map)
579{
580 unsigned long i, j;
581 unsigned long n, *p;
582 int npages;
583
584 /*
585 * Radix accumulates dirty bits in the first half of the
586 * memslot's dirty_bitmap area, for when pages are paged
587 * out or modified by the host directly. Pick up these
588 * bits and add them to the map.
589 */
590 n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long);
591 p = memslot->dirty_bitmap;
592 for (i = 0; i < n; ++i)
593 map[i] |= xchg(&p[i], 0);
594
595 for (i = 0; i < memslot->npages; i = j) {
596 npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
597
598 /*
599 * Note that if npages > 0 then i must be a multiple of npages,
600 * since huge pages are only used to back the guest at guest
601 * real addresses that are a multiple of their size.
602 * Since we have at most one PTE covering any given guest
603 * real address, if npages > 1 we can skip to i + npages.
604 */
605 j = i + 1;
606 if (npages)
607 for (j = i; npages; ++j, --npages)
608 __set_bit_le(j, map);
609 }
610 return 0;
611}
612
613static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
614 int psize, int *indexp)
615{
616 if (!mmu_psize_defs[psize].shift)
617 return;
618 info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
619 (mmu_psize_defs[psize].ap << 29);
620 ++(*indexp);
621}
622
623int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
624{
625 int i;
626
627 if (!radix_enabled())
628 return -EINVAL;
629 memset(info, 0, sizeof(*info));
630
631 /* 4k page size */
632 info->geometries[0].page_shift = 12;
633 info->geometries[0].level_bits[0] = 9;
634 for (i = 1; i < 4; ++i)
635 info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
636 /* 64k page size */
637 info->geometries[1].page_shift = 16;
638 for (i = 0; i < 4; ++i)
639 info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
640
641 i = 0;
642 add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
643 add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
644 add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
645 add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
646
647 return 0;
648}
649
650int kvmppc_init_vm_radix(struct kvm *kvm)
651{
652 kvm->arch.pgtable = pgd_alloc(kvm->mm);
653 if (!kvm->arch.pgtable)
654 return -ENOMEM;
655 return 0;
656}
657
658void kvmppc_free_radix(struct kvm *kvm)
659{
660 unsigned long ig, iu, im;
661 pte_t *pte;
662 pmd_t *pmd;
663 pud_t *pud;
664 pgd_t *pgd;
665
666 if (!kvm->arch.pgtable)
667 return;
668 pgd = kvm->arch.pgtable;
669 for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
670 if (!pgd_present(*pgd))
671 continue;
672 pud = pud_offset(pgd, 0);
673 for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) {
674 if (!pud_present(*pud))
675 continue;
676 pmd = pmd_offset(pud, 0);
677 for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) {
678 if (pmd_huge(*pmd)) {
679 pmd_clear(pmd);
680 continue;
681 }
682 if (!pmd_present(*pmd))
683 continue;
684 pte = pte_offset_map(pmd, 0);
685 memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
686 kvmppc_pte_free(pte);
687 pmd_clear(pmd);
688 }
689 pmd_free(kvm->mm, pmd_offset(pud, 0));
690 pud_clear(pud);
691 }
692 pud_free(kvm->mm, pud_offset(pgd, 0));
693 pgd_clear(pgd);
694 }
695 pgd_free(kvm->mm, kvm->arch.pgtable);
696}
697
698static void pte_ctor(void *addr)
699{
700 memset(addr, 0, PTE_TABLE_SIZE);
701}
702
703int kvmppc_radix_init(void)
704{
705 unsigned long size = sizeof(void *) << PTE_INDEX_SIZE;
706
707 kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
708 if (!kvm_pte_cache)
709 return -ENOMEM;
710 return 0;
711}
712
713void kvmppc_radix_exit(void)
714{
715 kmem_cache_destroy(kvm_pte_cache);
716}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 856cc9d38efd..bdf281cc88c0 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1132,7 +1132,7 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
1132 /* 1132 /*
1133 * Userspace can only modify DPFD (default prefetch depth), 1133 * Userspace can only modify DPFD (default prefetch depth),
1134 * ILE (interrupt little-endian) and TC (translation control). 1134 * ILE (interrupt little-endian) and TC (translation control).
1135 * On POWER8 userspace can also modify AIL (alt. interrupt loc.) 1135 * On POWER8 and POWER9 userspace can also modify AIL (alt. interrupt loc.).
1136 */ 1136 */
1137 mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; 1137 mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
1138 if (cpu_has_feature(CPU_FTR_ARCH_207S)) 1138 if (cpu_has_feature(CPU_FTR_ARCH_207S))
@@ -1818,6 +1818,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
1818 vcpu->arch.vcore = vcore; 1818 vcpu->arch.vcore = vcore;
1819 vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid; 1819 vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
1820 vcpu->arch.thread_cpu = -1; 1820 vcpu->arch.thread_cpu = -1;
1821 vcpu->arch.prev_cpu = -1;
1821 1822
1822 vcpu->arch.cpu_type = KVM_CPU_3S_64; 1823 vcpu->arch.cpu_type = KVM_CPU_3S_64;
1823 kvmppc_sanity_check(vcpu); 1824 kvmppc_sanity_check(vcpu);
@@ -1947,11 +1948,33 @@ static void kvmppc_release_hwthread(int cpu)
1947 tpaca->kvm_hstate.kvm_split_mode = NULL; 1948 tpaca->kvm_hstate.kvm_split_mode = NULL;
1948} 1949}
1949 1950
1951static void do_nothing(void *x)
1952{
1953}
1954
1955static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
1956{
1957 int i;
1958
1959 cpu = cpu_first_thread_sibling(cpu);
1960 cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
1961 /*
1962 * Make sure setting of bit in need_tlb_flush precedes
1963 * testing of cpu_in_guest bits. The matching barrier on
1964 * the other side is the first smp_mb() in kvmppc_run_core().
1965 */
1966 smp_mb();
1967 for (i = 0; i < threads_per_core; ++i)
1968 if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
1969 smp_call_function_single(cpu + i, do_nothing, NULL, 1);
1970}
1971
1950static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) 1972static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
1951{ 1973{
1952 int cpu; 1974 int cpu;
1953 struct paca_struct *tpaca; 1975 struct paca_struct *tpaca;
1954 struct kvmppc_vcore *mvc = vc->master_vcore; 1976 struct kvmppc_vcore *mvc = vc->master_vcore;
1977 struct kvm *kvm = vc->kvm;
1955 1978
1956 cpu = vc->pcpu; 1979 cpu = vc->pcpu;
1957 if (vcpu) { 1980 if (vcpu) {
@@ -1962,6 +1985,27 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
1962 cpu += vcpu->arch.ptid; 1985 cpu += vcpu->arch.ptid;
1963 vcpu->cpu = mvc->pcpu; 1986 vcpu->cpu = mvc->pcpu;
1964 vcpu->arch.thread_cpu = cpu; 1987 vcpu->arch.thread_cpu = cpu;
1988
1989 /*
1990 * With radix, the guest can do TLB invalidations itself,
1991 * and it could choose to use the local form (tlbiel) if
1992 * it is invalidating a translation that has only ever been
1993 * used on one vcpu. However, that doesn't mean it has
1994 * only ever been used on one physical cpu, since vcpus
1995 * can move around between pcpus. To cope with this, when
1996 * a vcpu moves from one pcpu to another, we need to tell
1997 * any vcpus running on the same core as this vcpu previously
1998 * ran to flush the TLB. The TLB is shared between threads,
1999 * so we use a single bit in .need_tlb_flush for all 4 threads.
2000 */
2001 if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) {
2002 if (vcpu->arch.prev_cpu >= 0 &&
2003 cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
2004 cpu_first_thread_sibling(cpu))
2005 radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
2006 vcpu->arch.prev_cpu = cpu;
2007 }
2008 cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
1965 } 2009 }
1966 tpaca = &paca[cpu]; 2010 tpaca = &paca[cpu];
1967 tpaca->kvm_hstate.kvm_vcpu = vcpu; 2011 tpaca->kvm_hstate.kvm_vcpu = vcpu;
@@ -2549,6 +2593,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2549 kvmppc_release_hwthread(pcpu + i); 2593 kvmppc_release_hwthread(pcpu + i);
2550 if (sip && sip->napped[i]) 2594 if (sip && sip->napped[i])
2551 kvmppc_ipi_thread(pcpu + i); 2595 kvmppc_ipi_thread(pcpu + i);
2596 cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
2552 } 2597 }
2553 2598
2554 kvmppc_set_host_core(pcpu); 2599 kvmppc_set_host_core(pcpu);
@@ -2875,7 +2920,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
2875 smp_mb(); 2920 smp_mb();
2876 2921
2877 /* On the first time here, set up HTAB and VRMA */ 2922 /* On the first time here, set up HTAB and VRMA */
2878 if (!vcpu->kvm->arch.hpte_setup_done) { 2923 if (!kvm_is_radix(vcpu->kvm) && !vcpu->kvm->arch.hpte_setup_done) {
2879 r = kvmppc_hv_setup_htab_rma(vcpu); 2924 r = kvmppc_hv_setup_htab_rma(vcpu);
2880 if (r) 2925 if (r)
2881 goto out; 2926 goto out;
@@ -2937,6 +2982,13 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
2937{ 2982{
2938 struct kvm_ppc_one_seg_page_size *sps; 2983 struct kvm_ppc_one_seg_page_size *sps;
2939 2984
2985 /*
2986 * Since we don't yet support HPT guests on a radix host,
2987 * return an error if the host uses radix.
2988 */
2989 if (radix_enabled())
2990 return -EINVAL;
2991
2940 info->flags = KVM_PPC_PAGE_SIZES_REAL; 2992 info->flags = KVM_PPC_PAGE_SIZES_REAL;
2941 if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) 2993 if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
2942 info->flags |= KVM_PPC_1T_SEGMENTS; 2994 info->flags |= KVM_PPC_1T_SEGMENTS;
@@ -2959,8 +3011,10 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
2959{ 3011{
2960 struct kvm_memslots *slots; 3012 struct kvm_memslots *slots;
2961 struct kvm_memory_slot *memslot; 3013 struct kvm_memory_slot *memslot;
2962 int r; 3014 int i, r;
2963 unsigned long n; 3015 unsigned long n;
3016 unsigned long *buf;
3017 struct kvm_vcpu *vcpu;
2964 3018
2965 mutex_lock(&kvm->slots_lock); 3019 mutex_lock(&kvm->slots_lock);
2966 3020
@@ -2974,15 +3028,32 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
2974 if (!memslot->dirty_bitmap) 3028 if (!memslot->dirty_bitmap)
2975 goto out; 3029 goto out;
2976 3030
3031 /*
3032 * Use second half of bitmap area because radix accumulates
3033 * bits in the first half.
3034 */
2977 n = kvm_dirty_bitmap_bytes(memslot); 3035 n = kvm_dirty_bitmap_bytes(memslot);
2978 memset(memslot->dirty_bitmap, 0, n); 3036 buf = memslot->dirty_bitmap + n / sizeof(long);
3037 memset(buf, 0, n);
2979 3038
2980 r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap); 3039 if (kvm_is_radix(kvm))
3040 r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf);
3041 else
3042 r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf);
2981 if (r) 3043 if (r)
2982 goto out; 3044 goto out;
2983 3045
3046 /* Harvest dirty bits from VPA and DTL updates */
3047 /* Note: we never modify the SLB shadow buffer areas */
3048 kvm_for_each_vcpu(i, vcpu, kvm) {
3049 spin_lock(&vcpu->arch.vpa_update_lock);
3050 kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf);
3051 kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf);
3052 spin_unlock(&vcpu->arch.vpa_update_lock);
3053 }
3054
2984 r = -EFAULT; 3055 r = -EFAULT;
2985 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 3056 if (copy_to_user(log->dirty_bitmap, buf, n))
2986 goto out; 3057 goto out;
2987 3058
2988 r = 0; 3059 r = 0;
@@ -3003,6 +3074,15 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
3003static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, 3074static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
3004 unsigned long npages) 3075 unsigned long npages)
3005{ 3076{
3077 /*
3078 * For now, if radix_enabled() then we only support radix guests,
3079 * and in that case we don't need the rmap array.
3080 */
3081 if (radix_enabled()) {
3082 slot->arch.rmap = NULL;
3083 return 0;
3084 }
3085
3006 slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); 3086 slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
3007 if (!slot->arch.rmap) 3087 if (!slot->arch.rmap)
3008 return -ENOMEM; 3088 return -ENOMEM;
@@ -3035,7 +3115,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
3035 if (npages) 3115 if (npages)
3036 atomic64_inc(&kvm->arch.mmio_update); 3116 atomic64_inc(&kvm->arch.mmio_update);
3037 3117
3038 if (npages && old->npages) { 3118 if (npages && old->npages && !kvm_is_radix(kvm)) {
3039 /* 3119 /*
3040 * If modifying a memslot, reset all the rmap dirty bits. 3120 * If modifying a memslot, reset all the rmap dirty bits.
3041 * If this is a new memslot, we don't need to do anything 3121 * If this is a new memslot, we don't need to do anything
@@ -3044,7 +3124,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
3044 */ 3124 */
3045 slots = kvm_memslots(kvm); 3125 slots = kvm_memslots(kvm);
3046 memslot = id_to_memslot(slots, mem->slot); 3126 memslot = id_to_memslot(slots, mem->slot);
3047 kvmppc_hv_get_dirty_log(kvm, memslot, NULL); 3127 kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL);
3048 } 3128 }
3049} 3129}
3050 3130
@@ -3083,14 +3163,20 @@ static void kvmppc_setup_partition_table(struct kvm *kvm)
3083{ 3163{
3084 unsigned long dw0, dw1; 3164 unsigned long dw0, dw1;
3085 3165
3086 /* PS field - page size for VRMA */ 3166 if (!kvm_is_radix(kvm)) {
3087 dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) | 3167 /* PS field - page size for VRMA */
3088 ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1); 3168 dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
3089 /* HTABSIZE and HTABORG fields */ 3169 ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
3090 dw0 |= kvm->arch.sdr1; 3170 /* HTABSIZE and HTABORG fields */
3171 dw0 |= kvm->arch.sdr1;
3091 3172
3092 /* Second dword has GR=0; other fields are unused since UPRT=0 */ 3173 /* Second dword as set by userspace */
3093 dw1 = 0; 3174 dw1 = kvm->arch.process_table;
3175 } else {
3176 dw0 = PATB_HR | radix__get_tree_size() |
3177 __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
3178 dw1 = PATB_GR | kvm->arch.process_table;
3179 }
3094 3180
3095 mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); 3181 mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
3096} 3182}
@@ -3260,6 +3346,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
3260{ 3346{
3261 unsigned long lpcr, lpid; 3347 unsigned long lpcr, lpid;
3262 char buf[32]; 3348 char buf[32];
3349 int ret;
3263 3350
3264 /* Allocate the guest's logical partition ID */ 3351 /* Allocate the guest's logical partition ID */
3265 3352
@@ -3307,13 +3394,30 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
3307 lpcr |= LPCR_HVICE; 3394 lpcr |= LPCR_HVICE;
3308 } 3395 }
3309 3396
3397 /*
3398 * For now, if the host uses radix, the guest must be radix.
3399 */
3400 if (radix_enabled()) {
3401 kvm->arch.radix = 1;
3402 lpcr &= ~LPCR_VPM1;
3403 lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
3404 ret = kvmppc_init_vm_radix(kvm);
3405 if (ret) {
3406 kvmppc_free_lpid(kvm->arch.lpid);
3407 return ret;
3408 }
3409 kvmppc_setup_partition_table(kvm);
3410 }
3411
3310 kvm->arch.lpcr = lpcr; 3412 kvm->arch.lpcr = lpcr;
3311 3413
3312 /* 3414 /*
3313 * Work out how many sets the TLB has, for the use of 3415 * Work out how many sets the TLB has, for the use of
3314 * the TLB invalidation loop in book3s_hv_rmhandlers.S. 3416 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
3315 */ 3417 */
3316 if (cpu_has_feature(CPU_FTR_ARCH_300)) 3418 if (kvm_is_radix(kvm))
3419 kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */
3420 else if (cpu_has_feature(CPU_FTR_ARCH_300))
3317 kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */ 3421 kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */
3318 else if (cpu_has_feature(CPU_FTR_ARCH_207S)) 3422 else if (cpu_has_feature(CPU_FTR_ARCH_207S))
3319 kvm->arch.tlb_sets = POWER8_TLB_SETS; /* 512 */ 3423 kvm->arch.tlb_sets = POWER8_TLB_SETS; /* 512 */
@@ -3323,8 +3427,11 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
3323 /* 3427 /*
3324 * Track that we now have a HV mode VM active. This blocks secondary 3428 * Track that we now have a HV mode VM active. This blocks secondary
3325 * CPU threads from coming online. 3429 * CPU threads from coming online.
3430 * On POWER9, we only need to do this for HPT guests on a radix
3431 * host, which is not yet supported.
3326 */ 3432 */
3327 kvm_hv_vm_activated(); 3433 if (!cpu_has_feature(CPU_FTR_ARCH_300))
3434 kvm_hv_vm_activated();
3328 3435
3329 /* 3436 /*
3330 * Create a debugfs directory for the VM 3437 * Create a debugfs directory for the VM
@@ -3350,11 +3457,17 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
3350{ 3457{
3351 debugfs_remove_recursive(kvm->arch.debugfs_dir); 3458 debugfs_remove_recursive(kvm->arch.debugfs_dir);
3352 3459
3353 kvm_hv_vm_deactivated(); 3460 if (!cpu_has_feature(CPU_FTR_ARCH_300))
3461 kvm_hv_vm_deactivated();
3354 3462
3355 kvmppc_free_vcores(kvm); 3463 kvmppc_free_vcores(kvm);
3356 3464
3357 kvmppc_free_hpt(kvm); 3465 kvmppc_free_lpid(kvm->arch.lpid);
3466
3467 if (kvm_is_radix(kvm))
3468 kvmppc_free_radix(kvm);
3469 else
3470 kvmppc_free_hpt(kvm);
3358 3471
3359 kvmppc_free_pimap(kvm); 3472 kvmppc_free_pimap(kvm);
3360} 3473}
@@ -3383,11 +3496,6 @@ static int kvmppc_core_check_processor_compat_hv(void)
3383 if (!cpu_has_feature(CPU_FTR_HVMODE) || 3496 if (!cpu_has_feature(CPU_FTR_HVMODE) ||
3384 !cpu_has_feature(CPU_FTR_ARCH_206)) 3497 !cpu_has_feature(CPU_FTR_ARCH_206))
3385 return -EIO; 3498 return -EIO;
3386 /*
3387 * Disable KVM for Power9 in radix mode.
3388 */
3389 if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
3390 return -EIO;
3391 3499
3392 return 0; 3500 return 0;
3393} 3501}
@@ -3655,6 +3763,41 @@ static void init_default_hcalls(void)
3655 } 3763 }
3656} 3764}
3657 3765
3766static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
3767{
3768 unsigned long lpcr;
3769 int radix;
3770
3771 /* If not on a POWER9, reject it */
3772 if (!cpu_has_feature(CPU_FTR_ARCH_300))
3773 return -ENODEV;
3774
3775 /* If any unknown flags set, reject it */
3776 if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
3777 return -EINVAL;
3778
3779 /* We can't change a guest to/from radix yet */
3780 radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
3781 if (radix != kvm_is_radix(kvm))
3782 return -EINVAL;
3783
3784 /* GR (guest radix) bit in process_table field must match */
3785 if (!!(cfg->process_table & PATB_GR) != radix)
3786 return -EINVAL;
3787
3788 /* Process table size field must be reasonable, i.e. <= 24 */
3789 if ((cfg->process_table & PRTS_MASK) > 24)
3790 return -EINVAL;
3791
3792 kvm->arch.process_table = cfg->process_table;
3793 kvmppc_setup_partition_table(kvm);
3794
3795 lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
3796 kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
3797
3798 return 0;
3799}
3800
3658static struct kvmppc_ops kvm_ops_hv = { 3801static struct kvmppc_ops kvm_ops_hv = {
3659 .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, 3802 .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
3660 .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, 3803 .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -3692,6 +3835,8 @@ static struct kvmppc_ops kvm_ops_hv = {
3692 .irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv, 3835 .irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
3693 .irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv, 3836 .irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
3694#endif 3837#endif
3838 .configure_mmu = kvmhv_configure_mmu,
3839 .get_rmmu_info = kvmhv_get_rmmu_info,
3695}; 3840};
3696 3841
3697static int kvm_init_subcore_bitmap(void) 3842static int kvm_init_subcore_bitmap(void)
@@ -3726,6 +3871,11 @@ static int kvm_init_subcore_bitmap(void)
3726 return 0; 3871 return 0;
3727} 3872}
3728 3873
3874static int kvmppc_radix_possible(void)
3875{
3876 return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
3877}
3878
3729static int kvmppc_book3s_init_hv(void) 3879static int kvmppc_book3s_init_hv(void)
3730{ 3880{
3731 int r; 3881 int r;
@@ -3765,12 +3915,19 @@ static int kvmppc_book3s_init_hv(void)
3765 init_vcore_lists(); 3915 init_vcore_lists();
3766 3916
3767 r = kvmppc_mmu_hv_init(); 3917 r = kvmppc_mmu_hv_init();
3918 if (r)
3919 return r;
3920
3921 if (kvmppc_radix_possible())
3922 r = kvmppc_radix_init();
3768 return r; 3923 return r;
3769} 3924}
3770 3925
3771static void kvmppc_book3s_exit_hv(void) 3926static void kvmppc_book3s_exit_hv(void)
3772{ 3927{
3773 kvmppc_free_host_rm_ops(); 3928 kvmppc_free_host_rm_ops();
3929 if (kvmppc_radix_possible())
3930 kvmppc_radix_exit();
3774 kvmppc_hv_ops = NULL; 3931 kvmppc_hv_ops = NULL;
3775} 3932}
3776 3933
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 5bb24be0b346..fe08fea54b70 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -29,6 +29,11 @@
29#include <asm/opal.h> 29#include <asm/opal.h>
30#include <asm/smp.h> 30#include <asm/smp.h>
31 31
32static bool in_realmode(void)
33{
34 return !(mfmsr() & MSR_IR);
35}
36
32#define KVM_CMA_CHUNK_ORDER 18 37#define KVM_CMA_CHUNK_ORDER 18
33 38
34/* 39/*
@@ -200,7 +205,6 @@ static inline void rm_writeb(unsigned long paddr, u8 val)
200 205
201/* 206/*
202 * Send an interrupt or message to another CPU. 207 * Send an interrupt or message to another CPU.
203 * This can only be called in real mode.
204 * The caller needs to include any barrier needed to order writes 208 * The caller needs to include any barrier needed to order writes
205 * to memory vs. the IPI/message. 209 * to memory vs. the IPI/message.
206 */ 210 */
@@ -226,7 +230,9 @@ void kvmhv_rm_send_ipi(int cpu)
226 230
227 /* Else poke the target with an IPI */ 231 /* Else poke the target with an IPI */
228 xics_phys = paca[cpu].kvm_hstate.xics_phys; 232 xics_phys = paca[cpu].kvm_hstate.xics_phys;
229 if (xics_phys) 233 if (!in_realmode())
234 opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
235 else if (xics_phys)
230 rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); 236 rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
231 else 237 else
232 opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu), 238 opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu),
@@ -412,14 +418,15 @@ static long kvmppc_read_one_intr(bool *again)
412 418
413 /* Now read the interrupt from the ICP */ 419 /* Now read the interrupt from the ICP */
414 xics_phys = local_paca->kvm_hstate.xics_phys; 420 xics_phys = local_paca->kvm_hstate.xics_phys;
415 if (!xics_phys) { 421 rc = 0;
416 /* Use OPAL to read the XIRR */ 422 if (!in_realmode())
423 rc = opal_int_get_xirr(&xirr, false);
424 else if (!xics_phys)
417 rc = opal_rm_int_get_xirr(&xirr, false); 425 rc = opal_rm_int_get_xirr(&xirr, false);
418 if (rc < 0) 426 else
419 return 1;
420 } else {
421 xirr = _lwzcix(xics_phys + XICS_XIRR); 427 xirr = _lwzcix(xics_phys + XICS_XIRR);
422 } 428 if (rc < 0)
429 return 1;
423 430
424 /* 431 /*
425 * Save XIRR for later. Since we get control in reverse endian 432 * Save XIRR for later. Since we get control in reverse endian
@@ -445,15 +452,19 @@ static long kvmppc_read_one_intr(bool *again)
445 * If it is an IPI, clear the MFRR and EOI it. 452 * If it is an IPI, clear the MFRR and EOI it.
446 */ 453 */
447 if (xisr == XICS_IPI) { 454 if (xisr == XICS_IPI) {
448 if (xics_phys) { 455 rc = 0;
456 if (!in_realmode()) {
457 opal_int_set_mfrr(hard_smp_processor_id(), 0xff);
458 rc = opal_int_eoi(h_xirr);
459 } else if (xics_phys) {
449 _stbcix(xics_phys + XICS_MFRR, 0xff); 460 _stbcix(xics_phys + XICS_MFRR, 0xff);
450 _stwcix(xics_phys + XICS_XIRR, xirr); 461 _stwcix(xics_phys + XICS_XIRR, xirr);
451 } else { 462 } else {
452 opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff); 463 opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff);
453 rc = opal_rm_int_eoi(h_xirr); 464 rc = opal_rm_int_eoi(h_xirr);
454 /* If rc > 0, there is another interrupt pending */
455 *again = rc > 0;
456 } 465 }
466 /* If rc > 0, there is another interrupt pending */
467 *again = rc > 0;
457 468
458 /* 469 /*
459 * Need to ensure side effects of above stores 470 * Need to ensure side effects of above stores
@@ -471,7 +482,10 @@ static long kvmppc_read_one_intr(bool *again)
471 /* We raced with the host, 482 /* We raced with the host,
472 * we need to resend that IPI, bummer 483 * we need to resend that IPI, bummer
473 */ 484 */
474 if (xics_phys) 485 if (!in_realmode())
486 opal_int_set_mfrr(hard_smp_processor_id(),
487 IPI_PRIORITY);
488 else if (xics_phys)
475 _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY); 489 _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
476 else 490 else
477 opal_rm_int_set_mfrr(hard_smp_processor_id(), 491 opal_rm_int_set_mfrr(hard_smp_processor_id(),
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 9ef3c4be952f..b095afcd4309 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -43,6 +43,7 @@ static void *real_vmalloc_addr(void *x)
43static int global_invalidates(struct kvm *kvm, unsigned long flags) 43static int global_invalidates(struct kvm *kvm, unsigned long flags)
44{ 44{
45 int global; 45 int global;
46 int cpu;
46 47
47 /* 48 /*
48 * If there is only one vcore, and it's currently running, 49 * If there is only one vcore, and it's currently running,
@@ -60,8 +61,14 @@ static int global_invalidates(struct kvm *kvm, unsigned long flags)
60 /* any other core might now have stale TLB entries... */ 61 /* any other core might now have stale TLB entries... */
61 smp_wmb(); 62 smp_wmb();
62 cpumask_setall(&kvm->arch.need_tlb_flush); 63 cpumask_setall(&kvm->arch.need_tlb_flush);
63 cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu, 64 cpu = local_paca->kvm_hstate.kvm_vcore->pcpu;
64 &kvm->arch.need_tlb_flush); 65 /*
66 * On POWER9, threads are independent but the TLB is shared,
67 * so use the bit for the first thread to represent the core.
68 */
69 if (cpu_has_feature(CPU_FTR_ARCH_300))
70 cpu = cpu_first_thread_sibling(cpu);
71 cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush);
65 } 72 }
66 73
67 return global; 74 return global;
@@ -182,6 +189,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
182 unsigned long mmu_seq; 189 unsigned long mmu_seq;
183 unsigned long rcbits, irq_flags = 0; 190 unsigned long rcbits, irq_flags = 0;
184 191
192 if (kvm_is_radix(kvm))
193 return H_FUNCTION;
185 psize = hpte_page_size(pteh, ptel); 194 psize = hpte_page_size(pteh, ptel);
186 if (!psize) 195 if (!psize)
187 return H_PARAMETER; 196 return H_PARAMETER;
@@ -458,6 +467,8 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
458 struct revmap_entry *rev; 467 struct revmap_entry *rev;
459 u64 pte, orig_pte, pte_r; 468 u64 pte, orig_pte, pte_r;
460 469
470 if (kvm_is_radix(kvm))
471 return H_FUNCTION;
461 if (pte_index >= kvm->arch.hpt_npte) 472 if (pte_index >= kvm->arch.hpt_npte)
462 return H_PARAMETER; 473 return H_PARAMETER;
463 hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); 474 hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
@@ -529,6 +540,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
529 struct revmap_entry *rev, *revs[4]; 540 struct revmap_entry *rev, *revs[4];
530 u64 hp0, hp1; 541 u64 hp0, hp1;
531 542
543 if (kvm_is_radix(kvm))
544 return H_FUNCTION;
532 global = global_invalidates(kvm, 0); 545 global = global_invalidates(kvm, 0);
533 for (i = 0; i < 4 && ret == H_SUCCESS; ) { 546 for (i = 0; i < 4 && ret == H_SUCCESS; ) {
534 n = 0; 547 n = 0;
@@ -642,6 +655,8 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
642 unsigned long v, r, rb, mask, bits; 655 unsigned long v, r, rb, mask, bits;
643 u64 pte_v, pte_r; 656 u64 pte_v, pte_r;
644 657
658 if (kvm_is_radix(kvm))
659 return H_FUNCTION;
645 if (pte_index >= kvm->arch.hpt_npte) 660 if (pte_index >= kvm->arch.hpt_npte)
646 return H_PARAMETER; 661 return H_PARAMETER;
647 662
@@ -711,6 +726,8 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
711 int i, n = 1; 726 int i, n = 1;
712 struct revmap_entry *rev = NULL; 727 struct revmap_entry *rev = NULL;
713 728
729 if (kvm_is_radix(kvm))
730 return H_FUNCTION;
714 if (pte_index >= kvm->arch.hpt_npte) 731 if (pte_index >= kvm->arch.hpt_npte)
715 return H_PARAMETER; 732 return H_PARAMETER;
716 if (flags & H_READ_4) { 733 if (flags & H_READ_4) {
@@ -750,6 +767,8 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
750 unsigned long *rmap; 767 unsigned long *rmap;
751 long ret = H_NOT_FOUND; 768 long ret = H_NOT_FOUND;
752 769
770 if (kvm_is_radix(kvm))
771 return H_FUNCTION;
753 if (pte_index >= kvm->arch.hpt_npte) 772 if (pte_index >= kvm->arch.hpt_npte)
754 return H_PARAMETER; 773 return H_PARAMETER;
755 774
@@ -796,6 +815,8 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
796 unsigned long *rmap; 815 unsigned long *rmap;
797 long ret = H_NOT_FOUND; 816 long ret = H_NOT_FOUND;
798 817
818 if (kvm_is_radix(kvm))
819 return H_FUNCTION;
799 if (pte_index >= kvm->arch.hpt_npte) 820 if (pte_index >= kvm->arch.hpt_npte)
800 return H_PARAMETER; 821 return H_PARAMETER;
801 822
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 44cfdd281fa1..0b2e388f4cdf 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -62,11 +62,9 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu)
62 hcpu = hcore << threads_shift; 62 hcpu = hcore << threads_shift;
63 kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu; 63 kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
64 smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION); 64 smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
65 if (paca[hcpu].kvm_hstate.xics_phys) 65 kvmppc_set_host_ipi(hcpu, 1);
66 icp_native_cause_ipi_rm(hcpu); 66 smp_mb();
67 else 67 kvmhv_rm_send_ipi(hcpu);
68 opal_rm_int_set_mfrr(get_hard_smp_processor_id(hcpu),
69 IPI_PRIORITY);
70} 68}
71#else 69#else
72static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { } 70static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { }
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9338a818e05c..47414a6fe2dd 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -148,6 +148,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
148 addi r1, r1, 112 148 addi r1, r1, 112
149 ld r7, HSTATE_HOST_MSR(r13) 149 ld r7, HSTATE_HOST_MSR(r13)
150 150
151 /*
152 * If we came back from the guest via a relocation-on interrupt,
153 * we will be in virtual mode at this point, which makes it a
154 * little easier to get back to the caller.
155 */
156 mfmsr r0
157 andi. r0, r0, MSR_IR /* in real mode? */
158 bne .Lvirt_return
159
151 cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK 160 cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
152 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL 161 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
153 beq 11f 162 beq 11f
@@ -181,6 +190,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
181 mtspr SPRN_HSRR1, r7 190 mtspr SPRN_HSRR1, r7
182 ba 0xe80 191 ba 0xe80
183 192
193 /* Virtual-mode return - can't get here for HMI or machine check */
194.Lvirt_return:
195 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
196 beq 16f
197 cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
198 beq 17f
199 andi. r0, r7, MSR_EE /* were interrupts hard-enabled? */
200 beq 18f
201 mtmsrd r7, 1 /* if so then re-enable them */
20218: mtlr r8
203 blr
204
20516: mtspr SPRN_HSRR0, r8 /* jump to reloc-on external vector */
206 mtspr SPRN_HSRR1, r7
207 b exc_virt_0x4500_hardware_interrupt
208
20917: mtspr SPRN_HSRR0, r8
210 mtspr SPRN_HSRR1, r7
211 b exc_virt_0x4e80_h_doorbell
212
184kvmppc_primary_no_guest: 213kvmppc_primary_no_guest:
185 /* We handle this much like a ceded vcpu */ 214 /* We handle this much like a ceded vcpu */
186 /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ 215 /* put the HDEC into the DEC, since HDEC interrupts don't wake us */
@@ -518,6 +547,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
518/* Stack frame offsets */ 547/* Stack frame offsets */
519#define STACK_SLOT_TID (112-16) 548#define STACK_SLOT_TID (112-16)
520#define STACK_SLOT_PSSCR (112-24) 549#define STACK_SLOT_PSSCR (112-24)
550#define STACK_SLOT_PID (112-32)
521 551
522.global kvmppc_hv_entry 552.global kvmppc_hv_entry
523kvmppc_hv_entry: 553kvmppc_hv_entry:
@@ -530,6 +560,7 @@ kvmppc_hv_entry:
530 * R1 = host R1 560 * R1 = host R1
531 * R2 = TOC 561 * R2 = TOC
532 * all other volatile GPRS = free 562 * all other volatile GPRS = free
563 * Does not preserve non-volatile GPRs or CR fields
533 */ 564 */
534 mflr r0 565 mflr r0
535 std r0, PPC_LR_STKOFF(r1) 566 std r0, PPC_LR_STKOFF(r1)
@@ -549,32 +580,38 @@ kvmppc_hv_entry:
549 bl kvmhv_start_timing 580 bl kvmhv_start_timing
5501: 5811:
551#endif 582#endif
552 /* Clear out SLB */ 583
584 /* Use cr7 as an indication of radix mode */
585 ld r5, HSTATE_KVM_VCORE(r13)
586 ld r9, VCORE_KVM(r5) /* pointer to struct kvm */
587 lbz r0, KVM_RADIX(r9)
588 cmpwi cr7, r0, 0
589
590 /* Clear out SLB if hash */
591 bne cr7, 2f
553 li r6,0 592 li r6,0
554 slbmte r6,r6 593 slbmte r6,r6
555 slbia 594 slbia
556 ptesync 595 ptesync
557 5962:
558 /* 597 /*
559 * POWER7/POWER8 host -> guest partition switch code. 598 * POWER7/POWER8 host -> guest partition switch code.
560 * We don't have to lock against concurrent tlbies, 599 * We don't have to lock against concurrent tlbies,
561 * but we do have to coordinate across hardware threads. 600 * but we do have to coordinate across hardware threads.
562 */ 601 */
563 /* Set bit in entry map iff exit map is zero. */ 602 /* Set bit in entry map iff exit map is zero. */
564 ld r5, HSTATE_KVM_VCORE(r13)
565 li r7, 1 603 li r7, 1
566 lbz r6, HSTATE_PTID(r13) 604 lbz r6, HSTATE_PTID(r13)
567 sld r7, r7, r6 605 sld r7, r7, r6
568 addi r9, r5, VCORE_ENTRY_EXIT 606 addi r8, r5, VCORE_ENTRY_EXIT
56921: lwarx r3, 0, r9 60721: lwarx r3, 0, r8
570 cmpwi r3, 0x100 /* any threads starting to exit? */ 608 cmpwi r3, 0x100 /* any threads starting to exit? */
571 bge secondary_too_late /* if so we're too late to the party */ 609 bge secondary_too_late /* if so we're too late to the party */
572 or r3, r3, r7 610 or r3, r3, r7
573 stwcx. r3, 0, r9 611 stwcx. r3, 0, r8
574 bne 21b 612 bne 21b
575 613
576 /* Primary thread switches to guest partition. */ 614 /* Primary thread switches to guest partition. */
577 ld r9,VCORE_KVM(r5) /* pointer to struct kvm */
578 cmpwi r6,0 615 cmpwi r6,0
579 bne 10f 616 bne 10f
580 lwz r7,KVM_LPID(r9) 617 lwz r7,KVM_LPID(r9)
@@ -590,30 +627,44 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
590 627
591 /* See if we need to flush the TLB */ 628 /* See if we need to flush the TLB */
592 lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */ 629 lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */
630BEGIN_FTR_SECTION
631 /*
632 * On POWER9, individual threads can come in here, but the
633 * TLB is shared between the 4 threads in a core, hence
634 * invalidating on one thread invalidates for all.
635 * Thus we make all 4 threads use the same bit here.
636 */
637 clrrdi r6,r6,2
638END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
593 clrldi r7,r6,64-6 /* extract bit number (6 bits) */ 639 clrldi r7,r6,64-6 /* extract bit number (6 bits) */
594 srdi r6,r6,6 /* doubleword number */ 640 srdi r6,r6,6 /* doubleword number */
595 sldi r6,r6,3 /* address offset */ 641 sldi r6,r6,3 /* address offset */
596 add r6,r6,r9 642 add r6,r6,r9
597 addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */ 643 addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */
598 li r0,1 644 li r8,1
599 sld r0,r0,r7 645 sld r8,r8,r7
600 ld r7,0(r6) 646 ld r7,0(r6)
601 and. r7,r7,r0 647 and. r7,r7,r8
602 beq 22f 648 beq 22f
60323: ldarx r7,0,r6 /* if set, clear the bit */
604 andc r7,r7,r0
605 stdcx. r7,0,r6
606 bne 23b
607 /* Flush the TLB of any entries for this LPID */ 649 /* Flush the TLB of any entries for this LPID */
608 lwz r6,KVM_TLB_SETS(r9) 650 lwz r0,KVM_TLB_SETS(r9)
609 li r0,0 /* RS for P9 version of tlbiel */ 651 mtctr r0
610 mtctr r6
611 li r7,0x800 /* IS field = 0b10 */ 652 li r7,0x800 /* IS field = 0b10 */
612 ptesync 653 ptesync
61328: tlbiel r7 654 li r0,0 /* RS for P9 version of tlbiel */
655 bne cr7, 29f
65628: tlbiel r7 /* On P9, rs=0, RIC=0, PRS=0, R=0 */
614 addi r7,r7,0x1000 657 addi r7,r7,0x1000
615 bdnz 28b 658 bdnz 28b
616 ptesync 659 b 30f
66029: PPC_TLBIEL(7,0,2,1,1) /* for radix, RIC=2, PRS=1, R=1 */
661 addi r7,r7,0x1000
662 bdnz 29b
66330: ptesync
66423: ldarx r7,0,r6 /* clear the bit after TLB flushed */
665 andc r7,r7,r8
666 stdcx. r7,0,r6
667 bne 23b
617 668
618 /* Add timebase offset onto timebase */ 669 /* Add timebase offset onto timebase */
61922: ld r8,VCORE_TB_OFFSET(r5) 67022: ld r8,VCORE_TB_OFFSET(r5)
@@ -658,7 +709,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
658 beq kvmppc_primary_no_guest 709 beq kvmppc_primary_no_guest
659kvmppc_got_guest: 710kvmppc_got_guest:
660 711
661 /* Load up guest SLB entries */ 712 /* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
662 lwz r5,VCPU_SLB_MAX(r4) 713 lwz r5,VCPU_SLB_MAX(r4)
663 cmpwi r5,0 714 cmpwi r5,0
664 beq 9f 715 beq 9f
@@ -696,8 +747,10 @@ kvmppc_got_guest:
696BEGIN_FTR_SECTION 747BEGIN_FTR_SECTION
697 mfspr r5, SPRN_TIDR 748 mfspr r5, SPRN_TIDR
698 mfspr r6, SPRN_PSSCR 749 mfspr r6, SPRN_PSSCR
750 mfspr r7, SPRN_PID
699 std r5, STACK_SLOT_TID(r1) 751 std r5, STACK_SLOT_TID(r1)
700 std r6, STACK_SLOT_PSSCR(r1) 752 std r6, STACK_SLOT_PSSCR(r1)
753 std r7, STACK_SLOT_PID(r1)
701END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 754END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
702 755
703BEGIN_FTR_SECTION 756BEGIN_FTR_SECTION
@@ -824,6 +877,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
824 mtspr SPRN_PID, r7 877 mtspr SPRN_PID, r7
825 mtspr SPRN_WORT, r8 878 mtspr SPRN_WORT, r8
826BEGIN_FTR_SECTION 879BEGIN_FTR_SECTION
880 PPC_INVALIDATE_ERAT
881END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
882BEGIN_FTR_SECTION
827 /* POWER8-only registers */ 883 /* POWER8-only registers */
828 ld r5, VCPU_TCSCR(r4) 884 ld r5, VCPU_TCSCR(r4)
829 ld r6, VCPU_ACOP(r4) 885 ld r6, VCPU_ACOP(r4)
@@ -1057,13 +1113,13 @@ hdec_soon:
1057kvmppc_interrupt_hv: 1113kvmppc_interrupt_hv:
1058 /* 1114 /*
1059 * Register contents: 1115 * Register contents:
1060 * R12 = interrupt vector 1116 * R12 = (guest CR << 32) | interrupt vector
1061 * R13 = PACA 1117 * R13 = PACA
1062 * guest CR, R12 saved in shadow VCPU SCRATCH1/0 1118 * guest R12 saved in shadow VCPU SCRATCH0
1119 * guest CTR saved in shadow VCPU SCRATCH1 if RELOCATABLE
1063 * guest R13 saved in SPRN_SCRATCH0 1120 * guest R13 saved in SPRN_SCRATCH0
1064 */ 1121 */
1065 std r9, HSTATE_SCRATCH2(r13) 1122 std r9, HSTATE_SCRATCH2(r13)
1066
1067 lbz r9, HSTATE_IN_GUEST(r13) 1123 lbz r9, HSTATE_IN_GUEST(r13)
1068 cmpwi r9, KVM_GUEST_MODE_HOST_HV 1124 cmpwi r9, KVM_GUEST_MODE_HOST_HV
1069 beq kvmppc_bad_host_intr 1125 beq kvmppc_bad_host_intr
@@ -1094,8 +1150,9 @@ kvmppc_interrupt_hv:
1094 std r10, VCPU_GPR(R10)(r9) 1150 std r10, VCPU_GPR(R10)(r9)
1095 std r11, VCPU_GPR(R11)(r9) 1151 std r11, VCPU_GPR(R11)(r9)
1096 ld r3, HSTATE_SCRATCH0(r13) 1152 ld r3, HSTATE_SCRATCH0(r13)
1097 lwz r4, HSTATE_SCRATCH1(r13)
1098 std r3, VCPU_GPR(R12)(r9) 1153 std r3, VCPU_GPR(R12)(r9)
1154 /* CR is in the high half of r12 */
1155 srdi r4, r12, 32
1099 stw r4, VCPU_CR(r9) 1156 stw r4, VCPU_CR(r9)
1100BEGIN_FTR_SECTION 1157BEGIN_FTR_SECTION
1101 ld r3, HSTATE_CFAR(r13) 1158 ld r3, HSTATE_CFAR(r13)
@@ -1114,6 +1171,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
1114 mfspr r11, SPRN_SRR1 1171 mfspr r11, SPRN_SRR1
1115 std r10, VCPU_SRR0(r9) 1172 std r10, VCPU_SRR0(r9)
1116 std r11, VCPU_SRR1(r9) 1173 std r11, VCPU_SRR1(r9)
1174 /* trap is in the low half of r12, clear CR from the high half */
1175 clrldi r12, r12, 32
1117 andi. r0, r12, 2 /* need to read HSRR0/1? */ 1176 andi. r0, r12, 2 /* need to read HSRR0/1? */
1118 beq 1f 1177 beq 1f
1119 mfspr r10, SPRN_HSRR0 1178 mfspr r10, SPRN_HSRR0
@@ -1149,7 +1208,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
114911: stw r3,VCPU_HEIR(r9) 120811: stw r3,VCPU_HEIR(r9)
1150 1209
1151 /* these are volatile across C function calls */ 1210 /* these are volatile across C function calls */
1211#ifdef CONFIG_RELOCATABLE
1212 ld r3, HSTATE_SCRATCH1(r13)
1213 mtctr r3
1214#else
1152 mfctr r3 1215 mfctr r3
1216#endif
1153 mfxer r4 1217 mfxer r4
1154 std r3, VCPU_CTR(r9) 1218 std r3, VCPU_CTR(r9)
1155 std r4, VCPU_XER(r9) 1219 std r4, VCPU_XER(r9)
@@ -1285,11 +1349,15 @@ mc_cont:
1285 mtspr SPRN_CTRLT,r6 1349 mtspr SPRN_CTRLT,r6
12864: 13504:
1287 /* Read the guest SLB and save it away */ 1351 /* Read the guest SLB and save it away */
1352 ld r5, VCPU_KVM(r9)
1353 lbz r0, KVM_RADIX(r5)
1354 cmpwi r0, 0
1355 li r5, 0
1356 bne 3f /* for radix, save 0 entries */
1288 lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */ 1357 lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */
1289 mtctr r0 1358 mtctr r0
1290 li r6,0 1359 li r6,0
1291 addi r7,r9,VCPU_SLB 1360 addi r7,r9,VCPU_SLB
1292 li r5,0
12931: slbmfee r8,r6 13611: slbmfee r8,r6
1294 andis. r0,r8,SLB_ESID_V@h 1362 andis. r0,r8,SLB_ESID_V@h
1295 beq 2f 1363 beq 2f
@@ -1301,7 +1369,7 @@ mc_cont:
1301 addi r5,r5,1 1369 addi r5,r5,1
13022: addi r6,r6,1 13702: addi r6,r6,1
1303 bdnz 1b 1371 bdnz 1b
1304 stw r5,VCPU_SLB_MAX(r9) 13723: stw r5,VCPU_SLB_MAX(r9)
1305 1373
1306 /* 1374 /*
1307 * Save the guest PURR/SPURR 1375 * Save the guest PURR/SPURR
@@ -1550,9 +1618,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
1550BEGIN_FTR_SECTION 1618BEGIN_FTR_SECTION
1551 ld r5, STACK_SLOT_TID(r1) 1619 ld r5, STACK_SLOT_TID(r1)
1552 ld r6, STACK_SLOT_PSSCR(r1) 1620 ld r6, STACK_SLOT_PSSCR(r1)
1621 ld r7, STACK_SLOT_PID(r1)
1553 mtspr SPRN_TIDR, r5 1622 mtspr SPRN_TIDR, r5
1554 mtspr SPRN_PSSCR, r6 1623 mtspr SPRN_PSSCR, r6
1624 mtspr SPRN_PID, r7
1555END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 1625END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
1626BEGIN_FTR_SECTION
1627 PPC_INVALIDATE_ERAT
1628END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
1556 1629
1557 /* 1630 /*
1558 * POWER7/POWER8 guest -> host partition switch code. 1631 * POWER7/POWER8 guest -> host partition switch code.
@@ -1663,6 +1736,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
1663 isync 1736 isync
1664 1737
1665 /* load host SLB entries */ 1738 /* load host SLB entries */
1739BEGIN_MMU_FTR_SECTION
1740 b 0f
1741END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
1666 ld r8,PACA_SLBSHADOWPTR(r13) 1742 ld r8,PACA_SLBSHADOWPTR(r13)
1667 1743
1668 .rept SLB_NUM_BOLTED 1744 .rept SLB_NUM_BOLTED
@@ -1675,7 +1751,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
1675 slbmte r6,r5 1751 slbmte r6,r5
16761: addi r8,r8,16 17521: addi r8,r8,16
1677 .endr 1753 .endr
1678 17540:
1679#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1755#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
1680 /* Finish timing, if we have a vcpu */ 1756 /* Finish timing, if we have a vcpu */
1681 ld r4, HSTATE_KVM_VCPU(r13) 1757 ld r4, HSTATE_KVM_VCPU(r13)
@@ -1702,11 +1778,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
1702 * reflect the HDSI to the guest as a DSI. 1778 * reflect the HDSI to the guest as a DSI.
1703 */ 1779 */
1704kvmppc_hdsi: 1780kvmppc_hdsi:
1781 ld r3, VCPU_KVM(r9)
1782 lbz r0, KVM_RADIX(r3)
1783 cmpwi r0, 0
1705 mfspr r4, SPRN_HDAR 1784 mfspr r4, SPRN_HDAR
1706 mfspr r6, SPRN_HDSISR 1785 mfspr r6, SPRN_HDSISR
1786 bne .Lradix_hdsi /* on radix, just save DAR/DSISR/ASDR */
1707 /* HPTE not found fault or protection fault? */ 1787 /* HPTE not found fault or protection fault? */
1708 andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h 1788 andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h
1709 beq 1f /* if not, send it to the guest */ 1789 beq 1f /* if not, send it to the guest */
1790BEGIN_FTR_SECTION
1791 mfspr r5, SPRN_ASDR /* on POWER9, use ASDR to get VSID */
1792 b 4f
1793END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
1710 andi. r0, r11, MSR_DR /* data relocation enabled? */ 1794 andi. r0, r11, MSR_DR /* data relocation enabled? */
1711 beq 3f 1795 beq 3f
1712 clrrdi r0, r4, 28 1796 clrrdi r0, r4, 28
@@ -1776,13 +1860,29 @@ fast_interrupt_c_return:
1776 stb r0, HSTATE_IN_GUEST(r13) 1860 stb r0, HSTATE_IN_GUEST(r13)
1777 b guest_exit_cont 1861 b guest_exit_cont
1778 1862
1863.Lradix_hdsi:
1864 std r4, VCPU_FAULT_DAR(r9)
1865 stw r6, VCPU_FAULT_DSISR(r9)
1866.Lradix_hisi:
1867 mfspr r5, SPRN_ASDR
1868 std r5, VCPU_FAULT_GPA(r9)
1869 b guest_exit_cont
1870
1779/* 1871/*
1780 * Similarly for an HISI, reflect it to the guest as an ISI unless 1872 * Similarly for an HISI, reflect it to the guest as an ISI unless
1781 * it is an HPTE not found fault for a page that we have paged out. 1873 * it is an HPTE not found fault for a page that we have paged out.
1782 */ 1874 */
1783kvmppc_hisi: 1875kvmppc_hisi:
1876 ld r3, VCPU_KVM(r9)
1877 lbz r0, KVM_RADIX(r3)
1878 cmpwi r0, 0
1879 bne .Lradix_hisi /* for radix, just save ASDR */
1784 andis. r0, r11, SRR1_ISI_NOPT@h 1880 andis. r0, r11, SRR1_ISI_NOPT@h
1785 beq 1f 1881 beq 1f
1882BEGIN_FTR_SECTION
1883 mfspr r5, SPRN_ASDR /* on POWER9, use ASDR to get VSID */
1884 b 4f
1885END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
1786 andi. r0, r11, MSR_IR /* instruction relocation enabled? */ 1886 andi. r0, r11, MSR_IR /* instruction relocation enabled? */
1787 beq 3f 1887 beq 3f
1788 clrrdi r0, r10, 28 1888 clrrdi r0, r10, 28
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index ca8f174289bb..2a2b96d53999 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -167,20 +167,38 @@ kvmppc_handler_trampoline_enter_end:
167 * * 167 * *
168 *****************************************************************************/ 168 *****************************************************************************/
169 169
170.global kvmppc_handler_trampoline_exit
171kvmppc_handler_trampoline_exit:
172
173.global kvmppc_interrupt_pr 170.global kvmppc_interrupt_pr
174kvmppc_interrupt_pr: 171kvmppc_interrupt_pr:
172 /* 64-bit entry. Register usage at this point:
173 *
174 * SPRG_SCRATCH0 = guest R13
175 * R12 = (guest CR << 32) | exit handler id
176 * R13 = PACA
177 * HSTATE.SCRATCH0 = guest R12
178 * HSTATE.SCRATCH1 = guest CTR if RELOCATABLE
179 */
180#ifdef CONFIG_PPC64
181 /* Match 32-bit entry */
182#ifdef CONFIG_RELOCATABLE
183 std r9, HSTATE_SCRATCH2(r13)
184 ld r9, HSTATE_SCRATCH1(r13)
185 mtctr r9
186 ld r9, HSTATE_SCRATCH2(r13)
187#endif
188 rotldi r12, r12, 32 /* Flip R12 halves for stw */
189 stw r12, HSTATE_SCRATCH1(r13) /* CR is now in the low half */
190 srdi r12, r12, 32 /* shift trap into low half */
191#endif
175 192
193.global kvmppc_handler_trampoline_exit
194kvmppc_handler_trampoline_exit:
176 /* Register usage at this point: 195 /* Register usage at this point:
177 * 196 *
178 * SPRG_SCRATCH0 = guest R13 197 * SPRG_SCRATCH0 = guest R13
179 * R12 = exit handler id 198 * R12 = exit handler id
180 * R13 = shadow vcpu (32-bit) or PACA (64-bit) 199 * R13 = shadow vcpu (32-bit) or PACA (64-bit)
181 * HSTATE.SCRATCH0 = guest R12 200 * HSTATE.SCRATCH0 = guest R12
182 * HSTATE.SCRATCH1 = guest CR 201 * HSTATE.SCRATCH1 = guest CR
183 *
184 */ 202 */
185 203
186 /* Save registers */ 204 /* Save registers */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index cd892dec7cb6..40a5b2d75ed1 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -565,6 +565,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
565 case KVM_CAP_PPC_HWRNG: 565 case KVM_CAP_PPC_HWRNG:
566 r = kvmppc_hwrng_present(); 566 r = kvmppc_hwrng_present();
567 break; 567 break;
568 case KVM_CAP_PPC_MMU_RADIX:
569 r = !!(hv_enabled && radix_enabled());
570 break;
571 case KVM_CAP_PPC_MMU_HASH_V3:
572 r = !!(hv_enabled && !radix_enabled() &&
573 cpu_has_feature(CPU_FTR_ARCH_300));
574 break;
568#endif 575#endif
569 case KVM_CAP_SYNC_MMU: 576 case KVM_CAP_SYNC_MMU:
570#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 577#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -1468,6 +1475,31 @@ long kvm_arch_vm_ioctl(struct file *filp,
1468 r = kvm_vm_ioctl_rtas_define_token(kvm, argp); 1475 r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
1469 break; 1476 break;
1470 } 1477 }
1478 case KVM_PPC_CONFIGURE_V3_MMU: {
1479 struct kvm *kvm = filp->private_data;
1480 struct kvm_ppc_mmuv3_cfg cfg;
1481
1482 r = -EINVAL;
1483 if (!kvm->arch.kvm_ops->configure_mmu)
1484 goto out;
1485 r = -EFAULT;
1486 if (copy_from_user(&cfg, argp, sizeof(cfg)))
1487 goto out;
1488 r = kvm->arch.kvm_ops->configure_mmu(kvm, &cfg);
1489 break;
1490 }
1491 case KVM_PPC_GET_RMMU_INFO: {
1492 struct kvm *kvm = filp->private_data;
1493 struct kvm_ppc_rmmu_info info;
1494
1495 r = -EINVAL;
1496 if (!kvm->arch.kvm_ops->get_rmmu_info)
1497 goto out;
1498 r = kvm->arch.kvm_ops->get_rmmu_info(kvm, &info);
1499 if (r >= 0 && copy_to_user(argp, &info, sizeof(info)))
1500 r = -EFAULT;
1501 break;
1502 }
1471 default: { 1503 default: {
1472 struct kvm *kvm = filp->private_data; 1504 struct kvm *kvm = filp->private_data;
1473 r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg); 1505 r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg);
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index a175cd82ae8c..2be5dc242832 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -41,6 +41,7 @@ static void pmd_ctor(void *addr)
41} 41}
42 42
43struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; 43struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
44EXPORT_SYMBOL_GPL(pgtable_cache); /* used by kvm_hv module */
44 45
45/* 46/*
46 * Create a kmem_cache() for pagetables. This is not used for PTE 47 * Create a kmem_cache() for pagetables. This is not used for PTE
@@ -82,7 +83,7 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
82 pgtable_cache[shift - 1] = new; 83 pgtable_cache[shift - 1] = new;
83 pr_debug("Allocated pgtable cache for order %d\n", shift); 84 pr_debug("Allocated pgtable cache for order %d\n", shift);
84} 85}
85 86EXPORT_SYMBOL_GPL(pgtable_cache_add); /* used by kvm_hv module */
86 87
87void pgtable_cache_init(void) 88void pgtable_cache_init(void)
88{ 89{
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 93abf8a9813d..10c9a545a646 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -42,6 +42,8 @@
42#include <linux/memblock.h> 42#include <linux/memblock.h>
43#include <linux/hugetlb.h> 43#include <linux/hugetlb.h>
44#include <linux/slab.h> 44#include <linux/slab.h>
45#include <linux/of_fdt.h>
46#include <linux/libfdt.h>
45 47
46#include <asm/pgalloc.h> 48#include <asm/pgalloc.h>
47#include <asm/page.h> 49#include <asm/page.h>
@@ -344,12 +346,45 @@ static int __init parse_disable_radix(char *p)
344} 346}
345early_param("disable_radix", parse_disable_radix); 347early_param("disable_radix", parse_disable_radix);
346 348
349/*
350 * If we're running under a hypervisor, we need to check the contents of
351 * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do
352 * radix. If not, we clear the radix feature bit so we fall back to hash.
353 */
354static void early_check_vec5(void)
355{
356 unsigned long root, chosen;
357 int size;
358 const u8 *vec5;
359
360 root = of_get_flat_dt_root();
361 chosen = of_get_flat_dt_subnode_by_name(root, "chosen");
362 if (chosen == -FDT_ERR_NOTFOUND)
363 return;
364 vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size);
365 if (!vec5)
366 return;
367 if (size <= OV5_INDX(OV5_MMU_RADIX_300) ||
368 !(vec5[OV5_INDX(OV5_MMU_RADIX_300)] & OV5_FEAT(OV5_MMU_RADIX_300)))
369 /* Hypervisor doesn't support radix */
370 cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
371}
372
347void __init mmu_early_init_devtree(void) 373void __init mmu_early_init_devtree(void)
348{ 374{
349 /* Disable radix mode based on kernel command line. */ 375 /* Disable radix mode based on kernel command line. */
350 if (disable_radix) 376 if (disable_radix)
351 cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; 377 cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
352 378
379 /*
380 * Check /chosen/ibm,architecture-vec-5 if running as a guest.
381 * When running bare-metal, we can use radix if we like
382 * even though the ibm,architecture-vec-5 property created by
383 * skiboot doesn't have the necessary bits set.
384 */
385 if (early_radix_enabled() && !(mfmsr() & MSR_HV))
386 early_check_vec5();
387
353 if (early_radix_enabled()) 388 if (early_radix_enabled())
354 radix__early_init_devtree(); 389 radix__early_init_devtree();
355 else 390 else
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index cfa53ccc8baf..94323c4ececc 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -401,6 +401,8 @@ void __init radix__early_init_mmu(void)
401 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 401 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
402 radix_init_partition_table(); 402 radix_init_partition_table();
403 radix_init_amor(); 403 radix_init_amor();
404 } else {
405 radix_init_pseries();
404 } 406 }
405 407
406 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); 408 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 8bca7f58afc4..d6b5e5cde412 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -454,13 +454,23 @@ void __init mmu_partition_table_init(void)
454void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, 454void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
455 unsigned long dw1) 455 unsigned long dw1)
456{ 456{
457 unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
458
457 partition_tb[lpid].patb0 = cpu_to_be64(dw0); 459 partition_tb[lpid].patb0 = cpu_to_be64(dw0);
458 partition_tb[lpid].patb1 = cpu_to_be64(dw1); 460 partition_tb[lpid].patb1 = cpu_to_be64(dw1);
459 461
460 /* Global flush of TLBs and partition table caches for this lpid */ 462 /*
463 * Global flush of TLBs and partition table caches for this lpid.
464 * The type of flush (hash or radix) depends on what the previous
465 * use of this partition ID was, not the new use.
466 */
461 asm volatile("ptesync" : : : "memory"); 467 asm volatile("ptesync" : : : "memory");
462 asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : 468 if (old & PATB_HR)
463 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 469 asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
470 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
471 else
472 asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
473 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
464 asm volatile("eieio; tlbsync; ptesync" : : : "memory"); 474 asm volatile("eieio; tlbsync; ptesync" : : : "memory");
465} 475}
466EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); 476EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
index ea7f09bd73b1..7d67623203b8 100644
--- a/arch/powerpc/platforms/pseries/firmware.c
+++ b/arch/powerpc/platforms/pseries/firmware.c
@@ -126,7 +126,7 @@ static void __init fw_vec5_feature_init(const char *vec5, unsigned long len)
126 index = OV5_INDX(vec5_fw_features_table[i].feature); 126 index = OV5_INDX(vec5_fw_features_table[i].feature);
127 feat = OV5_FEAT(vec5_fw_features_table[i].feature); 127 feat = OV5_FEAT(vec5_fw_features_table[i].feature);
128 128
129 if (vec5[index] & feat) 129 if (index < len && (vec5[index] & feat))
130 powerpc_firmware_features |= 130 powerpc_firmware_features |=
131 vec5_fw_features_table[i].val; 131 vec5_fw_features_table[i].val;
132 } 132 }
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 5dc1c3c6e716..0587655aea69 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -609,6 +609,29 @@ static int __init disable_bulk_remove(char *str)
609 609
610__setup("bulk_remove=", disable_bulk_remove); 610__setup("bulk_remove=", disable_bulk_remove);
611 611
612/* Actually only used for radix, so far */
613static int pseries_lpar_register_process_table(unsigned long base,
614 unsigned long page_size, unsigned long table_size)
615{
616 long rc;
617 unsigned long flags = PROC_TABLE_NEW;
618
619 if (radix_enabled())
620 flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE;
621 for (;;) {
622 rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
623 page_size, table_size);
624 if (!H_IS_LONG_BUSY(rc))
625 break;
626 mdelay(get_longbusy_msecs(rc));
627 }
628 if (rc != H_SUCCESS) {
629 pr_err("Failed to register process table (rc=%ld)\n", rc);
630 BUG();
631 }
632 return rc;
633}
634
612void __init hpte_init_pseries(void) 635void __init hpte_init_pseries(void)
613{ 636{
614 mmu_hash_ops.hpte_invalidate = pSeries_lpar_hpte_invalidate; 637 mmu_hash_ops.hpte_invalidate = pSeries_lpar_hpte_invalidate;
@@ -622,6 +645,12 @@ void __init hpte_init_pseries(void)
622 mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate; 645 mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
623} 646}
624 647
648void radix_init_pseries(void)
649{
650 pr_info("Using radix MMU under hypervisor\n");
651 register_process_table = pseries_lpar_register_process_table;
652}
653
625#ifdef CONFIG_PPC_SMLPAR 654#ifdef CONFIG_PPC_SMLPAR
626#define CMO_FREE_HINT_DEFAULT 1 655#define CMO_FREE_HINT_DEFAULT 1
627static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT; 656static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index cac48eda1075..e0035808c814 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -871,6 +871,8 @@ struct kvm_ppc_smmu_info {
871#define KVM_CAP_S390_USER_INSTR0 130 871#define KVM_CAP_S390_USER_INSTR0 130
872#define KVM_CAP_MSI_DEVID 131 872#define KVM_CAP_MSI_DEVID 131
873#define KVM_CAP_PPC_HTM 132 873#define KVM_CAP_PPC_HTM 132
874#define KVM_CAP_PPC_MMU_RADIX 134
875#define KVM_CAP_PPC_MMU_HASH_V3 135
874 876
875#ifdef KVM_CAP_IRQ_ROUTING 877#ifdef KVM_CAP_IRQ_ROUTING
876 878
@@ -1187,6 +1189,10 @@ struct kvm_s390_ucas_mapping {
1187#define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr) 1189#define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr)
1188/* Available with KVM_CAP_PPC_RTAS */ 1190/* Available with KVM_CAP_PPC_RTAS */
1189#define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO, 0xac, struct kvm_rtas_token_args) 1191#define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO, 0xac, struct kvm_rtas_token_args)
1192/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */
1193#define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg)
1194/* Available with KVM_CAP_PPC_RADIX_MMU */
1195#define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info)
1190 1196
1191/* ioctl for vm fd */ 1197/* ioctl for vm fd */
1192#define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) 1198#define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device)