aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2019-05-15 17:38:42 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2019-05-15 17:39:38 -0400
commit59c5c58c5b93285753d5c1de34d2e00039c27bc0 (patch)
tree19ac0493a5eb3bef477cb04f8117dad12b6bddb9
parentf93f7ede087f2edcc18e4b02310df5749a6b5a61 (diff)
parent4894fbcce856635c9ab79f44e50826e86bb92110 (diff)
Merge tag 'kvm-ppc-next-5.2-2' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc into HEAD
PPC KVM update for 5.2 * Support for guests to access the new POWER9 XIVE interrupt controller hardware directly, reducing interrupt latency and overhead for guests. * In-kernel implementation of the H_PAGE_INIT hypercall. * Reduce memory usage of sparsely-populated IOMMU tables. * Several bug fixes. Second PPC KVM update for 5.2 * Fix a bug, fix a spelling mistake, remove some useless code.
-rw-r--r--Documentation/powerpc/DAWR-POWER9.txt32
-rw-r--r--Documentation/virtual/kvm/api.txt10
-rw-r--r--Documentation/virtual/kvm/devices/xive.txt197
-rw-r--r--arch/powerpc/include/asm/hw_breakpoint.h8
-rw-r--r--arch/powerpc/include/asm/kvm_host.h11
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h41
-rw-r--r--arch/powerpc/include/asm/opal-api.h7
-rw-r--r--arch/powerpc/include/asm/opal.h7
-rw-r--r--arch/powerpc/include/asm/xive.h17
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h46
-rw-r--r--arch/powerpc/kernel/hw_breakpoint.c62
-rw-r--r--arch/powerpc/kernel/process.c9
-rw-r--r--arch/powerpc/kernel/ptrace.c3
-rw-r--r--arch/powerpc/kvm/Makefile2
-rw-r--r--arch/powerpc/kvm/book3s.c42
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c102
-rw-r--r--arch/powerpc/kvm/book3s_64_vio_hv.c105
-rw-r--r--arch/powerpc/kvm/book3s_hv.c159
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c57
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c144
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S109
-rw-r--r--arch/powerpc/kvm/book3s_xive.c250
-rw-r--r--arch/powerpc/kvm/book3s_xive.h37
-rw-r--r--arch/powerpc/kvm/book3s_xive_native.c1249
-rw-r--r--arch/powerpc/kvm/book3s_xive_template.c78
-rw-r--r--arch/powerpc/kvm/powerpc.c37
-rw-r--r--arch/powerpc/platforms/powernv/opal-call.c3
-rw-r--r--arch/powerpc/sysdev/xive/native.c110
-rw-r--r--include/linux/kvm_host.h10
-rw-r--r--include/uapi/linux/kvm.h3
-rw-r--r--virt/kvm/kvm_main.c18
31 files changed, 2670 insertions, 295 deletions
diff --git a/Documentation/powerpc/DAWR-POWER9.txt b/Documentation/powerpc/DAWR-POWER9.txt
index 2feaa6619658..bdec03650941 100644
--- a/Documentation/powerpc/DAWR-POWER9.txt
+++ b/Documentation/powerpc/DAWR-POWER9.txt
@@ -56,3 +56,35 @@ POWER9. Loads and stores to the watchpoint locations will not be
56trapped in GDB. The watchpoint is remembered, so if the guest is 56trapped in GDB. The watchpoint is remembered, so if the guest is
57migrated back to the POWER8 host, it will start working again. 57migrated back to the POWER8 host, it will start working again.
58 58
59Force enabling the DAWR
60=============================
61Kernels (since ~v5.2) have an option to force enable the DAWR via:
62
63 echo Y > /sys/kernel/debug/powerpc/dawr_enable_dangerous
64
65This enables the DAWR even on POWER9.
66
67This is a dangerous setting, USE AT YOUR OWN RISK.
68
69Some users may not care about a bad user crashing their box
70(ie. single user/desktop systems) and really want the DAWR. This
71allows them to force enable DAWR.
72
73This flag can also be used to disable DAWR access. Once this is
74cleared, all DAWR access should be cleared immediately and your
75machine once again safe from crashing.
76
77Userspace may get confused by toggling this. If DAWR is force
78enabled/disabled between getting the number of breakpoints (via
79PTRACE_GETHWDBGINFO) and setting the breakpoint, userspace will get an
80inconsistent view of what's available. Similarly for guests.
81
82For the DAWR to be enabled in a KVM guest, the DAWR needs to be force
83enabled in the host AND the guest. For this reason, this won't work on
84POWERVM as it doesn't allow the HCALL to work. Writes of 'Y' to the
85dawr_enable_dangerous file will fail if the hypervisor doesn't support
86writing the DAWR.
87
88To double check the DAWR is working, run this kernel selftest:
89 tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
90Any errors/failures/skips mean something is wrong.
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 47a5eb00bc53..8ffd9beb931b 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1967,6 +1967,7 @@ registers, find a list below:
1967 PPC | KVM_REG_PPC_TLB3PS | 32 1967 PPC | KVM_REG_PPC_TLB3PS | 32
1968 PPC | KVM_REG_PPC_EPTCFG | 32 1968 PPC | KVM_REG_PPC_EPTCFG | 32
1969 PPC | KVM_REG_PPC_ICP_STATE | 64 1969 PPC | KVM_REG_PPC_ICP_STATE | 64
1970 PPC | KVM_REG_PPC_VP_STATE | 128
1970 PPC | KVM_REG_PPC_TB_OFFSET | 64 1971 PPC | KVM_REG_PPC_TB_OFFSET | 64
1971 PPC | KVM_REG_PPC_SPMC1 | 32 1972 PPC | KVM_REG_PPC_SPMC1 | 32
1972 PPC | KVM_REG_PPC_SPMC2 | 32 1973 PPC | KVM_REG_PPC_SPMC2 | 32
@@ -4487,6 +4488,15 @@ struct kvm_sync_regs {
4487 struct kvm_vcpu_events events; 4488 struct kvm_vcpu_events events;
4488}; 4489};
4489 4490
44916.75 KVM_CAP_PPC_IRQ_XIVE
4492
4493Architectures: ppc
4494Target: vcpu
4495Parameters: args[0] is the XIVE device fd
4496 args[1] is the XIVE CPU number (server ID) for this vcpu
4497
4498This capability connects the vcpu to an in-kernel XIVE device.
4499
44907. Capabilities that can be enabled on VMs 45007. Capabilities that can be enabled on VMs
4491------------------------------------------ 4501------------------------------------------
4492 4502
diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt
new file mode 100644
index 000000000000..9a24a4525253
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/xive.txt
@@ -0,0 +1,197 @@
1POWER9 eXternal Interrupt Virtualization Engine (XIVE Gen1)
2==========================================================
3
4Device types supported:
5 KVM_DEV_TYPE_XIVE POWER9 XIVE Interrupt Controller generation 1
6
7This device acts as a VM interrupt controller. It provides the KVM
8interface to configure the interrupt sources of a VM in the underlying
9POWER9 XIVE interrupt controller.
10
11Only one XIVE instance may be instantiated. A guest XIVE device
12requires a POWER9 host and the guest OS should have support for the
13XIVE native exploitation interrupt mode. If not, it should run using
14the legacy interrupt mode, referred as XICS (POWER7/8).
15
16* Device Mappings
17
18 The KVM device exposes different MMIO ranges of the XIVE HW which
19 are required for interrupt management. These are exposed to the
20 guest in VMAs populated with a custom VM fault handler.
21
22 1. Thread Interrupt Management Area (TIMA)
23
24 Each thread has an associated Thread Interrupt Management context
25 composed of a set of registers. These registers let the thread
26 handle priority management and interrupt acknowledgment. The most
27 important are :
28
29 - Interrupt Pending Buffer (IPB)
30 - Current Processor Priority (CPPR)
31 - Notification Source Register (NSR)
32
33 They are exposed to software in four different pages each proposing
34 a view with a different privilege. The first page is for the
35 physical thread context and the second for the hypervisor. Only the
36 third (operating system) and the fourth (user level) are exposed the
37 guest.
38
39 2. Event State Buffer (ESB)
40
41 Each source is associated with an Event State Buffer (ESB) with
42 either a pair of even/odd pair of pages which provides commands to
43 manage the source: to trigger, to EOI, to turn off the source for
44 instance.
45
46 3. Device pass-through
47
48 When a device is passed-through into the guest, the source
49 interrupts are from a different HW controller (PHB4) and the ESB
50 pages exposed to the guest should accommadate this change.
51
52 The passthru_irq helpers, kvmppc_xive_set_mapped() and
53 kvmppc_xive_clr_mapped() are called when the device HW irqs are
54 mapped into or unmapped from the guest IRQ number space. The KVM
55 device extends these helpers to clear the ESB pages of the guest IRQ
56 number being mapped and then lets the VM fault handler repopulate.
57 The handler will insert the ESB page corresponding to the HW
58 interrupt of the device being passed-through or the initial IPI ESB
59 page if the device has being removed.
60
61 The ESB remapping is fully transparent to the guest and the OS
62 device driver. All handling is done within VFIO and the above
63 helpers in KVM-PPC.
64
65* Groups:
66
67 1. KVM_DEV_XIVE_GRP_CTRL
68 Provides global controls on the device
69 Attributes:
70 1.1 KVM_DEV_XIVE_RESET (write only)
71 Resets the interrupt controller configuration for sources and event
72 queues. To be used by kexec and kdump.
73 Errors: none
74
75 1.2 KVM_DEV_XIVE_EQ_SYNC (write only)
76 Sync all the sources and queues and mark the EQ pages dirty. This
77 to make sure that a consistent memory state is captured when
78 migrating the VM.
79 Errors: none
80
81 2. KVM_DEV_XIVE_GRP_SOURCE (write only)
82 Initializes a new source in the XIVE device and mask it.
83 Attributes:
84 Interrupt source number (64-bit)
85 The kvm_device_attr.addr points to a __u64 value:
86 bits: | 63 .... 2 | 1 | 0
87 values: | unused | level | type
88 - type: 0:MSI 1:LSI
89 - level: assertion level in case of an LSI.
90 Errors:
91 -E2BIG: Interrupt source number is out of range
92 -ENOMEM: Could not create a new source block
93 -EFAULT: Invalid user pointer for attr->addr.
94 -ENXIO: Could not allocate underlying HW interrupt
95
96 3. KVM_DEV_XIVE_GRP_SOURCE_CONFIG (write only)
97 Configures source targeting
98 Attributes:
99 Interrupt source number (64-bit)
100 The kvm_device_attr.addr points to a __u64 value:
101 bits: | 63 .... 33 | 32 | 31 .. 3 | 2 .. 0
102 values: | eisn | mask | server | priority
103 - priority: 0-7 interrupt priority level
104 - server: CPU number chosen to handle the interrupt
105 - mask: mask flag (unused)
106 - eisn: Effective Interrupt Source Number
107 Errors:
108 -ENOENT: Unknown source number
109 -EINVAL: Not initialized source number
110 -EINVAL: Invalid priority
111 -EINVAL: Invalid CPU number.
112 -EFAULT: Invalid user pointer for attr->addr.
113 -ENXIO: CPU event queues not configured or configuration of the
114 underlying HW interrupt failed
115 -EBUSY: No CPU available to serve interrupt
116
117 4. KVM_DEV_XIVE_GRP_EQ_CONFIG (read-write)
118 Configures an event queue of a CPU
119 Attributes:
120 EQ descriptor identifier (64-bit)
121 The EQ descriptor identifier is a tuple (server, priority) :
122 bits: | 63 .... 32 | 31 .. 3 | 2 .. 0
123 values: | unused | server | priority
124 The kvm_device_attr.addr points to :
125 struct kvm_ppc_xive_eq {
126 __u32 flags;
127 __u32 qshift;
128 __u64 qaddr;
129 __u32 qtoggle;
130 __u32 qindex;
131 __u8 pad[40];
132 };
133 - flags: queue flags
134 KVM_XIVE_EQ_ALWAYS_NOTIFY (required)
135 forces notification without using the coalescing mechanism
136 provided by the XIVE END ESBs.
137 - qshift: queue size (power of 2)
138 - qaddr: real address of queue
139 - qtoggle: current queue toggle bit
140 - qindex: current queue index
141 - pad: reserved for future use
142 Errors:
143 -ENOENT: Invalid CPU number
144 -EINVAL: Invalid priority
145 -EINVAL: Invalid flags
146 -EINVAL: Invalid queue size
147 -EINVAL: Invalid queue address
148 -EFAULT: Invalid user pointer for attr->addr.
149 -EIO: Configuration of the underlying HW failed
150
151 5. KVM_DEV_XIVE_GRP_SOURCE_SYNC (write only)
152 Synchronize the source to flush event notifications
153 Attributes:
154 Interrupt source number (64-bit)
155 Errors:
156 -ENOENT: Unknown source number
157 -EINVAL: Not initialized source number
158
159* VCPU state
160
161 The XIVE IC maintains VP interrupt state in an internal structure
162 called the NVT. When a VP is not dispatched on a HW processor
163 thread, this structure can be updated by HW if the VP is the target
164 of an event notification.
165
166 It is important for migration to capture the cached IPB from the NVT
167 as it synthesizes the priorities of the pending interrupts. We
168 capture a bit more to report debug information.
169
170 KVM_REG_PPC_VP_STATE (2 * 64bits)
171 bits: | 63 .... 32 | 31 .... 0 |
172 values: | TIMA word0 | TIMA word1 |
173 bits: | 127 .......... 64 |
174 values: | unused |
175
176* Migration:
177
178 Saving the state of a VM using the XIVE native exploitation mode
179 should follow a specific sequence. When the VM is stopped :
180
181 1. Mask all sources (PQ=01) to stop the flow of events.
182
183 2. Sync the XIVE device with the KVM control KVM_DEV_XIVE_EQ_SYNC to
184 flush any in-flight event notification and to stabilize the EQs. At
185 this stage, the EQ pages are marked dirty to make sure they are
186 transferred in the migration sequence.
187
188 3. Capture the state of the source targeting, the EQs configuration
189 and the state of thread interrupt context registers.
190
191 Restore is similar :
192
193 1. Restore the EQ configuration. As targeting depends on it.
194 2. Restore targeting
195 3. Restore the thread interrupt contexts
196 4. Restore the source states
197 5. Let the vCPU run
diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h
index ece4dc89c90b..0fe8c1e46bbc 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -90,10 +90,18 @@ static inline void hw_breakpoint_disable(void)
90extern void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs); 90extern void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs);
91int hw_breakpoint_handler(struct die_args *args); 91int hw_breakpoint_handler(struct die_args *args);
92 92
93extern int set_dawr(struct arch_hw_breakpoint *brk);
94extern bool dawr_force_enable;
95static inline bool dawr_enabled(void)
96{
97 return dawr_force_enable;
98}
99
93#else /* CONFIG_HAVE_HW_BREAKPOINT */ 100#else /* CONFIG_HAVE_HW_BREAKPOINT */
94static inline void hw_breakpoint_disable(void) { } 101static inline void hw_breakpoint_disable(void) { }
95static inline void thread_change_pc(struct task_struct *tsk, 102static inline void thread_change_pc(struct task_struct *tsk,
96 struct pt_regs *regs) { } 103 struct pt_regs *regs) { }
104static inline bool dawr_enabled(void) { return false; }
97#endif /* CONFIG_HAVE_HW_BREAKPOINT */ 105#endif /* CONFIG_HAVE_HW_BREAKPOINT */
98#endif /* __KERNEL__ */ 106#endif /* __KERNEL__ */
99#endif /* _PPC_BOOK3S_64_HW_BREAKPOINT_H */ 107#endif /* _PPC_BOOK3S_64_HW_BREAKPOINT_H */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e6b5bb012ccb..013c76a0a03e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -201,6 +201,8 @@ struct kvmppc_spapr_tce_iommu_table {
201 struct kref kref; 201 struct kref kref;
202}; 202};
203 203
204#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))
205
204struct kvmppc_spapr_tce_table { 206struct kvmppc_spapr_tce_table {
205 struct list_head list; 207 struct list_head list;
206 struct kvm *kvm; 208 struct kvm *kvm;
@@ -210,6 +212,7 @@ struct kvmppc_spapr_tce_table {
210 u64 offset; /* in pages */ 212 u64 offset; /* in pages */
211 u64 size; /* window size in pages */ 213 u64 size; /* window size in pages */
212 struct list_head iommu_tables; 214 struct list_head iommu_tables;
215 struct mutex alloc_lock;
213 struct page *pages[0]; 216 struct page *pages[0];
214}; 217};
215 218
@@ -222,6 +225,7 @@ extern struct kvm_device_ops kvm_xics_ops;
222struct kvmppc_xive; 225struct kvmppc_xive;
223struct kvmppc_xive_vcpu; 226struct kvmppc_xive_vcpu;
224extern struct kvm_device_ops kvm_xive_ops; 227extern struct kvm_device_ops kvm_xive_ops;
228extern struct kvm_device_ops kvm_xive_native_ops;
225 229
226struct kvmppc_passthru_irqmap; 230struct kvmppc_passthru_irqmap;
227 231
@@ -312,7 +316,11 @@ struct kvm_arch {
312#endif 316#endif
313#ifdef CONFIG_KVM_XICS 317#ifdef CONFIG_KVM_XICS
314 struct kvmppc_xics *xics; 318 struct kvmppc_xics *xics;
315 struct kvmppc_xive *xive; 319 struct kvmppc_xive *xive; /* Current XIVE device in use */
320 struct {
321 struct kvmppc_xive *native;
322 struct kvmppc_xive *xics_on_xive;
323 } xive_devices;
316 struct kvmppc_passthru_irqmap *pimap; 324 struct kvmppc_passthru_irqmap *pimap;
317#endif 325#endif
318 struct kvmppc_ops *kvm_ops; 326 struct kvmppc_ops *kvm_ops;
@@ -449,6 +457,7 @@ struct kvmppc_passthru_irqmap {
449#define KVMPPC_IRQ_DEFAULT 0 457#define KVMPPC_IRQ_DEFAULT 0
450#define KVMPPC_IRQ_MPIC 1 458#define KVMPPC_IRQ_MPIC 1
451#define KVMPPC_IRQ_XICS 2 /* Includes a XIVE option */ 459#define KVMPPC_IRQ_XICS 2 /* Includes a XIVE option */
460#define KVMPPC_IRQ_XIVE 3 /* XIVE native exploitation mode */
452 461
453#define MMIO_HPTE_CACHE_SIZE 4 462#define MMIO_HPTE_CACHE_SIZE 4
454 463
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index ac22b28ae78d..bc892380e6cd 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -197,10 +197,6 @@ extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
197 (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \ 197 (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \
198 (stt)->size, (ioba), (npages)) ? \ 198 (stt)->size, (ioba), (npages)) ? \
199 H_PARAMETER : H_SUCCESS) 199 H_PARAMETER : H_SUCCESS)
200extern long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
201 unsigned long *ua, unsigned long **prmap);
202extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
203 unsigned long idx, unsigned long tce);
204extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, 200extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
205 unsigned long ioba, unsigned long tce); 201 unsigned long ioba, unsigned long tce);
206extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, 202extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
@@ -273,6 +269,7 @@ union kvmppc_one_reg {
273 u64 addr; 269 u64 addr;
274 u64 length; 270 u64 length;
275 } vpaval; 271 } vpaval;
272 u64 xive_timaval[2];
276}; 273};
277 274
278struct kvmppc_ops { 275struct kvmppc_ops {
@@ -480,6 +477,9 @@ extern void kvm_hv_vm_activated(void);
480extern void kvm_hv_vm_deactivated(void); 477extern void kvm_hv_vm_deactivated(void);
481extern bool kvm_hv_mode_active(void); 478extern bool kvm_hv_mode_active(void);
482 479
480extern void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu,
481 struct kvm_nested_guest *nested);
482
483#else 483#else
484static inline void __init kvm_cma_reserve(void) 484static inline void __init kvm_cma_reserve(void)
485{} 485{}
@@ -594,6 +594,22 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
594extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, 594extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
595 int level, bool line_status); 595 int level, bool line_status);
596extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); 596extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
597
598static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
599{
600 return vcpu->arch.irq_type == KVMPPC_IRQ_XIVE;
601}
602
603extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
604 struct kvm_vcpu *vcpu, u32 cpu);
605extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu);
606extern void kvmppc_xive_native_init_module(void);
607extern void kvmppc_xive_native_exit_module(void);
608extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
609 union kvmppc_one_reg *val);
610extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
611 union kvmppc_one_reg *val);
612
597#else 613#else
598static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, 614static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
599 u32 priority) { return -1; } 615 u32 priority) { return -1; }
@@ -617,6 +633,21 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur
617static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, 633static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
618 int level, bool line_status) { return -ENODEV; } 634 int level, bool line_status) { return -ENODEV; }
619static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } 635static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
636
637static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
638 { return 0; }
639static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
640 struct kvm_vcpu *vcpu, u32 cpu) { return -EBUSY; }
641static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { }
642static inline void kvmppc_xive_native_init_module(void) { }
643static inline void kvmppc_xive_native_exit_module(void) { }
644static inline int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
645 union kvmppc_one_reg *val)
646{ return 0; }
647static inline int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
648 union kvmppc_one_reg *val)
649{ return -ENOENT; }
650
620#endif /* CONFIG_KVM_XIVE */ 651#endif /* CONFIG_KVM_XIVE */
621 652
622#if defined(CONFIG_PPC_POWERNV) && defined(CONFIG_KVM_BOOK3S_64_HANDLER) 653#if defined(CONFIG_PPC_POWERNV) && defined(CONFIG_KVM_BOOK3S_64_HANDLER)
@@ -665,6 +696,8 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
665 unsigned long pte_index); 696 unsigned long pte_index);
666long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, 697long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
667 unsigned long pte_index); 698 unsigned long pte_index);
699long kvmppc_rm_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
700 unsigned long dest, unsigned long src);
668long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, 701long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
669 unsigned long slb_v, unsigned int status, bool data); 702 unsigned long slb_v, unsigned int status, bool data);
670unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu); 703unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 870fb7b239ea..e1d118ac61dc 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -186,8 +186,8 @@
186#define OPAL_XIVE_FREE_IRQ 140 186#define OPAL_XIVE_FREE_IRQ 140
187#define OPAL_XIVE_SYNC 141 187#define OPAL_XIVE_SYNC 141
188#define OPAL_XIVE_DUMP 142 188#define OPAL_XIVE_DUMP 142
189#define OPAL_XIVE_RESERVED3 143 189#define OPAL_XIVE_GET_QUEUE_STATE 143
190#define OPAL_XIVE_RESERVED4 144 190#define OPAL_XIVE_SET_QUEUE_STATE 144
191#define OPAL_SIGNAL_SYSTEM_RESET 145 191#define OPAL_SIGNAL_SYSTEM_RESET 145
192#define OPAL_NPU_INIT_CONTEXT 146 192#define OPAL_NPU_INIT_CONTEXT 146
193#define OPAL_NPU_DESTROY_CONTEXT 147 193#define OPAL_NPU_DESTROY_CONTEXT 147
@@ -210,7 +210,8 @@
210#define OPAL_PCI_GET_PBCQ_TUNNEL_BAR 164 210#define OPAL_PCI_GET_PBCQ_TUNNEL_BAR 164
211#define OPAL_PCI_SET_PBCQ_TUNNEL_BAR 165 211#define OPAL_PCI_SET_PBCQ_TUNNEL_BAR 165
212#define OPAL_NX_COPROC_INIT 167 212#define OPAL_NX_COPROC_INIT 167
213#define OPAL_LAST 167 213#define OPAL_XIVE_GET_VP_STATE 170
214#define OPAL_LAST 170
214 215
215#define QUIESCE_HOLD 1 /* Spin all calls at entry */ 216#define QUIESCE_HOLD 1 /* Spin all calls at entry */
216#define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ 217#define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index a55b01c90bb1..4e978d4dea5c 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -279,6 +279,13 @@ int64_t opal_xive_allocate_irq(uint32_t chip_id);
279int64_t opal_xive_free_irq(uint32_t girq); 279int64_t opal_xive_free_irq(uint32_t girq);
280int64_t opal_xive_sync(uint32_t type, uint32_t id); 280int64_t opal_xive_sync(uint32_t type, uint32_t id);
281int64_t opal_xive_dump(uint32_t type, uint32_t id); 281int64_t opal_xive_dump(uint32_t type, uint32_t id);
282int64_t opal_xive_get_queue_state(uint64_t vp, uint32_t prio,
283 __be32 *out_qtoggle,
284 __be32 *out_qindex);
285int64_t opal_xive_set_queue_state(uint64_t vp, uint32_t prio,
286 uint32_t qtoggle,
287 uint32_t qindex);
288int64_t opal_xive_get_vp_state(uint64_t vp, __be64 *out_w01);
282int64_t opal_pci_set_p2p(uint64_t phb_init, uint64_t phb_target, 289int64_t opal_pci_set_p2p(uint64_t phb_init, uint64_t phb_target,
283 uint64_t desc, uint16_t pe_number); 290 uint64_t desc, uint16_t pe_number);
284 291
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index 3c704f5dd3ae..eaf76f57023a 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -23,6 +23,7 @@
23 * same offset regardless of where the code is executing 23 * same offset regardless of where the code is executing
24 */ 24 */
25extern void __iomem *xive_tima; 25extern void __iomem *xive_tima;
26extern unsigned long xive_tima_os;
26 27
27/* 28/*
28 * Offset in the TM area of our current execution level (provided by 29 * Offset in the TM area of our current execution level (provided by
@@ -73,6 +74,8 @@ struct xive_q {
73 u32 esc_irq; 74 u32 esc_irq;
74 atomic_t count; 75 atomic_t count;
75 atomic_t pending_count; 76 atomic_t pending_count;
77 u64 guest_qaddr;
78 u32 guest_qshift;
76}; 79};
77 80
78/* Global enable flags for the XIVE support */ 81/* Global enable flags for the XIVE support */
@@ -109,12 +112,26 @@ extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
109extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio); 112extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
110 113
111extern void xive_native_sync_source(u32 hw_irq); 114extern void xive_native_sync_source(u32 hw_irq);
115extern void xive_native_sync_queue(u32 hw_irq);
112extern bool is_xive_irq(struct irq_chip *chip); 116extern bool is_xive_irq(struct irq_chip *chip);
113extern int xive_native_enable_vp(u32 vp_id, bool single_escalation); 117extern int xive_native_enable_vp(u32 vp_id, bool single_escalation);
114extern int xive_native_disable_vp(u32 vp_id); 118extern int xive_native_disable_vp(u32 vp_id);
115extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id); 119extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id);
116extern bool xive_native_has_single_escalation(void); 120extern bool xive_native_has_single_escalation(void);
117 121
122extern int xive_native_get_queue_info(u32 vp_id, uint32_t prio,
123 u64 *out_qpage,
124 u64 *out_qsize,
125 u64 *out_qeoi_page,
126 u32 *out_escalate_irq,
127 u64 *out_qflags);
128
129extern int xive_native_get_queue_state(u32 vp_id, uint32_t prio, u32 *qtoggle,
130 u32 *qindex);
131extern int xive_native_set_queue_state(u32 vp_id, uint32_t prio, u32 qtoggle,
132 u32 qindex);
133extern int xive_native_get_vp_state(u32 vp_id, u64 *out_state);
134
118#else 135#else
119 136
120static inline bool xive_enabled(void) { return false; } 137static inline bool xive_enabled(void) { return false; }
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 26ca425f4c2c..b0f72dea8b11 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -482,6 +482,8 @@ struct kvm_ppc_cpu_char {
482#define KVM_REG_PPC_ICP_PPRI_SHIFT 16 /* pending irq priority */ 482#define KVM_REG_PPC_ICP_PPRI_SHIFT 16 /* pending irq priority */
483#define KVM_REG_PPC_ICP_PPRI_MASK 0xff 483#define KVM_REG_PPC_ICP_PPRI_MASK 0xff
484 484
485#define KVM_REG_PPC_VP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x8d)
486
485/* Device control API: PPC-specific devices */ 487/* Device control API: PPC-specific devices */
486#define KVM_DEV_MPIC_GRP_MISC 1 488#define KVM_DEV_MPIC_GRP_MISC 1
487#define KVM_DEV_MPIC_BASE_ADDR 0 /* 64-bit */ 489#define KVM_DEV_MPIC_BASE_ADDR 0 /* 64-bit */
@@ -677,4 +679,48 @@ struct kvm_ppc_cpu_char {
677#define KVM_XICS_PRESENTED (1ULL << 43) 679#define KVM_XICS_PRESENTED (1ULL << 43)
678#define KVM_XICS_QUEUED (1ULL << 44) 680#define KVM_XICS_QUEUED (1ULL << 44)
679 681
682/* POWER9 XIVE Native Interrupt Controller */
683#define KVM_DEV_XIVE_GRP_CTRL 1
684#define KVM_DEV_XIVE_RESET 1
685#define KVM_DEV_XIVE_EQ_SYNC 2
686#define KVM_DEV_XIVE_GRP_SOURCE 2 /* 64-bit source identifier */
687#define KVM_DEV_XIVE_GRP_SOURCE_CONFIG 3 /* 64-bit source identifier */
688#define KVM_DEV_XIVE_GRP_EQ_CONFIG 4 /* 64-bit EQ identifier */
689#define KVM_DEV_XIVE_GRP_SOURCE_SYNC 5 /* 64-bit source identifier */
690
691/* Layout of 64-bit XIVE source attribute values */
692#define KVM_XIVE_LEVEL_SENSITIVE (1ULL << 0)
693#define KVM_XIVE_LEVEL_ASSERTED (1ULL << 1)
694
695/* Layout of 64-bit XIVE source configuration attribute values */
696#define KVM_XIVE_SOURCE_PRIORITY_SHIFT 0
697#define KVM_XIVE_SOURCE_PRIORITY_MASK 0x7
698#define KVM_XIVE_SOURCE_SERVER_SHIFT 3
699#define KVM_XIVE_SOURCE_SERVER_MASK 0xfffffff8ULL
700#define KVM_XIVE_SOURCE_MASKED_SHIFT 32
701#define KVM_XIVE_SOURCE_MASKED_MASK 0x100000000ULL
702#define KVM_XIVE_SOURCE_EISN_SHIFT 33
703#define KVM_XIVE_SOURCE_EISN_MASK 0xfffffffe00000000ULL
704
705/* Layout of 64-bit EQ identifier */
706#define KVM_XIVE_EQ_PRIORITY_SHIFT 0
707#define KVM_XIVE_EQ_PRIORITY_MASK 0x7
708#define KVM_XIVE_EQ_SERVER_SHIFT 3
709#define KVM_XIVE_EQ_SERVER_MASK 0xfffffff8ULL
710
711/* Layout of EQ configuration values (64 bytes) */
712struct kvm_ppc_xive_eq {
713 __u32 flags;
714 __u32 qshift;
715 __u64 qaddr;
716 __u32 qtoggle;
717 __u32 qindex;
718 __u8 pad[40];
719};
720
721#define KVM_XIVE_EQ_ALWAYS_NOTIFY 0x00000001
722
723#define KVM_XIVE_TIMA_PAGE_OFFSET 0
724#define KVM_XIVE_ESB_PAGE_OFFSET 4
725
680#endif /* __LINUX_KVM_POWERPC_H */ 726#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index fec8a6773119..da307dd93ee3 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -29,11 +29,15 @@
29#include <linux/kernel.h> 29#include <linux/kernel.h>
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/smp.h> 31#include <linux/smp.h>
32#include <linux/debugfs.h>
33#include <linux/init.h>
32 34
33#include <asm/hw_breakpoint.h> 35#include <asm/hw_breakpoint.h>
34#include <asm/processor.h> 36#include <asm/processor.h>
35#include <asm/sstep.h> 37#include <asm/sstep.h>
36#include <asm/debug.h> 38#include <asm/debug.h>
39#include <asm/debugfs.h>
40#include <asm/hvcall.h>
37#include <linux/uaccess.h> 41#include <linux/uaccess.h>
38 42
39/* 43/*
@@ -174,7 +178,7 @@ int hw_breakpoint_arch_parse(struct perf_event *bp,
174 if (!ppc_breakpoint_available()) 178 if (!ppc_breakpoint_available())
175 return -ENODEV; 179 return -ENODEV;
176 length_max = 8; /* DABR */ 180 length_max = 8; /* DABR */
177 if (cpu_has_feature(CPU_FTR_DAWR)) { 181 if (dawr_enabled()) {
178 length_max = 512 ; /* 64 doublewords */ 182 length_max = 512 ; /* 64 doublewords */
179 /* DAWR region can't cross 512 boundary */ 183 /* DAWR region can't cross 512 boundary */
180 if ((attr->bp_addr >> 9) != 184 if ((attr->bp_addr >> 9) !=
@@ -376,3 +380,59 @@ void hw_breakpoint_pmu_read(struct perf_event *bp)
376{ 380{
377 /* TODO */ 381 /* TODO */
378} 382}
383
384bool dawr_force_enable;
385EXPORT_SYMBOL_GPL(dawr_force_enable);
386
387static ssize_t dawr_write_file_bool(struct file *file,
388 const char __user *user_buf,
389 size_t count, loff_t *ppos)
390{
391 struct arch_hw_breakpoint null_brk = {0, 0, 0};
392 size_t rc;
393
394 /* Send error to user if they hypervisor won't allow us to write DAWR */
395 if ((!dawr_force_enable) &&
396 (firmware_has_feature(FW_FEATURE_LPAR)) &&
397 (set_dawr(&null_brk) != H_SUCCESS))
398 return -1;
399
400 rc = debugfs_write_file_bool(file, user_buf, count, ppos);
401 if (rc)
402 return rc;
403
404 /* If we are clearing, make sure all CPUs have the DAWR cleared */
405 if (!dawr_force_enable)
406 smp_call_function((smp_call_func_t)set_dawr, &null_brk, 0);
407
408 return rc;
409}
410
411static const struct file_operations dawr_enable_fops = {
412 .read = debugfs_read_file_bool,
413 .write = dawr_write_file_bool,
414 .open = simple_open,
415 .llseek = default_llseek,
416};
417
418static int __init dawr_force_setup(void)
419{
420 dawr_force_enable = false;
421
422 if (cpu_has_feature(CPU_FTR_DAWR)) {
423 /* Don't setup sysfs file for user control on P8 */
424 dawr_force_enable = true;
425 return 0;
426 }
427
428 if (PVR_VER(mfspr(SPRN_PVR)) == PVR_POWER9) {
429 /* Turn DAWR off by default, but allow admin to turn it on */
430 dawr_force_enable = false;
431 debugfs_create_file_unsafe("dawr_enable_dangerous", 0600,
432 powerpc_debugfs_root,
433 &dawr_force_enable,
434 &dawr_enable_fops);
435 }
436 return 0;
437}
438arch_initcall(dawr_force_setup);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index dd9e0d5386ee..225705aac814 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -67,6 +67,7 @@
67#include <asm/cpu_has_feature.h> 67#include <asm/cpu_has_feature.h>
68#include <asm/asm-prototypes.h> 68#include <asm/asm-prototypes.h>
69#include <asm/stacktrace.h> 69#include <asm/stacktrace.h>
70#include <asm/hw_breakpoint.h>
70 71
71#include <linux/kprobes.h> 72#include <linux/kprobes.h>
72#include <linux/kdebug.h> 73#include <linux/kdebug.h>
@@ -784,7 +785,7 @@ static inline int set_dabr(struct arch_hw_breakpoint *brk)
784 return __set_dabr(dabr, dabrx); 785 return __set_dabr(dabr, dabrx);
785} 786}
786 787
787static inline int set_dawr(struct arch_hw_breakpoint *brk) 788int set_dawr(struct arch_hw_breakpoint *brk)
788{ 789{
789 unsigned long dawr, dawrx, mrd; 790 unsigned long dawr, dawrx, mrd;
790 791
@@ -816,7 +817,7 @@ void __set_breakpoint(struct arch_hw_breakpoint *brk)
816{ 817{
817 memcpy(this_cpu_ptr(&current_brk), brk, sizeof(*brk)); 818 memcpy(this_cpu_ptr(&current_brk), brk, sizeof(*brk));
818 819
819 if (cpu_has_feature(CPU_FTR_DAWR)) 820 if (dawr_enabled())
820 // Power8 or later 821 // Power8 or later
821 set_dawr(brk); 822 set_dawr(brk);
822 else if (!cpu_has_feature(CPU_FTR_ARCH_207S)) 823 else if (!cpu_has_feature(CPU_FTR_ARCH_207S))
@@ -830,8 +831,8 @@ void __set_breakpoint(struct arch_hw_breakpoint *brk)
830/* Check if we have DAWR or DABR hardware */ 831/* Check if we have DAWR or DABR hardware */
831bool ppc_breakpoint_available(void) 832bool ppc_breakpoint_available(void)
832{ 833{
833 if (cpu_has_feature(CPU_FTR_DAWR)) 834 if (dawr_enabled())
834 return true; /* POWER8 DAWR */ 835 return true; /* POWER8 DAWR or POWER9 forced DAWR */
835 if (cpu_has_feature(CPU_FTR_ARCH_207S)) 836 if (cpu_has_feature(CPU_FTR_ARCH_207S))
836 return false; /* POWER9 with DAWR disabled */ 837 return false; /* POWER9 with DAWR disabled */
837 /* DABR: Everything but POWER8 and POWER9 */ 838 /* DABR: Everything but POWER8 and POWER9 */
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index d9ac7d94656e..684b0b315c32 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -43,6 +43,7 @@
43#include <asm/tm.h> 43#include <asm/tm.h>
44#include <asm/asm-prototypes.h> 44#include <asm/asm-prototypes.h>
45#include <asm/debug.h> 45#include <asm/debug.h>
46#include <asm/hw_breakpoint.h>
46 47
47#define CREATE_TRACE_POINTS 48#define CREATE_TRACE_POINTS
48#include <trace/events/syscalls.h> 49#include <trace/events/syscalls.h>
@@ -3088,7 +3089,7 @@ long arch_ptrace(struct task_struct *child, long request,
3088 dbginfo.sizeof_condition = 0; 3089 dbginfo.sizeof_condition = 0;
3089#ifdef CONFIG_HAVE_HW_BREAKPOINT 3090#ifdef CONFIG_HAVE_HW_BREAKPOINT
3090 dbginfo.features = PPC_DEBUG_FEATURE_DATA_BP_RANGE; 3091 dbginfo.features = PPC_DEBUG_FEATURE_DATA_BP_RANGE;
3091 if (cpu_has_feature(CPU_FTR_DAWR)) 3092 if (dawr_enabled())
3092 dbginfo.features |= PPC_DEBUG_FEATURE_DATA_BP_DAWR; 3093 dbginfo.features |= PPC_DEBUG_FEATURE_DATA_BP_DAWR;
3093#else 3094#else
3094 dbginfo.features = 0; 3095 dbginfo.features = 0;
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 3223aec88b2c..4c67cc79de7c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -94,7 +94,7 @@ endif
94kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ 94kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
95 book3s_xics.o 95 book3s_xics.o
96 96
97kvm-book3s_64-objs-$(CONFIG_KVM_XIVE) += book3s_xive.o 97kvm-book3s_64-objs-$(CONFIG_KVM_XIVE) += book3s_xive.o book3s_xive_native.o
98kvm-book3s_64-objs-$(CONFIG_SPAPR_TCE_IOMMU) += book3s_64_vio.o 98kvm-book3s_64-objs-$(CONFIG_SPAPR_TCE_IOMMU) += book3s_64_vio.o
99 99
100kvm-book3s_64-module-objs := \ 100kvm-book3s_64-module-objs := \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 10c5579d20ce..61a212d0daf0 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -651,6 +651,18 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
651 *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu)); 651 *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
652 break; 652 break;
653#endif /* CONFIG_KVM_XICS */ 653#endif /* CONFIG_KVM_XICS */
654#ifdef CONFIG_KVM_XIVE
655 case KVM_REG_PPC_VP_STATE:
656 if (!vcpu->arch.xive_vcpu) {
657 r = -ENXIO;
658 break;
659 }
660 if (xive_enabled())
661 r = kvmppc_xive_native_get_vp(vcpu, val);
662 else
663 r = -ENXIO;
664 break;
665#endif /* CONFIG_KVM_XIVE */
654 case KVM_REG_PPC_FSCR: 666 case KVM_REG_PPC_FSCR:
655 *val = get_reg_val(id, vcpu->arch.fscr); 667 *val = get_reg_val(id, vcpu->arch.fscr);
656 break; 668 break;
@@ -724,6 +736,18 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
724 r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val)); 736 r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
725 break; 737 break;
726#endif /* CONFIG_KVM_XICS */ 738#endif /* CONFIG_KVM_XICS */
739#ifdef CONFIG_KVM_XIVE
740 case KVM_REG_PPC_VP_STATE:
741 if (!vcpu->arch.xive_vcpu) {
742 r = -ENXIO;
743 break;
744 }
745 if (xive_enabled())
746 r = kvmppc_xive_native_set_vp(vcpu, val);
747 else
748 r = -ENXIO;
749 break;
750#endif /* CONFIG_KVM_XIVE */
727 case KVM_REG_PPC_FSCR: 751 case KVM_REG_PPC_FSCR:
728 vcpu->arch.fscr = set_reg_val(id, *val); 752 vcpu->arch.fscr = set_reg_val(id, *val);
729 break; 753 break;
@@ -891,6 +915,17 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
891 kvmppc_rtas_tokens_free(kvm); 915 kvmppc_rtas_tokens_free(kvm);
892 WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); 916 WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
893#endif 917#endif
918
919#ifdef CONFIG_KVM_XICS
920 /*
921 * Free the XIVE devices which are not directly freed by the
922 * device 'release' method
923 */
924 kfree(kvm->arch.xive_devices.native);
925 kvm->arch.xive_devices.native = NULL;
926 kfree(kvm->arch.xive_devices.xics_on_xive);
927 kvm->arch.xive_devices.xics_on_xive = NULL;
928#endif /* CONFIG_KVM_XICS */
894} 929}
895 930
896int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu) 931int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu)
@@ -1050,6 +1085,9 @@ static int kvmppc_book3s_init(void)
1050 if (xics_on_xive()) { 1085 if (xics_on_xive()) {
1051 kvmppc_xive_init_module(); 1086 kvmppc_xive_init_module();
1052 kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS); 1087 kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS);
1088 kvmppc_xive_native_init_module();
1089 kvm_register_device_ops(&kvm_xive_native_ops,
1090 KVM_DEV_TYPE_XIVE);
1053 } else 1091 } else
1054#endif 1092#endif
1055 kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS); 1093 kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS);
@@ -1060,8 +1098,10 @@ static int kvmppc_book3s_init(void)
1060static void kvmppc_book3s_exit(void) 1098static void kvmppc_book3s_exit(void)
1061{ 1099{
1062#ifdef CONFIG_KVM_XICS 1100#ifdef CONFIG_KVM_XICS
1063 if (xics_on_xive()) 1101 if (xics_on_xive()) {
1064 kvmppc_xive_exit_module(); 1102 kvmppc_xive_exit_module();
1103 kvmppc_xive_native_exit_module();
1104 }
1065#endif 1105#endif
1066#ifdef CONFIG_KVM_BOOK3S_32_HANDLER 1106#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
1067 kvmppc_book3s_exit_pr(); 1107 kvmppc_book3s_exit_pr();
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index f02b04973710..66270e07449a 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -228,11 +228,33 @@ static void release_spapr_tce_table(struct rcu_head *head)
228 unsigned long i, npages = kvmppc_tce_pages(stt->size); 228 unsigned long i, npages = kvmppc_tce_pages(stt->size);
229 229
230 for (i = 0; i < npages; i++) 230 for (i = 0; i < npages; i++)
231 __free_page(stt->pages[i]); 231 if (stt->pages[i])
232 __free_page(stt->pages[i]);
232 233
233 kfree(stt); 234 kfree(stt);
234} 235}
235 236
237static struct page *kvm_spapr_get_tce_page(struct kvmppc_spapr_tce_table *stt,
238 unsigned long sttpage)
239{
240 struct page *page = stt->pages[sttpage];
241
242 if (page)
243 return page;
244
245 mutex_lock(&stt->alloc_lock);
246 page = stt->pages[sttpage];
247 if (!page) {
248 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
249 WARN_ON_ONCE(!page);
250 if (page)
251 stt->pages[sttpage] = page;
252 }
253 mutex_unlock(&stt->alloc_lock);
254
255 return page;
256}
257
236static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf) 258static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf)
237{ 259{
238 struct kvmppc_spapr_tce_table *stt = vmf->vma->vm_file->private_data; 260 struct kvmppc_spapr_tce_table *stt = vmf->vma->vm_file->private_data;
@@ -241,7 +263,10 @@ static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf)
241 if (vmf->pgoff >= kvmppc_tce_pages(stt->size)) 263 if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
242 return VM_FAULT_SIGBUS; 264 return VM_FAULT_SIGBUS;
243 265
244 page = stt->pages[vmf->pgoff]; 266 page = kvm_spapr_get_tce_page(stt, vmf->pgoff);
267 if (!page)
268 return VM_FAULT_OOM;
269
245 get_page(page); 270 get_page(page);
246 vmf->page = page; 271 vmf->page = page;
247 return 0; 272 return 0;
@@ -296,7 +321,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
296 struct kvmppc_spapr_tce_table *siter; 321 struct kvmppc_spapr_tce_table *siter;
297 unsigned long npages, size = args->size; 322 unsigned long npages, size = args->size;
298 int ret = -ENOMEM; 323 int ret = -ENOMEM;
299 int i;
300 324
301 if (!args->size || args->page_shift < 12 || args->page_shift > 34 || 325 if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
302 (args->offset + args->size > (ULLONG_MAX >> args->page_shift))) 326 (args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
@@ -318,14 +342,9 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
318 stt->offset = args->offset; 342 stt->offset = args->offset;
319 stt->size = size; 343 stt->size = size;
320 stt->kvm = kvm; 344 stt->kvm = kvm;
345 mutex_init(&stt->alloc_lock);
321 INIT_LIST_HEAD_RCU(&stt->iommu_tables); 346 INIT_LIST_HEAD_RCU(&stt->iommu_tables);
322 347
323 for (i = 0; i < npages; i++) {
324 stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
325 if (!stt->pages[i])
326 goto fail;
327 }
328
329 mutex_lock(&kvm->lock); 348 mutex_lock(&kvm->lock);
330 349
331 /* Check this LIOBN hasn't been previously allocated */ 350 /* Check this LIOBN hasn't been previously allocated */
@@ -352,17 +371,28 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
352 if (ret >= 0) 371 if (ret >= 0)
353 return ret; 372 return ret;
354 373
355 fail:
356 for (i = 0; i < npages; i++)
357 if (stt->pages[i])
358 __free_page(stt->pages[i]);
359
360 kfree(stt); 374 kfree(stt);
361 fail_acct: 375 fail_acct:
362 kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); 376 kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
363 return ret; 377 return ret;
364} 378}
365 379
380static long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
381 unsigned long *ua)
382{
383 unsigned long gfn = tce >> PAGE_SHIFT;
384 struct kvm_memory_slot *memslot;
385
386 memslot = search_memslots(kvm_memslots(kvm), gfn);
387 if (!memslot)
388 return -EINVAL;
389
390 *ua = __gfn_to_hva_memslot(memslot, gfn) |
391 (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
392
393 return 0;
394}
395
366static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, 396static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
367 unsigned long tce) 397 unsigned long tce)
368{ 398{
@@ -378,7 +408,7 @@ static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
378 if (iommu_tce_check_gpa(stt->page_shift, gpa)) 408 if (iommu_tce_check_gpa(stt->page_shift, gpa))
379 return H_TOO_HARD; 409 return H_TOO_HARD;
380 410
381 if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL)) 411 if (kvmppc_tce_to_ua(stt->kvm, tce, &ua))
382 return H_TOO_HARD; 412 return H_TOO_HARD;
383 413
384 list_for_each_entry_rcu(stit, &stt->iommu_tables, next) { 414 list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
@@ -397,6 +427,36 @@ static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
397 return H_SUCCESS; 427 return H_SUCCESS;
398} 428}
399 429
430/*
431 * Handles TCE requests for emulated devices.
432 * Puts guest TCE values to the table and expects user space to convert them.
433 * Cannot fail so kvmppc_tce_validate must be called before it.
434 */
435static void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
436 unsigned long idx, unsigned long tce)
437{
438 struct page *page;
439 u64 *tbl;
440 unsigned long sttpage;
441
442 idx -= stt->offset;
443 sttpage = idx / TCES_PER_PAGE;
444 page = stt->pages[sttpage];
445
446 if (!page) {
447 /* We allow any TCE, not just with read|write permissions */
448 if (!tce)
449 return;
450
451 page = kvm_spapr_get_tce_page(stt, sttpage);
452 if (!page)
453 return;
454 }
455 tbl = page_to_virt(page);
456
457 tbl[idx % TCES_PER_PAGE] = tce;
458}
459
400static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl, 460static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl,
401 unsigned long entry) 461 unsigned long entry)
402{ 462{
@@ -543,15 +603,15 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
543 if (ret != H_SUCCESS) 603 if (ret != H_SUCCESS)
544 return ret; 604 return ret;
545 605
606 idx = srcu_read_lock(&vcpu->kvm->srcu);
607
546 ret = kvmppc_tce_validate(stt, tce); 608 ret = kvmppc_tce_validate(stt, tce);
547 if (ret != H_SUCCESS) 609 if (ret != H_SUCCESS)
548 return ret; 610 goto unlock_exit;
549 611
550 dir = iommu_tce_direction(tce); 612 dir = iommu_tce_direction(tce);
551 613
552 idx = srcu_read_lock(&vcpu->kvm->srcu); 614 if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) {
553
554 if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
555 ret = H_PARAMETER; 615 ret = H_PARAMETER;
556 goto unlock_exit; 616 goto unlock_exit;
557 } 617 }
@@ -612,7 +672,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
612 return ret; 672 return ret;
613 673
614 idx = srcu_read_lock(&vcpu->kvm->srcu); 674 idx = srcu_read_lock(&vcpu->kvm->srcu);
615 if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) { 675 if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua)) {
616 ret = H_TOO_HARD; 676 ret = H_TOO_HARD;
617 goto unlock_exit; 677 goto unlock_exit;
618 } 678 }
@@ -647,7 +707,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
647 } 707 }
648 tce = be64_to_cpu(tce); 708 tce = be64_to_cpu(tce);
649 709
650 if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) 710 if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua))
651 return H_PARAMETER; 711 return H_PARAMETER;
652 712
653 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { 713 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 2206bc729b9a..484b47fa3960 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -66,8 +66,6 @@
66 66
67#endif 67#endif
68 68
69#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))
70
71/* 69/*
72 * Finds a TCE table descriptor by LIOBN. 70 * Finds a TCE table descriptor by LIOBN.
73 * 71 *
@@ -88,6 +86,25 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
88EXPORT_SYMBOL_GPL(kvmppc_find_table); 86EXPORT_SYMBOL_GPL(kvmppc_find_table);
89 87
90#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 88#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
89static long kvmppc_rm_tce_to_ua(struct kvm *kvm, unsigned long tce,
90 unsigned long *ua, unsigned long **prmap)
91{
92 unsigned long gfn = tce >> PAGE_SHIFT;
93 struct kvm_memory_slot *memslot;
94
95 memslot = search_memslots(kvm_memslots_raw(kvm), gfn);
96 if (!memslot)
97 return -EINVAL;
98
99 *ua = __gfn_to_hva_memslot(memslot, gfn) |
100 (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
101
102 if (prmap)
103 *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
104
105 return 0;
106}
107
91/* 108/*
92 * Validates TCE address. 109 * Validates TCE address.
93 * At the moment flags and page mask are validated. 110 * At the moment flags and page mask are validated.
@@ -111,7 +128,7 @@ static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt,
111 if (iommu_tce_check_gpa(stt->page_shift, gpa)) 128 if (iommu_tce_check_gpa(stt->page_shift, gpa))
112 return H_PARAMETER; 129 return H_PARAMETER;
113 130
114 if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL)) 131 if (kvmppc_rm_tce_to_ua(stt->kvm, tce, &ua, NULL))
115 return H_TOO_HARD; 132 return H_TOO_HARD;
116 133
117 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { 134 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -129,7 +146,6 @@ static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt,
129 146
130 return H_SUCCESS; 147 return H_SUCCESS;
131} 148}
132#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
133 149
134/* Note on the use of page_address() in real mode, 150/* Note on the use of page_address() in real mode,
135 * 151 *
@@ -161,13 +177,9 @@ static u64 *kvmppc_page_address(struct page *page)
161/* 177/*
162 * Handles TCE requests for emulated devices. 178 * Handles TCE requests for emulated devices.
163 * Puts guest TCE values to the table and expects user space to convert them. 179 * Puts guest TCE values to the table and expects user space to convert them.
164 * Called in both real and virtual modes. 180 * Cannot fail so kvmppc_rm_tce_validate must be called before it.
165 * Cannot fail so kvmppc_tce_validate must be called before it.
166 *
167 * WARNING: This will be called in real-mode on HV KVM and virtual
168 * mode on PR KVM
169 */ 181 */
170void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, 182static void kvmppc_rm_tce_put(struct kvmppc_spapr_tce_table *stt,
171 unsigned long idx, unsigned long tce) 183 unsigned long idx, unsigned long tce)
172{ 184{
173 struct page *page; 185 struct page *page;
@@ -175,35 +187,48 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
175 187
176 idx -= stt->offset; 188 idx -= stt->offset;
177 page = stt->pages[idx / TCES_PER_PAGE]; 189 page = stt->pages[idx / TCES_PER_PAGE];
190 /*
191 * page must not be NULL in real mode,
192 * kvmppc_rm_ioba_validate() must have taken care of this.
193 */
194 WARN_ON_ONCE_RM(!page);
178 tbl = kvmppc_page_address(page); 195 tbl = kvmppc_page_address(page);
179 196
180 tbl[idx % TCES_PER_PAGE] = tce; 197 tbl[idx % TCES_PER_PAGE] = tce;
181} 198}
182EXPORT_SYMBOL_GPL(kvmppc_tce_put);
183 199
184long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, 200/*
185 unsigned long *ua, unsigned long **prmap) 201 * TCEs pages are allocated in kvmppc_rm_tce_put() which won't be able to do so
202 * in real mode.
203 * Check if kvmppc_rm_tce_put() can succeed in real mode, i.e. a TCEs page is
204 * allocated or not required (when clearing a tce entry).
205 */
206static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt,
207 unsigned long ioba, unsigned long npages, bool clearing)
186{ 208{
187 unsigned long gfn = tce >> PAGE_SHIFT; 209 unsigned long i, idx, sttpage, sttpages;
188 struct kvm_memory_slot *memslot; 210 unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages);
189 211
190 memslot = search_memslots(kvm_memslots(kvm), gfn); 212 if (ret)
191 if (!memslot) 213 return ret;
192 return -EINVAL; 214 /*
193 215 * clearing==true says kvmppc_rm_tce_put won't be allocating pages
194 *ua = __gfn_to_hva_memslot(memslot, gfn) | 216 * for empty tces.
195 (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); 217 */
218 if (clearing)
219 return H_SUCCESS;
196 220
197#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 221 idx = (ioba >> stt->page_shift) - stt->offset;
198 if (prmap) 222 sttpage = idx / TCES_PER_PAGE;
199 *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; 223 sttpages = _ALIGN_UP(idx % TCES_PER_PAGE + npages, TCES_PER_PAGE) /
200#endif 224 TCES_PER_PAGE;
225 for (i = sttpage; i < sttpage + sttpages; ++i)
226 if (!stt->pages[i])
227 return H_TOO_HARD;
201 228
202 return 0; 229 return H_SUCCESS;
203} 230}
204EXPORT_SYMBOL_GPL(kvmppc_tce_to_ua);
205 231
206#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
207static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, 232static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
208 unsigned long entry, unsigned long *hpa, 233 unsigned long entry, unsigned long *hpa,
209 enum dma_data_direction *direction) 234 enum dma_data_direction *direction)
@@ -381,7 +406,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
381 if (!stt) 406 if (!stt)
382 return H_TOO_HARD; 407 return H_TOO_HARD;
383 408
384 ret = kvmppc_ioba_validate(stt, ioba, 1); 409 ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce == 0);
385 if (ret != H_SUCCESS) 410 if (ret != H_SUCCESS)
386 return ret; 411 return ret;
387 412
@@ -390,7 +415,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
390 return ret; 415 return ret;
391 416
392 dir = iommu_tce_direction(tce); 417 dir = iommu_tce_direction(tce);
393 if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) 418 if ((dir != DMA_NONE) && kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
394 return H_PARAMETER; 419 return H_PARAMETER;
395 420
396 entry = ioba >> stt->page_shift; 421 entry = ioba >> stt->page_shift;
@@ -409,7 +434,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
409 } 434 }
410 } 435 }
411 436
412 kvmppc_tce_put(stt, entry, tce); 437 kvmppc_rm_tce_put(stt, entry, tce);
413 438
414 return H_SUCCESS; 439 return H_SUCCESS;
415} 440}
@@ -480,7 +505,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
480 if (tce_list & (SZ_4K - 1)) 505 if (tce_list & (SZ_4K - 1))
481 return H_PARAMETER; 506 return H_PARAMETER;
482 507
483 ret = kvmppc_ioba_validate(stt, ioba, npages); 508 ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false);
484 if (ret != H_SUCCESS) 509 if (ret != H_SUCCESS)
485 return ret; 510 return ret;
486 511
@@ -492,7 +517,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
492 */ 517 */
493 struct mm_iommu_table_group_mem_t *mem; 518 struct mm_iommu_table_group_mem_t *mem;
494 519
495 if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) 520 if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL))
496 return H_TOO_HARD; 521 return H_TOO_HARD;
497 522
498 mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K); 523 mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
@@ -508,7 +533,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
508 * We do not require memory to be preregistered in this case 533 * We do not require memory to be preregistered in this case
509 * so lock rmap and do __find_linux_pte_or_hugepte(). 534 * so lock rmap and do __find_linux_pte_or_hugepte().
510 */ 535 */
511 if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) 536 if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
512 return H_TOO_HARD; 537 return H_TOO_HARD;
513 538
514 rmap = (void *) vmalloc_to_phys(rmap); 539 rmap = (void *) vmalloc_to_phys(rmap);
@@ -542,7 +567,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
542 unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); 567 unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
543 568
544 ua = 0; 569 ua = 0;
545 if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) 570 if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
546 return H_PARAMETER; 571 return H_PARAMETER;
547 572
548 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { 573 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -557,7 +582,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
557 } 582 }
558 } 583 }
559 584
560 kvmppc_tce_put(stt, entry + i, tce); 585 kvmppc_rm_tce_put(stt, entry + i, tce);
561 } 586 }
562 587
563unlock_exit: 588unlock_exit:
@@ -583,7 +608,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
583 if (!stt) 608 if (!stt)
584 return H_TOO_HARD; 609 return H_TOO_HARD;
585 610
586 ret = kvmppc_ioba_validate(stt, ioba, npages); 611 ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value == 0);
587 if (ret != H_SUCCESS) 612 if (ret != H_SUCCESS)
588 return ret; 613 return ret;
589 614
@@ -610,7 +635,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
610 } 635 }
611 636
612 for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) 637 for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
613 kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); 638 kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value);
614 639
615 return H_SUCCESS; 640 return H_SUCCESS;
616} 641}
@@ -635,6 +660,10 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
635 660
636 idx = (ioba >> stt->page_shift) - stt->offset; 661 idx = (ioba >> stt->page_shift) - stt->offset;
637 page = stt->pages[idx / TCES_PER_PAGE]; 662 page = stt->pages[idx / TCES_PER_PAGE];
663 if (!page) {
664 vcpu->arch.regs.gpr[4] = 0;
665 return H_SUCCESS;
666 }
638 tbl = (u64 *)page_address(page); 667 tbl = (u64 *)page_address(page);
639 668
640 vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE]; 669 vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE];
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 06964350b97a..d5fc624e0655 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -74,6 +74,7 @@
74#include <asm/opal.h> 74#include <asm/opal.h>
75#include <asm/xics.h> 75#include <asm/xics.h>
76#include <asm/xive.h> 76#include <asm/xive.h>
77#include <asm/hw_breakpoint.h>
77 78
78#include "book3s.h" 79#include "book3s.h"
79 80
@@ -749,7 +750,7 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
749 /* 750 /*
750 * Ensure that the read of vcore->dpdes comes after the read 751 * Ensure that the read of vcore->dpdes comes after the read
751 * of vcpu->doorbell_request. This barrier matches the 752 * of vcpu->doorbell_request. This barrier matches the
752 * smb_wmb() in kvmppc_guest_entry_inject(). 753 * smp_wmb() in kvmppc_guest_entry_inject().
753 */ 754 */
754 smp_rmb(); 755 smp_rmb();
755 vc = vcpu->arch.vcore; 756 vc = vcpu->arch.vcore;
@@ -801,6 +802,80 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
801 } 802 }
802} 803}
803 804
805/* Copy guest memory in place - must reside within a single memslot */
806static int kvmppc_copy_guest(struct kvm *kvm, gpa_t to, gpa_t from,
807 unsigned long len)
808{
809 struct kvm_memory_slot *to_memslot = NULL;
810 struct kvm_memory_slot *from_memslot = NULL;
811 unsigned long to_addr, from_addr;
812 int r;
813
814 /* Get HPA for from address */
815 from_memslot = gfn_to_memslot(kvm, from >> PAGE_SHIFT);
816 if (!from_memslot)
817 return -EFAULT;
818 if ((from + len) >= ((from_memslot->base_gfn + from_memslot->npages)
819 << PAGE_SHIFT))
820 return -EINVAL;
821 from_addr = gfn_to_hva_memslot(from_memslot, from >> PAGE_SHIFT);
822 if (kvm_is_error_hva(from_addr))
823 return -EFAULT;
824 from_addr |= (from & (PAGE_SIZE - 1));
825
826 /* Get HPA for to address */
827 to_memslot = gfn_to_memslot(kvm, to >> PAGE_SHIFT);
828 if (!to_memslot)
829 return -EFAULT;
830 if ((to + len) >= ((to_memslot->base_gfn + to_memslot->npages)
831 << PAGE_SHIFT))
832 return -EINVAL;
833 to_addr = gfn_to_hva_memslot(to_memslot, to >> PAGE_SHIFT);
834 if (kvm_is_error_hva(to_addr))
835 return -EFAULT;
836 to_addr |= (to & (PAGE_SIZE - 1));
837
838 /* Perform copy */
839 r = raw_copy_in_user((void __user *)to_addr, (void __user *)from_addr,
840 len);
841 if (r)
842 return -EFAULT;
843 mark_page_dirty(kvm, to >> PAGE_SHIFT);
844 return 0;
845}
846
847static long kvmppc_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
848 unsigned long dest, unsigned long src)
849{
850 u64 pg_sz = SZ_4K; /* 4K page size */
851 u64 pg_mask = SZ_4K - 1;
852 int ret;
853
854 /* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */
855 if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE |
856 H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED))
857 return H_PARAMETER;
858
859 /* dest (and src if copy_page flag set) must be page aligned */
860 if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask)))
861 return H_PARAMETER;
862
863 /* zero and/or copy the page as determined by the flags */
864 if (flags & H_COPY_PAGE) {
865 ret = kvmppc_copy_guest(vcpu->kvm, dest, src, pg_sz);
866 if (ret < 0)
867 return H_PARAMETER;
868 } else if (flags & H_ZERO_PAGE) {
869 ret = kvm_clear_guest(vcpu->kvm, dest, pg_sz);
870 if (ret < 0)
871 return H_PARAMETER;
872 }
873
874 /* We can ignore the remaining flags */
875
876 return H_SUCCESS;
877}
878
804static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target) 879static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
805{ 880{
806 struct kvmppc_vcore *vcore = target->arch.vcore; 881 struct kvmppc_vcore *vcore = target->arch.vcore;
@@ -1003,6 +1078,11 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
1003 if (nesting_enabled(vcpu->kvm)) 1078 if (nesting_enabled(vcpu->kvm))
1004 ret = kvmhv_copy_tofrom_guest_nested(vcpu); 1079 ret = kvmhv_copy_tofrom_guest_nested(vcpu);
1005 break; 1080 break;
1081 case H_PAGE_INIT:
1082 ret = kvmppc_h_page_init(vcpu, kvmppc_get_gpr(vcpu, 4),
1083 kvmppc_get_gpr(vcpu, 5),
1084 kvmppc_get_gpr(vcpu, 6));
1085 break;
1006 default: 1086 default:
1007 return RESUME_HOST; 1087 return RESUME_HOST;
1008 } 1088 }
@@ -1047,6 +1127,7 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
1047 case H_IPOLL: 1127 case H_IPOLL:
1048 case H_XIRR_X: 1128 case H_XIRR_X:
1049#endif 1129#endif
1130 case H_PAGE_INIT:
1050 return 1; 1131 return 1;
1051 } 1132 }
1052 1133
@@ -2504,37 +2585,6 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
2504 } 2585 }
2505} 2586}
2506 2587
2507static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu,
2508 struct kvm_nested_guest *nested)
2509{
2510 cpumask_t *need_tlb_flush;
2511 int lpid;
2512
2513 if (!cpu_has_feature(CPU_FTR_HVMODE))
2514 return;
2515
2516 if (cpu_has_feature(CPU_FTR_ARCH_300))
2517 pcpu &= ~0x3UL;
2518
2519 if (nested) {
2520 lpid = nested->shadow_lpid;
2521 need_tlb_flush = &nested->need_tlb_flush;
2522 } else {
2523 lpid = kvm->arch.lpid;
2524 need_tlb_flush = &kvm->arch.need_tlb_flush;
2525 }
2526
2527 mtspr(SPRN_LPID, lpid);
2528 isync();
2529 smp_mb();
2530
2531 if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
2532 radix__local_flush_tlb_lpid_guest(lpid);
2533 /* Clear the bit after the TLB flush */
2534 cpumask_clear_cpu(pcpu, need_tlb_flush);
2535 }
2536}
2537
2538static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) 2588static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
2539{ 2589{
2540 int cpu; 2590 int cpu;
@@ -3228,19 +3278,11 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
3228 for (sub = 0; sub < core_info.n_subcores; ++sub) 3278 for (sub = 0; sub < core_info.n_subcores; ++sub)
3229 spin_unlock(&core_info.vc[sub]->lock); 3279 spin_unlock(&core_info.vc[sub]->lock);
3230 3280
3231 if (kvm_is_radix(vc->kvm)) { 3281 guest_enter_irqoff();
3232 /* 3282
3233 * Do we need to flush the process scoped TLB for the LPAR? 3283 srcu_idx = srcu_read_lock(&vc->kvm->srcu);
3234 * 3284
3235 * On POWER9, individual threads can come in here, but the 3285 this_cpu_disable_ftrace();
3236 * TLB is shared between the 4 threads in a core, hence
3237 * invalidating on one thread invalidates for all.
3238 * Thus we make all 4 threads use the same bit here.
3239 *
3240 * Hash must be flushed in realmode in order to use tlbiel.
3241 */
3242 kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL);
3243 }
3244 3286
3245 /* 3287 /*
3246 * Interrupts will be enabled once we get into the guest, 3288 * Interrupts will be enabled once we get into the guest,
@@ -3248,19 +3290,14 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
3248 */ 3290 */
3249 trace_hardirqs_on(); 3291 trace_hardirqs_on();
3250 3292
3251 guest_enter_irqoff();
3252
3253 srcu_idx = srcu_read_lock(&vc->kvm->srcu);
3254
3255 this_cpu_disable_ftrace();
3256
3257 trap = __kvmppc_vcore_entry(); 3293 trap = __kvmppc_vcore_entry();
3258 3294
3295 trace_hardirqs_off();
3296
3259 this_cpu_enable_ftrace(); 3297 this_cpu_enable_ftrace();
3260 3298
3261 srcu_read_unlock(&vc->kvm->srcu, srcu_idx); 3299 srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
3262 3300
3263 trace_hardirqs_off();
3264 set_irq_happened(trap); 3301 set_irq_happened(trap);
3265 3302
3266 spin_lock(&vc->lock); 3303 spin_lock(&vc->lock);
@@ -3374,7 +3411,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
3374 mtspr(SPRN_PURR, vcpu->arch.purr); 3411 mtspr(SPRN_PURR, vcpu->arch.purr);
3375 mtspr(SPRN_SPURR, vcpu->arch.spurr); 3412 mtspr(SPRN_SPURR, vcpu->arch.spurr);
3376 3413
3377 if (cpu_has_feature(CPU_FTR_DAWR)) { 3414 if (dawr_enabled()) {
3378 mtspr(SPRN_DAWR, vcpu->arch.dawr); 3415 mtspr(SPRN_DAWR, vcpu->arch.dawr);
3379 mtspr(SPRN_DAWRX, vcpu->arch.dawrx); 3416 mtspr(SPRN_DAWRX, vcpu->arch.dawrx);
3380 } 3417 }
@@ -3423,7 +3460,9 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
3423 vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2); 3460 vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
3424 vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3); 3461 vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
3425 3462
3426 mtspr(SPRN_PSSCR, host_psscr); 3463 /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
3464 mtspr(SPRN_PSSCR, host_psscr |
3465 (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
3427 mtspr(SPRN_HFSCR, host_hfscr); 3466 mtspr(SPRN_HFSCR, host_hfscr);
3428 mtspr(SPRN_CIABR, host_ciabr); 3467 mtspr(SPRN_CIABR, host_ciabr);
3429 mtspr(SPRN_DAWR, host_dawr); 3468 mtspr(SPRN_DAWR, host_dawr);
@@ -3511,6 +3550,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
3511#ifdef CONFIG_ALTIVEC 3550#ifdef CONFIG_ALTIVEC
3512 load_vr_state(&vcpu->arch.vr); 3551 load_vr_state(&vcpu->arch.vr);
3513#endif 3552#endif
3553 mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
3514 3554
3515 mtspr(SPRN_DSCR, vcpu->arch.dscr); 3555 mtspr(SPRN_DSCR, vcpu->arch.dscr);
3516 mtspr(SPRN_IAMR, vcpu->arch.iamr); 3556 mtspr(SPRN_IAMR, vcpu->arch.iamr);
@@ -3602,6 +3642,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
3602#ifdef CONFIG_ALTIVEC 3642#ifdef CONFIG_ALTIVEC
3603 store_vr_state(&vcpu->arch.vr); 3643 store_vr_state(&vcpu->arch.vr);
3604#endif 3644#endif
3645 vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
3605 3646
3606 if (cpu_has_feature(CPU_FTR_TM) || 3647 if (cpu_has_feature(CPU_FTR_TM) ||
3607 cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) 3648 cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
@@ -3967,7 +4008,7 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
3967 unsigned long lpcr) 4008 unsigned long lpcr)
3968{ 4009{
3969 int trap, r, pcpu; 4010 int trap, r, pcpu;
3970 int srcu_idx; 4011 int srcu_idx, lpid;
3971 struct kvmppc_vcore *vc; 4012 struct kvmppc_vcore *vc;
3972 struct kvm *kvm = vcpu->kvm; 4013 struct kvm *kvm = vcpu->kvm;
3973 struct kvm_nested_guest *nested = vcpu->arch.nested; 4014 struct kvm_nested_guest *nested = vcpu->arch.nested;
@@ -4043,8 +4084,12 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
4043 vc->vcore_state = VCORE_RUNNING; 4084 vc->vcore_state = VCORE_RUNNING;
4044 trace_kvmppc_run_core(vc, 0); 4085 trace_kvmppc_run_core(vc, 0);
4045 4086
4046 if (cpu_has_feature(CPU_FTR_HVMODE)) 4087 if (cpu_has_feature(CPU_FTR_HVMODE)) {
4047 kvmppc_radix_check_need_tlb_flush(kvm, pcpu, nested); 4088 lpid = nested ? nested->shadow_lpid : kvm->arch.lpid;
4089 mtspr(SPRN_LPID, lpid);
4090 isync();
4091 kvmppc_check_need_tlb_flush(kvm, pcpu, nested);
4092 }
4048 4093
4049 trace_hardirqs_on(); 4094 trace_hardirqs_on();
4050 guest_enter_irqoff(); 4095 guest_enter_irqoff();
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index b0cf22477e87..6035d24f1d1d 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -805,3 +805,60 @@ void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
805 vcpu->arch.doorbell_request = 0; 805 vcpu->arch.doorbell_request = 0;
806 } 806 }
807} 807}
808
809static void flush_guest_tlb(struct kvm *kvm)
810{
811 unsigned long rb, set;
812
813 rb = PPC_BIT(52); /* IS = 2 */
814 if (kvm_is_radix(kvm)) {
815 /* R=1 PRS=1 RIC=2 */
816 asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
817 : : "r" (rb), "i" (1), "i" (1), "i" (2),
818 "r" (0) : "memory");
819 for (set = 1; set < kvm->arch.tlb_sets; ++set) {
820 rb += PPC_BIT(51); /* increment set number */
821 /* R=1 PRS=1 RIC=0 */
822 asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
823 : : "r" (rb), "i" (1), "i" (1), "i" (0),
824 "r" (0) : "memory");
825 }
826 } else {
827 for (set = 0; set < kvm->arch.tlb_sets; ++set) {
828 /* R=0 PRS=0 RIC=0 */
829 asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
830 : : "r" (rb), "i" (0), "i" (0), "i" (0),
831 "r" (0) : "memory");
832 rb += PPC_BIT(51); /* increment set number */
833 }
834 }
835 asm volatile("ptesync": : :"memory");
836}
837
838void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu,
839 struct kvm_nested_guest *nested)
840{
841 cpumask_t *need_tlb_flush;
842
843 /*
844 * On POWER9, individual threads can come in here, but the
845 * TLB is shared between the 4 threads in a core, hence
846 * invalidating on one thread invalidates for all.
847 * Thus we make all 4 threads use the same bit.
848 */
849 if (cpu_has_feature(CPU_FTR_ARCH_300))
850 pcpu = cpu_first_thread_sibling(pcpu);
851
852 if (nested)
853 need_tlb_flush = &nested->need_tlb_flush;
854 else
855 need_tlb_flush = &kvm->arch.need_tlb_flush;
856
857 if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
858 flush_guest_tlb(kvm);
859
860 /* Clear the bit after the TLB flush */
861 cpumask_clear_cpu(pcpu, need_tlb_flush);
862 }
863}
864EXPORT_SYMBOL_GPL(kvmppc_check_need_tlb_flush);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 3b3791ed74a6..8431ad1e8391 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -13,6 +13,7 @@
13#include <linux/hugetlb.h> 13#include <linux/hugetlb.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/log2.h> 15#include <linux/log2.h>
16#include <linux/sizes.h>
16 17
17#include <asm/trace.h> 18#include <asm/trace.h>
18#include <asm/kvm_ppc.h> 19#include <asm/kvm_ppc.h>
@@ -867,6 +868,149 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
867 return ret; 868 return ret;
868} 869}
869 870
871static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long gpa,
872 int writing, unsigned long *hpa,
873 struct kvm_memory_slot **memslot_p)
874{
875 struct kvm *kvm = vcpu->kvm;
876 struct kvm_memory_slot *memslot;
877 unsigned long gfn, hva, pa, psize = PAGE_SHIFT;
878 unsigned int shift;
879 pte_t *ptep, pte;
880
881 /* Find the memslot for this address */
882 gfn = gpa >> PAGE_SHIFT;
883 memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
884 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
885 return H_PARAMETER;
886
887 /* Translate to host virtual address */
888 hva = __gfn_to_hva_memslot(memslot, gfn);
889
890 /* Try to find the host pte for that virtual address */
891 ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
892 if (!ptep)
893 return H_TOO_HARD;
894 pte = kvmppc_read_update_linux_pte(ptep, writing);
895 if (!pte_present(pte))
896 return H_TOO_HARD;
897
898 /* Convert to a physical address */
899 if (shift)
900 psize = 1UL << shift;
901 pa = pte_pfn(pte) << PAGE_SHIFT;
902 pa |= hva & (psize - 1);
903 pa |= gpa & ~PAGE_MASK;
904
905 if (hpa)
906 *hpa = pa;
907 if (memslot_p)
908 *memslot_p = memslot;
909
910 return H_SUCCESS;
911}
912
913static long kvmppc_do_h_page_init_zero(struct kvm_vcpu *vcpu,
914 unsigned long dest)
915{
916 struct kvm_memory_slot *memslot;
917 struct kvm *kvm = vcpu->kvm;
918 unsigned long pa, mmu_seq;
919 long ret = H_SUCCESS;
920 int i;
921
922 /* Used later to detect if we might have been invalidated */
923 mmu_seq = kvm->mmu_notifier_seq;
924 smp_rmb();
925
926 ret = kvmppc_get_hpa(vcpu, dest, 1, &pa, &memslot);
927 if (ret != H_SUCCESS)
928 return ret;
929
930 /* Check if we've been invalidated */
931 raw_spin_lock(&kvm->mmu_lock.rlock);
932 if (mmu_notifier_retry(kvm, mmu_seq)) {
933 ret = H_TOO_HARD;
934 goto out_unlock;
935 }
936
937 /* Zero the page */
938 for (i = 0; i < SZ_4K; i += L1_CACHE_BYTES, pa += L1_CACHE_BYTES)
939 dcbz((void *)pa);
940 kvmppc_update_dirty_map(memslot, dest >> PAGE_SHIFT, PAGE_SIZE);
941
942out_unlock:
943 raw_spin_unlock(&kvm->mmu_lock.rlock);
944 return ret;
945}
946
947static long kvmppc_do_h_page_init_copy(struct kvm_vcpu *vcpu,
948 unsigned long dest, unsigned long src)
949{
950 unsigned long dest_pa, src_pa, mmu_seq;
951 struct kvm_memory_slot *dest_memslot;
952 struct kvm *kvm = vcpu->kvm;
953 long ret = H_SUCCESS;
954
955 /* Used later to detect if we might have been invalidated */
956 mmu_seq = kvm->mmu_notifier_seq;
957 smp_rmb();
958
959 ret = kvmppc_get_hpa(vcpu, dest, 1, &dest_pa, &dest_memslot);
960 if (ret != H_SUCCESS)
961 return ret;
962 ret = kvmppc_get_hpa(vcpu, src, 0, &src_pa, NULL);
963 if (ret != H_SUCCESS)
964 return ret;
965
966 /* Check if we've been invalidated */
967 raw_spin_lock(&kvm->mmu_lock.rlock);
968 if (mmu_notifier_retry(kvm, mmu_seq)) {
969 ret = H_TOO_HARD;
970 goto out_unlock;
971 }
972
973 /* Copy the page */
974 memcpy((void *)dest_pa, (void *)src_pa, SZ_4K);
975
976 kvmppc_update_dirty_map(dest_memslot, dest >> PAGE_SHIFT, PAGE_SIZE);
977
978out_unlock:
979 raw_spin_unlock(&kvm->mmu_lock.rlock);
980 return ret;
981}
982
983long kvmppc_rm_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
984 unsigned long dest, unsigned long src)
985{
986 struct kvm *kvm = vcpu->kvm;
987 u64 pg_mask = SZ_4K - 1; /* 4K page size */
988 long ret = H_SUCCESS;
989
990 /* Don't handle radix mode here, go up to the virtual mode handler */
991 if (kvm_is_radix(kvm))
992 return H_TOO_HARD;
993
994 /* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */
995 if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE |
996 H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED))
997 return H_PARAMETER;
998
999 /* dest (and src if copy_page flag set) must be page aligned */
1000 if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask)))
1001 return H_PARAMETER;
1002
1003 /* zero and/or copy the page as determined by the flags */
1004 if (flags & H_COPY_PAGE)
1005 ret = kvmppc_do_h_page_init_copy(vcpu, dest, src);
1006 else if (flags & H_ZERO_PAGE)
1007 ret = kvmppc_do_h_page_init_zero(vcpu, dest);
1008
1009 /* We can ignore the other flags */
1010
1011 return ret;
1012}
1013
870void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, 1014void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
871 unsigned long pte_index) 1015 unsigned long pte_index)
872{ 1016{
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 3a5e719ef032..ad7bee97de77 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -581,11 +581,8 @@ kvmppc_hv_entry:
5811: 5811:
582#endif 582#endif
583 583
584 /* Use cr7 as an indication of radix mode */
585 ld r5, HSTATE_KVM_VCORE(r13) 584 ld r5, HSTATE_KVM_VCORE(r13)
586 ld r9, VCORE_KVM(r5) /* pointer to struct kvm */ 585 ld r9, VCORE_KVM(r5) /* pointer to struct kvm */
587 lbz r0, KVM_RADIX(r9)
588 cmpwi cr7, r0, 0
589 586
590 /* 587 /*
591 * POWER7/POWER8 host -> guest partition switch code. 588 * POWER7/POWER8 host -> guest partition switch code.
@@ -608,9 +605,6 @@ kvmppc_hv_entry:
608 cmpwi r6,0 605 cmpwi r6,0
609 bne 10f 606 bne 10f
610 607
611 /* Radix has already switched LPID and flushed core TLB */
612 bne cr7, 22f
613
614 lwz r7,KVM_LPID(r9) 608 lwz r7,KVM_LPID(r9)
615BEGIN_FTR_SECTION 609BEGIN_FTR_SECTION
616 ld r6,KVM_SDR1(r9) 610 ld r6,KVM_SDR1(r9)
@@ -622,41 +616,13 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
622 mtspr SPRN_LPID,r7 616 mtspr SPRN_LPID,r7
623 isync 617 isync
624 618
625 /* See if we need to flush the TLB. Hash has to be done in RM */ 619 /* See if we need to flush the TLB. */
626 lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */ 620 mr r3, r9 /* kvm pointer */
627BEGIN_FTR_SECTION 621 lhz r4, PACAPACAINDEX(r13) /* physical cpu number */
628 /* 622 li r5, 0 /* nested vcpu pointer */
629 * On POWER9, individual threads can come in here, but the 623 bl kvmppc_check_need_tlb_flush
630 * TLB is shared between the 4 threads in a core, hence 624 nop
631 * invalidating on one thread invalidates for all. 625 ld r5, HSTATE_KVM_VCORE(r13)
632 * Thus we make all 4 threads use the same bit here.
633 */
634 clrrdi r6,r6,2
635END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
636 clrldi r7,r6,64-6 /* extract bit number (6 bits) */
637 srdi r6,r6,6 /* doubleword number */
638 sldi r6,r6,3 /* address offset */
639 add r6,r6,r9
640 addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */
641 li r8,1
642 sld r8,r8,r7
643 ld r7,0(r6)
644 and. r7,r7,r8
645 beq 22f
646 /* Flush the TLB of any entries for this LPID */
647 lwz r0,KVM_TLB_SETS(r9)
648 mtctr r0
649 li r7,0x800 /* IS field = 0b10 */
650 ptesync
651 li r0,0 /* RS for P9 version of tlbiel */
65228: tlbiel r7 /* On P9, rs=0, RIC=0, PRS=0, R=0 */
653 addi r7,r7,0x1000
654 bdnz 28b
655 ptesync
65623: ldarx r7,0,r6 /* clear the bit after TLB flushed */
657 andc r7,r7,r8
658 stdcx. r7,0,r6
659 bne 23b
660 626
661 /* Add timebase offset onto timebase */ 627 /* Add timebase offset onto timebase */
66222: ld r8,VCORE_TB_OFFSET(r5) 62822: ld r8,VCORE_TB_OFFSET(r5)
@@ -822,18 +788,21 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
822 mtspr SPRN_IAMR, r5 788 mtspr SPRN_IAMR, r5
823 mtspr SPRN_PSPB, r6 789 mtspr SPRN_PSPB, r6
824 mtspr SPRN_FSCR, r7 790 mtspr SPRN_FSCR, r7
825 ld r5, VCPU_DAWR(r4)
826 ld r6, VCPU_DAWRX(r4)
827 ld r7, VCPU_CIABR(r4)
828 ld r8, VCPU_TAR(r4)
829 /* 791 /*
830 * Handle broken DAWR case by not writing it. This means we 792 * Handle broken DAWR case by not writing it. This means we
831 * can still store the DAWR register for migration. 793 * can still store the DAWR register for migration.
832 */ 794 */
833BEGIN_FTR_SECTION 795 LOAD_REG_ADDR(r5, dawr_force_enable)
796 lbz r5, 0(r5)
797 cmpdi r5, 0
798 beq 1f
799 ld r5, VCPU_DAWR(r4)
800 ld r6, VCPU_DAWRX(r4)
834 mtspr SPRN_DAWR, r5 801 mtspr SPRN_DAWR, r5
835 mtspr SPRN_DAWRX, r6 802 mtspr SPRN_DAWRX, r6
836END_FTR_SECTION_IFSET(CPU_FTR_DAWR) 8031:
804 ld r7, VCPU_CIABR(r4)
805 ld r8, VCPU_TAR(r4)
837 mtspr SPRN_CIABR, r7 806 mtspr SPRN_CIABR, r7
838 mtspr SPRN_TAR, r8 807 mtspr SPRN_TAR, r8
839 ld r5, VCPU_IC(r4) 808 ld r5, VCPU_IC(r4)
@@ -969,17 +938,27 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
969 938
970#ifdef CONFIG_KVM_XICS 939#ifdef CONFIG_KVM_XICS
971 /* We are entering the guest on that thread, push VCPU to XIVE */ 940 /* We are entering the guest on that thread, push VCPU to XIVE */
972 ld r10, HSTATE_XIVE_TIMA_PHYS(r13)
973 cmpldi cr0, r10, 0
974 beq no_xive
975 ld r11, VCPU_XIVE_SAVED_STATE(r4) 941 ld r11, VCPU_XIVE_SAVED_STATE(r4)
976 li r9, TM_QW1_OS 942 li r9, TM_QW1_OS
943 lwz r8, VCPU_XIVE_CAM_WORD(r4)
944 li r7, TM_QW1_OS + TM_WORD2
945 mfmsr r0
946 andi. r0, r0, MSR_DR /* in real mode? */
947 beq 2f
948 ld r10, HSTATE_XIVE_TIMA_VIRT(r13)
949 cmpldi cr1, r10, 0
950 beq cr1, no_xive
951 eieio
952 stdx r11,r9,r10
953 stwx r8,r7,r10
954 b 3f
9552: ld r10, HSTATE_XIVE_TIMA_PHYS(r13)
956 cmpldi cr1, r10, 0
957 beq cr1, no_xive
977 eieio 958 eieio
978 stdcix r11,r9,r10 959 stdcix r11,r9,r10
979 lwz r11, VCPU_XIVE_CAM_WORD(r4) 960 stwcix r8,r7,r10
980 li r9, TM_QW1_OS + TM_WORD2 9613: li r9, 1
981 stwcix r11,r9,r10
982 li r9, 1
983 stb r9, VCPU_XIVE_PUSHED(r4) 962 stb r9, VCPU_XIVE_PUSHED(r4)
984 eieio 963 eieio
985 964
@@ -998,12 +977,16 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
998 * on, we mask it. 977 * on, we mask it.
999 */ 978 */
1000 lbz r0, VCPU_XIVE_ESC_ON(r4) 979 lbz r0, VCPU_XIVE_ESC_ON(r4)
1001 cmpwi r0,0 980 cmpwi cr1, r0,0
1002 beq 1f 981 beq cr1, 1f
1003 ld r10, VCPU_XIVE_ESC_RADDR(r4)
1004 li r9, XIVE_ESB_SET_PQ_01 982 li r9, XIVE_ESB_SET_PQ_01
983 beq 4f /* in real mode? */
984 ld r10, VCPU_XIVE_ESC_VADDR(r4)
985 ldx r0, r10, r9
986 b 5f
9874: ld r10, VCPU_XIVE_ESC_RADDR(r4)
1005 ldcix r0, r10, r9 988 ldcix r0, r10, r9
1006 sync 9895: sync
1007 990
1008 /* We have a possible subtle race here: The escalation interrupt might 991 /* We have a possible subtle race here: The escalation interrupt might
1009 * have fired and be on its way to the host queue while we mask it, 992 * have fired and be on its way to the host queue while we mask it,
@@ -2281,7 +2264,7 @@ hcall_real_table:
2281#endif 2264#endif
2282 .long 0 /* 0x24 - H_SET_SPRG0 */ 2265 .long 0 /* 0x24 - H_SET_SPRG0 */
2283 .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table 2266 .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table
2284 .long 0 /* 0x2c */ 2267 .long DOTSYM(kvmppc_rm_h_page_init) - hcall_real_table
2285 .long 0 /* 0x30 */ 2268 .long 0 /* 0x30 */
2286 .long 0 /* 0x34 */ 2269 .long 0 /* 0x34 */
2287 .long 0 /* 0x38 */ 2270 .long 0 /* 0x38 */
@@ -2513,11 +2496,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
2513 blr 2496 blr
2514 2497
25152: 24982:
2516BEGIN_FTR_SECTION 2499 LOAD_REG_ADDR(r11, dawr_force_enable)
2517 /* POWER9 with disabled DAWR */ 2500 lbz r11, 0(r11)
2501 cmpdi r11, 0
2518 li r3, H_HARDWARE 2502 li r3, H_HARDWARE
2519 blr 2503 beqlr
2520END_FTR_SECTION_IFCLR(CPU_FTR_DAWR)
2521 /* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */ 2504 /* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */
2522 rlwimi r5, r4, 5, DAWRX_DR | DAWRX_DW 2505 rlwimi r5, r4, 5, DAWRX_DR | DAWRX_DW
2523 rlwimi r5, r4, 2, DAWRX_WT 2506 rlwimi r5, r4, 2, DAWRX_WT
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index f78d002f0fe0..4953957333b7 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -166,7 +166,8 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
166 return IRQ_HANDLED; 166 return IRQ_HANDLED;
167} 167}
168 168
169static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio) 169int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
170 bool single_escalation)
170{ 171{
171 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 172 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
172 struct xive_q *q = &xc->queues[prio]; 173 struct xive_q *q = &xc->queues[prio];
@@ -185,7 +186,7 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
185 return -EIO; 186 return -EIO;
186 } 187 }
187 188
188 if (xc->xive->single_escalation) 189 if (single_escalation)
189 name = kasprintf(GFP_KERNEL, "kvm-%d-%d", 190 name = kasprintf(GFP_KERNEL, "kvm-%d-%d",
190 vcpu->kvm->arch.lpid, xc->server_num); 191 vcpu->kvm->arch.lpid, xc->server_num);
191 else 192 else
@@ -217,7 +218,7 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
217 * interrupt, thus leaving it effectively masked after 218 * interrupt, thus leaving it effectively masked after
218 * it fires once. 219 * it fires once.
219 */ 220 */
220 if (xc->xive->single_escalation) { 221 if (single_escalation) {
221 struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]); 222 struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]);
222 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 223 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
223 224
@@ -291,7 +292,8 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio)
291 continue; 292 continue;
292 rc = xive_provision_queue(vcpu, prio); 293 rc = xive_provision_queue(vcpu, prio);
293 if (rc == 0 && !xive->single_escalation) 294 if (rc == 0 && !xive->single_escalation)
294 xive_attach_escalation(vcpu, prio); 295 kvmppc_xive_attach_escalation(vcpu, prio,
296 xive->single_escalation);
295 if (rc) 297 if (rc)
296 return rc; 298 return rc;
297 } 299 }
@@ -342,7 +344,7 @@ static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio)
342 return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY; 344 return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY;
343} 345}
344 346
345static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio) 347int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
346{ 348{
347 struct kvm_vcpu *vcpu; 349 struct kvm_vcpu *vcpu;
348 int i, rc; 350 int i, rc;
@@ -380,11 +382,6 @@ static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
380 return -EBUSY; 382 return -EBUSY;
381} 383}
382 384
383static u32 xive_vp(struct kvmppc_xive *xive, u32 server)
384{
385 return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server);
386}
387
388static u8 xive_lock_and_mask(struct kvmppc_xive *xive, 385static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
389 struct kvmppc_xive_src_block *sb, 386 struct kvmppc_xive_src_block *sb,
390 struct kvmppc_xive_irq_state *state) 387 struct kvmppc_xive_irq_state *state)
@@ -430,8 +427,8 @@ static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
430 */ 427 */
431 if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) { 428 if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
432 xive_native_configure_irq(hw_num, 429 xive_native_configure_irq(hw_num,
433 xive_vp(xive, state->act_server), 430 kvmppc_xive_vp(xive, state->act_server),
434 MASKED, state->number); 431 MASKED, state->number);
435 /* set old_p so we can track if an H_EOI was done */ 432 /* set old_p so we can track if an H_EOI was done */
436 state->old_p = true; 433 state->old_p = true;
437 state->old_q = false; 434 state->old_q = false;
@@ -486,8 +483,8 @@ static void xive_finish_unmask(struct kvmppc_xive *xive,
486 */ 483 */
487 if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) { 484 if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
488 xive_native_configure_irq(hw_num, 485 xive_native_configure_irq(hw_num,
489 xive_vp(xive, state->act_server), 486 kvmppc_xive_vp(xive, state->act_server),
490 state->act_priority, state->number); 487 state->act_priority, state->number);
491 /* If an EOI is needed, do it here */ 488 /* If an EOI is needed, do it here */
492 if (!state->old_p) 489 if (!state->old_p)
493 xive_vm_source_eoi(hw_num, xd); 490 xive_vm_source_eoi(hw_num, xd);
@@ -535,7 +532,7 @@ static int xive_target_interrupt(struct kvm *kvm,
535 * priority. The count for that new target will have 532 * priority. The count for that new target will have
536 * already been incremented. 533 * already been incremented.
537 */ 534 */
538 rc = xive_select_target(kvm, &server, prio); 535 rc = kvmppc_xive_select_target(kvm, &server, prio);
539 536
540 /* 537 /*
541 * We failed to find a target ? Not much we can do 538 * We failed to find a target ? Not much we can do
@@ -563,7 +560,7 @@ static int xive_target_interrupt(struct kvm *kvm,
563 kvmppc_xive_select_irq(state, &hw_num, NULL); 560 kvmppc_xive_select_irq(state, &hw_num, NULL);
564 561
565 return xive_native_configure_irq(hw_num, 562 return xive_native_configure_irq(hw_num,
566 xive_vp(xive, server), 563 kvmppc_xive_vp(xive, server),
567 prio, state->number); 564 prio, state->number);
568} 565}
569 566
@@ -849,7 +846,8 @@ int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
849 846
850 /* 847 /*
851 * We can't update the state of a "pushed" VCPU, but that 848 * We can't update the state of a "pushed" VCPU, but that
852 * shouldn't happen. 849 * shouldn't happen because the vcpu->mutex makes running a
850 * vcpu mutually exclusive with doing one_reg get/set on it.
853 */ 851 */
854 if (WARN_ON(vcpu->arch.xive_pushed)) 852 if (WARN_ON(vcpu->arch.xive_pushed))
855 return -EIO; 853 return -EIO;
@@ -940,6 +938,13 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
940 /* Turn the IPI hard off */ 938 /* Turn the IPI hard off */
941 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01); 939 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
942 940
941 /*
942 * Reset ESB guest mapping. Needed when ESB pages are exposed
943 * to the guest in XIVE native mode
944 */
945 if (xive->ops && xive->ops->reset_mapped)
946 xive->ops->reset_mapped(kvm, guest_irq);
947
943 /* Grab info about irq */ 948 /* Grab info about irq */
944 state->pt_number = hw_irq; 949 state->pt_number = hw_irq;
945 state->pt_data = irq_data_get_irq_handler_data(host_data); 950 state->pt_data = irq_data_get_irq_handler_data(host_data);
@@ -951,7 +956,7 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
951 * which is fine for a never started interrupt. 956 * which is fine for a never started interrupt.
952 */ 957 */
953 xive_native_configure_irq(hw_irq, 958 xive_native_configure_irq(hw_irq,
954 xive_vp(xive, state->act_server), 959 kvmppc_xive_vp(xive, state->act_server),
955 state->act_priority, state->number); 960 state->act_priority, state->number);
956 961
957 /* 962 /*
@@ -1025,9 +1030,17 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
1025 state->pt_number = 0; 1030 state->pt_number = 0;
1026 state->pt_data = NULL; 1031 state->pt_data = NULL;
1027 1032
1033 /*
1034 * Reset ESB guest mapping. Needed when ESB pages are exposed
1035 * to the guest in XIVE native mode
1036 */
1037 if (xive->ops && xive->ops->reset_mapped) {
1038 xive->ops->reset_mapped(kvm, guest_irq);
1039 }
1040
1028 /* Reconfigure the IPI */ 1041 /* Reconfigure the IPI */
1029 xive_native_configure_irq(state->ipi_number, 1042 xive_native_configure_irq(state->ipi_number,
1030 xive_vp(xive, state->act_server), 1043 kvmppc_xive_vp(xive, state->act_server),
1031 state->act_priority, state->number); 1044 state->act_priority, state->number);
1032 1045
1033 /* 1046 /*
@@ -1049,7 +1062,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
1049} 1062}
1050EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped); 1063EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped);
1051 1064
1052static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) 1065void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
1053{ 1066{
1054 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1067 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1055 struct kvm *kvm = vcpu->kvm; 1068 struct kvm *kvm = vcpu->kvm;
@@ -1083,14 +1096,35 @@ static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
1083 arch_spin_unlock(&sb->lock); 1096 arch_spin_unlock(&sb->lock);
1084 } 1097 }
1085 } 1098 }
1099
1100 /* Disable vcpu's escalation interrupt */
1101 if (vcpu->arch.xive_esc_on) {
1102 __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
1103 XIVE_ESB_SET_PQ_01));
1104 vcpu->arch.xive_esc_on = false;
1105 }
1106
1107 /*
1108 * Clear pointers to escalation interrupt ESB.
1109 * This is safe because the vcpu->mutex is held, preventing
1110 * any other CPU from concurrently executing a KVM_RUN ioctl.
1111 */
1112 vcpu->arch.xive_esc_vaddr = 0;
1113 vcpu->arch.xive_esc_raddr = 0;
1086} 1114}
1087 1115
1088void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) 1116void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
1089{ 1117{
1090 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1118 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1091 struct kvmppc_xive *xive = xc->xive; 1119 struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
1092 int i; 1120 int i;
1093 1121
1122 if (!kvmppc_xics_enabled(vcpu))
1123 return;
1124
1125 if (!xc)
1126 return;
1127
1094 pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num); 1128 pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
1095 1129
1096 /* Ensure no interrupt is still routed to that VP */ 1130 /* Ensure no interrupt is still routed to that VP */
@@ -1129,6 +1163,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
1129 } 1163 }
1130 /* Free the VP */ 1164 /* Free the VP */
1131 kfree(xc); 1165 kfree(xc);
1166
1167 /* Cleanup the vcpu */
1168 vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
1169 vcpu->arch.xive_vcpu = NULL;
1132} 1170}
1133 1171
1134int kvmppc_xive_connect_vcpu(struct kvm_device *dev, 1172int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
@@ -1146,7 +1184,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
1146 } 1184 }
1147 if (xive->kvm != vcpu->kvm) 1185 if (xive->kvm != vcpu->kvm)
1148 return -EPERM; 1186 return -EPERM;
1149 if (vcpu->arch.irq_type) 1187 if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
1150 return -EBUSY; 1188 return -EBUSY;
1151 if (kvmppc_xive_find_server(vcpu->kvm, cpu)) { 1189 if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
1152 pr_devel("Duplicate !\n"); 1190 pr_devel("Duplicate !\n");
@@ -1166,7 +1204,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
1166 xc->xive = xive; 1204 xc->xive = xive;
1167 xc->vcpu = vcpu; 1205 xc->vcpu = vcpu;
1168 xc->server_num = cpu; 1206 xc->server_num = cpu;
1169 xc->vp_id = xive_vp(xive, cpu); 1207 xc->vp_id = kvmppc_xive_vp(xive, cpu);
1170 xc->mfrr = 0xff; 1208 xc->mfrr = 0xff;
1171 xc->valid = true; 1209 xc->valid = true;
1172 1210
@@ -1219,7 +1257,8 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
1219 if (xive->qmap & (1 << i)) { 1257 if (xive->qmap & (1 << i)) {
1220 r = xive_provision_queue(vcpu, i); 1258 r = xive_provision_queue(vcpu, i);
1221 if (r == 0 && !xive->single_escalation) 1259 if (r == 0 && !xive->single_escalation)
1222 xive_attach_escalation(vcpu, i); 1260 kvmppc_xive_attach_escalation(
1261 vcpu, i, xive->single_escalation);
1223 if (r) 1262 if (r)
1224 goto bail; 1263 goto bail;
1225 } else { 1264 } else {
@@ -1234,7 +1273,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
1234 } 1273 }
1235 1274
1236 /* If not done above, attach priority 0 escalation */ 1275 /* If not done above, attach priority 0 escalation */
1237 r = xive_attach_escalation(vcpu, 0); 1276 r = kvmppc_xive_attach_escalation(vcpu, 0, xive->single_escalation);
1238 if (r) 1277 if (r)
1239 goto bail; 1278 goto bail;
1240 1279
@@ -1485,8 +1524,8 @@ static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
1485 return 0; 1524 return 0;
1486} 1525}
1487 1526
1488static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *xive, 1527struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
1489 int irq) 1528 struct kvmppc_xive *xive, int irq)
1490{ 1529{
1491 struct kvm *kvm = xive->kvm; 1530 struct kvm *kvm = xive->kvm;
1492 struct kvmppc_xive_src_block *sb; 1531 struct kvmppc_xive_src_block *sb;
@@ -1509,6 +1548,7 @@ static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *x
1509 1548
1510 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 1549 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
1511 sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i; 1550 sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i;
1551 sb->irq_state[i].eisn = 0;
1512 sb->irq_state[i].guest_priority = MASKED; 1552 sb->irq_state[i].guest_priority = MASKED;
1513 sb->irq_state[i].saved_priority = MASKED; 1553 sb->irq_state[i].saved_priority = MASKED;
1514 sb->irq_state[i].act_priority = MASKED; 1554 sb->irq_state[i].act_priority = MASKED;
@@ -1565,7 +1605,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
1565 sb = kvmppc_xive_find_source(xive, irq, &idx); 1605 sb = kvmppc_xive_find_source(xive, irq, &idx);
1566 if (!sb) { 1606 if (!sb) {
1567 pr_devel("No source, creating source block...\n"); 1607 pr_devel("No source, creating source block...\n");
1568 sb = xive_create_src_block(xive, irq); 1608 sb = kvmppc_xive_create_src_block(xive, irq);
1569 if (!sb) { 1609 if (!sb) {
1570 pr_devel("Failed to create block...\n"); 1610 pr_devel("Failed to create block...\n");
1571 return -ENOMEM; 1611 return -ENOMEM;
@@ -1789,7 +1829,7 @@ static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
1789 xive_cleanup_irq_data(xd); 1829 xive_cleanup_irq_data(xd);
1790} 1830}
1791 1831
1792static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb) 1832void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
1793{ 1833{
1794 int i; 1834 int i;
1795 1835
@@ -1810,16 +1850,55 @@ static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
1810 } 1850 }
1811} 1851}
1812 1852
1813static void kvmppc_xive_free(struct kvm_device *dev) 1853/*
1854 * Called when device fd is closed. kvm->lock is held.
1855 */
1856static void kvmppc_xive_release(struct kvm_device *dev)
1814{ 1857{
1815 struct kvmppc_xive *xive = dev->private; 1858 struct kvmppc_xive *xive = dev->private;
1816 struct kvm *kvm = xive->kvm; 1859 struct kvm *kvm = xive->kvm;
1860 struct kvm_vcpu *vcpu;
1817 int i; 1861 int i;
1862 int was_ready;
1863
1864 pr_devel("Releasing xive device\n");
1818 1865
1819 debugfs_remove(xive->dentry); 1866 debugfs_remove(xive->dentry);
1820 1867
1821 if (kvm) 1868 /*
1822 kvm->arch.xive = NULL; 1869 * Clearing mmu_ready temporarily while holding kvm->lock
1870 * is a way of ensuring that no vcpus can enter the guest
1871 * until we drop kvm->lock. Doing kick_all_cpus_sync()
1872 * ensures that any vcpu executing inside the guest has
1873 * exited the guest. Once kick_all_cpus_sync() has finished,
1874 * we know that no vcpu can be executing the XIVE push or
1875 * pull code, or executing a XICS hcall.
1876 *
1877 * Since this is the device release function, we know that
1878 * userspace does not have any open fd referring to the
1879 * device. Therefore there can not be any of the device
1880 * attribute set/get functions being executed concurrently,
1881 * and similarly, the connect_vcpu and set/clr_mapped
1882 * functions also cannot be being executed.
1883 */
1884 was_ready = kvm->arch.mmu_ready;
1885 kvm->arch.mmu_ready = 0;
1886 kick_all_cpus_sync();
1887
1888 /*
1889 * We should clean up the vCPU interrupt presenters first.
1890 */
1891 kvm_for_each_vcpu(i, vcpu, kvm) {
1892 /*
1893 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
1894 * (i.e. kvmppc_xive_[gs]et_icp) can be done concurrently.
1895 */
1896 mutex_lock(&vcpu->mutex);
1897 kvmppc_xive_cleanup_vcpu(vcpu);
1898 mutex_unlock(&vcpu->mutex);
1899 }
1900
1901 kvm->arch.xive = NULL;
1823 1902
1824 /* Mask and free interrupts */ 1903 /* Mask and free interrupts */
1825 for (i = 0; i <= xive->max_sbid; i++) { 1904 for (i = 0; i <= xive->max_sbid; i++) {
@@ -1832,11 +1911,47 @@ static void kvmppc_xive_free(struct kvm_device *dev)
1832 if (xive->vp_base != XIVE_INVALID_VP) 1911 if (xive->vp_base != XIVE_INVALID_VP)
1833 xive_native_free_vp_block(xive->vp_base); 1912 xive_native_free_vp_block(xive->vp_base);
1834 1913
1914 kvm->arch.mmu_ready = was_ready;
1915
1916 /*
1917 * A reference of the kvmppc_xive pointer is now kept under
1918 * the xive_devices struct of the machine for reuse. It is
1919 * freed when the VM is destroyed for now until we fix all the
1920 * execution paths.
1921 */
1835 1922
1836 kfree(xive);
1837 kfree(dev); 1923 kfree(dev);
1838} 1924}
1839 1925
1926/*
1927 * When the guest chooses the interrupt mode (XICS legacy or XIVE
1928 * native), the VM will switch of KVM device. The previous device will
1929 * be "released" before the new one is created.
1930 *
1931 * Until we are sure all execution paths are well protected, provide a
1932 * fail safe (transitional) method for device destruction, in which
1933 * the XIVE device pointer is recycled and not directly freed.
1934 */
1935struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type)
1936{
1937 struct kvmppc_xive **kvm_xive_device = type == KVM_DEV_TYPE_XIVE ?
1938 &kvm->arch.xive_devices.native :
1939 &kvm->arch.xive_devices.xics_on_xive;
1940 struct kvmppc_xive *xive = *kvm_xive_device;
1941
1942 if (!xive) {
1943 xive = kzalloc(sizeof(*xive), GFP_KERNEL);
1944 *kvm_xive_device = xive;
1945 } else {
1946 memset(xive, 0, sizeof(*xive));
1947 }
1948
1949 return xive;
1950}
1951
1952/*
1953 * Create a XICS device with XIVE backend. kvm->lock is held.
1954 */
1840static int kvmppc_xive_create(struct kvm_device *dev, u32 type) 1955static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
1841{ 1956{
1842 struct kvmppc_xive *xive; 1957 struct kvmppc_xive *xive;
@@ -1845,7 +1960,7 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
1845 1960
1846 pr_devel("Creating xive for partition\n"); 1961 pr_devel("Creating xive for partition\n");
1847 1962
1848 xive = kzalloc(sizeof(*xive), GFP_KERNEL); 1963 xive = kvmppc_xive_get_device(kvm, type);
1849 if (!xive) 1964 if (!xive)
1850 return -ENOMEM; 1965 return -ENOMEM;
1851 1966
@@ -1883,6 +1998,43 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
1883 return 0; 1998 return 0;
1884} 1999}
1885 2000
2001int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
2002{
2003 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
2004 unsigned int i;
2005
2006 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
2007 struct xive_q *q = &xc->queues[i];
2008 u32 i0, i1, idx;
2009
2010 if (!q->qpage && !xc->esc_virq[i])
2011 continue;
2012
2013 seq_printf(m, " [q%d]: ", i);
2014
2015 if (q->qpage) {
2016 idx = q->idx;
2017 i0 = be32_to_cpup(q->qpage + idx);
2018 idx = (idx + 1) & q->msk;
2019 i1 = be32_to_cpup(q->qpage + idx);
2020 seq_printf(m, "T=%d %08x %08x...\n", q->toggle,
2021 i0, i1);
2022 }
2023 if (xc->esc_virq[i]) {
2024 struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
2025 struct xive_irq_data *xd =
2026 irq_data_get_irq_handler_data(d);
2027 u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
2028
2029 seq_printf(m, "E:%c%c I(%d:%llx:%llx)",
2030 (pq & XIVE_ESB_VAL_P) ? 'P' : 'p',
2031 (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q',
2032 xc->esc_virq[i], pq, xd->eoi_page);
2033 seq_puts(m, "\n");
2034 }
2035 }
2036 return 0;
2037}
1886 2038
1887static int xive_debug_show(struct seq_file *m, void *private) 2039static int xive_debug_show(struct seq_file *m, void *private)
1888{ 2040{
@@ -1908,7 +2060,6 @@ static int xive_debug_show(struct seq_file *m, void *private)
1908 2060
1909 kvm_for_each_vcpu(i, vcpu, kvm) { 2061 kvm_for_each_vcpu(i, vcpu, kvm) {
1910 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 2062 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1911 unsigned int i;
1912 2063
1913 if (!xc) 2064 if (!xc)
1914 continue; 2065 continue;
@@ -1918,33 +2069,8 @@ static int xive_debug_show(struct seq_file *m, void *private)
1918 xc->server_num, xc->cppr, xc->hw_cppr, 2069 xc->server_num, xc->cppr, xc->hw_cppr,
1919 xc->mfrr, xc->pending, 2070 xc->mfrr, xc->pending,
1920 xc->stat_rm_h_xirr, xc->stat_vm_h_xirr); 2071 xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
1921 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
1922 struct xive_q *q = &xc->queues[i];
1923 u32 i0, i1, idx;
1924
1925 if (!q->qpage && !xc->esc_virq[i])
1926 continue;
1927 2072
1928 seq_printf(m, " [q%d]: ", i); 2073 kvmppc_xive_debug_show_queues(m, vcpu);
1929
1930 if (q->qpage) {
1931 idx = q->idx;
1932 i0 = be32_to_cpup(q->qpage + idx);
1933 idx = (idx + 1) & q->msk;
1934 i1 = be32_to_cpup(q->qpage + idx);
1935 seq_printf(m, "T=%d %08x %08x... \n", q->toggle, i0, i1);
1936 }
1937 if (xc->esc_virq[i]) {
1938 struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
1939 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
1940 u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
1941 seq_printf(m, "E:%c%c I(%d:%llx:%llx)",
1942 (pq & XIVE_ESB_VAL_P) ? 'P' : 'p',
1943 (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q',
1944 xc->esc_virq[i], pq, xd->eoi_page);
1945 seq_printf(m, "\n");
1946 }
1947 }
1948 2074
1949 t_rm_h_xirr += xc->stat_rm_h_xirr; 2075 t_rm_h_xirr += xc->stat_rm_h_xirr;
1950 t_rm_h_ipoll += xc->stat_rm_h_ipoll; 2076 t_rm_h_ipoll += xc->stat_rm_h_ipoll;
@@ -1999,7 +2125,7 @@ struct kvm_device_ops kvm_xive_ops = {
1999 .name = "kvm-xive", 2125 .name = "kvm-xive",
2000 .create = kvmppc_xive_create, 2126 .create = kvmppc_xive_create,
2001 .init = kvmppc_xive_init, 2127 .init = kvmppc_xive_init,
2002 .destroy = kvmppc_xive_free, 2128 .release = kvmppc_xive_release,
2003 .set_attr = xive_set_attr, 2129 .set_attr = xive_set_attr,
2004 .get_attr = xive_get_attr, 2130 .get_attr = xive_get_attr,
2005 .has_attr = xive_has_attr, 2131 .has_attr = xive_has_attr,
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
index a08ae6fd4c51..426146332984 100644
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -13,6 +13,13 @@
13#include "book3s_xics.h" 13#include "book3s_xics.h"
14 14
15/* 15/*
16 * The XIVE Interrupt source numbers are within the range 0 to
17 * KVMPPC_XICS_NR_IRQS.
18 */
19#define KVMPPC_XIVE_FIRST_IRQ 0
20#define KVMPPC_XIVE_NR_IRQS KVMPPC_XICS_NR_IRQS
21
22/*
16 * State for one guest irq source. 23 * State for one guest irq source.
17 * 24 *
18 * For each guest source we allocate a HW interrupt in the XIVE 25 * For each guest source we allocate a HW interrupt in the XIVE
@@ -54,6 +61,9 @@ struct kvmppc_xive_irq_state {
54 bool saved_p; 61 bool saved_p;
55 bool saved_q; 62 bool saved_q;
56 u8 saved_scan_prio; 63 u8 saved_scan_prio;
64
65 /* Xive native */
66 u32 eisn; /* Guest Effective IRQ number */
57}; 67};
58 68
59/* Select the "right" interrupt (IPI vs. passthrough) */ 69/* Select the "right" interrupt (IPI vs. passthrough) */
@@ -84,6 +94,11 @@ struct kvmppc_xive_src_block {
84 struct kvmppc_xive_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS]; 94 struct kvmppc_xive_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
85}; 95};
86 96
97struct kvmppc_xive;
98
99struct kvmppc_xive_ops {
100 int (*reset_mapped)(struct kvm *kvm, unsigned long guest_irq);
101};
87 102
88struct kvmppc_xive { 103struct kvmppc_xive {
89 struct kvm *kvm; 104 struct kvm *kvm;
@@ -122,6 +137,10 @@ struct kvmppc_xive {
122 137
123 /* Flags */ 138 /* Flags */
124 u8 single_escalation; 139 u8 single_escalation;
140
141 struct kvmppc_xive_ops *ops;
142 struct address_space *mapping;
143 struct mutex mapping_lock;
125}; 144};
126 145
127#define KVMPPC_XIVE_Q_COUNT 8 146#define KVMPPC_XIVE_Q_COUNT 8
@@ -198,6 +217,11 @@ static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmpp
198 return xive->src_blocks[bid]; 217 return xive->src_blocks[bid];
199} 218}
200 219
220static inline u32 kvmppc_xive_vp(struct kvmppc_xive *xive, u32 server)
221{
222 return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server);
223}
224
201/* 225/*
202 * Mapping between guest priorities and host priorities 226 * Mapping between guest priorities and host priorities
203 * is as follow. 227 * is as follow.
@@ -248,5 +272,18 @@ extern int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
248extern int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr); 272extern int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
249extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr); 273extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
250 274
275/*
276 * Common Xive routines for XICS-over-XIVE and XIVE native
277 */
278void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu);
279int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu);
280struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
281 struct kvmppc_xive *xive, int irq);
282void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb);
283int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio);
284int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
285 bool single_escalation);
286struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type);
287
251#endif /* CONFIG_KVM_XICS */ 288#endif /* CONFIG_KVM_XICS */
252#endif /* _KVM_PPC_BOOK3S_XICS_H */ 289#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
new file mode 100644
index 000000000000..6a8e698c4b6e
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -0,0 +1,1249 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2017-2019, IBM Corporation.
4 */
5
6#define pr_fmt(fmt) "xive-kvm: " fmt
7
8#include <linux/kernel.h>
9#include <linux/kvm_host.h>
10#include <linux/err.h>
11#include <linux/gfp.h>
12#include <linux/spinlock.h>
13#include <linux/delay.h>
14#include <linux/file.h>
15#include <asm/uaccess.h>
16#include <asm/kvm_book3s.h>
17#include <asm/kvm_ppc.h>
18#include <asm/hvcall.h>
19#include <asm/xive.h>
20#include <asm/xive-regs.h>
21#include <asm/debug.h>
22#include <asm/debugfs.h>
23#include <asm/opal.h>
24
25#include <linux/debugfs.h>
26#include <linux/seq_file.h>
27
28#include "book3s_xive.h"
29
30static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
31{
32 u64 val;
33
34 if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
35 offset |= offset << 4;
36
37 val = in_be64(xd->eoi_mmio + offset);
38 return (u8)val;
39}
40
41static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
42{
43 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
44 struct xive_q *q = &xc->queues[prio];
45
46 xive_native_disable_queue(xc->vp_id, q, prio);
47 if (q->qpage) {
48 put_page(virt_to_page(q->qpage));
49 q->qpage = NULL;
50 }
51}
52
53void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
54{
55 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
56 int i;
57
58 if (!kvmppc_xive_enabled(vcpu))
59 return;
60
61 if (!xc)
62 return;
63
64 pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num);
65
66 /* Ensure no interrupt is still routed to that VP */
67 xc->valid = false;
68 kvmppc_xive_disable_vcpu_interrupts(vcpu);
69
70 /* Disable the VP */
71 xive_native_disable_vp(xc->vp_id);
72
73 /* Free the queues & associated interrupts */
74 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
75 /* Free the escalation irq */
76 if (xc->esc_virq[i]) {
77 free_irq(xc->esc_virq[i], vcpu);
78 irq_dispose_mapping(xc->esc_virq[i]);
79 kfree(xc->esc_virq_names[i]);
80 xc->esc_virq[i] = 0;
81 }
82
83 /* Free the queue */
84 kvmppc_xive_native_cleanup_queue(vcpu, i);
85 }
86
87 /* Free the VP */
88 kfree(xc);
89
90 /* Cleanup the vcpu */
91 vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
92 vcpu->arch.xive_vcpu = NULL;
93}
94
95int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
96 struct kvm_vcpu *vcpu, u32 server_num)
97{
98 struct kvmppc_xive *xive = dev->private;
99 struct kvmppc_xive_vcpu *xc = NULL;
100 int rc;
101
102 pr_devel("native_connect_vcpu(server=%d)\n", server_num);
103
104 if (dev->ops != &kvm_xive_native_ops) {
105 pr_devel("Wrong ops !\n");
106 return -EPERM;
107 }
108 if (xive->kvm != vcpu->kvm)
109 return -EPERM;
110 if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
111 return -EBUSY;
112 if (server_num >= KVM_MAX_VCPUS) {
113 pr_devel("Out of bounds !\n");
114 return -EINVAL;
115 }
116
117 mutex_lock(&vcpu->kvm->lock);
118
119 if (kvmppc_xive_find_server(vcpu->kvm, server_num)) {
120 pr_devel("Duplicate !\n");
121 rc = -EEXIST;
122 goto bail;
123 }
124
125 xc = kzalloc(sizeof(*xc), GFP_KERNEL);
126 if (!xc) {
127 rc = -ENOMEM;
128 goto bail;
129 }
130
131 vcpu->arch.xive_vcpu = xc;
132 xc->xive = xive;
133 xc->vcpu = vcpu;
134 xc->server_num = server_num;
135
136 xc->vp_id = kvmppc_xive_vp(xive, server_num);
137 xc->valid = true;
138 vcpu->arch.irq_type = KVMPPC_IRQ_XIVE;
139
140 rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
141 if (rc) {
142 pr_err("Failed to get VP info from OPAL: %d\n", rc);
143 goto bail;
144 }
145
146 /*
147 * Enable the VP first as the single escalation mode will
148 * affect escalation interrupts numbering
149 */
150 rc = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
151 if (rc) {
152 pr_err("Failed to enable VP in OPAL: %d\n", rc);
153 goto bail;
154 }
155
156 /* Configure VCPU fields for use by assembly push/pull */
157 vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
158 vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
159
160 /* TODO: reset all queues to a clean state ? */
161bail:
162 mutex_unlock(&vcpu->kvm->lock);
163 if (rc)
164 kvmppc_xive_native_cleanup_vcpu(vcpu);
165
166 return rc;
167}
168
169/*
170 * Device passthrough support
171 */
172static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq)
173{
174 struct kvmppc_xive *xive = kvm->arch.xive;
175
176 if (irq >= KVMPPC_XIVE_NR_IRQS)
177 return -EINVAL;
178
179 /*
180 * Clear the ESB pages of the IRQ number being mapped (or
181 * unmapped) into the guest and let the the VM fault handler
182 * repopulate with the appropriate ESB pages (device or IC)
183 */
184 pr_debug("clearing esb pages for girq 0x%lx\n", irq);
185 mutex_lock(&xive->mapping_lock);
186 if (xive->mapping)
187 unmap_mapping_range(xive->mapping,
188 irq * (2ull << PAGE_SHIFT),
189 2ull << PAGE_SHIFT, 1);
190 mutex_unlock(&xive->mapping_lock);
191 return 0;
192}
193
194static struct kvmppc_xive_ops kvmppc_xive_native_ops = {
195 .reset_mapped = kvmppc_xive_native_reset_mapped,
196};
197
198static vm_fault_t xive_native_esb_fault(struct vm_fault *vmf)
199{
200 struct vm_area_struct *vma = vmf->vma;
201 struct kvm_device *dev = vma->vm_file->private_data;
202 struct kvmppc_xive *xive = dev->private;
203 struct kvmppc_xive_src_block *sb;
204 struct kvmppc_xive_irq_state *state;
205 struct xive_irq_data *xd;
206 u32 hw_num;
207 u16 src;
208 u64 page;
209 unsigned long irq;
210 u64 page_offset;
211
212 /*
213 * Linux/KVM uses a two pages ESB setting, one for trigger and
214 * one for EOI
215 */
216 page_offset = vmf->pgoff - vma->vm_pgoff;
217 irq = page_offset / 2;
218
219 sb = kvmppc_xive_find_source(xive, irq, &src);
220 if (!sb) {
221 pr_devel("%s: source %lx not found !\n", __func__, irq);
222 return VM_FAULT_SIGBUS;
223 }
224
225 state = &sb->irq_state[src];
226 kvmppc_xive_select_irq(state, &hw_num, &xd);
227
228 arch_spin_lock(&sb->lock);
229
230 /*
231 * first/even page is for trigger
232 * second/odd page is for EOI and management.
233 */
234 page = page_offset % 2 ? xd->eoi_page : xd->trig_page;
235 arch_spin_unlock(&sb->lock);
236
237 if (WARN_ON(!page)) {
238 pr_err("%s: accessing invalid ESB page for source %lx !\n",
239 __func__, irq);
240 return VM_FAULT_SIGBUS;
241 }
242
243 vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
244 return VM_FAULT_NOPAGE;
245}
246
247static const struct vm_operations_struct xive_native_esb_vmops = {
248 .fault = xive_native_esb_fault,
249};
250
251static vm_fault_t xive_native_tima_fault(struct vm_fault *vmf)
252{
253 struct vm_area_struct *vma = vmf->vma;
254
255 switch (vmf->pgoff - vma->vm_pgoff) {
256 case 0: /* HW - forbid access */
257 case 1: /* HV - forbid access */
258 return VM_FAULT_SIGBUS;
259 case 2: /* OS */
260 vmf_insert_pfn(vma, vmf->address, xive_tima_os >> PAGE_SHIFT);
261 return VM_FAULT_NOPAGE;
262 case 3: /* USER - TODO */
263 default:
264 return VM_FAULT_SIGBUS;
265 }
266}
267
268static const struct vm_operations_struct xive_native_tima_vmops = {
269 .fault = xive_native_tima_fault,
270};
271
272static int kvmppc_xive_native_mmap(struct kvm_device *dev,
273 struct vm_area_struct *vma)
274{
275 struct kvmppc_xive *xive = dev->private;
276
277 /* We only allow mappings at fixed offset for now */
278 if (vma->vm_pgoff == KVM_XIVE_TIMA_PAGE_OFFSET) {
279 if (vma_pages(vma) > 4)
280 return -EINVAL;
281 vma->vm_ops = &xive_native_tima_vmops;
282 } else if (vma->vm_pgoff == KVM_XIVE_ESB_PAGE_OFFSET) {
283 if (vma_pages(vma) > KVMPPC_XIVE_NR_IRQS * 2)
284 return -EINVAL;
285 vma->vm_ops = &xive_native_esb_vmops;
286 } else {
287 return -EINVAL;
288 }
289
290 vma->vm_flags |= VM_IO | VM_PFNMAP;
291 vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
292
293 /*
294 * Grab the KVM device file address_space to be able to clear
295 * the ESB pages mapping when a device is passed-through into
296 * the guest.
297 */
298 xive->mapping = vma->vm_file->f_mapping;
299 return 0;
300}
301
302static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
303 u64 addr)
304{
305 struct kvmppc_xive_src_block *sb;
306 struct kvmppc_xive_irq_state *state;
307 u64 __user *ubufp = (u64 __user *) addr;
308 u64 val;
309 u16 idx;
310 int rc;
311
312 pr_devel("%s irq=0x%lx\n", __func__, irq);
313
314 if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
315 return -E2BIG;
316
317 sb = kvmppc_xive_find_source(xive, irq, &idx);
318 if (!sb) {
319 pr_debug("No source, creating source block...\n");
320 sb = kvmppc_xive_create_src_block(xive, irq);
321 if (!sb) {
322 pr_err("Failed to create block...\n");
323 return -ENOMEM;
324 }
325 }
326 state = &sb->irq_state[idx];
327
328 if (get_user(val, ubufp)) {
329 pr_err("fault getting user info !\n");
330 return -EFAULT;
331 }
332
333 arch_spin_lock(&sb->lock);
334
335 /*
336 * If the source doesn't already have an IPI, allocate
337 * one and get the corresponding data
338 */
339 if (!state->ipi_number) {
340 state->ipi_number = xive_native_alloc_irq();
341 if (state->ipi_number == 0) {
342 pr_err("Failed to allocate IRQ !\n");
343 rc = -ENXIO;
344 goto unlock;
345 }
346 xive_native_populate_irq_data(state->ipi_number,
347 &state->ipi_data);
348 pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
349 state->ipi_number, irq);
350 }
351
352 /* Restore LSI state */
353 if (val & KVM_XIVE_LEVEL_SENSITIVE) {
354 state->lsi = true;
355 if (val & KVM_XIVE_LEVEL_ASSERTED)
356 state->asserted = true;
357 pr_devel(" LSI ! Asserted=%d\n", state->asserted);
358 }
359
360 /* Mask IRQ to start with */
361 state->act_server = 0;
362 state->act_priority = MASKED;
363 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
364 xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
365
366 /* Increment the number of valid sources and mark this one valid */
367 if (!state->valid)
368 xive->src_count++;
369 state->valid = true;
370
371 rc = 0;
372
373unlock:
374 arch_spin_unlock(&sb->lock);
375
376 return rc;
377}
378
379static int kvmppc_xive_native_update_source_config(struct kvmppc_xive *xive,
380 struct kvmppc_xive_src_block *sb,
381 struct kvmppc_xive_irq_state *state,
382 u32 server, u8 priority, bool masked,
383 u32 eisn)
384{
385 struct kvm *kvm = xive->kvm;
386 u32 hw_num;
387 int rc = 0;
388
389 arch_spin_lock(&sb->lock);
390
391 if (state->act_server == server && state->act_priority == priority &&
392 state->eisn == eisn)
393 goto unlock;
394
395 pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n",
396 priority, server, masked, state->act_server,
397 state->act_priority);
398
399 kvmppc_xive_select_irq(state, &hw_num, NULL);
400
401 if (priority != MASKED && !masked) {
402 rc = kvmppc_xive_select_target(kvm, &server, priority);
403 if (rc)
404 goto unlock;
405
406 state->act_priority = priority;
407 state->act_server = server;
408 state->eisn = eisn;
409
410 rc = xive_native_configure_irq(hw_num,
411 kvmppc_xive_vp(xive, server),
412 priority, eisn);
413 } else {
414 state->act_priority = MASKED;
415 state->act_server = 0;
416 state->eisn = 0;
417
418 rc = xive_native_configure_irq(hw_num, 0, MASKED, 0);
419 }
420
421unlock:
422 arch_spin_unlock(&sb->lock);
423 return rc;
424}
425
426static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive,
427 long irq, u64 addr)
428{
429 struct kvmppc_xive_src_block *sb;
430 struct kvmppc_xive_irq_state *state;
431 u64 __user *ubufp = (u64 __user *) addr;
432 u16 src;
433 u64 kvm_cfg;
434 u32 server;
435 u8 priority;
436 bool masked;
437 u32 eisn;
438
439 sb = kvmppc_xive_find_source(xive, irq, &src);
440 if (!sb)
441 return -ENOENT;
442
443 state = &sb->irq_state[src];
444
445 if (!state->valid)
446 return -EINVAL;
447
448 if (get_user(kvm_cfg, ubufp))
449 return -EFAULT;
450
451 pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__, irq, kvm_cfg);
452
453 priority = (kvm_cfg & KVM_XIVE_SOURCE_PRIORITY_MASK) >>
454 KVM_XIVE_SOURCE_PRIORITY_SHIFT;
455 server = (kvm_cfg & KVM_XIVE_SOURCE_SERVER_MASK) >>
456 KVM_XIVE_SOURCE_SERVER_SHIFT;
457 masked = (kvm_cfg & KVM_XIVE_SOURCE_MASKED_MASK) >>
458 KVM_XIVE_SOURCE_MASKED_SHIFT;
459 eisn = (kvm_cfg & KVM_XIVE_SOURCE_EISN_MASK) >>
460 KVM_XIVE_SOURCE_EISN_SHIFT;
461
462 if (priority != xive_prio_from_guest(priority)) {
463 pr_err("invalid priority for queue %d for VCPU %d\n",
464 priority, server);
465 return -EINVAL;
466 }
467
468 return kvmppc_xive_native_update_source_config(xive, sb, state, server,
469 priority, masked, eisn);
470}
471
472static int kvmppc_xive_native_sync_source(struct kvmppc_xive *xive,
473 long irq, u64 addr)
474{
475 struct kvmppc_xive_src_block *sb;
476 struct kvmppc_xive_irq_state *state;
477 struct xive_irq_data *xd;
478 u32 hw_num;
479 u16 src;
480 int rc = 0;
481
482 pr_devel("%s irq=0x%lx", __func__, irq);
483
484 sb = kvmppc_xive_find_source(xive, irq, &src);
485 if (!sb)
486 return -ENOENT;
487
488 state = &sb->irq_state[src];
489
490 rc = -EINVAL;
491
492 arch_spin_lock(&sb->lock);
493
494 if (state->valid) {
495 kvmppc_xive_select_irq(state, &hw_num, &xd);
496 xive_native_sync_source(hw_num);
497 rc = 0;
498 }
499
500 arch_spin_unlock(&sb->lock);
501 return rc;
502}
503
504static int xive_native_validate_queue_size(u32 qshift)
505{
506 /*
507 * We only support 64K pages for the moment. This is also
508 * advertised in the DT property "ibm,xive-eq-sizes"
509 */
510 switch (qshift) {
511 case 0: /* EQ reset */
512 case 16:
513 return 0;
514 case 12:
515 case 21:
516 case 24:
517 default:
518 return -EINVAL;
519 }
520}
521
522static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
523 long eq_idx, u64 addr)
524{
525 struct kvm *kvm = xive->kvm;
526 struct kvm_vcpu *vcpu;
527 struct kvmppc_xive_vcpu *xc;
528 void __user *ubufp = (void __user *) addr;
529 u32 server;
530 u8 priority;
531 struct kvm_ppc_xive_eq kvm_eq;
532 int rc;
533 __be32 *qaddr = 0;
534 struct page *page;
535 struct xive_q *q;
536 gfn_t gfn;
537 unsigned long page_size;
538
539 /*
540 * Demangle priority/server tuple from the EQ identifier
541 */
542 priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
543 KVM_XIVE_EQ_PRIORITY_SHIFT;
544 server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
545 KVM_XIVE_EQ_SERVER_SHIFT;
546
547 if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq)))
548 return -EFAULT;
549
550 vcpu = kvmppc_xive_find_server(kvm, server);
551 if (!vcpu) {
552 pr_err("Can't find server %d\n", server);
553 return -ENOENT;
554 }
555 xc = vcpu->arch.xive_vcpu;
556
557 if (priority != xive_prio_from_guest(priority)) {
558 pr_err("Trying to restore invalid queue %d for VCPU %d\n",
559 priority, server);
560 return -EINVAL;
561 }
562 q = &xc->queues[priority];
563
564 pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
565 __func__, server, priority, kvm_eq.flags,
566 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
567
568 /*
569 * sPAPR specifies a "Unconditional Notify (n) flag" for the
570 * H_INT_SET_QUEUE_CONFIG hcall which forces notification
571 * without using the coalescing mechanisms provided by the
572 * XIVE END ESBs. This is required on KVM as notification
573 * using the END ESBs is not supported.
574 */
575 if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) {
576 pr_err("invalid flags %d\n", kvm_eq.flags);
577 return -EINVAL;
578 }
579
580 rc = xive_native_validate_queue_size(kvm_eq.qshift);
581 if (rc) {
582 pr_err("invalid queue size %d\n", kvm_eq.qshift);
583 return rc;
584 }
585
586 /* reset queue and disable queueing */
587 if (!kvm_eq.qshift) {
588 q->guest_qaddr = 0;
589 q->guest_qshift = 0;
590
591 rc = xive_native_configure_queue(xc->vp_id, q, priority,
592 NULL, 0, true);
593 if (rc) {
594 pr_err("Failed to reset queue %d for VCPU %d: %d\n",
595 priority, xc->server_num, rc);
596 return rc;
597 }
598
599 if (q->qpage) {
600 put_page(virt_to_page(q->qpage));
601 q->qpage = NULL;
602 }
603
604 return 0;
605 }
606
607 if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) {
608 pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr,
609 1ull << kvm_eq.qshift);
610 return -EINVAL;
611 }
612
613 gfn = gpa_to_gfn(kvm_eq.qaddr);
614 page = gfn_to_page(kvm, gfn);
615 if (is_error_page(page)) {
616 pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
617 return -EINVAL;
618 }
619
620 page_size = kvm_host_page_size(kvm, gfn);
621 if (1ull << kvm_eq.qshift > page_size) {
622 pr_warn("Incompatible host page size %lx!\n", page_size);
623 return -EINVAL;
624 }
625
626 qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
627
628 /*
629 * Backup the queue page guest address to the mark EQ page
630 * dirty for migration.
631 */
632 q->guest_qaddr = kvm_eq.qaddr;
633 q->guest_qshift = kvm_eq.qshift;
634
635 /*
636 * Unconditional Notification is forced by default at the
637 * OPAL level because the use of END ESBs is not supported by
638 * Linux.
639 */
640 rc = xive_native_configure_queue(xc->vp_id, q, priority,
641 (__be32 *) qaddr, kvm_eq.qshift, true);
642 if (rc) {
643 pr_err("Failed to configure queue %d for VCPU %d: %d\n",
644 priority, xc->server_num, rc);
645 put_page(page);
646 return rc;
647 }
648
649 /*
650 * Only restore the queue state when needed. When doing the
651 * H_INT_SET_SOURCE_CONFIG hcall, it should not.
652 */
653 if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) {
654 rc = xive_native_set_queue_state(xc->vp_id, priority,
655 kvm_eq.qtoggle,
656 kvm_eq.qindex);
657 if (rc)
658 goto error;
659 }
660
661 rc = kvmppc_xive_attach_escalation(vcpu, priority,
662 xive->single_escalation);
663error:
664 if (rc)
665 kvmppc_xive_native_cleanup_queue(vcpu, priority);
666 return rc;
667}
668
669static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive,
670 long eq_idx, u64 addr)
671{
672 struct kvm *kvm = xive->kvm;
673 struct kvm_vcpu *vcpu;
674 struct kvmppc_xive_vcpu *xc;
675 struct xive_q *q;
676 void __user *ubufp = (u64 __user *) addr;
677 u32 server;
678 u8 priority;
679 struct kvm_ppc_xive_eq kvm_eq;
680 u64 qaddr;
681 u64 qshift;
682 u64 qeoi_page;
683 u32 escalate_irq;
684 u64 qflags;
685 int rc;
686
687 /*
688 * Demangle priority/server tuple from the EQ identifier
689 */
690 priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
691 KVM_XIVE_EQ_PRIORITY_SHIFT;
692 server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
693 KVM_XIVE_EQ_SERVER_SHIFT;
694
695 vcpu = kvmppc_xive_find_server(kvm, server);
696 if (!vcpu) {
697 pr_err("Can't find server %d\n", server);
698 return -ENOENT;
699 }
700 xc = vcpu->arch.xive_vcpu;
701
702 if (priority != xive_prio_from_guest(priority)) {
703 pr_err("invalid priority for queue %d for VCPU %d\n",
704 priority, server);
705 return -EINVAL;
706 }
707 q = &xc->queues[priority];
708
709 memset(&kvm_eq, 0, sizeof(kvm_eq));
710
711 if (!q->qpage)
712 return 0;
713
714 rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift,
715 &qeoi_page, &escalate_irq, &qflags);
716 if (rc)
717 return rc;
718
719 kvm_eq.flags = 0;
720 if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
721 kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY;
722
723 kvm_eq.qshift = q->guest_qshift;
724 kvm_eq.qaddr = q->guest_qaddr;
725
726 rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle,
727 &kvm_eq.qindex);
728 if (rc)
729 return rc;
730
731 pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
732 __func__, server, priority, kvm_eq.flags,
733 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
734
735 if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq)))
736 return -EFAULT;
737
738 return 0;
739}
740
741static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block *sb)
742{
743 int i;
744
745 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
746 struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
747
748 if (!state->valid)
749 continue;
750
751 if (state->act_priority == MASKED)
752 continue;
753
754 state->eisn = 0;
755 state->act_server = 0;
756 state->act_priority = MASKED;
757 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
758 xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
759 if (state->pt_number) {
760 xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
761 xive_native_configure_irq(state->pt_number,
762 0, MASKED, 0);
763 }
764 }
765}
766
767static int kvmppc_xive_reset(struct kvmppc_xive *xive)
768{
769 struct kvm *kvm = xive->kvm;
770 struct kvm_vcpu *vcpu;
771 unsigned int i;
772
773 pr_devel("%s\n", __func__);
774
775 mutex_lock(&kvm->lock);
776
777 kvm_for_each_vcpu(i, vcpu, kvm) {
778 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
779 unsigned int prio;
780
781 if (!xc)
782 continue;
783
784 kvmppc_xive_disable_vcpu_interrupts(vcpu);
785
786 for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
787
788 /* Single escalation, no queue 7 */
789 if (prio == 7 && xive->single_escalation)
790 break;
791
792 if (xc->esc_virq[prio]) {
793 free_irq(xc->esc_virq[prio], vcpu);
794 irq_dispose_mapping(xc->esc_virq[prio]);
795 kfree(xc->esc_virq_names[prio]);
796 xc->esc_virq[prio] = 0;
797 }
798
799 kvmppc_xive_native_cleanup_queue(vcpu, prio);
800 }
801 }
802
803 for (i = 0; i <= xive->max_sbid; i++) {
804 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
805
806 if (sb) {
807 arch_spin_lock(&sb->lock);
808 kvmppc_xive_reset_sources(sb);
809 arch_spin_unlock(&sb->lock);
810 }
811 }
812
813 mutex_unlock(&kvm->lock);
814
815 return 0;
816}
817
818static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
819{
820 int j;
821
822 for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
823 struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
824 struct xive_irq_data *xd;
825 u32 hw_num;
826
827 if (!state->valid)
828 continue;
829
830 /*
831 * The struct kvmppc_xive_irq_state reflects the state
832 * of the EAS configuration and not the state of the
833 * source. The source is masked setting the PQ bits to
834 * '-Q', which is what is being done before calling
835 * the KVM_DEV_XIVE_EQ_SYNC control.
836 *
837 * If a source EAS is configured, OPAL syncs the XIVE
838 * IC of the source and the XIVE IC of the previous
839 * target if any.
840 *
841 * So it should be fine ignoring MASKED sources as
842 * they have been synced already.
843 */
844 if (state->act_priority == MASKED)
845 continue;
846
847 kvmppc_xive_select_irq(state, &hw_num, &xd);
848 xive_native_sync_source(hw_num);
849 xive_native_sync_queue(hw_num);
850 }
851}
852
853static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
854{
855 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
856 unsigned int prio;
857
858 if (!xc)
859 return -ENOENT;
860
861 for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
862 struct xive_q *q = &xc->queues[prio];
863
864 if (!q->qpage)
865 continue;
866
867 /* Mark EQ page dirty for migration */
868 mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr));
869 }
870 return 0;
871}
872
873static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
874{
875 struct kvm *kvm = xive->kvm;
876 struct kvm_vcpu *vcpu;
877 unsigned int i;
878
879 pr_devel("%s\n", __func__);
880
881 mutex_lock(&kvm->lock);
882 for (i = 0; i <= xive->max_sbid; i++) {
883 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
884
885 if (sb) {
886 arch_spin_lock(&sb->lock);
887 kvmppc_xive_native_sync_sources(sb);
888 arch_spin_unlock(&sb->lock);
889 }
890 }
891
892 kvm_for_each_vcpu(i, vcpu, kvm) {
893 kvmppc_xive_native_vcpu_eq_sync(vcpu);
894 }
895 mutex_unlock(&kvm->lock);
896
897 return 0;
898}
899
900static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
901 struct kvm_device_attr *attr)
902{
903 struct kvmppc_xive *xive = dev->private;
904
905 switch (attr->group) {
906 case KVM_DEV_XIVE_GRP_CTRL:
907 switch (attr->attr) {
908 case KVM_DEV_XIVE_RESET:
909 return kvmppc_xive_reset(xive);
910 case KVM_DEV_XIVE_EQ_SYNC:
911 return kvmppc_xive_native_eq_sync(xive);
912 }
913 break;
914 case KVM_DEV_XIVE_GRP_SOURCE:
915 return kvmppc_xive_native_set_source(xive, attr->attr,
916 attr->addr);
917 case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
918 return kvmppc_xive_native_set_source_config(xive, attr->attr,
919 attr->addr);
920 case KVM_DEV_XIVE_GRP_EQ_CONFIG:
921 return kvmppc_xive_native_set_queue_config(xive, attr->attr,
922 attr->addr);
923 case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
924 return kvmppc_xive_native_sync_source(xive, attr->attr,
925 attr->addr);
926 }
927 return -ENXIO;
928}
929
930static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
931 struct kvm_device_attr *attr)
932{
933 struct kvmppc_xive *xive = dev->private;
934
935 switch (attr->group) {
936 case KVM_DEV_XIVE_GRP_EQ_CONFIG:
937 return kvmppc_xive_native_get_queue_config(xive, attr->attr,
938 attr->addr);
939 }
940 return -ENXIO;
941}
942
943static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
944 struct kvm_device_attr *attr)
945{
946 switch (attr->group) {
947 case KVM_DEV_XIVE_GRP_CTRL:
948 switch (attr->attr) {
949 case KVM_DEV_XIVE_RESET:
950 case KVM_DEV_XIVE_EQ_SYNC:
951 return 0;
952 }
953 break;
954 case KVM_DEV_XIVE_GRP_SOURCE:
955 case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
956 case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
957 if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
958 attr->attr < KVMPPC_XIVE_NR_IRQS)
959 return 0;
960 break;
961 case KVM_DEV_XIVE_GRP_EQ_CONFIG:
962 return 0;
963 }
964 return -ENXIO;
965}
966
967/*
968 * Called when device fd is closed
969 */
970static void kvmppc_xive_native_release(struct kvm_device *dev)
971{
972 struct kvmppc_xive *xive = dev->private;
973 struct kvm *kvm = xive->kvm;
974 struct kvm_vcpu *vcpu;
975 int i;
976 int was_ready;
977
978 debugfs_remove(xive->dentry);
979
980 pr_devel("Releasing xive native device\n");
981
982 /*
983 * Clearing mmu_ready temporarily while holding kvm->lock
984 * is a way of ensuring that no vcpus can enter the guest
985 * until we drop kvm->lock. Doing kick_all_cpus_sync()
986 * ensures that any vcpu executing inside the guest has
987 * exited the guest. Once kick_all_cpus_sync() has finished,
988 * we know that no vcpu can be executing the XIVE push or
989 * pull code or accessing the XIVE MMIO regions.
990 *
991 * Since this is the device release function, we know that
992 * userspace does not have any open fd or mmap referring to
993 * the device. Therefore there can not be any of the
994 * device attribute set/get, mmap, or page fault functions
995 * being executed concurrently, and similarly, the
996 * connect_vcpu and set/clr_mapped functions also cannot
997 * be being executed.
998 */
999 was_ready = kvm->arch.mmu_ready;
1000 kvm->arch.mmu_ready = 0;
1001 kick_all_cpus_sync();
1002
1003 /*
1004 * We should clean up the vCPU interrupt presenters first.
1005 */
1006 kvm_for_each_vcpu(i, vcpu, kvm) {
1007 /*
1008 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
1009 * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
1010 */
1011 mutex_lock(&vcpu->mutex);
1012 kvmppc_xive_native_cleanup_vcpu(vcpu);
1013 mutex_unlock(&vcpu->mutex);
1014 }
1015
1016 kvm->arch.xive = NULL;
1017
1018 for (i = 0; i <= xive->max_sbid; i++) {
1019 if (xive->src_blocks[i])
1020 kvmppc_xive_free_sources(xive->src_blocks[i]);
1021 kfree(xive->src_blocks[i]);
1022 xive->src_blocks[i] = NULL;
1023 }
1024
1025 if (xive->vp_base != XIVE_INVALID_VP)
1026 xive_native_free_vp_block(xive->vp_base);
1027
1028 kvm->arch.mmu_ready = was_ready;
1029
1030 /*
1031 * A reference of the kvmppc_xive pointer is now kept under
1032 * the xive_devices struct of the machine for reuse. It is
1033 * freed when the VM is destroyed for now until we fix all the
1034 * execution paths.
1035 */
1036
1037 kfree(dev);
1038}
1039
1040/*
1041 * Create a XIVE device. kvm->lock is held.
1042 */
1043static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
1044{
1045 struct kvmppc_xive *xive;
1046 struct kvm *kvm = dev->kvm;
1047 int ret = 0;
1048
1049 pr_devel("Creating xive native device\n");
1050
1051 if (kvm->arch.xive)
1052 return -EEXIST;
1053
1054 xive = kvmppc_xive_get_device(kvm, type);
1055 if (!xive)
1056 return -ENOMEM;
1057
1058 dev->private = xive;
1059 xive->dev = dev;
1060 xive->kvm = kvm;
1061 kvm->arch.xive = xive;
1062 mutex_init(&xive->mapping_lock);
1063
1064 /*
1065 * Allocate a bunch of VPs. KVM_MAX_VCPUS is a large value for
1066 * a default. Getting the max number of CPUs the VM was
1067 * configured with would improve our usage of the XIVE VP space.
1068 */
1069 xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS);
1070 pr_devel("VP_Base=%x\n", xive->vp_base);
1071
1072 if (xive->vp_base == XIVE_INVALID_VP)
1073 ret = -ENXIO;
1074
1075 xive->single_escalation = xive_native_has_single_escalation();
1076 xive->ops = &kvmppc_xive_native_ops;
1077
1078 if (ret)
1079 kfree(xive);
1080
1081 return ret;
1082}
1083
1084/*
1085 * Interrupt Pending Buffer (IPB) offset
1086 */
1087#define TM_IPB_SHIFT 40
1088#define TM_IPB_MASK (((u64) 0xFF) << TM_IPB_SHIFT)
1089
1090int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1091{
1092 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1093 u64 opal_state;
1094 int rc;
1095
1096 if (!kvmppc_xive_enabled(vcpu))
1097 return -EPERM;
1098
1099 if (!xc)
1100 return -ENOENT;
1101
1102 /* Thread context registers. We only care about IPB and CPPR */
1103 val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
1104
1105 /* Get the VP state from OPAL */
1106 rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
1107 if (rc)
1108 return rc;
1109
1110 /*
1111 * Capture the backup of IPB register in the NVT structure and
1112 * merge it in our KVM VP state.
1113 */
1114 val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
1115
1116 pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
1117 __func__,
1118 vcpu->arch.xive_saved_state.nsr,
1119 vcpu->arch.xive_saved_state.cppr,
1120 vcpu->arch.xive_saved_state.ipb,
1121 vcpu->arch.xive_saved_state.pipr,
1122 vcpu->arch.xive_saved_state.w01,
1123 (u32) vcpu->arch.xive_cam_word, opal_state);
1124
1125 return 0;
1126}
1127
1128int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1129{
1130 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1131 struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
1132
1133 pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
1134 val->xive_timaval[0], val->xive_timaval[1]);
1135
1136 if (!kvmppc_xive_enabled(vcpu))
1137 return -EPERM;
1138
1139 if (!xc || !xive)
1140 return -ENOENT;
1141
1142 /* We can't update the state of a "pushed" VCPU */
1143 if (WARN_ON(vcpu->arch.xive_pushed))
1144 return -EBUSY;
1145
1146 /*
1147 * Restore the thread context registers. IPB and CPPR should
1148 * be the only ones that matter.
1149 */
1150 vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
1151
1152 /*
1153 * There is no need to restore the XIVE internal state (IPB
1154 * stored in the NVT) as the IPB register was merged in KVM VP
1155 * state when captured.
1156 */
1157 return 0;
1158}
1159
1160static int xive_native_debug_show(struct seq_file *m, void *private)
1161{
1162 struct kvmppc_xive *xive = m->private;
1163 struct kvm *kvm = xive->kvm;
1164 struct kvm_vcpu *vcpu;
1165 unsigned int i;
1166
1167 if (!kvm)
1168 return 0;
1169
1170 seq_puts(m, "=========\nVCPU state\n=========\n");
1171
1172 kvm_for_each_vcpu(i, vcpu, kvm) {
1173 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1174
1175 if (!xc)
1176 continue;
1177
1178 seq_printf(m, "cpu server %#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
1179 xc->server_num,
1180 vcpu->arch.xive_saved_state.nsr,
1181 vcpu->arch.xive_saved_state.cppr,
1182 vcpu->arch.xive_saved_state.ipb,
1183 vcpu->arch.xive_saved_state.pipr,
1184 vcpu->arch.xive_saved_state.w01,
1185 (u32) vcpu->arch.xive_cam_word);
1186
1187 kvmppc_xive_debug_show_queues(m, vcpu);
1188 }
1189
1190 return 0;
1191}
1192
1193static int xive_native_debug_open(struct inode *inode, struct file *file)
1194{
1195 return single_open(file, xive_native_debug_show, inode->i_private);
1196}
1197
1198static const struct file_operations xive_native_debug_fops = {
1199 .open = xive_native_debug_open,
1200 .read = seq_read,
1201 .llseek = seq_lseek,
1202 .release = single_release,
1203};
1204
1205static void xive_native_debugfs_init(struct kvmppc_xive *xive)
1206{
1207 char *name;
1208
1209 name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive);
1210 if (!name) {
1211 pr_err("%s: no memory for name\n", __func__);
1212 return;
1213 }
1214
1215 xive->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root,
1216 xive, &xive_native_debug_fops);
1217
1218 pr_debug("%s: created %s\n", __func__, name);
1219 kfree(name);
1220}
1221
1222static void kvmppc_xive_native_init(struct kvm_device *dev)
1223{
1224 struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private;
1225
1226 /* Register some debug interfaces */
1227 xive_native_debugfs_init(xive);
1228}
1229
1230struct kvm_device_ops kvm_xive_native_ops = {
1231 .name = "kvm-xive-native",
1232 .create = kvmppc_xive_native_create,
1233 .init = kvmppc_xive_native_init,
1234 .release = kvmppc_xive_native_release,
1235 .set_attr = kvmppc_xive_native_set_attr,
1236 .get_attr = kvmppc_xive_native_get_attr,
1237 .has_attr = kvmppc_xive_native_has_attr,
1238 .mmap = kvmppc_xive_native_mmap,
1239};
1240
1241void kvmppc_xive_native_init_module(void)
1242{
1243 ;
1244}
1245
1246void kvmppc_xive_native_exit_module(void)
1247{
1248 ;
1249}
diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
index 033363d6e764..0737acfd17f1 100644
--- a/arch/powerpc/kvm/book3s_xive_template.c
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -130,24 +130,14 @@ static u32 GLUE(X_PFX,scan_interrupts)(struct kvmppc_xive_vcpu *xc,
130 */ 130 */
131 prio = ffs(pending) - 1; 131 prio = ffs(pending) - 1;
132 132
133 /*
134 * If the most favoured prio we found pending is less
135 * favored (or equal) than a pending IPI, we return
136 * the IPI instead.
137 *
138 * Note: If pending was 0 and mfrr is 0xff, we will
139 * not spurriously take an IPI because mfrr cannot
140 * then be smaller than cppr.
141 */
142 if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
143 prio = xc->mfrr;
144 hirq = XICS_IPI;
145 break;
146 }
147
148 /* Don't scan past the guest cppr */ 133 /* Don't scan past the guest cppr */
149 if (prio >= xc->cppr || prio > 7) 134 if (prio >= xc->cppr || prio > 7) {
135 if (xc->mfrr < xc->cppr) {
136 prio = xc->mfrr;
137 hirq = XICS_IPI;
138 }
150 break; 139 break;
140 }
151 141
152 /* Grab queue and pointers */ 142 /* Grab queue and pointers */
153 q = &xc->queues[prio]; 143 q = &xc->queues[prio];
@@ -184,9 +174,12 @@ skip_ipi:
184 * been set and another occurrence of the IPI will trigger. 174 * been set and another occurrence of the IPI will trigger.
185 */ 175 */
186 if (hirq == XICS_IPI || (prio == 0 && !qpage)) { 176 if (hirq == XICS_IPI || (prio == 0 && !qpage)) {
187 if (scan_type == scan_fetch) 177 if (scan_type == scan_fetch) {
188 GLUE(X_PFX,source_eoi)(xc->vp_ipi, 178 GLUE(X_PFX,source_eoi)(xc->vp_ipi,
189 &xc->vp_ipi_data); 179 &xc->vp_ipi_data);
180 q->idx = idx;
181 q->toggle = toggle;
182 }
190 /* Loop back on same queue with updated idx/toggle */ 183 /* Loop back on same queue with updated idx/toggle */
191#ifdef XIVE_RUNTIME_CHECKS 184#ifdef XIVE_RUNTIME_CHECKS
192 WARN_ON(hirq && hirq != XICS_IPI); 185 WARN_ON(hirq && hirq != XICS_IPI);
@@ -199,32 +192,41 @@ skip_ipi:
199 if (hirq == XICS_DUMMY) 192 if (hirq == XICS_DUMMY)
200 goto skip_ipi; 193 goto skip_ipi;
201 194
202 /* If fetching, update queue pointers */ 195 /* Clear the pending bit if the queue is now empty */
203 if (scan_type == scan_fetch) { 196 if (!hirq) {
204 q->idx = idx; 197 pending &= ~(1 << prio);
205 q->toggle = toggle;
206 }
207
208 /* Something found, stop searching */
209 if (hirq)
210 break;
211
212 /* Clear the pending bit on the now empty queue */
213 pending &= ~(1 << prio);
214 198
215 /* 199 /*
216 * Check if the queue count needs adjusting due to 200 * Check if the queue count needs adjusting due to
217 * interrupts being moved away. 201 * interrupts being moved away.
218 */ 202 */
219 if (atomic_read(&q->pending_count)) { 203 if (atomic_read(&q->pending_count)) {
220 int p = atomic_xchg(&q->pending_count, 0); 204 int p = atomic_xchg(&q->pending_count, 0);
221 if (p) { 205 if (p) {
222#ifdef XIVE_RUNTIME_CHECKS 206#ifdef XIVE_RUNTIME_CHECKS
223 WARN_ON(p > atomic_read(&q->count)); 207 WARN_ON(p > atomic_read(&q->count));
224#endif 208#endif
225 atomic_sub(p, &q->count); 209 atomic_sub(p, &q->count);
210 }
226 } 211 }
227 } 212 }
213
214 /*
215 * If the most favoured prio we found pending is less
216 * favored (or equal) than a pending IPI, we return
217 * the IPI instead.
218 */
219 if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
220 prio = xc->mfrr;
221 hirq = XICS_IPI;
222 break;
223 }
224
225 /* If fetching, update queue pointers */
226 if (scan_type == scan_fetch) {
227 q->idx = idx;
228 q->toggle = toggle;
229 }
228 } 230 }
229 231
230 /* If we are just taking a "peek", do nothing else */ 232 /* If we are just taking a "peek", do nothing else */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 92910b7c5bcc..3393b166817a 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -570,6 +570,16 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
570 case KVM_CAP_PPC_GET_CPU_CHAR: 570 case KVM_CAP_PPC_GET_CPU_CHAR:
571 r = 1; 571 r = 1;
572 break; 572 break;
573#ifdef CONFIG_KVM_XIVE
574 case KVM_CAP_PPC_IRQ_XIVE:
575 /*
576 * We need XIVE to be enabled on the platform (implies
577 * a POWER9 processor) and the PowerNV platform, as
578 * nested is not yet supported.
579 */
580 r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE);
581 break;
582#endif
573 583
574 case KVM_CAP_PPC_ALLOC_HTAB: 584 case KVM_CAP_PPC_ALLOC_HTAB:
575 r = hv_enabled; 585 r = hv_enabled;
@@ -750,6 +760,9 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
750 else 760 else
751 kvmppc_xics_free_icp(vcpu); 761 kvmppc_xics_free_icp(vcpu);
752 break; 762 break;
763 case KVMPPC_IRQ_XIVE:
764 kvmppc_xive_native_cleanup_vcpu(vcpu);
765 break;
753 } 766 }
754 767
755 kvmppc_core_vcpu_free(vcpu); 768 kvmppc_core_vcpu_free(vcpu);
@@ -1938,6 +1951,30 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
1938 break; 1951 break;
1939 } 1952 }
1940#endif /* CONFIG_KVM_XICS */ 1953#endif /* CONFIG_KVM_XICS */
1954#ifdef CONFIG_KVM_XIVE
1955 case KVM_CAP_PPC_IRQ_XIVE: {
1956 struct fd f;
1957 struct kvm_device *dev;
1958
1959 r = -EBADF;
1960 f = fdget(cap->args[0]);
1961 if (!f.file)
1962 break;
1963
1964 r = -ENXIO;
1965 if (!xive_enabled())
1966 break;
1967
1968 r = -EPERM;
1969 dev = kvm_device_from_filp(f.file);
1970 if (dev)
1971 r = kvmppc_xive_native_connect_vcpu(dev, vcpu,
1972 cap->args[1]);
1973
1974 fdput(f);
1975 break;
1976 }
1977#endif /* CONFIG_KVM_XIVE */
1941#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 1978#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
1942 case KVM_CAP_PPC_FWNMI: 1979 case KVM_CAP_PPC_FWNMI:
1943 r = -EINVAL; 1980 r = -EINVAL;
diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
index daad8c45c8e7..7472244e7f30 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -260,6 +260,9 @@ OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO);
260OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); 260OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO);
261OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); 261OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC);
262OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); 262OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP);
263OPAL_CALL(opal_xive_get_queue_state, OPAL_XIVE_GET_QUEUE_STATE);
264OPAL_CALL(opal_xive_set_queue_state, OPAL_XIVE_SET_QUEUE_STATE);
265OPAL_CALL(opal_xive_get_vp_state, OPAL_XIVE_GET_VP_STATE);
263OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET); 266OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET);
264OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); 267OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT);
265OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); 268OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT);
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index 1ca127d052a6..7782201e5fe8 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -437,6 +437,12 @@ void xive_native_sync_source(u32 hw_irq)
437} 437}
438EXPORT_SYMBOL_GPL(xive_native_sync_source); 438EXPORT_SYMBOL_GPL(xive_native_sync_source);
439 439
440void xive_native_sync_queue(u32 hw_irq)
441{
442 opal_xive_sync(XIVE_SYNC_QUEUE, hw_irq);
443}
444EXPORT_SYMBOL_GPL(xive_native_sync_queue);
445
440static const struct xive_ops xive_native_ops = { 446static const struct xive_ops xive_native_ops = {
441 .populate_irq_data = xive_native_populate_irq_data, 447 .populate_irq_data = xive_native_populate_irq_data,
442 .configure_irq = xive_native_configure_irq, 448 .configure_irq = xive_native_configure_irq,
@@ -515,6 +521,9 @@ u32 xive_native_default_eq_shift(void)
515} 521}
516EXPORT_SYMBOL_GPL(xive_native_default_eq_shift); 522EXPORT_SYMBOL_GPL(xive_native_default_eq_shift);
517 523
524unsigned long xive_tima_os;
525EXPORT_SYMBOL_GPL(xive_tima_os);
526
518bool __init xive_native_init(void) 527bool __init xive_native_init(void)
519{ 528{
520 struct device_node *np; 529 struct device_node *np;
@@ -567,6 +576,14 @@ bool __init xive_native_init(void)
567 for_each_possible_cpu(cpu) 576 for_each_possible_cpu(cpu)
568 kvmppc_set_xive_tima(cpu, r.start, tima); 577 kvmppc_set_xive_tima(cpu, r.start, tima);
569 578
579 /* Resource 2 is OS window */
580 if (of_address_to_resource(np, 2, &r)) {
581 pr_err("Failed to get thread mgmnt area resource\n");
582 return false;
583 }
584
585 xive_tima_os = r.start;
586
570 /* Grab size of provisionning pages */ 587 /* Grab size of provisionning pages */
571 xive_parse_provisioning(np); 588 xive_parse_provisioning(np);
572 589
@@ -711,3 +728,96 @@ bool xive_native_has_single_escalation(void)
711 return xive_has_single_esc; 728 return xive_has_single_esc;
712} 729}
713EXPORT_SYMBOL_GPL(xive_native_has_single_escalation); 730EXPORT_SYMBOL_GPL(xive_native_has_single_escalation);
731
732int xive_native_get_queue_info(u32 vp_id, u32 prio,
733 u64 *out_qpage,
734 u64 *out_qsize,
735 u64 *out_qeoi_page,
736 u32 *out_escalate_irq,
737 u64 *out_qflags)
738{
739 __be64 qpage;
740 __be64 qsize;
741 __be64 qeoi_page;
742 __be32 escalate_irq;
743 __be64 qflags;
744 s64 rc;
745
746 rc = opal_xive_get_queue_info(vp_id, prio, &qpage, &qsize,
747 &qeoi_page, &escalate_irq, &qflags);
748 if (rc) {
749 pr_err("OPAL failed to get queue info for VCPU %d/%d : %lld\n",
750 vp_id, prio, rc);
751 return -EIO;
752 }
753
754 if (out_qpage)
755 *out_qpage = be64_to_cpu(qpage);
756 if (out_qsize)
757 *out_qsize = be32_to_cpu(qsize);
758 if (out_qeoi_page)
759 *out_qeoi_page = be64_to_cpu(qeoi_page);
760 if (out_escalate_irq)
761 *out_escalate_irq = be32_to_cpu(escalate_irq);
762 if (out_qflags)
763 *out_qflags = be64_to_cpu(qflags);
764
765 return 0;
766}
767EXPORT_SYMBOL_GPL(xive_native_get_queue_info);
768
769int xive_native_get_queue_state(u32 vp_id, u32 prio, u32 *qtoggle, u32 *qindex)
770{
771 __be32 opal_qtoggle;
772 __be32 opal_qindex;
773 s64 rc;
774
775 rc = opal_xive_get_queue_state(vp_id, prio, &opal_qtoggle,
776 &opal_qindex);
777 if (rc) {
778 pr_err("OPAL failed to get queue state for VCPU %d/%d : %lld\n",
779 vp_id, prio, rc);
780 return -EIO;
781 }
782
783 if (qtoggle)
784 *qtoggle = be32_to_cpu(opal_qtoggle);
785 if (qindex)
786 *qindex = be32_to_cpu(opal_qindex);
787
788 return 0;
789}
790EXPORT_SYMBOL_GPL(xive_native_get_queue_state);
791
792int xive_native_set_queue_state(u32 vp_id, u32 prio, u32 qtoggle, u32 qindex)
793{
794 s64 rc;
795
796 rc = opal_xive_set_queue_state(vp_id, prio, qtoggle, qindex);
797 if (rc) {
798 pr_err("OPAL failed to set queue state for VCPU %d/%d : %lld\n",
799 vp_id, prio, rc);
800 return -EIO;
801 }
802
803 return 0;
804}
805EXPORT_SYMBOL_GPL(xive_native_set_queue_state);
806
807int xive_native_get_vp_state(u32 vp_id, u64 *out_state)
808{
809 __be64 state;
810 s64 rc;
811
812 rc = opal_xive_get_vp_state(vp_id, &state);
813 if (rc) {
814 pr_err("OPAL failed to get vp state for VCPU %d : %lld\n",
815 vp_id, rc);
816 return -EIO;
817 }
818
819 if (out_state)
820 *out_state = be64_to_cpu(state);
821 return 0;
822}
823EXPORT_SYMBOL_GPL(xive_native_get_vp_state);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6f665c16e31d..79fa4426509c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1270,11 +1270,21 @@ struct kvm_device_ops {
1270 */ 1270 */
1271 void (*destroy)(struct kvm_device *dev); 1271 void (*destroy)(struct kvm_device *dev);
1272 1272
1273 /*
1274 * Release is an alternative method to free the device. It is
1275 * called when the device file descriptor is closed. Once
1276 * release is called, the destroy method will not be called
1277 * anymore as the device is removed from the device list of
1278 * the VM. kvm->lock is held.
1279 */
1280 void (*release)(struct kvm_device *dev);
1281
1273 int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); 1282 int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
1274 int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); 1283 int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
1275 int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); 1284 int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
1276 long (*ioctl)(struct kvm_device *dev, unsigned int ioctl, 1285 long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
1277 unsigned long arg); 1286 unsigned long arg);
1287 int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma);
1278}; 1288};
1279 1289
1280void kvm_device_get(struct kvm_device *dev); 1290void kvm_device_get(struct kvm_device *dev);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d673734c46cb..d727adf07801 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -989,6 +989,7 @@ struct kvm_ppc_resize_hpt {
989#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 /* Obsolete */ 989#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 /* Obsolete */
990#define KVM_CAP_HYPERV_CPUID 167 990#define KVM_CAP_HYPERV_CPUID 167
991#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 168 991#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 168
992#define KVM_CAP_PPC_IRQ_XIVE 169
992 993
993#ifdef KVM_CAP_IRQ_ROUTING 994#ifdef KVM_CAP_IRQ_ROUTING
994 995
@@ -1212,6 +1213,8 @@ enum kvm_device_type {
1212#define KVM_DEV_TYPE_ARM_VGIC_V3 KVM_DEV_TYPE_ARM_VGIC_V3 1213#define KVM_DEV_TYPE_ARM_VGIC_V3 KVM_DEV_TYPE_ARM_VGIC_V3
1213 KVM_DEV_TYPE_ARM_VGIC_ITS, 1214 KVM_DEV_TYPE_ARM_VGIC_ITS,
1214#define KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_ARM_VGIC_ITS 1215#define KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_ARM_VGIC_ITS
1216 KVM_DEV_TYPE_XIVE,
1217#define KVM_DEV_TYPE_XIVE KVM_DEV_TYPE_XIVE
1215 KVM_DEV_TYPE_MAX, 1218 KVM_DEV_TYPE_MAX,
1216}; 1219};
1217 1220
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f4e02cd8fa43..d22b1f4bfa56 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2950,6 +2950,16 @@ out:
2950} 2950}
2951#endif 2951#endif
2952 2952
2953static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
2954{
2955 struct kvm_device *dev = filp->private_data;
2956
2957 if (dev->ops->mmap)
2958 return dev->ops->mmap(dev, vma);
2959
2960 return -ENODEV;
2961}
2962
2953static int kvm_device_ioctl_attr(struct kvm_device *dev, 2963static int kvm_device_ioctl_attr(struct kvm_device *dev,
2954 int (*accessor)(struct kvm_device *dev, 2964 int (*accessor)(struct kvm_device *dev,
2955 struct kvm_device_attr *attr), 2965 struct kvm_device_attr *attr),
@@ -2994,6 +3004,13 @@ static int kvm_device_release(struct inode *inode, struct file *filp)
2994 struct kvm_device *dev = filp->private_data; 3004 struct kvm_device *dev = filp->private_data;
2995 struct kvm *kvm = dev->kvm; 3005 struct kvm *kvm = dev->kvm;
2996 3006
3007 if (dev->ops->release) {
3008 mutex_lock(&kvm->lock);
3009 list_del(&dev->vm_node);
3010 dev->ops->release(dev);
3011 mutex_unlock(&kvm->lock);
3012 }
3013
2997 kvm_put_kvm(kvm); 3014 kvm_put_kvm(kvm);
2998 return 0; 3015 return 0;
2999} 3016}
@@ -3002,6 +3019,7 @@ static const struct file_operations kvm_device_fops = {
3002 .unlocked_ioctl = kvm_device_ioctl, 3019 .unlocked_ioctl = kvm_device_ioctl,
3003 .release = kvm_device_release, 3020 .release = kvm_device_release,
3004 KVM_COMPAT(kvm_device_ioctl), 3021 KVM_COMPAT(kvm_device_ioctl),
3022 .mmap = kvm_device_mmap,
3005}; 3023};
3006 3024
3007struct kvm_device *kvm_device_from_filp(struct file *filp) 3025struct kvm_device *kvm_device_from_filp(struct file *filp)