aboutsummaryrefslogtreecommitdiffstats
path: root/virt/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-10-08 05:27:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-08 05:27:39 -0400
commite4e65676f272adb63655a2ca95207e8212d282f1 (patch)
tree3679a3e6897d698ee949642660281e7f74e2852b /virt/kvm
parentf89f4a06a59f30dec64b2afc4111426fc01e9e12 (diff)
parentf439ed27f8b8b90d243ae15acb193d37f96eebe0 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "Fixes and features for 3.18. Apart from the usual cleanups, here is the summary of new features: - s390 moves closer towards host large page support - PowerPC has improved support for debugging (both inside the guest and via gdbstub) and support for e6500 processors - ARM/ARM64 support read-only memory (which is necessary to put firmware in emulated NOR flash) - x86 has the usual emulator fixes and nested virtualization improvements (including improved Windows support on Intel and Jailhouse hypervisor support on AMD), adaptive PLE which helps overcommitting of huge guests. Also included are some patches that make KVM more friendly to memory hot-unplug, and fixes for rare caching bugs. Two patches have trivial mm/ parts that were acked by Rik and Andrew. Note: I will soon switch to a subkey for signing purposes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (157 commits) kvm: do not handle APIC access page if in-kernel irqchip is not in use KVM: s390: count vcpu wakeups in stat.halt_wakeup KVM: s390/facilities: allow TOD-CLOCK steering facility bit KVM: PPC: BOOK3S: HV: CMA: Reserve cma region only in hypervisor mode arm/arm64: KVM: Report correct FSC for unsupported fault types arm/arm64: KVM: Fix VTTBR_BADDR_MASK and pgd alloc kvm: Fix kvm_get_page_retry_io __gup retval check arm/arm64: KVM: Fix set_clear_sgi_pend_reg offset kvm: x86: Unpin and remove kvm_arch->apic_access_page kvm: vmx: Implement set_apic_access_page_addr kvm: x86: Add request bit to reload APIC access page address kvm: Add arch specific mmu notifier for page invalidation kvm: Rename make_all_cpus_request() to kvm_make_all_cpus_request() and make it non-static kvm: Fix page ageing bugs kvm/x86/mmu: Pass gfn and level to rmapp callback. x86: kvm: use alternatives for VMCALL vs. VMMCALL if kernel text is read-only kvm: x86: use macros to compute bank MSRs KVM: x86: Remove debug assertion of non-PAE reserved bits kvm: don't take vcpu mutex for obviously invalid vcpu ioctls kvm: Faults which trigger IO release the mmap_sem ...
Diffstat (limited to 'virt/kvm')
-rw-r--r--virt/kvm/arm/vgic.c744
-rw-r--r--virt/kvm/async_pf.c4
-rw-r--r--virt/kvm/eventfd.c4
-rw-r--r--virt/kvm/ioapic.c46
-rw-r--r--virt/kvm/ioapic.h2
-rw-r--r--virt/kvm/kvm_main.c192
-rw-r--r--virt/kvm/vfio.c22
-rw-r--r--virt/kvm/vfio.h13
8 files changed, 779 insertions, 248 deletions
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 73eba793b17f..862967852d5a 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -36,21 +36,22 @@
36 * How the whole thing works (courtesy of Christoffer Dall): 36 * How the whole thing works (courtesy of Christoffer Dall):
37 * 37 *
38 * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if 38 * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if
39 * something is pending 39 * something is pending on the CPU interface.
40 * - VGIC pending interrupts are stored on the vgic.irq_state vgic 40 * - Interrupts that are pending on the distributor are stored on the
41 * bitmap (this bitmap is updated by both user land ioctls and guest 41 * vgic.irq_pending vgic bitmap (this bitmap is updated by both user land
42 * mmio ops, and other in-kernel peripherals such as the 42 * ioctls and guest mmio ops, and other in-kernel peripherals such as the
43 * arch. timers) and indicate the 'wire' state. 43 * arch. timers).
44 * - Every time the bitmap changes, the irq_pending_on_cpu oracle is 44 * - Every time the bitmap changes, the irq_pending_on_cpu oracle is
45 * recalculated 45 * recalculated
46 * - To calculate the oracle, we need info for each cpu from 46 * - To calculate the oracle, we need info for each cpu from
47 * compute_pending_for_cpu, which considers: 47 * compute_pending_for_cpu, which considers:
48 * - PPI: dist->irq_state & dist->irq_enable 48 * - PPI: dist->irq_pending & dist->irq_enable
49 * - SPI: dist->irq_state & dist->irq_enable & dist->irq_spi_target 49 * - SPI: dist->irq_pending & dist->irq_enable & dist->irq_spi_target
50 * - irq_spi_target is a 'formatted' version of the GICD_ICFGR 50 * - irq_spi_target is a 'formatted' version of the GICD_ITARGETSRn
51 * registers, stored on each vcpu. We only keep one bit of 51 * registers, stored on each vcpu. We only keep one bit of
52 * information per interrupt, making sure that only one vcpu can 52 * information per interrupt, making sure that only one vcpu can
53 * accept the interrupt. 53 * accept the interrupt.
54 * - If any of the above state changes, we must recalculate the oracle.
54 * - The same is true when injecting an interrupt, except that we only 55 * - The same is true when injecting an interrupt, except that we only
55 * consider a single interrupt at a time. The irq_spi_cpu array 56 * consider a single interrupt at a time. The irq_spi_cpu array
56 * contains the target CPU for each SPI. 57 * contains the target CPU for each SPI.
@@ -60,13 +61,18 @@
60 * the 'line' again. This is achieved as such: 61 * the 'line' again. This is achieved as such:
61 * 62 *
62 * - When a level interrupt is moved onto a vcpu, the corresponding 63 * - When a level interrupt is moved onto a vcpu, the corresponding
63 * bit in irq_active is set. As long as this bit is set, the line 64 * bit in irq_queued is set. As long as this bit is set, the line
64 * will be ignored for further interrupts. The interrupt is injected 65 * will be ignored for further interrupts. The interrupt is injected
65 * into the vcpu with the GICH_LR_EOI bit set (generate a 66 * into the vcpu with the GICH_LR_EOI bit set (generate a
66 * maintenance interrupt on EOI). 67 * maintenance interrupt on EOI).
67 * - When the interrupt is EOIed, the maintenance interrupt fires, 68 * - When the interrupt is EOIed, the maintenance interrupt fires,
68 * and clears the corresponding bit in irq_active. This allow the 69 * and clears the corresponding bit in irq_queued. This allows the
69 * interrupt line to be sampled again. 70 * interrupt line to be sampled again.
71 * - Note that level-triggered interrupts can also be set to pending from
72 * writes to GICD_ISPENDRn and lowering the external input line does not
73 * cause the interrupt to become inactive in such a situation.
74 * Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become
75 * inactive as long as the external input line is held high.
70 */ 76 */
71 77
72#define VGIC_ADDR_UNDEF (-1) 78#define VGIC_ADDR_UNDEF (-1)
@@ -89,6 +95,7 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
89static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu); 95static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu);
90static void vgic_update_state(struct kvm *kvm); 96static void vgic_update_state(struct kvm *kvm);
91static void vgic_kick_vcpus(struct kvm *kvm); 97static void vgic_kick_vcpus(struct kvm *kvm);
98static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi);
92static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg); 99static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg);
93static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr); 100static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
94static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc); 101static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
@@ -99,10 +106,8 @@ static const struct vgic_ops *vgic_ops;
99static const struct vgic_params *vgic; 106static const struct vgic_params *vgic;
100 107
101/* 108/*
102 * struct vgic_bitmap contains unions that provide two views of 109 * struct vgic_bitmap contains a bitmap made of unsigned longs, but
103 * the same data. In one case it is an array of registers of 110 * extracts u32s out of them.
104 * u32's, and in the other case it is a bitmap of unsigned
105 * longs.
106 * 111 *
107 * This does not work on 64-bit BE systems, because the bitmap access 112 * This does not work on 64-bit BE systems, because the bitmap access
108 * will store two consecutive 32-bit words with the higher-addressed 113 * will store two consecutive 32-bit words with the higher-addressed
@@ -118,23 +123,45 @@ static const struct vgic_params *vgic;
118#define REG_OFFSET_SWIZZLE 0 123#define REG_OFFSET_SWIZZLE 0
119#endif 124#endif
120 125
126static int vgic_init_bitmap(struct vgic_bitmap *b, int nr_cpus, int nr_irqs)
127{
128 int nr_longs;
129
130 nr_longs = nr_cpus + BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS);
131
132 b->private = kzalloc(sizeof(unsigned long) * nr_longs, GFP_KERNEL);
133 if (!b->private)
134 return -ENOMEM;
135
136 b->shared = b->private + nr_cpus;
137
138 return 0;
139}
140
141static void vgic_free_bitmap(struct vgic_bitmap *b)
142{
143 kfree(b->private);
144 b->private = NULL;
145 b->shared = NULL;
146}
147
121static u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, 148static u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x,
122 int cpuid, u32 offset) 149 int cpuid, u32 offset)
123{ 150{
124 offset >>= 2; 151 offset >>= 2;
125 if (!offset) 152 if (!offset)
126 return x->percpu[cpuid].reg + (offset ^ REG_OFFSET_SWIZZLE); 153 return (u32 *)(x->private + cpuid) + REG_OFFSET_SWIZZLE;
127 else 154 else
128 return x->shared.reg + ((offset - 1) ^ REG_OFFSET_SWIZZLE); 155 return (u32 *)(x->shared) + ((offset - 1) ^ REG_OFFSET_SWIZZLE);
129} 156}
130 157
131static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x, 158static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x,
132 int cpuid, int irq) 159 int cpuid, int irq)
133{ 160{
134 if (irq < VGIC_NR_PRIVATE_IRQS) 161 if (irq < VGIC_NR_PRIVATE_IRQS)
135 return test_bit(irq, x->percpu[cpuid].reg_ul); 162 return test_bit(irq, x->private + cpuid);
136 163
137 return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared.reg_ul); 164 return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared);
138} 165}
139 166
140static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid, 167static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
@@ -143,9 +170,9 @@ static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
143 unsigned long *reg; 170 unsigned long *reg;
144 171
145 if (irq < VGIC_NR_PRIVATE_IRQS) { 172 if (irq < VGIC_NR_PRIVATE_IRQS) {
146 reg = x->percpu[cpuid].reg_ul; 173 reg = x->private + cpuid;
147 } else { 174 } else {
148 reg = x->shared.reg_ul; 175 reg = x->shared;
149 irq -= VGIC_NR_PRIVATE_IRQS; 176 irq -= VGIC_NR_PRIVATE_IRQS;
150 } 177 }
151 178
@@ -157,24 +184,49 @@ static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
157 184
158static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid) 185static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid)
159{ 186{
160 if (unlikely(cpuid >= VGIC_MAX_CPUS)) 187 return x->private + cpuid;
161 return NULL;
162 return x->percpu[cpuid].reg_ul;
163} 188}
164 189
165static unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x) 190static unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x)
166{ 191{
167 return x->shared.reg_ul; 192 return x->shared;
193}
194
195static int vgic_init_bytemap(struct vgic_bytemap *x, int nr_cpus, int nr_irqs)
196{
197 int size;
198
199 size = nr_cpus * VGIC_NR_PRIVATE_IRQS;
200 size += nr_irqs - VGIC_NR_PRIVATE_IRQS;
201
202 x->private = kzalloc(size, GFP_KERNEL);
203 if (!x->private)
204 return -ENOMEM;
205
206 x->shared = x->private + nr_cpus * VGIC_NR_PRIVATE_IRQS / sizeof(u32);
207 return 0;
208}
209
210static void vgic_free_bytemap(struct vgic_bytemap *b)
211{
212 kfree(b->private);
213 b->private = NULL;
214 b->shared = NULL;
168} 215}
169 216
170static u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset) 217static u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset)
171{ 218{
172 offset >>= 2; 219 u32 *reg;
173 BUG_ON(offset > (VGIC_NR_IRQS / 4)); 220
174 if (offset < 8) 221 if (offset < VGIC_NR_PRIVATE_IRQS) {
175 return x->percpu[cpuid] + offset; 222 reg = x->private;
176 else 223 offset += cpuid * VGIC_NR_PRIVATE_IRQS;
177 return x->shared + offset - 8; 224 } else {
225 reg = x->shared;
226 offset -= VGIC_NR_PRIVATE_IRQS;
227 }
228
229 return reg + (offset / sizeof(u32));
178} 230}
179 231
180#define VGIC_CFG_LEVEL 0 232#define VGIC_CFG_LEVEL 0
@@ -196,46 +248,81 @@ static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, int irq)
196 return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq); 248 return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq);
197} 249}
198 250
199static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq) 251static int vgic_irq_is_queued(struct kvm_vcpu *vcpu, int irq)
252{
253 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
254
255 return vgic_bitmap_get_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq);
256}
257
258static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq)
259{
260 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
261
262 vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 1);
263}
264
265static void vgic_irq_clear_queued(struct kvm_vcpu *vcpu, int irq)
266{
267 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
268
269 vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 0);
270}
271
272static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq)
200{ 273{
201 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 274 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
202 275
203 return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq); 276 return vgic_bitmap_get_irq_val(&dist->irq_level, vcpu->vcpu_id, irq);
204} 277}
205 278
206static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq) 279static void vgic_dist_irq_set_level(struct kvm_vcpu *vcpu, int irq)
207{ 280{
208 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 281 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
209 282
210 vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1); 283 vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 1);
211} 284}
212 285
213static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq) 286static void vgic_dist_irq_clear_level(struct kvm_vcpu *vcpu, int irq)
214{ 287{
215 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 288 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
216 289
217 vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0); 290 vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 0);
291}
292
293static int vgic_dist_irq_soft_pend(struct kvm_vcpu *vcpu, int irq)
294{
295 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
296
297 return vgic_bitmap_get_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq);
298}
299
300static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq)
301{
302 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
303
304 vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0);
218} 305}
219 306
220static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq) 307static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
221{ 308{
222 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 309 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
223 310
224 return vgic_bitmap_get_irq_val(&dist->irq_state, vcpu->vcpu_id, irq); 311 return vgic_bitmap_get_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq);
225} 312}
226 313
227static void vgic_dist_irq_set(struct kvm_vcpu *vcpu, int irq) 314static void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq)
228{ 315{
229 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 316 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
230 317
231 vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 1); 318 vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 1);
232} 319}
233 320
234static void vgic_dist_irq_clear(struct kvm_vcpu *vcpu, int irq) 321static void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq)
235{ 322{
236 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 323 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
237 324
238 vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 0); 325 vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 0);
239} 326}
240 327
241static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq) 328static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq)
@@ -256,6 +343,11 @@ static void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
256 vcpu->arch.vgic_cpu.pending_shared); 343 vcpu->arch.vgic_cpu.pending_shared);
257} 344}
258 345
346static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq)
347{
348 return vgic_irq_is_edge(vcpu, irq) || !vgic_irq_is_queued(vcpu, irq);
349}
350
259static u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask) 351static u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask)
260{ 352{
261 return le32_to_cpu(*((u32 *)mmio->data)) & mask; 353 return le32_to_cpu(*((u32 *)mmio->data)) & mask;
@@ -347,7 +439,7 @@ static bool handle_mmio_misc(struct kvm_vcpu *vcpu,
347 439
348 case 4: /* GICD_TYPER */ 440 case 4: /* GICD_TYPER */
349 reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; 441 reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
350 reg |= (VGIC_NR_IRQS >> 5) - 1; 442 reg |= (vcpu->kvm->arch.vgic.nr_irqs >> 5) - 1;
351 vgic_reg_access(mmio, &reg, word_offset, 443 vgic_reg_access(mmio, &reg, word_offset,
352 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); 444 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
353 break; 445 break;
@@ -409,11 +501,33 @@ static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu,
409 struct kvm_exit_mmio *mmio, 501 struct kvm_exit_mmio *mmio,
410 phys_addr_t offset) 502 phys_addr_t offset)
411{ 503{
412 u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state, 504 u32 *reg, orig;
413 vcpu->vcpu_id, offset); 505 u32 level_mask;
506 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
507
508 reg = vgic_bitmap_get_reg(&dist->irq_cfg, vcpu->vcpu_id, offset);
509 level_mask = (~(*reg));
510
511 /* Mark both level and edge triggered irqs as pending */
512 reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu->vcpu_id, offset);
513 orig = *reg;
414 vgic_reg_access(mmio, reg, offset, 514 vgic_reg_access(mmio, reg, offset,
415 ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); 515 ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
516
416 if (mmio->is_write) { 517 if (mmio->is_write) {
518 /* Set the soft-pending flag only for level-triggered irqs */
519 reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
520 vcpu->vcpu_id, offset);
521 vgic_reg_access(mmio, reg, offset,
522 ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
523 *reg &= level_mask;
524
525 /* Ignore writes to SGIs */
526 if (offset < 2) {
527 *reg &= ~0xffff;
528 *reg |= orig & 0xffff;
529 }
530
417 vgic_update_state(vcpu->kvm); 531 vgic_update_state(vcpu->kvm);
418 return true; 532 return true;
419 } 533 }
@@ -425,11 +539,34 @@ static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu,
425 struct kvm_exit_mmio *mmio, 539 struct kvm_exit_mmio *mmio,
426 phys_addr_t offset) 540 phys_addr_t offset)
427{ 541{
428 u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state, 542 u32 *level_active;
429 vcpu->vcpu_id, offset); 543 u32 *reg, orig;
544 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
545
546 reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu->vcpu_id, offset);
547 orig = *reg;
430 vgic_reg_access(mmio, reg, offset, 548 vgic_reg_access(mmio, reg, offset,
431 ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); 549 ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
432 if (mmio->is_write) { 550 if (mmio->is_write) {
551 /* Re-set level triggered level-active interrupts */
552 level_active = vgic_bitmap_get_reg(&dist->irq_level,
553 vcpu->vcpu_id, offset);
554 reg = vgic_bitmap_get_reg(&dist->irq_pending,
555 vcpu->vcpu_id, offset);
556 *reg |= *level_active;
557
558 /* Ignore writes to SGIs */
559 if (offset < 2) {
560 *reg &= ~0xffff;
561 *reg |= orig & 0xffff;
562 }
563
564 /* Clear soft-pending flags */
565 reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
566 vcpu->vcpu_id, offset);
567 vgic_reg_access(mmio, reg, offset,
568 ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
569
433 vgic_update_state(vcpu->kvm); 570 vgic_update_state(vcpu->kvm);
434 return true; 571 return true;
435 } 572 }
@@ -651,9 +788,9 @@ static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
651 * is fine, then we are only setting a few bits that were 788 * is fine, then we are only setting a few bits that were
652 * already set. 789 * already set.
653 */ 790 */
654 vgic_dist_irq_set(vcpu, lr.irq); 791 vgic_dist_irq_set_pending(vcpu, lr.irq);
655 if (lr.irq < VGIC_NR_SGIS) 792 if (lr.irq < VGIC_NR_SGIS)
656 dist->irq_sgi_sources[vcpu_id][lr.irq] |= 1 << lr.source; 793 *vgic_get_sgi_sources(dist, vcpu_id, lr.irq) |= 1 << lr.source;
657 lr.state &= ~LR_STATE_PENDING; 794 lr.state &= ~LR_STATE_PENDING;
658 vgic_set_lr(vcpu, i, lr); 795 vgic_set_lr(vcpu, i, lr);
659 796
@@ -662,8 +799,10 @@ static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
662 * active), then the LR does not hold any useful info and can 799 * active), then the LR does not hold any useful info and can
663 * be marked as free for other use. 800 * be marked as free for other use.
664 */ 801 */
665 if (!(lr.state & LR_STATE_MASK)) 802 if (!(lr.state & LR_STATE_MASK)) {
666 vgic_retire_lr(i, lr.irq, vcpu); 803 vgic_retire_lr(i, lr.irq, vcpu);
804 vgic_irq_clear_queued(vcpu, lr.irq);
805 }
667 806
668 /* Finally update the VGIC state. */ 807 /* Finally update the VGIC state. */
669 vgic_update_state(vcpu->kvm); 808 vgic_update_state(vcpu->kvm);
@@ -677,7 +816,7 @@ static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
677{ 816{
678 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 817 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
679 int sgi; 818 int sgi;
680 int min_sgi = (offset & ~0x3) * 4; 819 int min_sgi = (offset & ~0x3);
681 int max_sgi = min_sgi + 3; 820 int max_sgi = min_sgi + 3;
682 int vcpu_id = vcpu->vcpu_id; 821 int vcpu_id = vcpu->vcpu_id;
683 u32 reg = 0; 822 u32 reg = 0;
@@ -685,7 +824,7 @@ static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
685 /* Copy source SGIs from distributor side */ 824 /* Copy source SGIs from distributor side */
686 for (sgi = min_sgi; sgi <= max_sgi; sgi++) { 825 for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
687 int shift = 8 * (sgi - min_sgi); 826 int shift = 8 * (sgi - min_sgi);
688 reg |= (u32)dist->irq_sgi_sources[vcpu_id][sgi] << shift; 827 reg |= ((u32)*vgic_get_sgi_sources(dist, vcpu_id, sgi)) << shift;
689 } 828 }
690 829
691 mmio_data_write(mmio, ~0, reg); 830 mmio_data_write(mmio, ~0, reg);
@@ -698,7 +837,7 @@ static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
698{ 837{
699 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 838 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
700 int sgi; 839 int sgi;
701 int min_sgi = (offset & ~0x3) * 4; 840 int min_sgi = (offset & ~0x3);
702 int max_sgi = min_sgi + 3; 841 int max_sgi = min_sgi + 3;
703 int vcpu_id = vcpu->vcpu_id; 842 int vcpu_id = vcpu->vcpu_id;
704 u32 reg; 843 u32 reg;
@@ -709,14 +848,15 @@ static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
709 /* Clear pending SGIs on the distributor */ 848 /* Clear pending SGIs on the distributor */
710 for (sgi = min_sgi; sgi <= max_sgi; sgi++) { 849 for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
711 u8 mask = reg >> (8 * (sgi - min_sgi)); 850 u8 mask = reg >> (8 * (sgi - min_sgi));
851 u8 *src = vgic_get_sgi_sources(dist, vcpu_id, sgi);
712 if (set) { 852 if (set) {
713 if ((dist->irq_sgi_sources[vcpu_id][sgi] & mask) != mask) 853 if ((*src & mask) != mask)
714 updated = true; 854 updated = true;
715 dist->irq_sgi_sources[vcpu_id][sgi] |= mask; 855 *src |= mask;
716 } else { 856 } else {
717 if (dist->irq_sgi_sources[vcpu_id][sgi] & mask) 857 if (*src & mask)
718 updated = true; 858 updated = true;
719 dist->irq_sgi_sources[vcpu_id][sgi] &= ~mask; 859 *src &= ~mask;
720 } 860 }
721 } 861 }
722 862
@@ -755,6 +895,7 @@ static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu,
755struct mmio_range { 895struct mmio_range {
756 phys_addr_t base; 896 phys_addr_t base;
757 unsigned long len; 897 unsigned long len;
898 int bits_per_irq;
758 bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, 899 bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
759 phys_addr_t offset); 900 phys_addr_t offset);
760}; 901};
@@ -763,56 +904,67 @@ static const struct mmio_range vgic_dist_ranges[] = {
763 { 904 {
764 .base = GIC_DIST_CTRL, 905 .base = GIC_DIST_CTRL,
765 .len = 12, 906 .len = 12,
907 .bits_per_irq = 0,
766 .handle_mmio = handle_mmio_misc, 908 .handle_mmio = handle_mmio_misc,
767 }, 909 },
768 { 910 {
769 .base = GIC_DIST_IGROUP, 911 .base = GIC_DIST_IGROUP,
770 .len = VGIC_NR_IRQS / 8, 912 .len = VGIC_MAX_IRQS / 8,
913 .bits_per_irq = 1,
771 .handle_mmio = handle_mmio_raz_wi, 914 .handle_mmio = handle_mmio_raz_wi,
772 }, 915 },
773 { 916 {
774 .base = GIC_DIST_ENABLE_SET, 917 .base = GIC_DIST_ENABLE_SET,
775 .len = VGIC_NR_IRQS / 8, 918 .len = VGIC_MAX_IRQS / 8,
919 .bits_per_irq = 1,
776 .handle_mmio = handle_mmio_set_enable_reg, 920 .handle_mmio = handle_mmio_set_enable_reg,
777 }, 921 },
778 { 922 {
779 .base = GIC_DIST_ENABLE_CLEAR, 923 .base = GIC_DIST_ENABLE_CLEAR,
780 .len = VGIC_NR_IRQS / 8, 924 .len = VGIC_MAX_IRQS / 8,
925 .bits_per_irq = 1,
781 .handle_mmio = handle_mmio_clear_enable_reg, 926 .handle_mmio = handle_mmio_clear_enable_reg,
782 }, 927 },
783 { 928 {
784 .base = GIC_DIST_PENDING_SET, 929 .base = GIC_DIST_PENDING_SET,
785 .len = VGIC_NR_IRQS / 8, 930 .len = VGIC_MAX_IRQS / 8,
931 .bits_per_irq = 1,
786 .handle_mmio = handle_mmio_set_pending_reg, 932 .handle_mmio = handle_mmio_set_pending_reg,
787 }, 933 },
788 { 934 {
789 .base = GIC_DIST_PENDING_CLEAR, 935 .base = GIC_DIST_PENDING_CLEAR,
790 .len = VGIC_NR_IRQS / 8, 936 .len = VGIC_MAX_IRQS / 8,
937 .bits_per_irq = 1,
791 .handle_mmio = handle_mmio_clear_pending_reg, 938 .handle_mmio = handle_mmio_clear_pending_reg,
792 }, 939 },
793 { 940 {
794 .base = GIC_DIST_ACTIVE_SET, 941 .base = GIC_DIST_ACTIVE_SET,
795 .len = VGIC_NR_IRQS / 8, 942 .len = VGIC_MAX_IRQS / 8,
943 .bits_per_irq = 1,
796 .handle_mmio = handle_mmio_raz_wi, 944 .handle_mmio = handle_mmio_raz_wi,
797 }, 945 },
798 { 946 {
799 .base = GIC_DIST_ACTIVE_CLEAR, 947 .base = GIC_DIST_ACTIVE_CLEAR,
800 .len = VGIC_NR_IRQS / 8, 948 .len = VGIC_MAX_IRQS / 8,
949 .bits_per_irq = 1,
801 .handle_mmio = handle_mmio_raz_wi, 950 .handle_mmio = handle_mmio_raz_wi,
802 }, 951 },
803 { 952 {
804 .base = GIC_DIST_PRI, 953 .base = GIC_DIST_PRI,
805 .len = VGIC_NR_IRQS, 954 .len = VGIC_MAX_IRQS,
955 .bits_per_irq = 8,
806 .handle_mmio = handle_mmio_priority_reg, 956 .handle_mmio = handle_mmio_priority_reg,
807 }, 957 },
808 { 958 {
809 .base = GIC_DIST_TARGET, 959 .base = GIC_DIST_TARGET,
810 .len = VGIC_NR_IRQS, 960 .len = VGIC_MAX_IRQS,
961 .bits_per_irq = 8,
811 .handle_mmio = handle_mmio_target_reg, 962 .handle_mmio = handle_mmio_target_reg,
812 }, 963 },
813 { 964 {
814 .base = GIC_DIST_CONFIG, 965 .base = GIC_DIST_CONFIG,
815 .len = VGIC_NR_IRQS / 4, 966 .len = VGIC_MAX_IRQS / 4,
967 .bits_per_irq = 2,
816 .handle_mmio = handle_mmio_cfg_reg, 968 .handle_mmio = handle_mmio_cfg_reg,
817 }, 969 },
818 { 970 {
@@ -850,6 +1002,22 @@ struct mmio_range *find_matching_range(const struct mmio_range *ranges,
850 return NULL; 1002 return NULL;
851} 1003}
852 1004
1005static bool vgic_validate_access(const struct vgic_dist *dist,
1006 const struct mmio_range *range,
1007 unsigned long offset)
1008{
1009 int irq;
1010
1011 if (!range->bits_per_irq)
1012 return true; /* Not an irq-based access */
1013
1014 irq = offset * 8 / range->bits_per_irq;
1015 if (irq >= dist->nr_irqs)
1016 return false;
1017
1018 return true;
1019}
1020
853/** 1021/**
854 * vgic_handle_mmio - handle an in-kernel MMIO access 1022 * vgic_handle_mmio - handle an in-kernel MMIO access
855 * @vcpu: pointer to the vcpu performing the access 1023 * @vcpu: pointer to the vcpu performing the access
@@ -889,7 +1057,13 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
889 1057
890 spin_lock(&vcpu->kvm->arch.vgic.lock); 1058 spin_lock(&vcpu->kvm->arch.vgic.lock);
891 offset = mmio->phys_addr - range->base - base; 1059 offset = mmio->phys_addr - range->base - base;
892 updated_state = range->handle_mmio(vcpu, mmio, offset); 1060 if (vgic_validate_access(dist, range, offset)) {
1061 updated_state = range->handle_mmio(vcpu, mmio, offset);
1062 } else {
1063 vgic_reg_access(mmio, NULL, offset,
1064 ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
1065 updated_state = false;
1066 }
893 spin_unlock(&vcpu->kvm->arch.vgic.lock); 1067 spin_unlock(&vcpu->kvm->arch.vgic.lock);
894 kvm_prepare_mmio(run, mmio); 1068 kvm_prepare_mmio(run, mmio);
895 kvm_handle_mmio_return(vcpu, run); 1069 kvm_handle_mmio_return(vcpu, run);
@@ -900,6 +1074,11 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
900 return true; 1074 return true;
901} 1075}
902 1076
1077static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi)
1078{
1079 return dist->irq_sgi_sources + vcpu_id * VGIC_NR_SGIS + sgi;
1080}
1081
903static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg) 1082static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
904{ 1083{
905 struct kvm *kvm = vcpu->kvm; 1084 struct kvm *kvm = vcpu->kvm;
@@ -932,8 +1111,8 @@ static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
932 kvm_for_each_vcpu(c, vcpu, kvm) { 1111 kvm_for_each_vcpu(c, vcpu, kvm) {
933 if (target_cpus & 1) { 1112 if (target_cpus & 1) {
934 /* Flag the SGI as pending */ 1113 /* Flag the SGI as pending */
935 vgic_dist_irq_set(vcpu, sgi); 1114 vgic_dist_irq_set_pending(vcpu, sgi);
936 dist->irq_sgi_sources[c][sgi] |= 1 << vcpu_id; 1115 *vgic_get_sgi_sources(dist, c, sgi) |= 1 << vcpu_id;
937 kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c); 1116 kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c);
938 } 1117 }
939 1118
@@ -941,32 +1120,38 @@ static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
941 } 1120 }
942} 1121}
943 1122
1123static int vgic_nr_shared_irqs(struct vgic_dist *dist)
1124{
1125 return dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
1126}
1127
944static int compute_pending_for_cpu(struct kvm_vcpu *vcpu) 1128static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
945{ 1129{
946 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 1130 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
947 unsigned long *pending, *enabled, *pend_percpu, *pend_shared; 1131 unsigned long *pending, *enabled, *pend_percpu, *pend_shared;
948 unsigned long pending_private, pending_shared; 1132 unsigned long pending_private, pending_shared;
1133 int nr_shared = vgic_nr_shared_irqs(dist);
949 int vcpu_id; 1134 int vcpu_id;
950 1135
951 vcpu_id = vcpu->vcpu_id; 1136 vcpu_id = vcpu->vcpu_id;
952 pend_percpu = vcpu->arch.vgic_cpu.pending_percpu; 1137 pend_percpu = vcpu->arch.vgic_cpu.pending_percpu;
953 pend_shared = vcpu->arch.vgic_cpu.pending_shared; 1138 pend_shared = vcpu->arch.vgic_cpu.pending_shared;
954 1139
955 pending = vgic_bitmap_get_cpu_map(&dist->irq_state, vcpu_id); 1140 pending = vgic_bitmap_get_cpu_map(&dist->irq_pending, vcpu_id);
956 enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id); 1141 enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
957 bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS); 1142 bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS);
958 1143
959 pending = vgic_bitmap_get_shared_map(&dist->irq_state); 1144 pending = vgic_bitmap_get_shared_map(&dist->irq_pending);
960 enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled); 1145 enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
961 bitmap_and(pend_shared, pending, enabled, VGIC_NR_SHARED_IRQS); 1146 bitmap_and(pend_shared, pending, enabled, nr_shared);
962 bitmap_and(pend_shared, pend_shared, 1147 bitmap_and(pend_shared, pend_shared,
963 vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]), 1148 vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
964 VGIC_NR_SHARED_IRQS); 1149 nr_shared);
965 1150
966 pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS); 1151 pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS);
967 pending_shared = find_first_bit(pend_shared, VGIC_NR_SHARED_IRQS); 1152 pending_shared = find_first_bit(pend_shared, nr_shared);
968 return (pending_private < VGIC_NR_PRIVATE_IRQS || 1153 return (pending_private < VGIC_NR_PRIVATE_IRQS ||
969 pending_shared < VGIC_NR_SHARED_IRQS); 1154 pending_shared < vgic_nr_shared_irqs(dist));
970} 1155}
971 1156
972/* 1157/*
@@ -980,14 +1165,14 @@ static void vgic_update_state(struct kvm *kvm)
980 int c; 1165 int c;
981 1166
982 if (!dist->enabled) { 1167 if (!dist->enabled) {
983 set_bit(0, &dist->irq_pending_on_cpu); 1168 set_bit(0, dist->irq_pending_on_cpu);
984 return; 1169 return;
985 } 1170 }
986 1171
987 kvm_for_each_vcpu(c, vcpu, kvm) { 1172 kvm_for_each_vcpu(c, vcpu, kvm) {
988 if (compute_pending_for_cpu(vcpu)) { 1173 if (compute_pending_for_cpu(vcpu)) {
989 pr_debug("CPU%d has pending interrupts\n", c); 1174 pr_debug("CPU%d has pending interrupts\n", c);
990 set_bit(c, &dist->irq_pending_on_cpu); 1175 set_bit(c, dist->irq_pending_on_cpu);
991 } 1176 }
992 } 1177 }
993} 1178}
@@ -1079,8 +1264,8 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
1079 1264
1080 if (!vgic_irq_is_enabled(vcpu, vlr.irq)) { 1265 if (!vgic_irq_is_enabled(vcpu, vlr.irq)) {
1081 vgic_retire_lr(lr, vlr.irq, vcpu); 1266 vgic_retire_lr(lr, vlr.irq, vcpu);
1082 if (vgic_irq_is_active(vcpu, vlr.irq)) 1267 if (vgic_irq_is_queued(vcpu, vlr.irq))
1083 vgic_irq_clear_active(vcpu, vlr.irq); 1268 vgic_irq_clear_queued(vcpu, vlr.irq);
1084 } 1269 }
1085 } 1270 }
1086} 1271}
@@ -1092,13 +1277,14 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
1092static bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) 1277static bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
1093{ 1278{
1094 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 1279 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
1280 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
1095 struct vgic_lr vlr; 1281 struct vgic_lr vlr;
1096 int lr; 1282 int lr;
1097 1283
1098 /* Sanitize the input... */ 1284 /* Sanitize the input... */
1099 BUG_ON(sgi_source_id & ~7); 1285 BUG_ON(sgi_source_id & ~7);
1100 BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS); 1286 BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS);
1101 BUG_ON(irq >= VGIC_NR_IRQS); 1287 BUG_ON(irq >= dist->nr_irqs);
1102 1288
1103 kvm_debug("Queue IRQ%d\n", irq); 1289 kvm_debug("Queue IRQ%d\n", irq);
1104 1290
@@ -1144,14 +1330,14 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq)
1144 int vcpu_id = vcpu->vcpu_id; 1330 int vcpu_id = vcpu->vcpu_id;
1145 int c; 1331 int c;
1146 1332
1147 sources = dist->irq_sgi_sources[vcpu_id][irq]; 1333 sources = *vgic_get_sgi_sources(dist, vcpu_id, irq);
1148 1334
1149 for_each_set_bit(c, &sources, VGIC_MAX_CPUS) { 1335 for_each_set_bit(c, &sources, dist->nr_cpus) {
1150 if (vgic_queue_irq(vcpu, c, irq)) 1336 if (vgic_queue_irq(vcpu, c, irq))
1151 clear_bit(c, &sources); 1337 clear_bit(c, &sources);
1152 } 1338 }
1153 1339
1154 dist->irq_sgi_sources[vcpu_id][irq] = sources; 1340 *vgic_get_sgi_sources(dist, vcpu_id, irq) = sources;
1155 1341
1156 /* 1342 /*
1157 * If the sources bitmap has been cleared it means that we 1343 * If the sources bitmap has been cleared it means that we
@@ -1160,7 +1346,7 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq)
1160 * our emulated gic and can get rid of them. 1346 * our emulated gic and can get rid of them.
1161 */ 1347 */
1162 if (!sources) { 1348 if (!sources) {
1163 vgic_dist_irq_clear(vcpu, irq); 1349 vgic_dist_irq_clear_pending(vcpu, irq);
1164 vgic_cpu_irq_clear(vcpu, irq); 1350 vgic_cpu_irq_clear(vcpu, irq);
1165 return true; 1351 return true;
1166 } 1352 }
@@ -1170,15 +1356,15 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq)
1170 1356
1171static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq) 1357static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq)
1172{ 1358{
1173 if (vgic_irq_is_active(vcpu, irq)) 1359 if (!vgic_can_sample_irq(vcpu, irq))
1174 return true; /* level interrupt, already queued */ 1360 return true; /* level interrupt, already queued */
1175 1361
1176 if (vgic_queue_irq(vcpu, 0, irq)) { 1362 if (vgic_queue_irq(vcpu, 0, irq)) {
1177 if (vgic_irq_is_edge(vcpu, irq)) { 1363 if (vgic_irq_is_edge(vcpu, irq)) {
1178 vgic_dist_irq_clear(vcpu, irq); 1364 vgic_dist_irq_clear_pending(vcpu, irq);
1179 vgic_cpu_irq_clear(vcpu, irq); 1365 vgic_cpu_irq_clear(vcpu, irq);
1180 } else { 1366 } else {
1181 vgic_irq_set_active(vcpu, irq); 1367 vgic_irq_set_queued(vcpu, irq);
1182 } 1368 }
1183 1369
1184 return true; 1370 return true;
@@ -1223,7 +1409,7 @@ static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
1223 } 1409 }
1224 1410
1225 /* SPIs */ 1411 /* SPIs */
1226 for_each_set_bit(i, vgic_cpu->pending_shared, VGIC_NR_SHARED_IRQS) { 1412 for_each_set_bit(i, vgic_cpu->pending_shared, vgic_nr_shared_irqs(dist)) {
1227 if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS)) 1413 if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS))
1228 overflow = 1; 1414 overflow = 1;
1229 } 1415 }
@@ -1239,7 +1425,7 @@ epilog:
1239 * us. Claim we don't have anything pending. We'll 1425 * us. Claim we don't have anything pending. We'll
1240 * adjust that if needed while exiting. 1426 * adjust that if needed while exiting.
1241 */ 1427 */
1242 clear_bit(vcpu_id, &dist->irq_pending_on_cpu); 1428 clear_bit(vcpu_id, dist->irq_pending_on_cpu);
1243 } 1429 }
1244} 1430}
1245 1431
@@ -1261,17 +1447,32 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
1261 1447
1262 for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) { 1448 for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) {
1263 struct vgic_lr vlr = vgic_get_lr(vcpu, lr); 1449 struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
1450 WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq));
1264 1451
1265 vgic_irq_clear_active(vcpu, vlr.irq); 1452 vgic_irq_clear_queued(vcpu, vlr.irq);
1266 WARN_ON(vlr.state & LR_STATE_MASK); 1453 WARN_ON(vlr.state & LR_STATE_MASK);
1267 vlr.state = 0; 1454 vlr.state = 0;
1268 vgic_set_lr(vcpu, lr, vlr); 1455 vgic_set_lr(vcpu, lr, vlr);
1269 1456
1457 /*
1458 * If the IRQ was EOIed it was also ACKed and we we
1459 * therefore assume we can clear the soft pending
1460 * state (should it had been set) for this interrupt.
1461 *
1462 * Note: if the IRQ soft pending state was set after
1463 * the IRQ was acked, it actually shouldn't be
1464 * cleared, but we have no way of knowing that unless
1465 * we start trapping ACKs when the soft-pending state
1466 * is set.
1467 */
1468 vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
1469
1270 /* Any additional pending interrupt? */ 1470 /* Any additional pending interrupt? */
1271 if (vgic_dist_irq_is_pending(vcpu, vlr.irq)) { 1471 if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
1272 vgic_cpu_irq_set(vcpu, vlr.irq); 1472 vgic_cpu_irq_set(vcpu, vlr.irq);
1273 level_pending = true; 1473 level_pending = true;
1274 } else { 1474 } else {
1475 vgic_dist_irq_clear_pending(vcpu, vlr.irq);
1275 vgic_cpu_irq_clear(vcpu, vlr.irq); 1476 vgic_cpu_irq_clear(vcpu, vlr.irq);
1276 } 1477 }
1277 1478
@@ -1315,14 +1516,14 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
1315 1516
1316 vlr = vgic_get_lr(vcpu, lr); 1517 vlr = vgic_get_lr(vcpu, lr);
1317 1518
1318 BUG_ON(vlr.irq >= VGIC_NR_IRQS); 1519 BUG_ON(vlr.irq >= dist->nr_irqs);
1319 vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY; 1520 vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
1320 } 1521 }
1321 1522
1322 /* Check if we still have something up our sleeve... */ 1523 /* Check if we still have something up our sleeve... */
1323 pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr); 1524 pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr);
1324 if (level_pending || pending < vgic->nr_lr) 1525 if (level_pending || pending < vgic->nr_lr)
1325 set_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu); 1526 set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
1326} 1527}
1327 1528
1328void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) 1529void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
@@ -1356,7 +1557,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
1356 if (!irqchip_in_kernel(vcpu->kvm)) 1557 if (!irqchip_in_kernel(vcpu->kvm))
1357 return 0; 1558 return 0;
1358 1559
1359 return test_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu); 1560 return test_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
1360} 1561}
1361 1562
1362static void vgic_kick_vcpus(struct kvm *kvm) 1563static void vgic_kick_vcpus(struct kvm *kvm)
@@ -1376,34 +1577,36 @@ static void vgic_kick_vcpus(struct kvm *kvm)
1376 1577
1377static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level) 1578static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level)
1378{ 1579{
1379 int is_edge = vgic_irq_is_edge(vcpu, irq); 1580 int edge_triggered = vgic_irq_is_edge(vcpu, irq);
1380 int state = vgic_dist_irq_is_pending(vcpu, irq);
1381 1581
1382 /* 1582 /*
1383 * Only inject an interrupt if: 1583 * Only inject an interrupt if:
1384 * - edge triggered and we have a rising edge 1584 * - edge triggered and we have a rising edge
1385 * - level triggered and we change level 1585 * - level triggered and we change level
1386 */ 1586 */
1387 if (is_edge) 1587 if (edge_triggered) {
1588 int state = vgic_dist_irq_is_pending(vcpu, irq);
1388 return level > state; 1589 return level > state;
1389 else 1590 } else {
1591 int state = vgic_dist_irq_get_level(vcpu, irq);
1390 return level != state; 1592 return level != state;
1593 }
1391} 1594}
1392 1595
1393static bool vgic_update_irq_state(struct kvm *kvm, int cpuid, 1596static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
1394 unsigned int irq_num, bool level) 1597 unsigned int irq_num, bool level)
1395{ 1598{
1396 struct vgic_dist *dist = &kvm->arch.vgic; 1599 struct vgic_dist *dist = &kvm->arch.vgic;
1397 struct kvm_vcpu *vcpu; 1600 struct kvm_vcpu *vcpu;
1398 int is_edge, is_level; 1601 int edge_triggered, level_triggered;
1399 int enabled; 1602 int enabled;
1400 bool ret = true; 1603 bool ret = true;
1401 1604
1402 spin_lock(&dist->lock); 1605 spin_lock(&dist->lock);
1403 1606
1404 vcpu = kvm_get_vcpu(kvm, cpuid); 1607 vcpu = kvm_get_vcpu(kvm, cpuid);
1405 is_edge = vgic_irq_is_edge(vcpu, irq_num); 1608 edge_triggered = vgic_irq_is_edge(vcpu, irq_num);
1406 is_level = !is_edge; 1609 level_triggered = !edge_triggered;
1407 1610
1408 if (!vgic_validate_injection(vcpu, irq_num, level)) { 1611 if (!vgic_validate_injection(vcpu, irq_num, level)) {
1409 ret = false; 1612 ret = false;
@@ -1417,10 +1620,19 @@ static bool vgic_update_irq_state(struct kvm *kvm, int cpuid,
1417 1620
1418 kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid); 1621 kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid);
1419 1622
1420 if (level) 1623 if (level) {
1421 vgic_dist_irq_set(vcpu, irq_num); 1624 if (level_triggered)
1422 else 1625 vgic_dist_irq_set_level(vcpu, irq_num);
1423 vgic_dist_irq_clear(vcpu, irq_num); 1626 vgic_dist_irq_set_pending(vcpu, irq_num);
1627 } else {
1628 if (level_triggered) {
1629 vgic_dist_irq_clear_level(vcpu, irq_num);
1630 if (!vgic_dist_irq_soft_pend(vcpu, irq_num))
1631 vgic_dist_irq_clear_pending(vcpu, irq_num);
1632 } else {
1633 vgic_dist_irq_clear_pending(vcpu, irq_num);
1634 }
1635 }
1424 1636
1425 enabled = vgic_irq_is_enabled(vcpu, irq_num); 1637 enabled = vgic_irq_is_enabled(vcpu, irq_num);
1426 1638
@@ -1429,7 +1641,7 @@ static bool vgic_update_irq_state(struct kvm *kvm, int cpuid,
1429 goto out; 1641 goto out;
1430 } 1642 }
1431 1643
1432 if (is_level && vgic_irq_is_active(vcpu, irq_num)) { 1644 if (!vgic_can_sample_irq(vcpu, irq_num)) {
1433 /* 1645 /*
1434 * Level interrupt in progress, will be picked up 1646 * Level interrupt in progress, will be picked up
1435 * when EOId. 1647 * when EOId.
@@ -1440,7 +1652,7 @@ static bool vgic_update_irq_state(struct kvm *kvm, int cpuid,
1440 1652
1441 if (level) { 1653 if (level) {
1442 vgic_cpu_irq_set(vcpu, irq_num); 1654 vgic_cpu_irq_set(vcpu, irq_num);
1443 set_bit(cpuid, &dist->irq_pending_on_cpu); 1655 set_bit(cpuid, dist->irq_pending_on_cpu);
1444 } 1656 }
1445 1657
1446out: 1658out:
@@ -1466,7 +1678,8 @@ out:
1466int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, 1678int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
1467 bool level) 1679 bool level)
1468{ 1680{
1469 if (vgic_update_irq_state(kvm, cpuid, irq_num, level)) 1681 if (likely(vgic_initialized(kvm)) &&
1682 vgic_update_irq_pending(kvm, cpuid, irq_num, level))
1470 vgic_kick_vcpus(kvm); 1683 vgic_kick_vcpus(kvm);
1471 1684
1472 return 0; 1685 return 0;
@@ -1483,6 +1696,32 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data)
1483 return IRQ_HANDLED; 1696 return IRQ_HANDLED;
1484} 1697}
1485 1698
1699void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
1700{
1701 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
1702
1703 kfree(vgic_cpu->pending_shared);
1704 kfree(vgic_cpu->vgic_irq_lr_map);
1705 vgic_cpu->pending_shared = NULL;
1706 vgic_cpu->vgic_irq_lr_map = NULL;
1707}
1708
1709static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
1710{
1711 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
1712
1713 int sz = (nr_irqs - VGIC_NR_PRIVATE_IRQS) / 8;
1714 vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL);
1715 vgic_cpu->vgic_irq_lr_map = kzalloc(nr_irqs, GFP_KERNEL);
1716
1717 if (!vgic_cpu->pending_shared || !vgic_cpu->vgic_irq_lr_map) {
1718 kvm_vgic_vcpu_destroy(vcpu);
1719 return -ENOMEM;
1720 }
1721
1722 return 0;
1723}
1724
1486/** 1725/**
1487 * kvm_vgic_vcpu_init - Initialize per-vcpu VGIC state 1726 * kvm_vgic_vcpu_init - Initialize per-vcpu VGIC state
1488 * @vcpu: pointer to the vcpu struct 1727 * @vcpu: pointer to the vcpu struct
@@ -1490,16 +1729,13 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data)
1490 * Initialize the vgic_cpu struct and vgic_dist struct fields pertaining to 1729 * Initialize the vgic_cpu struct and vgic_dist struct fields pertaining to
1491 * this vcpu and enable the VGIC for this VCPU 1730 * this vcpu and enable the VGIC for this VCPU
1492 */ 1731 */
1493int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) 1732static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
1494{ 1733{
1495 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 1734 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
1496 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 1735 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
1497 int i; 1736 int i;
1498 1737
1499 if (vcpu->vcpu_id >= VGIC_MAX_CPUS) 1738 for (i = 0; i < dist->nr_irqs; i++) {
1500 return -EBUSY;
1501
1502 for (i = 0; i < VGIC_NR_IRQS; i++) {
1503 if (i < VGIC_NR_PPIS) 1739 if (i < VGIC_NR_PPIS)
1504 vgic_bitmap_set_irq_val(&dist->irq_enabled, 1740 vgic_bitmap_set_irq_val(&dist->irq_enabled,
1505 vcpu->vcpu_id, i, 1); 1741 vcpu->vcpu_id, i, 1);
@@ -1518,84 +1754,112 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
1518 vgic_cpu->nr_lr = vgic->nr_lr; 1754 vgic_cpu->nr_lr = vgic->nr_lr;
1519 1755
1520 vgic_enable(vcpu); 1756 vgic_enable(vcpu);
1521
1522 return 0;
1523} 1757}
1524 1758
1525static void vgic_init_maintenance_interrupt(void *info) 1759void kvm_vgic_destroy(struct kvm *kvm)
1526{ 1760{
1527 enable_percpu_irq(vgic->maint_irq, 0); 1761 struct vgic_dist *dist = &kvm->arch.vgic;
1762 struct kvm_vcpu *vcpu;
1763 int i;
1764
1765 kvm_for_each_vcpu(i, vcpu, kvm)
1766 kvm_vgic_vcpu_destroy(vcpu);
1767
1768 vgic_free_bitmap(&dist->irq_enabled);
1769 vgic_free_bitmap(&dist->irq_level);
1770 vgic_free_bitmap(&dist->irq_pending);
1771 vgic_free_bitmap(&dist->irq_soft_pend);
1772 vgic_free_bitmap(&dist->irq_queued);
1773 vgic_free_bitmap(&dist->irq_cfg);
1774 vgic_free_bytemap(&dist->irq_priority);
1775 if (dist->irq_spi_target) {
1776 for (i = 0; i < dist->nr_cpus; i++)
1777 vgic_free_bitmap(&dist->irq_spi_target[i]);
1778 }
1779 kfree(dist->irq_sgi_sources);
1780 kfree(dist->irq_spi_cpu);
1781 kfree(dist->irq_spi_target);
1782 kfree(dist->irq_pending_on_cpu);
1783 dist->irq_sgi_sources = NULL;
1784 dist->irq_spi_cpu = NULL;
1785 dist->irq_spi_target = NULL;
1786 dist->irq_pending_on_cpu = NULL;
1528} 1787}
1529 1788
1530static int vgic_cpu_notify(struct notifier_block *self, 1789/*
1531 unsigned long action, void *cpu) 1790 * Allocate and initialize the various data structures. Must be called
1791 * with kvm->lock held!
1792 */
1793static int vgic_init_maps(struct kvm *kvm)
1532{ 1794{
1533 switch (action) { 1795 struct vgic_dist *dist = &kvm->arch.vgic;
1534 case CPU_STARTING: 1796 struct kvm_vcpu *vcpu;
1535 case CPU_STARTING_FROZEN: 1797 int nr_cpus, nr_irqs;
1536 vgic_init_maintenance_interrupt(NULL); 1798 int ret, i;
1537 break;
1538 case CPU_DYING:
1539 case CPU_DYING_FROZEN:
1540 disable_percpu_irq(vgic->maint_irq);
1541 break;
1542 }
1543 1799
1544 return NOTIFY_OK; 1800 if (dist->nr_cpus) /* Already allocated */
1545} 1801 return 0;
1546 1802
1547static struct notifier_block vgic_cpu_nb = { 1803 nr_cpus = dist->nr_cpus = atomic_read(&kvm->online_vcpus);
1548 .notifier_call = vgic_cpu_notify, 1804 if (!nr_cpus) /* No vcpus? Can't be good... */
1549}; 1805 return -EINVAL;
1550 1806
1551static const struct of_device_id vgic_ids[] = { 1807 /*
1552 { .compatible = "arm,cortex-a15-gic", .data = vgic_v2_probe, }, 1808 * If nobody configured the number of interrupts, use the
1553 { .compatible = "arm,gic-v3", .data = vgic_v3_probe, }, 1809 * legacy one.
1554 {}, 1810 */
1555}; 1811 if (!dist->nr_irqs)
1812 dist->nr_irqs = VGIC_NR_IRQS_LEGACY;
1556 1813
1557int kvm_vgic_hyp_init(void) 1814 nr_irqs = dist->nr_irqs;
1558{
1559 const struct of_device_id *matched_id;
1560 int (*vgic_probe)(struct device_node *,const struct vgic_ops **,
1561 const struct vgic_params **);
1562 struct device_node *vgic_node;
1563 int ret;
1564 1815
1565 vgic_node = of_find_matching_node_and_match(NULL, 1816 ret = vgic_init_bitmap(&dist->irq_enabled, nr_cpus, nr_irqs);
1566 vgic_ids, &matched_id); 1817 ret |= vgic_init_bitmap(&dist->irq_level, nr_cpus, nr_irqs);
1567 if (!vgic_node) { 1818 ret |= vgic_init_bitmap(&dist->irq_pending, nr_cpus, nr_irqs);
1568 kvm_err("error: no compatible GIC node found\n"); 1819 ret |= vgic_init_bitmap(&dist->irq_soft_pend, nr_cpus, nr_irqs);
1569 return -ENODEV; 1820 ret |= vgic_init_bitmap(&dist->irq_queued, nr_cpus, nr_irqs);
1570 } 1821 ret |= vgic_init_bitmap(&dist->irq_cfg, nr_cpus, nr_irqs);
1822 ret |= vgic_init_bytemap(&dist->irq_priority, nr_cpus, nr_irqs);
1571 1823
1572 vgic_probe = matched_id->data;
1573 ret = vgic_probe(vgic_node, &vgic_ops, &vgic);
1574 if (ret) 1824 if (ret)
1575 return ret; 1825 goto out;
1576 1826
1577 ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler, 1827 dist->irq_sgi_sources = kzalloc(nr_cpus * VGIC_NR_SGIS, GFP_KERNEL);
1578 "vgic", kvm_get_running_vcpus()); 1828 dist->irq_spi_cpu = kzalloc(nr_irqs - VGIC_NR_PRIVATE_IRQS, GFP_KERNEL);
1579 if (ret) { 1829 dist->irq_spi_target = kzalloc(sizeof(*dist->irq_spi_target) * nr_cpus,
1580 kvm_err("Cannot register interrupt %d\n", vgic->maint_irq); 1830 GFP_KERNEL);
1581 return ret; 1831 dist->irq_pending_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long),
1832 GFP_KERNEL);
1833 if (!dist->irq_sgi_sources ||
1834 !dist->irq_spi_cpu ||
1835 !dist->irq_spi_target ||
1836 !dist->irq_pending_on_cpu) {
1837 ret = -ENOMEM;
1838 goto out;
1582 } 1839 }
1583 1840
1584 ret = __register_cpu_notifier(&vgic_cpu_nb); 1841 for (i = 0; i < nr_cpus; i++)
1585 if (ret) { 1842 ret |= vgic_init_bitmap(&dist->irq_spi_target[i],
1586 kvm_err("Cannot register vgic CPU notifier\n"); 1843 nr_cpus, nr_irqs);
1587 goto out_free_irq;
1588 }
1589 1844
1590 /* Callback into for arch code for setup */ 1845 if (ret)
1591 vgic_arch_setup(vgic); 1846 goto out;
1592 1847
1593 on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1); 1848 kvm_for_each_vcpu(i, vcpu, kvm) {
1849 ret = vgic_vcpu_init_maps(vcpu, nr_irqs);
1850 if (ret) {
1851 kvm_err("VGIC: Failed to allocate vcpu memory\n");
1852 break;
1853 }
1854 }
1594 1855
1595 return 0; 1856 for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i += 4)
1857 vgic_set_target_reg(kvm, 0, i);
1858
1859out:
1860 if (ret)
1861 kvm_vgic_destroy(kvm);
1596 1862
1597out_free_irq:
1598 free_percpu_irq(vgic->maint_irq, kvm_get_running_vcpus());
1599 return ret; 1863 return ret;
1600} 1864}
1601 1865
@@ -1610,6 +1874,7 @@ out_free_irq:
1610 */ 1874 */
1611int kvm_vgic_init(struct kvm *kvm) 1875int kvm_vgic_init(struct kvm *kvm)
1612{ 1876{
1877 struct kvm_vcpu *vcpu;
1613 int ret = 0, i; 1878 int ret = 0, i;
1614 1879
1615 if (!irqchip_in_kernel(kvm)) 1880 if (!irqchip_in_kernel(kvm))
@@ -1627,6 +1892,12 @@ int kvm_vgic_init(struct kvm *kvm)
1627 goto out; 1892 goto out;
1628 } 1893 }
1629 1894
1895 ret = vgic_init_maps(kvm);
1896 if (ret) {
1897 kvm_err("Unable to allocate maps\n");
1898 goto out;
1899 }
1900
1630 ret = kvm_phys_addr_ioremap(kvm, kvm->arch.vgic.vgic_cpu_base, 1901 ret = kvm_phys_addr_ioremap(kvm, kvm->arch.vgic.vgic_cpu_base,
1631 vgic->vcpu_base, KVM_VGIC_V2_CPU_SIZE); 1902 vgic->vcpu_base, KVM_VGIC_V2_CPU_SIZE);
1632 if (ret) { 1903 if (ret) {
@@ -1634,11 +1905,13 @@ int kvm_vgic_init(struct kvm *kvm)
1634 goto out; 1905 goto out;
1635 } 1906 }
1636 1907
1637 for (i = VGIC_NR_PRIVATE_IRQS; i < VGIC_NR_IRQS; i += 4) 1908 kvm_for_each_vcpu(i, vcpu, kvm)
1638 vgic_set_target_reg(kvm, 0, i); 1909 kvm_vgic_vcpu_init(vcpu);
1639 1910
1640 kvm->arch.vgic.ready = true; 1911 kvm->arch.vgic.ready = true;
1641out: 1912out:
1913 if (ret)
1914 kvm_vgic_destroy(kvm);
1642 mutex_unlock(&kvm->lock); 1915 mutex_unlock(&kvm->lock);
1643 return ret; 1916 return ret;
1644} 1917}
@@ -1690,7 +1963,7 @@ out:
1690 return ret; 1963 return ret;
1691} 1964}
1692 1965
1693static bool vgic_ioaddr_overlap(struct kvm *kvm) 1966static int vgic_ioaddr_overlap(struct kvm *kvm)
1694{ 1967{
1695 phys_addr_t dist = kvm->arch.vgic.vgic_dist_base; 1968 phys_addr_t dist = kvm->arch.vgic.vgic_dist_base;
1696 phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base; 1969 phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base;
@@ -1879,6 +2152,10 @@ static int vgic_attr_regs_access(struct kvm_device *dev,
1879 2152
1880 mutex_lock(&dev->kvm->lock); 2153 mutex_lock(&dev->kvm->lock);
1881 2154
2155 ret = vgic_init_maps(dev->kvm);
2156 if (ret)
2157 goto out;
2158
1882 if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) { 2159 if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
1883 ret = -EINVAL; 2160 ret = -EINVAL;
1884 goto out; 2161 goto out;
@@ -1976,6 +2253,36 @@ static int vgic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
1976 2253
1977 return vgic_attr_regs_access(dev, attr, &reg, true); 2254 return vgic_attr_regs_access(dev, attr, &reg, true);
1978 } 2255 }
2256 case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
2257 u32 __user *uaddr = (u32 __user *)(long)attr->addr;
2258 u32 val;
2259 int ret = 0;
2260
2261 if (get_user(val, uaddr))
2262 return -EFAULT;
2263
2264 /*
2265 * We require:
2266 * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
2267 * - at most 1024 interrupts
2268 * - a multiple of 32 interrupts
2269 */
2270 if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
2271 val > VGIC_MAX_IRQS ||
2272 (val & 31))
2273 return -EINVAL;
2274
2275 mutex_lock(&dev->kvm->lock);
2276
2277 if (vgic_initialized(dev->kvm) || dev->kvm->arch.vgic.nr_irqs)
2278 ret = -EBUSY;
2279 else
2280 dev->kvm->arch.vgic.nr_irqs = val;
2281
2282 mutex_unlock(&dev->kvm->lock);
2283
2284 return ret;
2285 }
1979 2286
1980 } 2287 }
1981 2288
@@ -2012,6 +2319,11 @@ static int vgic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
2012 r = put_user(reg, uaddr); 2319 r = put_user(reg, uaddr);
2013 break; 2320 break;
2014 } 2321 }
2322 case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
2323 u32 __user *uaddr = (u32 __user *)(long)attr->addr;
2324 r = put_user(dev->kvm->arch.vgic.nr_irqs, uaddr);
2325 break;
2326 }
2015 2327
2016 } 2328 }
2017 2329
@@ -2048,6 +2360,8 @@ static int vgic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
2048 case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: 2360 case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
2049 offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; 2361 offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
2050 return vgic_has_attr_regs(vgic_cpu_ranges, offset); 2362 return vgic_has_attr_regs(vgic_cpu_ranges, offset);
2363 case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
2364 return 0;
2051 } 2365 }
2052 return -ENXIO; 2366 return -ENXIO;
2053} 2367}
@@ -2062,7 +2376,7 @@ static int vgic_create(struct kvm_device *dev, u32 type)
2062 return kvm_vgic_create(dev->kvm); 2376 return kvm_vgic_create(dev->kvm);
2063} 2377}
2064 2378
2065struct kvm_device_ops kvm_arm_vgic_v2_ops = { 2379static struct kvm_device_ops kvm_arm_vgic_v2_ops = {
2066 .name = "kvm-arm-vgic", 2380 .name = "kvm-arm-vgic",
2067 .create = vgic_create, 2381 .create = vgic_create,
2068 .destroy = vgic_destroy, 2382 .destroy = vgic_destroy,
@@ -2070,3 +2384,81 @@ struct kvm_device_ops kvm_arm_vgic_v2_ops = {
2070 .get_attr = vgic_get_attr, 2384 .get_attr = vgic_get_attr,
2071 .has_attr = vgic_has_attr, 2385 .has_attr = vgic_has_attr,
2072}; 2386};
2387
2388static void vgic_init_maintenance_interrupt(void *info)
2389{
2390 enable_percpu_irq(vgic->maint_irq, 0);
2391}
2392
2393static int vgic_cpu_notify(struct notifier_block *self,
2394 unsigned long action, void *cpu)
2395{
2396 switch (action) {
2397 case CPU_STARTING:
2398 case CPU_STARTING_FROZEN:
2399 vgic_init_maintenance_interrupt(NULL);
2400 break;
2401 case CPU_DYING:
2402 case CPU_DYING_FROZEN:
2403 disable_percpu_irq(vgic->maint_irq);
2404 break;
2405 }
2406
2407 return NOTIFY_OK;
2408}
2409
2410static struct notifier_block vgic_cpu_nb = {
2411 .notifier_call = vgic_cpu_notify,
2412};
2413
2414static const struct of_device_id vgic_ids[] = {
2415 { .compatible = "arm,cortex-a15-gic", .data = vgic_v2_probe, },
2416 { .compatible = "arm,gic-v3", .data = vgic_v3_probe, },
2417 {},
2418};
2419
2420int kvm_vgic_hyp_init(void)
2421{
2422 const struct of_device_id *matched_id;
2423 const int (*vgic_probe)(struct device_node *,const struct vgic_ops **,
2424 const struct vgic_params **);
2425 struct device_node *vgic_node;
2426 int ret;
2427
2428 vgic_node = of_find_matching_node_and_match(NULL,
2429 vgic_ids, &matched_id);
2430 if (!vgic_node) {
2431 kvm_err("error: no compatible GIC node found\n");
2432 return -ENODEV;
2433 }
2434
2435 vgic_probe = matched_id->data;
2436 ret = vgic_probe(vgic_node, &vgic_ops, &vgic);
2437 if (ret)
2438 return ret;
2439
2440 ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler,
2441 "vgic", kvm_get_running_vcpus());
2442 if (ret) {
2443 kvm_err("Cannot register interrupt %d\n", vgic->maint_irq);
2444 return ret;
2445 }
2446
2447 ret = __register_cpu_notifier(&vgic_cpu_nb);
2448 if (ret) {
2449 kvm_err("Cannot register vgic CPU notifier\n");
2450 goto out_free_irq;
2451 }
2452
2453 /* Callback into for arch code for setup */
2454 vgic_arch_setup(vgic);
2455
2456 on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1);
2457
2458 return kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
2459 KVM_DEV_TYPE_ARM_VGIC_V2);
2460
2461out_free_irq:
2462 free_percpu_irq(vgic->maint_irq, kvm_get_running_vcpus());
2463 return ret;
2464}
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index d6a3d0993d88..5ff7f7f2689a 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,9 +80,7 @@ static void async_pf_execute(struct work_struct *work)
80 80
81 might_sleep(); 81 might_sleep();
82 82
83 down_read(&mm->mmap_sem); 83 kvm_get_user_page_io(NULL, mm, addr, 1, NULL);
84 get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
85 up_read(&mm->mmap_sem);
86 kvm_async_page_present_sync(vcpu, apf); 84 kvm_async_page_present_sync(vcpu, apf);
87 85
88 spin_lock(&vcpu->async_pf.lock); 86 spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 3c5981c87c3f..b0fb390943c6 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -36,7 +36,9 @@
36#include <linux/seqlock.h> 36#include <linux/seqlock.h>
37#include <trace/events/kvm.h> 37#include <trace/events/kvm.h>
38 38
39#include "irq.h" 39#ifdef __KVM_HAVE_IOAPIC
40#include "ioapic.h"
41#endif
40#include "iodev.h" 42#include "iodev.h"
41 43
42#ifdef CONFIG_HAVE_KVM_IRQFD 44#ifdef CONFIG_HAVE_KVM_IRQFD
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index e8ce34c9db32..0ba4057d271b 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -405,6 +405,26 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
405 spin_unlock(&ioapic->lock); 405 spin_unlock(&ioapic->lock);
406} 406}
407 407
408static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
409{
410 int i;
411 struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
412 eoi_inject.work);
413 spin_lock(&ioapic->lock);
414 for (i = 0; i < IOAPIC_NUM_PINS; i++) {
415 union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
416
417 if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
418 continue;
419
420 if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
421 ioapic_service(ioapic, i, false);
422 }
423 spin_unlock(&ioapic->lock);
424}
425
426#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000
427
408static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, 428static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
409 struct kvm_ioapic *ioapic, int vector, int trigger_mode) 429 struct kvm_ioapic *ioapic, int vector, int trigger_mode)
410{ 430{
@@ -435,8 +455,26 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
435 455
436 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); 456 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
437 ent->fields.remote_irr = 0; 457 ent->fields.remote_irr = 0;
438 if (ioapic->irr & (1 << i)) 458 if (!ent->fields.mask && (ioapic->irr & (1 << i))) {
439 ioapic_service(ioapic, i, false); 459 ++ioapic->irq_eoi[i];
460 if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
461 /*
462 * Real hardware does not deliver the interrupt
463 * immediately during eoi broadcast, and this
464 * lets a buggy guest make slow progress
465 * even if it does not correctly handle a
466 * level-triggered interrupt. Emulate this
467 * behavior if we detect an interrupt storm.
468 */
469 schedule_delayed_work(&ioapic->eoi_inject, HZ / 100);
470 ioapic->irq_eoi[i] = 0;
471 trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
472 } else {
473 ioapic_service(ioapic, i, false);
474 }
475 } else {
476 ioapic->irq_eoi[i] = 0;
477 }
440 } 478 }
441} 479}
442 480
@@ -565,12 +603,14 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
565{ 603{
566 int i; 604 int i;
567 605
606 cancel_delayed_work_sync(&ioapic->eoi_inject);
568 for (i = 0; i < IOAPIC_NUM_PINS; i++) 607 for (i = 0; i < IOAPIC_NUM_PINS; i++)
569 ioapic->redirtbl[i].fields.mask = 1; 608 ioapic->redirtbl[i].fields.mask = 1;
570 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; 609 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
571 ioapic->ioregsel = 0; 610 ioapic->ioregsel = 0;
572 ioapic->irr = 0; 611 ioapic->irr = 0;
573 ioapic->id = 0; 612 ioapic->id = 0;
613 memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
574 rtc_irq_eoi_tracking_reset(ioapic); 614 rtc_irq_eoi_tracking_reset(ioapic);
575 update_handled_vectors(ioapic); 615 update_handled_vectors(ioapic);
576} 616}
@@ -589,6 +629,7 @@ int kvm_ioapic_init(struct kvm *kvm)
589 if (!ioapic) 629 if (!ioapic)
590 return -ENOMEM; 630 return -ENOMEM;
591 spin_lock_init(&ioapic->lock); 631 spin_lock_init(&ioapic->lock);
632 INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
592 kvm->arch.vioapic = ioapic; 633 kvm->arch.vioapic = ioapic;
593 kvm_ioapic_reset(ioapic); 634 kvm_ioapic_reset(ioapic);
594 kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); 635 kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
@@ -609,6 +650,7 @@ void kvm_ioapic_destroy(struct kvm *kvm)
609{ 650{
610 struct kvm_ioapic *ioapic = kvm->arch.vioapic; 651 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
611 652
653 cancel_delayed_work_sync(&ioapic->eoi_inject);
612 if (ioapic) { 654 if (ioapic) {
613 kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); 655 kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
614 kvm->arch.vioapic = NULL; 656 kvm->arch.vioapic = NULL;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 90d43e95dcf8..e23b70634f1e 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -59,6 +59,8 @@ struct kvm_ioapic {
59 spinlock_t lock; 59 spinlock_t lock;
60 DECLARE_BITMAP(handled_vectors, 256); 60 DECLARE_BITMAP(handled_vectors, 256);
61 struct rtc_status rtc_status; 61 struct rtc_status rtc_status;
62 struct delayed_work eoi_inject;
63 u32 irq_eoi[IOAPIC_NUM_PINS];
62}; 64};
63 65
64#ifdef DEBUG 66#ifdef DEBUG
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 95519bc959ed..384eaa7b02fa 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -52,11 +52,13 @@
52 52
53#include <asm/processor.h> 53#include <asm/processor.h>
54#include <asm/io.h> 54#include <asm/io.h>
55#include <asm/ioctl.h>
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/pgtable.h> 57#include <asm/pgtable.h>
57 58
58#include "coalesced_mmio.h" 59#include "coalesced_mmio.h"
59#include "async_pf.h" 60#include "async_pf.h"
61#include "vfio.h"
60 62
61#define CREATE_TRACE_POINTS 63#define CREATE_TRACE_POINTS
62#include <trace/events/kvm.h> 64#include <trace/events/kvm.h>
@@ -95,8 +97,6 @@ static int hardware_enable_all(void);
95static void hardware_disable_all(void); 97static void hardware_disable_all(void);
96 98
97static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 99static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
98static void update_memslots(struct kvm_memslots *slots,
99 struct kvm_memory_slot *new, u64 last_generation);
100 100
101static void kvm_release_pfn_dirty(pfn_t pfn); 101static void kvm_release_pfn_dirty(pfn_t pfn);
102static void mark_page_dirty_in_slot(struct kvm *kvm, 102static void mark_page_dirty_in_slot(struct kvm *kvm,
@@ -129,7 +129,8 @@ int vcpu_load(struct kvm_vcpu *vcpu)
129 struct pid *oldpid = vcpu->pid; 129 struct pid *oldpid = vcpu->pid;
130 struct pid *newpid = get_task_pid(current, PIDTYPE_PID); 130 struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
131 rcu_assign_pointer(vcpu->pid, newpid); 131 rcu_assign_pointer(vcpu->pid, newpid);
132 synchronize_rcu(); 132 if (oldpid)
133 synchronize_rcu();
133 put_pid(oldpid); 134 put_pid(oldpid);
134 } 135 }
135 cpu = get_cpu(); 136 cpu = get_cpu();
@@ -152,7 +153,7 @@ static void ack_flush(void *_completed)
152{ 153{
153} 154}
154 155
155static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 156bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
156{ 157{
157 int i, cpu, me; 158 int i, cpu, me;
158 cpumask_var_t cpus; 159 cpumask_var_t cpus;
@@ -189,7 +190,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
189 long dirty_count = kvm->tlbs_dirty; 190 long dirty_count = kvm->tlbs_dirty;
190 191
191 smp_mb(); 192 smp_mb();
192 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 193 if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
193 ++kvm->stat.remote_tlb_flush; 194 ++kvm->stat.remote_tlb_flush;
194 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 195 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
195} 196}
@@ -197,17 +198,17 @@ EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
197 198
198void kvm_reload_remote_mmus(struct kvm *kvm) 199void kvm_reload_remote_mmus(struct kvm *kvm)
199{ 200{
200 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 201 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
201} 202}
202 203
203void kvm_make_mclock_inprogress_request(struct kvm *kvm) 204void kvm_make_mclock_inprogress_request(struct kvm *kvm)
204{ 205{
205 make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); 206 kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
206} 207}
207 208
208void kvm_make_scan_ioapic_request(struct kvm *kvm) 209void kvm_make_scan_ioapic_request(struct kvm *kvm)
209{ 210{
210 make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); 211 kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
211} 212}
212 213
213int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 214int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
@@ -295,6 +296,9 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
295 kvm_flush_remote_tlbs(kvm); 296 kvm_flush_remote_tlbs(kvm);
296 297
297 spin_unlock(&kvm->mmu_lock); 298 spin_unlock(&kvm->mmu_lock);
299
300 kvm_arch_mmu_notifier_invalidate_page(kvm, address);
301
298 srcu_read_unlock(&kvm->srcu, idx); 302 srcu_read_unlock(&kvm->srcu, idx);
299} 303}
300 304
@@ -368,7 +372,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
368 372
369static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 373static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
370 struct mm_struct *mm, 374 struct mm_struct *mm,
371 unsigned long address) 375 unsigned long start,
376 unsigned long end)
372{ 377{
373 struct kvm *kvm = mmu_notifier_to_kvm(mn); 378 struct kvm *kvm = mmu_notifier_to_kvm(mn);
374 int young, idx; 379 int young, idx;
@@ -376,7 +381,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
376 idx = srcu_read_lock(&kvm->srcu); 381 idx = srcu_read_lock(&kvm->srcu);
377 spin_lock(&kvm->mmu_lock); 382 spin_lock(&kvm->mmu_lock);
378 383
379 young = kvm_age_hva(kvm, address); 384 young = kvm_age_hva(kvm, start, end);
380 if (young) 385 if (young)
381 kvm_flush_remote_tlbs(kvm); 386 kvm_flush_remote_tlbs(kvm);
382 387
@@ -476,6 +481,13 @@ static struct kvm *kvm_create_vm(unsigned long type)
476 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 481 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
477 if (!kvm->memslots) 482 if (!kvm->memslots)
478 goto out_err_no_srcu; 483 goto out_err_no_srcu;
484
485 /*
486 * Init kvm generation close to the maximum to easily test the
487 * code of handling generation number wrap-around.
488 */
489 kvm->memslots->generation = -150;
490
479 kvm_init_memslots_id(kvm); 491 kvm_init_memslots_id(kvm);
480 if (init_srcu_struct(&kvm->srcu)) 492 if (init_srcu_struct(&kvm->srcu))
481 goto out_err_no_srcu; 493 goto out_err_no_srcu;
@@ -687,8 +699,7 @@ static void sort_memslots(struct kvm_memslots *slots)
687} 699}
688 700
689static void update_memslots(struct kvm_memslots *slots, 701static void update_memslots(struct kvm_memslots *slots,
690 struct kvm_memory_slot *new, 702 struct kvm_memory_slot *new)
691 u64 last_generation)
692{ 703{
693 if (new) { 704 if (new) {
694 int id = new->id; 705 int id = new->id;
@@ -699,15 +710,13 @@ static void update_memslots(struct kvm_memslots *slots,
699 if (new->npages != npages) 710 if (new->npages != npages)
700 sort_memslots(slots); 711 sort_memslots(slots);
701 } 712 }
702
703 slots->generation = last_generation + 1;
704} 713}
705 714
706static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) 715static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
707{ 716{
708 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 717 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
709 718
710#ifdef KVM_CAP_READONLY_MEM 719#ifdef __KVM_HAVE_READONLY_MEM
711 valid_flags |= KVM_MEM_READONLY; 720 valid_flags |= KVM_MEM_READONLY;
712#endif 721#endif
713 722
@@ -722,10 +731,24 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
722{ 731{
723 struct kvm_memslots *old_memslots = kvm->memslots; 732 struct kvm_memslots *old_memslots = kvm->memslots;
724 733
725 update_memslots(slots, new, kvm->memslots->generation); 734 /*
735 * Set the low bit in the generation, which disables SPTE caching
736 * until the end of synchronize_srcu_expedited.
737 */
738 WARN_ON(old_memslots->generation & 1);
739 slots->generation = old_memslots->generation + 1;
740
741 update_memslots(slots, new);
726 rcu_assign_pointer(kvm->memslots, slots); 742 rcu_assign_pointer(kvm->memslots, slots);
727 synchronize_srcu_expedited(&kvm->srcu); 743 synchronize_srcu_expedited(&kvm->srcu);
728 744
745 /*
746 * Increment the new memslot generation a second time. This prevents
747 * vm exits that race with memslot updates from caching a memslot
748 * generation that will (potentially) be valid forever.
749 */
750 slots->generation++;
751
729 kvm_arch_memslots_updated(kvm); 752 kvm_arch_memslots_updated(kvm);
730 753
731 return old_memslots; 754 return old_memslots;
@@ -776,7 +799,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
776 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 799 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
777 npages = mem->memory_size >> PAGE_SHIFT; 800 npages = mem->memory_size >> PAGE_SHIFT;
778 801
779 r = -EINVAL;
780 if (npages > KVM_MEM_MAX_NR_PAGES) 802 if (npages > KVM_MEM_MAX_NR_PAGES)
781 goto out; 803 goto out;
782 804
@@ -790,7 +812,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
790 new.npages = npages; 812 new.npages = npages;
791 new.flags = mem->flags; 813 new.flags = mem->flags;
792 814
793 r = -EINVAL;
794 if (npages) { 815 if (npages) {
795 if (!old.npages) 816 if (!old.npages)
796 change = KVM_MR_CREATE; 817 change = KVM_MR_CREATE;
@@ -846,7 +867,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
846 } 867 }
847 868
848 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { 869 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
849 r = -ENOMEM;
850 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), 870 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
851 GFP_KERNEL); 871 GFP_KERNEL);
852 if (!slots) 872 if (!slots)
@@ -1075,9 +1095,9 @@ EXPORT_SYMBOL_GPL(gfn_to_hva);
1075 * If writable is set to false, the hva returned by this function is only 1095 * If writable is set to false, the hva returned by this function is only
1076 * allowed to be read. 1096 * allowed to be read.
1077 */ 1097 */
1078unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) 1098unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
1099 gfn_t gfn, bool *writable)
1079{ 1100{
1080 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
1081 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); 1101 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
1082 1102
1083 if (!kvm_is_error_hva(hva) && writable) 1103 if (!kvm_is_error_hva(hva) && writable)
@@ -1086,6 +1106,13 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
1086 return hva; 1106 return hva;
1087} 1107}
1088 1108
1109unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
1110{
1111 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
1112
1113 return gfn_to_hva_memslot_prot(slot, gfn, writable);
1114}
1115
1089static int kvm_read_hva(void *data, void __user *hva, int len) 1116static int kvm_read_hva(void *data, void __user *hva, int len)
1090{ 1117{
1091 return __copy_from_user(data, hva, len); 1118 return __copy_from_user(data, hva, len);
@@ -1107,6 +1134,43 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
1107 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); 1134 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
1108} 1135}
1109 1136
1137int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
1138 unsigned long addr, bool write_fault,
1139 struct page **pagep)
1140{
1141 int npages;
1142 int locked = 1;
1143 int flags = FOLL_TOUCH | FOLL_HWPOISON |
1144 (pagep ? FOLL_GET : 0) |
1145 (write_fault ? FOLL_WRITE : 0);
1146
1147 /*
1148 * If retrying the fault, we get here *not* having allowed the filemap
1149 * to wait on the page lock. We should now allow waiting on the IO with
1150 * the mmap semaphore released.
1151 */
1152 down_read(&mm->mmap_sem);
1153 npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
1154 &locked);
1155 if (!locked) {
1156 VM_BUG_ON(npages);
1157
1158 if (!pagep)
1159 return 0;
1160
1161 /*
1162 * The previous call has now waited on the IO. Now we can
1163 * retry and complete. Pass TRIED to ensure we do not re
1164 * schedule async IO (see e.g. filemap_fault).
1165 */
1166 down_read(&mm->mmap_sem);
1167 npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
1168 pagep, NULL, NULL);
1169 }
1170 up_read(&mm->mmap_sem);
1171 return npages;
1172}
1173
1110static inline int check_user_page_hwpoison(unsigned long addr) 1174static inline int check_user_page_hwpoison(unsigned long addr)
1111{ 1175{
1112 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; 1176 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
@@ -1169,9 +1233,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1169 npages = get_user_page_nowait(current, current->mm, 1233 npages = get_user_page_nowait(current, current->mm,
1170 addr, write_fault, page); 1234 addr, write_fault, page);
1171 up_read(&current->mm->mmap_sem); 1235 up_read(&current->mm->mmap_sem);
1172 } else 1236 } else {
1173 npages = get_user_pages_fast(addr, 1, write_fault, 1237 /*
1174 page); 1238 * By now we have tried gup_fast, and possibly async_pf, and we
1239 * are certainly not atomic. Time to retry the gup, allowing
1240 * mmap semaphore to be relinquished in the case of IO.
1241 */
1242 npages = kvm_get_user_page_io(current, current->mm, addr,
1243 write_fault, page);
1244 }
1175 if (npages != 1) 1245 if (npages != 1)
1176 return npages; 1246 return npages;
1177 1247
@@ -1768,8 +1838,7 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
1768 bool eligible; 1838 bool eligible;
1769 1839
1770 eligible = !vcpu->spin_loop.in_spin_loop || 1840 eligible = !vcpu->spin_loop.in_spin_loop ||
1771 (vcpu->spin_loop.in_spin_loop && 1841 vcpu->spin_loop.dy_eligible;
1772 vcpu->spin_loop.dy_eligible);
1773 1842
1774 if (vcpu->spin_loop.in_spin_loop) 1843 if (vcpu->spin_loop.in_spin_loop)
1775 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 1844 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
@@ -1975,6 +2044,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
1975 if (vcpu->kvm->mm != current->mm) 2044 if (vcpu->kvm->mm != current->mm)
1976 return -EIO; 2045 return -EIO;
1977 2046
2047 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
2048 return -EINVAL;
2049
1978#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) 2050#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
1979 /* 2051 /*
1980 * Special cases: vcpu ioctls that are asynchronous to vcpu execution, 2052 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
@@ -2259,6 +2331,29 @@ struct kvm_device *kvm_device_from_filp(struct file *filp)
2259 return filp->private_data; 2331 return filp->private_data;
2260} 2332}
2261 2333
2334static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
2335#ifdef CONFIG_KVM_MPIC
2336 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
2337 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
2338#endif
2339
2340#ifdef CONFIG_KVM_XICS
2341 [KVM_DEV_TYPE_XICS] = &kvm_xics_ops,
2342#endif
2343};
2344
2345int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
2346{
2347 if (type >= ARRAY_SIZE(kvm_device_ops_table))
2348 return -ENOSPC;
2349
2350 if (kvm_device_ops_table[type] != NULL)
2351 return -EEXIST;
2352
2353 kvm_device_ops_table[type] = ops;
2354 return 0;
2355}
2356
2262static int kvm_ioctl_create_device(struct kvm *kvm, 2357static int kvm_ioctl_create_device(struct kvm *kvm,
2263 struct kvm_create_device *cd) 2358 struct kvm_create_device *cd)
2264{ 2359{
@@ -2267,36 +2362,12 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
2267 bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 2362 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
2268 int ret; 2363 int ret;
2269 2364
2270 switch (cd->type) { 2365 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
2271#ifdef CONFIG_KVM_MPIC 2366 return -ENODEV;
2272 case KVM_DEV_TYPE_FSL_MPIC_20: 2367
2273 case KVM_DEV_TYPE_FSL_MPIC_42: 2368 ops = kvm_device_ops_table[cd->type];
2274 ops = &kvm_mpic_ops; 2369 if (ops == NULL)
2275 break;
2276#endif
2277#ifdef CONFIG_KVM_XICS
2278 case KVM_DEV_TYPE_XICS:
2279 ops = &kvm_xics_ops;
2280 break;
2281#endif
2282#ifdef CONFIG_KVM_VFIO
2283 case KVM_DEV_TYPE_VFIO:
2284 ops = &kvm_vfio_ops;
2285 break;
2286#endif
2287#ifdef CONFIG_KVM_ARM_VGIC
2288 case KVM_DEV_TYPE_ARM_VGIC_V2:
2289 ops = &kvm_arm_vgic_v2_ops;
2290 break;
2291#endif
2292#ifdef CONFIG_S390
2293 case KVM_DEV_TYPE_FLIC:
2294 ops = &kvm_flic_ops;
2295 break;
2296#endif
2297 default:
2298 return -ENODEV; 2370 return -ENODEV;
2299 }
2300 2371
2301 if (test) 2372 if (test)
2302 return 0; 2373 return 0;
@@ -2611,7 +2682,6 @@ static long kvm_dev_ioctl(struct file *filp,
2611 2682
2612 switch (ioctl) { 2683 switch (ioctl) {
2613 case KVM_GET_API_VERSION: 2684 case KVM_GET_API_VERSION:
2614 r = -EINVAL;
2615 if (arg) 2685 if (arg)
2616 goto out; 2686 goto out;
2617 r = KVM_API_VERSION; 2687 r = KVM_API_VERSION;
@@ -2623,7 +2693,6 @@ static long kvm_dev_ioctl(struct file *filp,
2623 r = kvm_vm_ioctl_check_extension_generic(NULL, arg); 2693 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
2624 break; 2694 break;
2625 case KVM_GET_VCPU_MMAP_SIZE: 2695 case KVM_GET_VCPU_MMAP_SIZE:
2626 r = -EINVAL;
2627 if (arg) 2696 if (arg)
2628 goto out; 2697 goto out;
2629 r = PAGE_SIZE; /* struct kvm_run */ 2698 r = PAGE_SIZE; /* struct kvm_run */
@@ -2668,7 +2737,7 @@ static void hardware_enable_nolock(void *junk)
2668 2737
2669 cpumask_set_cpu(cpu, cpus_hardware_enabled); 2738 cpumask_set_cpu(cpu, cpus_hardware_enabled);
2670 2739
2671 r = kvm_arch_hardware_enable(NULL); 2740 r = kvm_arch_hardware_enable();
2672 2741
2673 if (r) { 2742 if (r) {
2674 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2743 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
@@ -2693,7 +2762,7 @@ static void hardware_disable_nolock(void *junk)
2693 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2762 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
2694 return; 2763 return;
2695 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2764 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2696 kvm_arch_hardware_disable(NULL); 2765 kvm_arch_hardware_disable();
2697} 2766}
2698 2767
2699static void hardware_disable(void) 2768static void hardware_disable(void)
@@ -3123,6 +3192,8 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
3123 if (vcpu->preempted) 3192 if (vcpu->preempted)
3124 vcpu->preempted = false; 3193 vcpu->preempted = false;
3125 3194
3195 kvm_arch_sched_in(vcpu, cpu);
3196
3126 kvm_arch_vcpu_load(vcpu, cpu); 3197 kvm_arch_vcpu_load(vcpu, cpu);
3127} 3198}
3128 3199
@@ -3214,6 +3285,9 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
3214 goto out_undebugfs; 3285 goto out_undebugfs;
3215 } 3286 }
3216 3287
3288 r = kvm_vfio_ops_init();
3289 WARN_ON(r);
3290
3217 return 0; 3291 return 0;
3218 3292
3219out_undebugfs: 3293out_undebugfs:
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index ba1a93f935c7..281e7cf2b8e5 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -18,6 +18,7 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/uaccess.h> 19#include <linux/uaccess.h>
20#include <linux/vfio.h> 20#include <linux/vfio.h>
21#include "vfio.h"
21 22
22struct kvm_vfio_group { 23struct kvm_vfio_group {
23 struct list_head node; 24 struct list_head node;
@@ -246,6 +247,16 @@ static void kvm_vfio_destroy(struct kvm_device *dev)
246 kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */ 247 kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */
247} 248}
248 249
250static int kvm_vfio_create(struct kvm_device *dev, u32 type);
251
252static struct kvm_device_ops kvm_vfio_ops = {
253 .name = "kvm-vfio",
254 .create = kvm_vfio_create,
255 .destroy = kvm_vfio_destroy,
256 .set_attr = kvm_vfio_set_attr,
257 .has_attr = kvm_vfio_has_attr,
258};
259
249static int kvm_vfio_create(struct kvm_device *dev, u32 type) 260static int kvm_vfio_create(struct kvm_device *dev, u32 type)
250{ 261{
251 struct kvm_device *tmp; 262 struct kvm_device *tmp;
@@ -268,10 +279,7 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type)
268 return 0; 279 return 0;
269} 280}
270 281
271struct kvm_device_ops kvm_vfio_ops = { 282int kvm_vfio_ops_init(void)
272 .name = "kvm-vfio", 283{
273 .create = kvm_vfio_create, 284 return kvm_register_device_ops(&kvm_vfio_ops, KVM_DEV_TYPE_VFIO);
274 .destroy = kvm_vfio_destroy, 285}
275 .set_attr = kvm_vfio_set_attr,
276 .has_attr = kvm_vfio_has_attr,
277};
diff --git a/virt/kvm/vfio.h b/virt/kvm/vfio.h
new file mode 100644
index 000000000000..92eac75d6b62
--- /dev/null
+++ b/virt/kvm/vfio.h
@@ -0,0 +1,13 @@
1#ifndef __KVM_VFIO_H
2#define __KVM_VFIO_H
3
4#ifdef CONFIG_KVM_VFIO
5int kvm_vfio_ops_init(void);
6#else
7static inline int kvm_vfio_ops_init(void)
8{
9 return 0;
10}
11#endif
12
13#endif