aboutsummaryrefslogtreecommitdiffstats
path: root/virt/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-04 12:30:33 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-04 12:30:33 -0400
commitecefbd94b834fa32559d854646d777c56749ef1c (patch)
treeca8958900ad9e208a8e5fb7704f1b66dc76131b4 /virt/kvm
parentce57e981f2b996aaca2031003b3f866368307766 (diff)
parent3d11df7abbff013b811d5615320580cd5d9d7d31 (diff)
Merge tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Avi Kivity: "Highlights of the changes for this release include support for vfio level triggered interrupts, improved big real mode support on older Intels, a streamlines guest page table walker, guest APIC speedups, PIO optimizations, better overcommit handling, and read-only memory." * tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (138 commits) KVM: s390: Fix vcpu_load handling in interrupt code KVM: x86: Fix guest debug across vcpu INIT reset KVM: Add resampling irqfds for level triggered interrupts KVM: optimize apic interrupt delivery KVM: MMU: Eliminate pointless temporary 'ac' KVM: MMU: Avoid access/dirty update loop if all is well KVM: MMU: Eliminate eperm temporary KVM: MMU: Optimize is_last_gpte() KVM: MMU: Simplify walk_addr_generic() loop KVM: MMU: Optimize pte permission checks KVM: MMU: Update accessed and dirty bits after guest pagetable walk KVM: MMU: Move gpte_access() out of paging_tmpl.h KVM: MMU: Optimize gpte_access() slightly KVM: MMU: Push clean gpte write protection out of gpte_access() KVM: clarify kvmclock documentation KVM: make processes waiting on vcpu mutex killable KVM: SVM: Make use of asm.h KVM: VMX: Make use of asm.h KVM: VMX: Make lto-friendly KVM: x86: lapic: Clean up find_highest_vector() and count_vectors() ... Conflicts: arch/s390/include/asm/processor.h arch/x86/kvm/i8259.c
Diffstat (limited to 'virt/kvm')
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/async_pf.c11
-rw-r--r--virt/kvm/eventfd.c150
-rw-r--r--virt/kvm/ioapic.c37
-rw-r--r--virt/kvm/iommu.c16
-rw-r--r--virt/kvm/irq_comm.c17
-rw-r--r--virt/kvm/kvm_main.c541
7 files changed, 525 insertions, 250 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 28694f4a9139..d01b24b72c61 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -21,3 +21,6 @@ config KVM_ASYNC_PF
21 21
22config HAVE_KVM_MSI 22config HAVE_KVM_MSI
23 bool 23 bool
24
25config HAVE_KVM_CPU_RELAX_INTERCEPT
26 bool
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 74268b4c2ee1..ea475cd03511 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -111,8 +111,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
111 list_entry(vcpu->async_pf.done.next, 111 list_entry(vcpu->async_pf.done.next,
112 typeof(*work), link); 112 typeof(*work), link);
113 list_del(&work->link); 113 list_del(&work->link);
114 if (work->page) 114 if (!is_error_page(work->page))
115 put_page(work->page); 115 kvm_release_page_clean(work->page);
116 kmem_cache_free(async_pf_cache, work); 116 kmem_cache_free(async_pf_cache, work);
117 } 117 }
118 spin_unlock(&vcpu->async_pf.lock); 118 spin_unlock(&vcpu->async_pf.lock);
@@ -138,8 +138,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
138 138
139 list_del(&work->queue); 139 list_del(&work->queue);
140 vcpu->async_pf.queued--; 140 vcpu->async_pf.queued--;
141 if (work->page) 141 if (!is_error_page(work->page))
142 put_page(work->page); 142 kvm_release_page_clean(work->page);
143 kmem_cache_free(async_pf_cache, work); 143 kmem_cache_free(async_pf_cache, work);
144 } 144 }
145} 145}
@@ -203,8 +203,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
203 if (!work) 203 if (!work)
204 return -ENOMEM; 204 return -ENOMEM;
205 205
206 work->page = bad_page; 206 work->page = KVM_ERR_PTR_BAD_PAGE;
207 get_page(bad_page);
208 INIT_LIST_HEAD(&work->queue); /* for list_del to work */ 207 INIT_LIST_HEAD(&work->queue); /* for list_del to work */
209 208
210 spin_lock(&vcpu->async_pf.lock); 209 spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 67a35e90384c..9718e98d6d2a 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -43,6 +43,31 @@
43 * -------------------------------------------------------------------- 43 * --------------------------------------------------------------------
44 */ 44 */
45 45
46/*
47 * Resampling irqfds are a special variety of irqfds used to emulate
48 * level triggered interrupts. The interrupt is asserted on eventfd
49 * trigger. On acknowledgement through the irq ack notifier, the
50 * interrupt is de-asserted and userspace is notified through the
51 * resamplefd. All resamplers on the same gsi are de-asserted
52 * together, so we don't need to track the state of each individual
53 * user. We can also therefore share the same irq source ID.
54 */
55struct _irqfd_resampler {
56 struct kvm *kvm;
57 /*
58 * List of resampling struct _irqfd objects sharing this gsi.
59 * RCU list modified under kvm->irqfds.resampler_lock
60 */
61 struct list_head list;
62 struct kvm_irq_ack_notifier notifier;
63 /*
64 * Entry in list of kvm->irqfd.resampler_list. Use for sharing
65 * resamplers among irqfds on the same gsi.
66 * Accessed and modified under kvm->irqfds.resampler_lock
67 */
68 struct list_head link;
69};
70
46struct _irqfd { 71struct _irqfd {
47 /* Used for MSI fast-path */ 72 /* Used for MSI fast-path */
48 struct kvm *kvm; 73 struct kvm *kvm;
@@ -52,6 +77,12 @@ struct _irqfd {
52 /* Used for level IRQ fast-path */ 77 /* Used for level IRQ fast-path */
53 int gsi; 78 int gsi;
54 struct work_struct inject; 79 struct work_struct inject;
80 /* The resampler used by this irqfd (resampler-only) */
81 struct _irqfd_resampler *resampler;
82 /* Eventfd notified on resample (resampler-only) */
83 struct eventfd_ctx *resamplefd;
84 /* Entry in list of irqfds for a resampler (resampler-only) */
85 struct list_head resampler_link;
55 /* Used for setup/shutdown */ 86 /* Used for setup/shutdown */
56 struct eventfd_ctx *eventfd; 87 struct eventfd_ctx *eventfd;
57 struct list_head list; 88 struct list_head list;
@@ -67,8 +98,58 @@ irqfd_inject(struct work_struct *work)
67 struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); 98 struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
68 struct kvm *kvm = irqfd->kvm; 99 struct kvm *kvm = irqfd->kvm;
69 100
70 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); 101 if (!irqfd->resampler) {
71 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); 102 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
103 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
104 } else
105 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
106 irqfd->gsi, 1);
107}
108
109/*
110 * Since resampler irqfds share an IRQ source ID, we de-assert once
111 * then notify all of the resampler irqfds using this GSI. We can't
112 * do multiple de-asserts or we risk racing with incoming re-asserts.
113 */
114static void
115irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
116{
117 struct _irqfd_resampler *resampler;
118 struct _irqfd *irqfd;
119
120 resampler = container_of(kian, struct _irqfd_resampler, notifier);
121
122 kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
123 resampler->notifier.gsi, 0);
124
125 rcu_read_lock();
126
127 list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
128 eventfd_signal(irqfd->resamplefd, 1);
129
130 rcu_read_unlock();
131}
132
133static void
134irqfd_resampler_shutdown(struct _irqfd *irqfd)
135{
136 struct _irqfd_resampler *resampler = irqfd->resampler;
137 struct kvm *kvm = resampler->kvm;
138
139 mutex_lock(&kvm->irqfds.resampler_lock);
140
141 list_del_rcu(&irqfd->resampler_link);
142 synchronize_rcu();
143
144 if (list_empty(&resampler->list)) {
145 list_del(&resampler->link);
146 kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
147 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
148 resampler->notifier.gsi, 0);
149 kfree(resampler);
150 }
151
152 mutex_unlock(&kvm->irqfds.resampler_lock);
72} 153}
73 154
74/* 155/*
@@ -92,6 +173,11 @@ irqfd_shutdown(struct work_struct *work)
92 */ 173 */
93 flush_work(&irqfd->inject); 174 flush_work(&irqfd->inject);
94 175
176 if (irqfd->resampler) {
177 irqfd_resampler_shutdown(irqfd);
178 eventfd_ctx_put(irqfd->resamplefd);
179 }
180
95 /* 181 /*
96 * It is now safe to release the object's resources 182 * It is now safe to release the object's resources
97 */ 183 */
@@ -203,7 +289,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
203 struct kvm_irq_routing_table *irq_rt; 289 struct kvm_irq_routing_table *irq_rt;
204 struct _irqfd *irqfd, *tmp; 290 struct _irqfd *irqfd, *tmp;
205 struct file *file = NULL; 291 struct file *file = NULL;
206 struct eventfd_ctx *eventfd = NULL; 292 struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
207 int ret; 293 int ret;
208 unsigned int events; 294 unsigned int events;
209 295
@@ -231,6 +317,54 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
231 317
232 irqfd->eventfd = eventfd; 318 irqfd->eventfd = eventfd;
233 319
320 if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
321 struct _irqfd_resampler *resampler;
322
323 resamplefd = eventfd_ctx_fdget(args->resamplefd);
324 if (IS_ERR(resamplefd)) {
325 ret = PTR_ERR(resamplefd);
326 goto fail;
327 }
328
329 irqfd->resamplefd = resamplefd;
330 INIT_LIST_HEAD(&irqfd->resampler_link);
331
332 mutex_lock(&kvm->irqfds.resampler_lock);
333
334 list_for_each_entry(resampler,
335 &kvm->irqfds.resampler_list, list) {
336 if (resampler->notifier.gsi == irqfd->gsi) {
337 irqfd->resampler = resampler;
338 break;
339 }
340 }
341
342 if (!irqfd->resampler) {
343 resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
344 if (!resampler) {
345 ret = -ENOMEM;
346 mutex_unlock(&kvm->irqfds.resampler_lock);
347 goto fail;
348 }
349
350 resampler->kvm = kvm;
351 INIT_LIST_HEAD(&resampler->list);
352 resampler->notifier.gsi = irqfd->gsi;
353 resampler->notifier.irq_acked = irqfd_resampler_ack;
354 INIT_LIST_HEAD(&resampler->link);
355
356 list_add(&resampler->link, &kvm->irqfds.resampler_list);
357 kvm_register_irq_ack_notifier(kvm,
358 &resampler->notifier);
359 irqfd->resampler = resampler;
360 }
361
362 list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
363 synchronize_rcu();
364
365 mutex_unlock(&kvm->irqfds.resampler_lock);
366 }
367
234 /* 368 /*
235 * Install our own custom wake-up handling so we are notified via 369 * Install our own custom wake-up handling so we are notified via
236 * a callback whenever someone signals the underlying eventfd 370 * a callback whenever someone signals the underlying eventfd
@@ -276,6 +410,12 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
276 return 0; 410 return 0;
277 411
278fail: 412fail:
413 if (irqfd->resampler)
414 irqfd_resampler_shutdown(irqfd);
415
416 if (resamplefd && !IS_ERR(resamplefd))
417 eventfd_ctx_put(resamplefd);
418
279 if (eventfd && !IS_ERR(eventfd)) 419 if (eventfd && !IS_ERR(eventfd))
280 eventfd_ctx_put(eventfd); 420 eventfd_ctx_put(eventfd);
281 421
@@ -291,6 +431,8 @@ kvm_eventfd_init(struct kvm *kvm)
291{ 431{
292 spin_lock_init(&kvm->irqfds.lock); 432 spin_lock_init(&kvm->irqfds.lock);
293 INIT_LIST_HEAD(&kvm->irqfds.items); 433 INIT_LIST_HEAD(&kvm->irqfds.items);
434 INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
435 mutex_init(&kvm->irqfds.resampler_lock);
294 INIT_LIST_HEAD(&kvm->ioeventfds); 436 INIT_LIST_HEAD(&kvm->ioeventfds);
295} 437}
296 438
@@ -340,7 +482,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
340int 482int
341kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) 483kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
342{ 484{
343 if (args->flags & ~KVM_IRQFD_FLAG_DEASSIGN) 485 if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
344 return -EINVAL; 486 return -EINVAL;
345 487
346 if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) 488 if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index ef61d529a6c4..cfb7e4d52dc2 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -197,28 +197,29 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
197 u32 old_irr; 197 u32 old_irr;
198 u32 mask = 1 << irq; 198 u32 mask = 1 << irq;
199 union kvm_ioapic_redirect_entry entry; 199 union kvm_ioapic_redirect_entry entry;
200 int ret = 1; 200 int ret, irq_level;
201
202 BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
201 203
202 spin_lock(&ioapic->lock); 204 spin_lock(&ioapic->lock);
203 old_irr = ioapic->irr; 205 old_irr = ioapic->irr;
204 if (irq >= 0 && irq < IOAPIC_NUM_PINS) { 206 irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
205 int irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], 207 irq_source_id, level);
206 irq_source_id, level); 208 entry = ioapic->redirtbl[irq];
207 entry = ioapic->redirtbl[irq]; 209 irq_level ^= entry.fields.polarity;
208 irq_level ^= entry.fields.polarity; 210 if (!irq_level) {
209 if (!irq_level) 211 ioapic->irr &= ~mask;
210 ioapic->irr &= ~mask; 212 ret = 1;
211 else { 213 } else {
212 int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); 214 int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
213 ioapic->irr |= mask; 215 ioapic->irr |= mask;
214 if ((edge && old_irr != ioapic->irr) || 216 if ((edge && old_irr != ioapic->irr) ||
215 (!edge && !entry.fields.remote_irr)) 217 (!edge && !entry.fields.remote_irr))
216 ret = ioapic_service(ioapic, irq); 218 ret = ioapic_service(ioapic, irq);
217 else 219 else
218 ret = 0; /* report coalesced interrupt */ 220 ret = 0; /* report coalesced interrupt */
219 }
220 trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
221 } 221 }
222 trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
222 spin_unlock(&ioapic->lock); 223 spin_unlock(&ioapic->lock);
223 224
224 return ret; 225 return ret;
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index e9fff9830bf0..037cb6730e68 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -42,13 +42,13 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
42static void kvm_iommu_put_pages(struct kvm *kvm, 42static void kvm_iommu_put_pages(struct kvm *kvm,
43 gfn_t base_gfn, unsigned long npages); 43 gfn_t base_gfn, unsigned long npages);
44 44
45static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, 45static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
46 gfn_t gfn, unsigned long size) 46 unsigned long size)
47{ 47{
48 gfn_t end_gfn; 48 gfn_t end_gfn;
49 pfn_t pfn; 49 pfn_t pfn;
50 50
51 pfn = gfn_to_pfn_memslot(kvm, slot, gfn); 51 pfn = gfn_to_pfn_memslot(slot, gfn);
52 end_gfn = gfn + (size >> PAGE_SHIFT); 52 end_gfn = gfn + (size >> PAGE_SHIFT);
53 gfn += 1; 53 gfn += 1;
54 54
@@ -56,7 +56,7 @@ static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
56 return pfn; 56 return pfn;
57 57
58 while (gfn < end_gfn) 58 while (gfn < end_gfn)
59 gfn_to_pfn_memslot(kvm, slot, gfn++); 59 gfn_to_pfn_memslot(slot, gfn++);
60 60
61 return pfn; 61 return pfn;
62} 62}
@@ -105,7 +105,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
105 * Pin all pages we are about to map in memory. This is 105 * Pin all pages we are about to map in memory. This is
106 * important because we unmap and unpin in 4kb steps later. 106 * important because we unmap and unpin in 4kb steps later.
107 */ 107 */
108 pfn = kvm_pin_pages(kvm, slot, gfn, page_size); 108 pfn = kvm_pin_pages(slot, gfn, page_size);
109 if (is_error_pfn(pfn)) { 109 if (is_error_pfn(pfn)) {
110 gfn += 1; 110 gfn += 1;
111 continue; 111 continue;
@@ -300,6 +300,12 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
300 300
301 /* Get physical address */ 301 /* Get physical address */
302 phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); 302 phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
303
304 if (!phys) {
305 gfn++;
306 continue;
307 }
308
303 pfn = phys >> PAGE_SHIFT; 309 pfn = phys >> PAGE_SHIFT;
304 310
305 /* Unmap address from IO address space */ 311 /* Unmap address from IO address space */
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 83402d74a767..2eb58af7ee99 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -68,8 +68,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
68 struct kvm_vcpu *vcpu, *lowest = NULL; 68 struct kvm_vcpu *vcpu, *lowest = NULL;
69 69
70 if (irq->dest_mode == 0 && irq->dest_id == 0xff && 70 if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
71 kvm_is_dm_lowest_prio(irq)) 71 kvm_is_dm_lowest_prio(irq)) {
72 printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); 72 printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
73 irq->delivery_mode = APIC_DM_FIXED;
74 }
75
76 if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r))
77 return r;
73 78
74 kvm_for_each_vcpu(i, vcpu, kvm) { 79 kvm_for_each_vcpu(i, vcpu, kvm) {
75 if (!kvm_apic_present(vcpu)) 80 if (!kvm_apic_present(vcpu))
@@ -223,6 +228,9 @@ int kvm_request_irq_source_id(struct kvm *kvm)
223 } 228 }
224 229
225 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); 230 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
231#ifdef CONFIG_X86
232 ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
233#endif
226 set_bit(irq_source_id, bitmap); 234 set_bit(irq_source_id, bitmap);
227unlock: 235unlock:
228 mutex_unlock(&kvm->irq_lock); 236 mutex_unlock(&kvm->irq_lock);
@@ -233,6 +241,9 @@ unlock:
233void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) 241void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
234{ 242{
235 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); 243 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
244#ifdef CONFIG_X86
245 ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
246#endif
236 247
237 mutex_lock(&kvm->irq_lock); 248 mutex_lock(&kvm->irq_lock);
238 if (irq_source_id < 0 || 249 if (irq_source_id < 0 ||
@@ -321,11 +332,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
321 switch (ue->u.irqchip.irqchip) { 332 switch (ue->u.irqchip.irqchip) {
322 case KVM_IRQCHIP_PIC_MASTER: 333 case KVM_IRQCHIP_PIC_MASTER:
323 e->set = kvm_set_pic_irq; 334 e->set = kvm_set_pic_irq;
324 max_pin = 16; 335 max_pin = PIC_NUM_PINS;
325 break; 336 break;
326 case KVM_IRQCHIP_PIC_SLAVE: 337 case KVM_IRQCHIP_PIC_SLAVE:
327 e->set = kvm_set_pic_irq; 338 e->set = kvm_set_pic_irq;
328 max_pin = 16; 339 max_pin = PIC_NUM_PINS;
329 delta = 8; 340 delta = 8;
330 break; 341 break;
331 case KVM_IRQCHIP_IOAPIC: 342 case KVM_IRQCHIP_IOAPIC:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d617f69131d7..c353b4599cec 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -100,13 +100,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
100 100
101static bool largepages_enabled = true; 101static bool largepages_enabled = true;
102 102
103static struct page *hwpoison_page; 103bool kvm_is_mmio_pfn(pfn_t pfn)
104static pfn_t hwpoison_pfn;
105
106struct page *fault_page;
107pfn_t fault_pfn;
108
109inline int kvm_is_mmio_pfn(pfn_t pfn)
110{ 104{
111 if (pfn_valid(pfn)) { 105 if (pfn_valid(pfn)) {
112 int reserved; 106 int reserved;
@@ -137,11 +131,12 @@ inline int kvm_is_mmio_pfn(pfn_t pfn)
137/* 131/*
138 * Switches to specified vcpu, until a matching vcpu_put() 132 * Switches to specified vcpu, until a matching vcpu_put()
139 */ 133 */
140void vcpu_load(struct kvm_vcpu *vcpu) 134int vcpu_load(struct kvm_vcpu *vcpu)
141{ 135{
142 int cpu; 136 int cpu;
143 137
144 mutex_lock(&vcpu->mutex); 138 if (mutex_lock_killable(&vcpu->mutex))
139 return -EINTR;
145 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { 140 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
146 /* The thread running this VCPU changed. */ 141 /* The thread running this VCPU changed. */
147 struct pid *oldpid = vcpu->pid; 142 struct pid *oldpid = vcpu->pid;
@@ -154,6 +149,7 @@ void vcpu_load(struct kvm_vcpu *vcpu)
154 preempt_notifier_register(&vcpu->preempt_notifier); 149 preempt_notifier_register(&vcpu->preempt_notifier);
155 kvm_arch_vcpu_load(vcpu, cpu); 150 kvm_arch_vcpu_load(vcpu, cpu);
156 put_cpu(); 151 put_cpu();
152 return 0;
157} 153}
158 154
159void vcpu_put(struct kvm_vcpu *vcpu) 155void vcpu_put(struct kvm_vcpu *vcpu)
@@ -236,6 +232,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
236 } 232 }
237 vcpu->run = page_address(page); 233 vcpu->run = page_address(page);
238 234
235 kvm_vcpu_set_in_spin_loop(vcpu, false);
236 kvm_vcpu_set_dy_eligible(vcpu, false);
237
239 r = kvm_arch_vcpu_init(vcpu); 238 r = kvm_arch_vcpu_init(vcpu);
240 if (r < 0) 239 if (r < 0)
241 goto fail_free_run; 240 goto fail_free_run;
@@ -332,8 +331,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
332 * count is also read inside the mmu_lock critical section. 331 * count is also read inside the mmu_lock critical section.
333 */ 332 */
334 kvm->mmu_notifier_count++; 333 kvm->mmu_notifier_count++;
335 for (; start < end; start += PAGE_SIZE) 334 need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
336 need_tlb_flush |= kvm_unmap_hva(kvm, start);
337 need_tlb_flush |= kvm->tlbs_dirty; 335 need_tlb_flush |= kvm->tlbs_dirty;
338 /* we've to flush the tlb before the pages can be freed */ 336 /* we've to flush the tlb before the pages can be freed */
339 if (need_tlb_flush) 337 if (need_tlb_flush)
@@ -412,7 +410,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
412 int idx; 410 int idx;
413 411
414 idx = srcu_read_lock(&kvm->srcu); 412 idx = srcu_read_lock(&kvm->srcu);
415 kvm_arch_flush_shadow(kvm); 413 kvm_arch_flush_shadow_all(kvm);
416 srcu_read_unlock(&kvm->srcu, idx); 414 srcu_read_unlock(&kvm->srcu, idx);
417} 415}
418 416
@@ -551,16 +549,12 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
551static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 549static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
552 struct kvm_memory_slot *dont) 550 struct kvm_memory_slot *dont)
553{ 551{
554 if (!dont || free->rmap != dont->rmap)
555 vfree(free->rmap);
556
557 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 552 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
558 kvm_destroy_dirty_bitmap(free); 553 kvm_destroy_dirty_bitmap(free);
559 554
560 kvm_arch_free_memslot(free, dont); 555 kvm_arch_free_memslot(free, dont);
561 556
562 free->npages = 0; 557 free->npages = 0;
563 free->rmap = NULL;
564} 558}
565 559
566void kvm_free_physmem(struct kvm *kvm) 560void kvm_free_physmem(struct kvm *kvm)
@@ -590,7 +584,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
590#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 584#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
591 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 585 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
592#else 586#else
593 kvm_arch_flush_shadow(kvm); 587 kvm_arch_flush_shadow_all(kvm);
594#endif 588#endif
595 kvm_arch_destroy_vm(kvm); 589 kvm_arch_destroy_vm(kvm);
596 kvm_free_physmem(kvm); 590 kvm_free_physmem(kvm);
@@ -686,6 +680,20 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
686 slots->generation++; 680 slots->generation++;
687} 681}
688 682
683static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
684{
685 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
686
687#ifdef KVM_CAP_READONLY_MEM
688 valid_flags |= KVM_MEM_READONLY;
689#endif
690
691 if (mem->flags & ~valid_flags)
692 return -EINVAL;
693
694 return 0;
695}
696
689/* 697/*
690 * Allocate some memory and give it an address in the guest physical address 698 * Allocate some memory and give it an address in the guest physical address
691 * space. 699 * space.
@@ -706,6 +714,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
706 struct kvm_memory_slot old, new; 714 struct kvm_memory_slot old, new;
707 struct kvm_memslots *slots, *old_memslots; 715 struct kvm_memslots *slots, *old_memslots;
708 716
717 r = check_memory_region_flags(mem);
718 if (r)
719 goto out;
720
709 r = -EINVAL; 721 r = -EINVAL;
710 /* General sanity checks */ 722 /* General sanity checks */
711 if (mem->memory_size & (PAGE_SIZE - 1)) 723 if (mem->memory_size & (PAGE_SIZE - 1))
@@ -769,11 +781,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
769 if (npages && !old.npages) { 781 if (npages && !old.npages) {
770 new.user_alloc = user_alloc; 782 new.user_alloc = user_alloc;
771 new.userspace_addr = mem->userspace_addr; 783 new.userspace_addr = mem->userspace_addr;
772#ifndef CONFIG_S390 784
773 new.rmap = vzalloc(npages * sizeof(*new.rmap));
774 if (!new.rmap)
775 goto out_free;
776#endif /* not defined CONFIG_S390 */
777 if (kvm_arch_create_memslot(&new, npages)) 785 if (kvm_arch_create_memslot(&new, npages))
778 goto out_free; 786 goto out_free;
779 } 787 }
@@ -785,7 +793,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
785 /* destroy any largepage mappings for dirty tracking */ 793 /* destroy any largepage mappings for dirty tracking */
786 } 794 }
787 795
788 if (!npages) { 796 if (!npages || base_gfn != old.base_gfn) {
789 struct kvm_memory_slot *slot; 797 struct kvm_memory_slot *slot;
790 798
791 r = -ENOMEM; 799 r = -ENOMEM;
@@ -801,14 +809,14 @@ int __kvm_set_memory_region(struct kvm *kvm,
801 old_memslots = kvm->memslots; 809 old_memslots = kvm->memslots;
802 rcu_assign_pointer(kvm->memslots, slots); 810 rcu_assign_pointer(kvm->memslots, slots);
803 synchronize_srcu_expedited(&kvm->srcu); 811 synchronize_srcu_expedited(&kvm->srcu);
804 /* From this point no new shadow pages pointing to a deleted 812 /* From this point no new shadow pages pointing to a deleted,
805 * memslot will be created. 813 * or moved, memslot will be created.
806 * 814 *
807 * validation of sp->gfn happens in: 815 * validation of sp->gfn happens in:
808 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 816 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
809 * - kvm_is_visible_gfn (mmu_check_roots) 817 * - kvm_is_visible_gfn (mmu_check_roots)
810 */ 818 */
811 kvm_arch_flush_shadow(kvm); 819 kvm_arch_flush_shadow_memslot(kvm, slot);
812 kfree(old_memslots); 820 kfree(old_memslots);
813 } 821 }
814 822
@@ -832,7 +840,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
832 840
833 /* actual memory is freed via old in kvm_free_physmem_slot below */ 841 /* actual memory is freed via old in kvm_free_physmem_slot below */
834 if (!npages) { 842 if (!npages) {
835 new.rmap = NULL;
836 new.dirty_bitmap = NULL; 843 new.dirty_bitmap = NULL;
837 memset(&new.arch, 0, sizeof(new.arch)); 844 memset(&new.arch, 0, sizeof(new.arch));
838 } 845 }
@@ -844,13 +851,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
844 851
845 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); 852 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
846 853
847 /*
848 * If the new memory slot is created, we need to clear all
849 * mmio sptes.
850 */
851 if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
852 kvm_arch_flush_shadow(kvm);
853
854 kvm_free_physmem_slot(&old, &new); 854 kvm_free_physmem_slot(&old, &new);
855 kfree(old_memslots); 855 kfree(old_memslots);
856 856
@@ -932,53 +932,6 @@ void kvm_disable_largepages(void)
932} 932}
933EXPORT_SYMBOL_GPL(kvm_disable_largepages); 933EXPORT_SYMBOL_GPL(kvm_disable_largepages);
934 934
935int is_error_page(struct page *page)
936{
937 return page == bad_page || page == hwpoison_page || page == fault_page;
938}
939EXPORT_SYMBOL_GPL(is_error_page);
940
941int is_error_pfn(pfn_t pfn)
942{
943 return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
944}
945EXPORT_SYMBOL_GPL(is_error_pfn);
946
947int is_hwpoison_pfn(pfn_t pfn)
948{
949 return pfn == hwpoison_pfn;
950}
951EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
952
953int is_fault_pfn(pfn_t pfn)
954{
955 return pfn == fault_pfn;
956}
957EXPORT_SYMBOL_GPL(is_fault_pfn);
958
959int is_noslot_pfn(pfn_t pfn)
960{
961 return pfn == bad_pfn;
962}
963EXPORT_SYMBOL_GPL(is_noslot_pfn);
964
965int is_invalid_pfn(pfn_t pfn)
966{
967 return pfn == hwpoison_pfn || pfn == fault_pfn;
968}
969EXPORT_SYMBOL_GPL(is_invalid_pfn);
970
971static inline unsigned long bad_hva(void)
972{
973 return PAGE_OFFSET;
974}
975
976int kvm_is_error_hva(unsigned long addr)
977{
978 return addr == bad_hva();
979}
980EXPORT_SYMBOL_GPL(kvm_is_error_hva);
981
982struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 935struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
983{ 936{
984 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 937 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -1021,28 +974,62 @@ out:
1021 return size; 974 return size;
1022} 975}
1023 976
1024static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 977static bool memslot_is_readonly(struct kvm_memory_slot *slot)
1025 gfn_t *nr_pages) 978{
979 return slot->flags & KVM_MEM_READONLY;
980}
981
982static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
983 gfn_t *nr_pages, bool write)
1026{ 984{
1027 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 985 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1028 return bad_hva(); 986 return KVM_HVA_ERR_BAD;
987
988 if (memslot_is_readonly(slot) && write)
989 return KVM_HVA_ERR_RO_BAD;
1029 990
1030 if (nr_pages) 991 if (nr_pages)
1031 *nr_pages = slot->npages - (gfn - slot->base_gfn); 992 *nr_pages = slot->npages - (gfn - slot->base_gfn);
1032 993
1033 return gfn_to_hva_memslot(slot, gfn); 994 return __gfn_to_hva_memslot(slot, gfn);
1034} 995}
1035 996
997static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
998 gfn_t *nr_pages)
999{
1000 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
1001}
1002
1003unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
1004 gfn_t gfn)
1005{
1006 return gfn_to_hva_many(slot, gfn, NULL);
1007}
1008EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
1009
1036unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1010unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1037{ 1011{
1038 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1012 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
1039} 1013}
1040EXPORT_SYMBOL_GPL(gfn_to_hva); 1014EXPORT_SYMBOL_GPL(gfn_to_hva);
1041 1015
1042static pfn_t get_fault_pfn(void) 1016/*
1017 * The hva returned by this function is only allowed to be read.
1018 * It should pair with kvm_read_hva() or kvm_read_hva_atomic().
1019 */
1020static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
1021{
1022 return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
1023}
1024
1025static int kvm_read_hva(void *data, void __user *hva, int len)
1043{ 1026{
1044 get_page(fault_page); 1027 return __copy_from_user(data, hva, len);
1045 return fault_pfn; 1028}
1029
1030static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
1031{
1032 return __copy_from_user_inatomic(data, hva, len);
1046} 1033}
1047 1034
1048int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1035int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
@@ -1065,108 +1052,186 @@ static inline int check_user_page_hwpoison(unsigned long addr)
1065 return rc == -EHWPOISON; 1052 return rc == -EHWPOISON;
1066} 1053}
1067 1054
1068static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, 1055/*
1069 bool *async, bool write_fault, bool *writable) 1056 * The atomic path to get the writable pfn which will be stored in @pfn,
1057 * true indicates success, otherwise false is returned.
1058 */
1059static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
1060 bool write_fault, bool *writable, pfn_t *pfn)
1070{ 1061{
1071 struct page *page[1]; 1062 struct page *page[1];
1072 int npages = 0; 1063 int npages;
1073 pfn_t pfn;
1074 1064
1075 /* we can do it either atomically or asynchronously, not both */ 1065 if (!(async || atomic))
1076 BUG_ON(atomic && async); 1066 return false;
1077 1067
1078 BUG_ON(!write_fault && !writable); 1068 /*
1069 * Fast pin a writable pfn only if it is a write fault request
1070 * or the caller allows to map a writable pfn for a read fault
1071 * request.
1072 */
1073 if (!(write_fault || writable))
1074 return false;
1079 1075
1080 if (writable) 1076 npages = __get_user_pages_fast(addr, 1, 1, page);
1081 *writable = true; 1077 if (npages == 1) {
1078 *pfn = page_to_pfn(page[0]);
1082 1079
1083 if (atomic || async) 1080 if (writable)
1084 npages = __get_user_pages_fast(addr, 1, 1, page); 1081 *writable = true;
1082 return true;
1083 }
1085 1084
1086 if (unlikely(npages != 1) && !atomic) { 1085 return false;
1087 might_sleep(); 1086}
1088 1087
1089 if (writable) 1088/*
1090 *writable = write_fault; 1089 * The slow path to get the pfn of the specified host virtual address,
1090 * 1 indicates success, -errno is returned if error is detected.
1091 */
1092static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1093 bool *writable, pfn_t *pfn)
1094{
1095 struct page *page[1];
1096 int npages = 0;
1091 1097
1092 if (async) { 1098 might_sleep();
1093 down_read(&current->mm->mmap_sem); 1099
1094 npages = get_user_page_nowait(current, current->mm, 1100 if (writable)
1095 addr, write_fault, page); 1101 *writable = write_fault;
1096 up_read(&current->mm->mmap_sem); 1102
1097 } else 1103 if (async) {
1098 npages = get_user_pages_fast(addr, 1, write_fault, 1104 down_read(&current->mm->mmap_sem);
1099 page); 1105 npages = get_user_page_nowait(current, current->mm,
1100 1106 addr, write_fault, page);
1101 /* map read fault as writable if possible */ 1107 up_read(&current->mm->mmap_sem);
1102 if (unlikely(!write_fault) && npages == 1) { 1108 } else
1103 struct page *wpage[1]; 1109 npages = get_user_pages_fast(addr, 1, write_fault,
1104 1110 page);
1105 npages = __get_user_pages_fast(addr, 1, 1, wpage); 1111 if (npages != 1)
1106 if (npages == 1) { 1112 return npages;
1107 *writable = true; 1113
1108 put_page(page[0]); 1114 /* map read fault as writable if possible */
1109 page[0] = wpage[0]; 1115 if (unlikely(!write_fault) && writable) {
1110 } 1116 struct page *wpage[1];
1111 npages = 1; 1117
1118 npages = __get_user_pages_fast(addr, 1, 1, wpage);
1119 if (npages == 1) {
1120 *writable = true;
1121 put_page(page[0]);
1122 page[0] = wpage[0];
1112 } 1123 }
1124
1125 npages = 1;
1113 } 1126 }
1127 *pfn = page_to_pfn(page[0]);
1128 return npages;
1129}
1114 1130
1115 if (unlikely(npages != 1)) { 1131static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
1116 struct vm_area_struct *vma; 1132{
1133 if (unlikely(!(vma->vm_flags & VM_READ)))
1134 return false;
1117 1135
1118 if (atomic) 1136 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
1119 return get_fault_pfn(); 1137 return false;
1120 1138
1121 down_read(&current->mm->mmap_sem); 1139 return true;
1122 if (npages == -EHWPOISON || 1140}
1123 (!async && check_user_page_hwpoison(addr))) {
1124 up_read(&current->mm->mmap_sem);
1125 get_page(hwpoison_page);
1126 return page_to_pfn(hwpoison_page);
1127 }
1128 1141
1129 vma = find_vma_intersection(current->mm, addr, addr+1); 1142/*
1130 1143 * Pin guest page in memory and return its pfn.
1131 if (vma == NULL) 1144 * @addr: host virtual address which maps memory to the guest
1132 pfn = get_fault_pfn(); 1145 * @atomic: whether this function can sleep
1133 else if ((vma->vm_flags & VM_PFNMAP)) { 1146 * @async: whether this function need to wait IO complete if the
1134 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + 1147 * host page is not in the memory
1135 vma->vm_pgoff; 1148 * @write_fault: whether we should get a writable host page
1136 BUG_ON(!kvm_is_mmio_pfn(pfn)); 1149 * @writable: whether it allows to map a writable host page for !@write_fault
1137 } else { 1150 *
1138 if (async && (vma->vm_flags & VM_WRITE)) 1151 * The function will map a writable host page for these two cases:
1139 *async = true; 1152 * 1): @write_fault = true
1140 pfn = get_fault_pfn(); 1153 * 2): @write_fault = false && @writable, @writable will tell the caller
1141 } 1154 * whether the mapping is writable.
1142 up_read(&current->mm->mmap_sem); 1155 */
1143 } else 1156static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
1144 pfn = page_to_pfn(page[0]); 1157 bool write_fault, bool *writable)
1158{
1159 struct vm_area_struct *vma;
1160 pfn_t pfn = 0;
1161 int npages;
1162
1163 /* we can do it either atomically or asynchronously, not both */
1164 BUG_ON(atomic && async);
1145 1165
1166 if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
1167 return pfn;
1168
1169 if (atomic)
1170 return KVM_PFN_ERR_FAULT;
1171
1172 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
1173 if (npages == 1)
1174 return pfn;
1175
1176 down_read(&current->mm->mmap_sem);
1177 if (npages == -EHWPOISON ||
1178 (!async && check_user_page_hwpoison(addr))) {
1179 pfn = KVM_PFN_ERR_HWPOISON;
1180 goto exit;
1181 }
1182
1183 vma = find_vma_intersection(current->mm, addr, addr + 1);
1184
1185 if (vma == NULL)
1186 pfn = KVM_PFN_ERR_FAULT;
1187 else if ((vma->vm_flags & VM_PFNMAP)) {
1188 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
1189 vma->vm_pgoff;
1190 BUG_ON(!kvm_is_mmio_pfn(pfn));
1191 } else {
1192 if (async && vma_is_valid(vma, write_fault))
1193 *async = true;
1194 pfn = KVM_PFN_ERR_FAULT;
1195 }
1196exit:
1197 up_read(&current->mm->mmap_sem);
1146 return pfn; 1198 return pfn;
1147} 1199}
1148 1200
1149pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) 1201static pfn_t
1202__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
1203 bool *async, bool write_fault, bool *writable)
1150{ 1204{
1151 return hva_to_pfn(kvm, addr, true, NULL, true, NULL); 1205 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
1206
1207 if (addr == KVM_HVA_ERR_RO_BAD)
1208 return KVM_PFN_ERR_RO_FAULT;
1209
1210 if (kvm_is_error_hva(addr))
1211 return KVM_PFN_ERR_BAD;
1212
1213 /* Do not map writable pfn in the readonly memslot. */
1214 if (writable && memslot_is_readonly(slot)) {
1215 *writable = false;
1216 writable = NULL;
1217 }
1218
1219 return hva_to_pfn(addr, atomic, async, write_fault,
1220 writable);
1152} 1221}
1153EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
1154 1222
1155static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, 1223static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
1156 bool write_fault, bool *writable) 1224 bool write_fault, bool *writable)
1157{ 1225{
1158 unsigned long addr; 1226 struct kvm_memory_slot *slot;
1159 1227
1160 if (async) 1228 if (async)
1161 *async = false; 1229 *async = false;
1162 1230
1163 addr = gfn_to_hva(kvm, gfn); 1231 slot = gfn_to_memslot(kvm, gfn);
1164 if (kvm_is_error_hva(addr)) {
1165 get_page(bad_page);
1166 return page_to_pfn(bad_page);
1167 }
1168 1232
1169 return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); 1233 return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
1234 writable);
1170} 1235}
1171 1236
1172pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1237pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@ -1195,12 +1260,16 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1195} 1260}
1196EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1261EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1197 1262
1198pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 1263pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
1199 struct kvm_memory_slot *slot, gfn_t gfn) 1264{
1265 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
1266}
1267
1268pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
1200{ 1269{
1201 unsigned long addr = gfn_to_hva_memslot(slot, gfn); 1270 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
1202 return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
1203} 1271}
1272EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
1204 1273
1205int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1274int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1206 int nr_pages) 1275 int nr_pages)
@@ -1219,30 +1288,42 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1219} 1288}
1220EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1289EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1221 1290
1291static struct page *kvm_pfn_to_page(pfn_t pfn)
1292{
1293 if (is_error_pfn(pfn))
1294 return KVM_ERR_PTR_BAD_PAGE;
1295
1296 if (kvm_is_mmio_pfn(pfn)) {
1297 WARN_ON(1);
1298 return KVM_ERR_PTR_BAD_PAGE;
1299 }
1300
1301 return pfn_to_page(pfn);
1302}
1303
1222struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1304struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1223{ 1305{
1224 pfn_t pfn; 1306 pfn_t pfn;
1225 1307
1226 pfn = gfn_to_pfn(kvm, gfn); 1308 pfn = gfn_to_pfn(kvm, gfn);
1227 if (!kvm_is_mmio_pfn(pfn))
1228 return pfn_to_page(pfn);
1229
1230 WARN_ON(kvm_is_mmio_pfn(pfn));
1231 1309
1232 get_page(bad_page); 1310 return kvm_pfn_to_page(pfn);
1233 return bad_page;
1234} 1311}
1235 1312
1236EXPORT_SYMBOL_GPL(gfn_to_page); 1313EXPORT_SYMBOL_GPL(gfn_to_page);
1237 1314
1238void kvm_release_page_clean(struct page *page) 1315void kvm_release_page_clean(struct page *page)
1239{ 1316{
1317 WARN_ON(is_error_page(page));
1318
1240 kvm_release_pfn_clean(page_to_pfn(page)); 1319 kvm_release_pfn_clean(page_to_pfn(page));
1241} 1320}
1242EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1321EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1243 1322
1244void kvm_release_pfn_clean(pfn_t pfn) 1323void kvm_release_pfn_clean(pfn_t pfn)
1245{ 1324{
1325 WARN_ON(is_error_pfn(pfn));
1326
1246 if (!kvm_is_mmio_pfn(pfn)) 1327 if (!kvm_is_mmio_pfn(pfn))
1247 put_page(pfn_to_page(pfn)); 1328 put_page(pfn_to_page(pfn));
1248} 1329}
@@ -1250,6 +1331,8 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
1250 1331
1251void kvm_release_page_dirty(struct page *page) 1332void kvm_release_page_dirty(struct page *page)
1252{ 1333{
1334 WARN_ON(is_error_page(page));
1335
1253 kvm_release_pfn_dirty(page_to_pfn(page)); 1336 kvm_release_pfn_dirty(page_to_pfn(page));
1254} 1337}
1255EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1338EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
@@ -1305,10 +1388,10 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1305 int r; 1388 int r;
1306 unsigned long addr; 1389 unsigned long addr;
1307 1390
1308 addr = gfn_to_hva(kvm, gfn); 1391 addr = gfn_to_hva_read(kvm, gfn);
1309 if (kvm_is_error_hva(addr)) 1392 if (kvm_is_error_hva(addr))
1310 return -EFAULT; 1393 return -EFAULT;
1311 r = __copy_from_user(data, (void __user *)addr + offset, len); 1394 r = kvm_read_hva(data, (void __user *)addr + offset, len);
1312 if (r) 1395 if (r)
1313 return -EFAULT; 1396 return -EFAULT;
1314 return 0; 1397 return 0;
@@ -1343,11 +1426,11 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1343 gfn_t gfn = gpa >> PAGE_SHIFT; 1426 gfn_t gfn = gpa >> PAGE_SHIFT;
1344 int offset = offset_in_page(gpa); 1427 int offset = offset_in_page(gpa);
1345 1428
1346 addr = gfn_to_hva(kvm, gfn); 1429 addr = gfn_to_hva_read(kvm, gfn);
1347 if (kvm_is_error_hva(addr)) 1430 if (kvm_is_error_hva(addr))
1348 return -EFAULT; 1431 return -EFAULT;
1349 pagefault_disable(); 1432 pagefault_disable();
1350 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1433 r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
1351 pagefault_enable(); 1434 pagefault_enable();
1352 if (r) 1435 if (r)
1353 return -EFAULT; 1436 return -EFAULT;
@@ -1580,6 +1663,43 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
1580} 1663}
1581EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 1664EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
1582 1665
1666#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
1667/*
1668 * Helper that checks whether a VCPU is eligible for directed yield.
1669 * Most eligible candidate to yield is decided by following heuristics:
1670 *
1671 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
1672 * (preempted lock holder), indicated by @in_spin_loop.
1673 * Set at the beiginning and cleared at the end of interception/PLE handler.
1674 *
1675 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
1676 * chance last time (mostly it has become eligible now since we have probably
1677 * yielded to lockholder in last iteration. This is done by toggling
1678 * @dy_eligible each time a VCPU checked for eligibility.)
1679 *
1680 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
1681 * to preempted lock-holder could result in wrong VCPU selection and CPU
1682 * burning. Giving priority for a potential lock-holder increases lock
1683 * progress.
1684 *
1685 * Since algorithm is based on heuristics, accessing another VCPU data without
1686 * locking does not harm. It may result in trying to yield to same VCPU, fail
1687 * and continue with next VCPU and so on.
1688 */
1689bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
1690{
1691 bool eligible;
1692
1693 eligible = !vcpu->spin_loop.in_spin_loop ||
1694 (vcpu->spin_loop.in_spin_loop &&
1695 vcpu->spin_loop.dy_eligible);
1696
1697 if (vcpu->spin_loop.in_spin_loop)
1698 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
1699
1700 return eligible;
1701}
1702#endif
1583void kvm_vcpu_on_spin(struct kvm_vcpu *me) 1703void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1584{ 1704{
1585 struct kvm *kvm = me->kvm; 1705 struct kvm *kvm = me->kvm;
@@ -1589,6 +1709,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1589 int pass; 1709 int pass;
1590 int i; 1710 int i;
1591 1711
1712 kvm_vcpu_set_in_spin_loop(me, true);
1592 /* 1713 /*
1593 * We boost the priority of a VCPU that is runnable but not 1714 * We boost the priority of a VCPU that is runnable but not
1594 * currently running, because it got preempted by something 1715 * currently running, because it got preempted by something
@@ -1607,6 +1728,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1607 continue; 1728 continue;
1608 if (waitqueue_active(&vcpu->wq)) 1729 if (waitqueue_active(&vcpu->wq))
1609 continue; 1730 continue;
1731 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
1732 continue;
1610 if (kvm_vcpu_yield_to(vcpu)) { 1733 if (kvm_vcpu_yield_to(vcpu)) {
1611 kvm->last_boosted_vcpu = i; 1734 kvm->last_boosted_vcpu = i;
1612 yielded = 1; 1735 yielded = 1;
@@ -1614,6 +1737,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1614 } 1737 }
1615 } 1738 }
1616 } 1739 }
1740 kvm_vcpu_set_in_spin_loop(me, false);
1741
1742 /* Ensure vcpu is not eligible during next spinloop */
1743 kvm_vcpu_set_dy_eligible(me, false);
1617} 1744}
1618EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 1745EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1619 1746
@@ -1766,7 +1893,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
1766#endif 1893#endif
1767 1894
1768 1895
1769 vcpu_load(vcpu); 1896 r = vcpu_load(vcpu);
1897 if (r)
1898 return r;
1770 switch (ioctl) { 1899 switch (ioctl) {
1771 case KVM_RUN: 1900 case KVM_RUN:
1772 r = -EINVAL; 1901 r = -EINVAL;
@@ -2094,6 +2223,29 @@ static long kvm_vm_ioctl(struct file *filp,
2094 break; 2223 break;
2095 } 2224 }
2096#endif 2225#endif
2226#ifdef __KVM_HAVE_IRQ_LINE
2227 case KVM_IRQ_LINE_STATUS:
2228 case KVM_IRQ_LINE: {
2229 struct kvm_irq_level irq_event;
2230
2231 r = -EFAULT;
2232 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2233 goto out;
2234
2235 r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
2236 if (r)
2237 goto out;
2238
2239 r = -EFAULT;
2240 if (ioctl == KVM_IRQ_LINE_STATUS) {
2241 if (copy_to_user(argp, &irq_event, sizeof irq_event))
2242 goto out;
2243 }
2244
2245 r = 0;
2246 break;
2247 }
2248#endif
2097 default: 2249 default:
2098 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2250 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
2099 if (r == -ENOTTY) 2251 if (r == -ENOTTY)
@@ -2698,9 +2850,6 @@ static struct syscore_ops kvm_syscore_ops = {
2698 .resume = kvm_resume, 2850 .resume = kvm_resume,
2699}; 2851};
2700 2852
2701struct page *bad_page;
2702pfn_t bad_pfn;
2703
2704static inline 2853static inline
2705struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2854struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2706{ 2855{
@@ -2732,33 +2881,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2732 if (r) 2881 if (r)
2733 goto out_fail; 2882 goto out_fail;
2734 2883
2735 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2736
2737 if (bad_page == NULL) {
2738 r = -ENOMEM;
2739 goto out;
2740 }
2741
2742 bad_pfn = page_to_pfn(bad_page);
2743
2744 hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2745
2746 if (hwpoison_page == NULL) {
2747 r = -ENOMEM;
2748 goto out_free_0;
2749 }
2750
2751 hwpoison_pfn = page_to_pfn(hwpoison_page);
2752
2753 fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2754
2755 if (fault_page == NULL) {
2756 r = -ENOMEM;
2757 goto out_free_0;
2758 }
2759
2760 fault_pfn = page_to_pfn(fault_page);
2761
2762 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2884 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2763 r = -ENOMEM; 2885 r = -ENOMEM;
2764 goto out_free_0; 2886 goto out_free_0;
@@ -2833,12 +2955,6 @@ out_free_1:
2833out_free_0a: 2955out_free_0a:
2834 free_cpumask_var(cpus_hardware_enabled); 2956 free_cpumask_var(cpus_hardware_enabled);
2835out_free_0: 2957out_free_0:
2836 if (fault_page)
2837 __free_page(fault_page);
2838 if (hwpoison_page)
2839 __free_page(hwpoison_page);
2840 __free_page(bad_page);
2841out:
2842 kvm_arch_exit(); 2958 kvm_arch_exit();
2843out_fail: 2959out_fail:
2844 return r; 2960 return r;
@@ -2858,8 +2974,5 @@ void kvm_exit(void)
2858 kvm_arch_hardware_unsetup(); 2974 kvm_arch_hardware_unsetup();
2859 kvm_arch_exit(); 2975 kvm_arch_exit();
2860 free_cpumask_var(cpus_hardware_enabled); 2976 free_cpumask_var(cpus_hardware_enabled);
2861 __free_page(fault_page);
2862 __free_page(hwpoison_page);
2863 __free_page(bad_page);
2864} 2977}
2865EXPORT_SYMBOL_GPL(kvm_exit); 2978EXPORT_SYMBOL_GPL(kvm_exit);