diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-04 12:30:33 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-04 12:30:33 -0400 |
| commit | ecefbd94b834fa32559d854646d777c56749ef1c (patch) | |
| tree | ca8958900ad9e208a8e5fb7704f1b66dc76131b4 /virt | |
| parent | ce57e981f2b996aaca2031003b3f866368307766 (diff) | |
| parent | 3d11df7abbff013b811d5615320580cd5d9d7d31 (diff) | |
Merge tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Avi Kivity:
"Highlights of the changes for this release include support for vfio
level triggered interrupts, improved big real mode support on older
Intels, a streamlines guest page table walker, guest APIC speedups,
PIO optimizations, better overcommit handling, and read-only memory."
* tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (138 commits)
KVM: s390: Fix vcpu_load handling in interrupt code
KVM: x86: Fix guest debug across vcpu INIT reset
KVM: Add resampling irqfds for level triggered interrupts
KVM: optimize apic interrupt delivery
KVM: MMU: Eliminate pointless temporary 'ac'
KVM: MMU: Avoid access/dirty update loop if all is well
KVM: MMU: Eliminate eperm temporary
KVM: MMU: Optimize is_last_gpte()
KVM: MMU: Simplify walk_addr_generic() loop
KVM: MMU: Optimize pte permission checks
KVM: MMU: Update accessed and dirty bits after guest pagetable walk
KVM: MMU: Move gpte_access() out of paging_tmpl.h
KVM: MMU: Optimize gpte_access() slightly
KVM: MMU: Push clean gpte write protection out of gpte_access()
KVM: clarify kvmclock documentation
KVM: make processes waiting on vcpu mutex killable
KVM: SVM: Make use of asm.h
KVM: VMX: Make use of asm.h
KVM: VMX: Make lto-friendly
KVM: x86: lapic: Clean up find_highest_vector() and count_vectors()
...
Conflicts:
arch/s390/include/asm/processor.h
arch/x86/kvm/i8259.c
Diffstat (limited to 'virt')
| -rw-r--r-- | virt/kvm/Kconfig | 3 | ||||
| -rw-r--r-- | virt/kvm/async_pf.c | 11 | ||||
| -rw-r--r-- | virt/kvm/eventfd.c | 150 | ||||
| -rw-r--r-- | virt/kvm/ioapic.c | 37 | ||||
| -rw-r--r-- | virt/kvm/iommu.c | 16 | ||||
| -rw-r--r-- | virt/kvm/irq_comm.c | 17 | ||||
| -rw-r--r-- | virt/kvm/kvm_main.c | 541 |
7 files changed, 525 insertions, 250 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 28694f4a913..d01b24b72c6 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig | |||
| @@ -21,3 +21,6 @@ config KVM_ASYNC_PF | |||
| 21 | 21 | ||
| 22 | config HAVE_KVM_MSI | 22 | config HAVE_KVM_MSI |
| 23 | bool | 23 | bool |
| 24 | |||
| 25 | config HAVE_KVM_CPU_RELAX_INTERCEPT | ||
| 26 | bool | ||
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 74268b4c2ee..ea475cd0351 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c | |||
| @@ -111,8 +111,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) | |||
| 111 | list_entry(vcpu->async_pf.done.next, | 111 | list_entry(vcpu->async_pf.done.next, |
| 112 | typeof(*work), link); | 112 | typeof(*work), link); |
| 113 | list_del(&work->link); | 113 | list_del(&work->link); |
| 114 | if (work->page) | 114 | if (!is_error_page(work->page)) |
| 115 | put_page(work->page); | 115 | kvm_release_page_clean(work->page); |
| 116 | kmem_cache_free(async_pf_cache, work); | 116 | kmem_cache_free(async_pf_cache, work); |
| 117 | } | 117 | } |
| 118 | spin_unlock(&vcpu->async_pf.lock); | 118 | spin_unlock(&vcpu->async_pf.lock); |
| @@ -138,8 +138,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) | |||
| 138 | 138 | ||
| 139 | list_del(&work->queue); | 139 | list_del(&work->queue); |
| 140 | vcpu->async_pf.queued--; | 140 | vcpu->async_pf.queued--; |
| 141 | if (work->page) | 141 | if (!is_error_page(work->page)) |
| 142 | put_page(work->page); | 142 | kvm_release_page_clean(work->page); |
| 143 | kmem_cache_free(async_pf_cache, work); | 143 | kmem_cache_free(async_pf_cache, work); |
| 144 | } | 144 | } |
| 145 | } | 145 | } |
| @@ -203,8 +203,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) | |||
| 203 | if (!work) | 203 | if (!work) |
| 204 | return -ENOMEM; | 204 | return -ENOMEM; |
| 205 | 205 | ||
| 206 | work->page = bad_page; | 206 | work->page = KVM_ERR_PTR_BAD_PAGE; |
| 207 | get_page(bad_page); | ||
| 208 | INIT_LIST_HEAD(&work->queue); /* for list_del to work */ | 207 | INIT_LIST_HEAD(&work->queue); /* for list_del to work */ |
| 209 | 208 | ||
| 210 | spin_lock(&vcpu->async_pf.lock); | 209 | spin_lock(&vcpu->async_pf.lock); |
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 67a35e90384..9718e98d6d2 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c | |||
| @@ -43,6 +43,31 @@ | |||
| 43 | * -------------------------------------------------------------------- | 43 | * -------------------------------------------------------------------- |
| 44 | */ | 44 | */ |
| 45 | 45 | ||
| 46 | /* | ||
| 47 | * Resampling irqfds are a special variety of irqfds used to emulate | ||
| 48 | * level triggered interrupts. The interrupt is asserted on eventfd | ||
| 49 | * trigger. On acknowledgement through the irq ack notifier, the | ||
| 50 | * interrupt is de-asserted and userspace is notified through the | ||
| 51 | * resamplefd. All resamplers on the same gsi are de-asserted | ||
| 52 | * together, so we don't need to track the state of each individual | ||
| 53 | * user. We can also therefore share the same irq source ID. | ||
| 54 | */ | ||
| 55 | struct _irqfd_resampler { | ||
| 56 | struct kvm *kvm; | ||
| 57 | /* | ||
| 58 | * List of resampling struct _irqfd objects sharing this gsi. | ||
| 59 | * RCU list modified under kvm->irqfds.resampler_lock | ||
| 60 | */ | ||
| 61 | struct list_head list; | ||
| 62 | struct kvm_irq_ack_notifier notifier; | ||
| 63 | /* | ||
| 64 | * Entry in list of kvm->irqfd.resampler_list. Use for sharing | ||
| 65 | * resamplers among irqfds on the same gsi. | ||
| 66 | * Accessed and modified under kvm->irqfds.resampler_lock | ||
| 67 | */ | ||
| 68 | struct list_head link; | ||
| 69 | }; | ||
| 70 | |||
| 46 | struct _irqfd { | 71 | struct _irqfd { |
| 47 | /* Used for MSI fast-path */ | 72 | /* Used for MSI fast-path */ |
| 48 | struct kvm *kvm; | 73 | struct kvm *kvm; |
| @@ -52,6 +77,12 @@ struct _irqfd { | |||
| 52 | /* Used for level IRQ fast-path */ | 77 | /* Used for level IRQ fast-path */ |
| 53 | int gsi; | 78 | int gsi; |
| 54 | struct work_struct inject; | 79 | struct work_struct inject; |
| 80 | /* The resampler used by this irqfd (resampler-only) */ | ||
| 81 | struct _irqfd_resampler *resampler; | ||
| 82 | /* Eventfd notified on resample (resampler-only) */ | ||
| 83 | struct eventfd_ctx *resamplefd; | ||
| 84 | /* Entry in list of irqfds for a resampler (resampler-only) */ | ||
| 85 | struct list_head resampler_link; | ||
| 55 | /* Used for setup/shutdown */ | 86 | /* Used for setup/shutdown */ |
| 56 | struct eventfd_ctx *eventfd; | 87 | struct eventfd_ctx *eventfd; |
| 57 | struct list_head list; | 88 | struct list_head list; |
| @@ -67,8 +98,58 @@ irqfd_inject(struct work_struct *work) | |||
| 67 | struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); | 98 | struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); |
| 68 | struct kvm *kvm = irqfd->kvm; | 99 | struct kvm *kvm = irqfd->kvm; |
| 69 | 100 | ||
| 70 | kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); | 101 | if (!irqfd->resampler) { |
| 71 | kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); | 102 | kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); |
| 103 | kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); | ||
| 104 | } else | ||
| 105 | kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, | ||
| 106 | irqfd->gsi, 1); | ||
| 107 | } | ||
| 108 | |||
| 109 | /* | ||
| 110 | * Since resampler irqfds share an IRQ source ID, we de-assert once | ||
| 111 | * then notify all of the resampler irqfds using this GSI. We can't | ||
| 112 | * do multiple de-asserts or we risk racing with incoming re-asserts. | ||
| 113 | */ | ||
| 114 | static void | ||
| 115 | irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) | ||
| 116 | { | ||
| 117 | struct _irqfd_resampler *resampler; | ||
| 118 | struct _irqfd *irqfd; | ||
| 119 | |||
| 120 | resampler = container_of(kian, struct _irqfd_resampler, notifier); | ||
| 121 | |||
| 122 | kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, | ||
| 123 | resampler->notifier.gsi, 0); | ||
| 124 | |||
| 125 | rcu_read_lock(); | ||
| 126 | |||
| 127 | list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link) | ||
| 128 | eventfd_signal(irqfd->resamplefd, 1); | ||
| 129 | |||
| 130 | rcu_read_unlock(); | ||
| 131 | } | ||
| 132 | |||
| 133 | static void | ||
| 134 | irqfd_resampler_shutdown(struct _irqfd *irqfd) | ||
| 135 | { | ||
| 136 | struct _irqfd_resampler *resampler = irqfd->resampler; | ||
| 137 | struct kvm *kvm = resampler->kvm; | ||
| 138 | |||
| 139 | mutex_lock(&kvm->irqfds.resampler_lock); | ||
| 140 | |||
| 141 | list_del_rcu(&irqfd->resampler_link); | ||
| 142 | synchronize_rcu(); | ||
| 143 | |||
| 144 | if (list_empty(&resampler->list)) { | ||
| 145 | list_del(&resampler->link); | ||
| 146 | kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); | ||
| 147 | kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, | ||
| 148 | resampler->notifier.gsi, 0); | ||
| 149 | kfree(resampler); | ||
| 150 | } | ||
| 151 | |||
| 152 | mutex_unlock(&kvm->irqfds.resampler_lock); | ||
| 72 | } | 153 | } |
| 73 | 154 | ||
| 74 | /* | 155 | /* |
| @@ -92,6 +173,11 @@ irqfd_shutdown(struct work_struct *work) | |||
| 92 | */ | 173 | */ |
| 93 | flush_work(&irqfd->inject); | 174 | flush_work(&irqfd->inject); |
| 94 | 175 | ||
| 176 | if (irqfd->resampler) { | ||
| 177 | irqfd_resampler_shutdown(irqfd); | ||
| 178 | eventfd_ctx_put(irqfd->resamplefd); | ||
| 179 | } | ||
| 180 | |||
| 95 | /* | 181 | /* |
| 96 | * It is now safe to release the object's resources | 182 | * It is now safe to release the object's resources |
| 97 | */ | 183 | */ |
| @@ -203,7 +289,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) | |||
| 203 | struct kvm_irq_routing_table *irq_rt; | 289 | struct kvm_irq_routing_table *irq_rt; |
| 204 | struct _irqfd *irqfd, *tmp; | 290 | struct _irqfd *irqfd, *tmp; |
| 205 | struct file *file = NULL; | 291 | struct file *file = NULL; |
| 206 | struct eventfd_ctx *eventfd = NULL; | 292 | struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; |
| 207 | int ret; | 293 | int ret; |
| 208 | unsigned int events; | 294 | unsigned int events; |
| 209 | 295 | ||
| @@ -231,6 +317,54 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) | |||
| 231 | 317 | ||
| 232 | irqfd->eventfd = eventfd; | 318 | irqfd->eventfd = eventfd; |
| 233 | 319 | ||
| 320 | if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) { | ||
| 321 | struct _irqfd_resampler *resampler; | ||
| 322 | |||
| 323 | resamplefd = eventfd_ctx_fdget(args->resamplefd); | ||
| 324 | if (IS_ERR(resamplefd)) { | ||
| 325 | ret = PTR_ERR(resamplefd); | ||
| 326 | goto fail; | ||
| 327 | } | ||
| 328 | |||
| 329 | irqfd->resamplefd = resamplefd; | ||
| 330 | INIT_LIST_HEAD(&irqfd->resampler_link); | ||
| 331 | |||
| 332 | mutex_lock(&kvm->irqfds.resampler_lock); | ||
| 333 | |||
| 334 | list_for_each_entry(resampler, | ||
| 335 | &kvm->irqfds.resampler_list, list) { | ||
| 336 | if (resampler->notifier.gsi == irqfd->gsi) { | ||
| 337 | irqfd->resampler = resampler; | ||
| 338 | break; | ||
| 339 | } | ||
| 340 | } | ||
| 341 | |||
| 342 | if (!irqfd->resampler) { | ||
| 343 | resampler = kzalloc(sizeof(*resampler), GFP_KERNEL); | ||
| 344 | if (!resampler) { | ||
| 345 | ret = -ENOMEM; | ||
| 346 | mutex_unlock(&kvm->irqfds.resampler_lock); | ||
| 347 | goto fail; | ||
| 348 | } | ||
| 349 | |||
| 350 | resampler->kvm = kvm; | ||
| 351 | INIT_LIST_HEAD(&resampler->list); | ||
| 352 | resampler->notifier.gsi = irqfd->gsi; | ||
| 353 | resampler->notifier.irq_acked = irqfd_resampler_ack; | ||
| 354 | INIT_LIST_HEAD(&resampler->link); | ||
| 355 | |||
| 356 | list_add(&resampler->link, &kvm->irqfds.resampler_list); | ||
| 357 | kvm_register_irq_ack_notifier(kvm, | ||
| 358 | &resampler->notifier); | ||
| 359 | irqfd->resampler = resampler; | ||
| 360 | } | ||
| 361 | |||
| 362 | list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); | ||
| 363 | synchronize_rcu(); | ||
| 364 | |||
| 365 | mutex_unlock(&kvm->irqfds.resampler_lock); | ||
| 366 | } | ||
| 367 | |||
| 234 | /* | 368 | /* |
| 235 | * Install our own custom wake-up handling so we are notified via | 369 | * Install our own custom wake-up handling so we are notified via |
| 236 | * a callback whenever someone signals the underlying eventfd | 370 | * a callback whenever someone signals the underlying eventfd |
| @@ -276,6 +410,12 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) | |||
| 276 | return 0; | 410 | return 0; |
| 277 | 411 | ||
| 278 | fail: | 412 | fail: |
| 413 | if (irqfd->resampler) | ||
| 414 | irqfd_resampler_shutdown(irqfd); | ||
| 415 | |||
| 416 | if (resamplefd && !IS_ERR(resamplefd)) | ||
| 417 | eventfd_ctx_put(resamplefd); | ||
| 418 | |||
| 279 | if (eventfd && !IS_ERR(eventfd)) | 419 | if (eventfd && !IS_ERR(eventfd)) |
| 280 | eventfd_ctx_put(eventfd); | 420 | eventfd_ctx_put(eventfd); |
| 281 | 421 | ||
| @@ -291,6 +431,8 @@ kvm_eventfd_init(struct kvm *kvm) | |||
| 291 | { | 431 | { |
| 292 | spin_lock_init(&kvm->irqfds.lock); | 432 | spin_lock_init(&kvm->irqfds.lock); |
| 293 | INIT_LIST_HEAD(&kvm->irqfds.items); | 433 | INIT_LIST_HEAD(&kvm->irqfds.items); |
| 434 | INIT_LIST_HEAD(&kvm->irqfds.resampler_list); | ||
| 435 | mutex_init(&kvm->irqfds.resampler_lock); | ||
| 294 | INIT_LIST_HEAD(&kvm->ioeventfds); | 436 | INIT_LIST_HEAD(&kvm->ioeventfds); |
| 295 | } | 437 | } |
| 296 | 438 | ||
| @@ -340,7 +482,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) | |||
| 340 | int | 482 | int |
| 341 | kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) | 483 | kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) |
| 342 | { | 484 | { |
| 343 | if (args->flags & ~KVM_IRQFD_FLAG_DEASSIGN) | 485 | if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) |
| 344 | return -EINVAL; | 486 | return -EINVAL; |
| 345 | 487 | ||
| 346 | if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) | 488 | if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) |
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index ef61d529a6c..cfb7e4d52dc 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c | |||
| @@ -197,28 +197,29 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, | |||
| 197 | u32 old_irr; | 197 | u32 old_irr; |
| 198 | u32 mask = 1 << irq; | 198 | u32 mask = 1 << irq; |
| 199 | union kvm_ioapic_redirect_entry entry; | 199 | union kvm_ioapic_redirect_entry entry; |
| 200 | int ret = 1; | 200 | int ret, irq_level; |
| 201 | |||
| 202 | BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS); | ||
| 201 | 203 | ||
| 202 | spin_lock(&ioapic->lock); | 204 | spin_lock(&ioapic->lock); |
| 203 | old_irr = ioapic->irr; | 205 | old_irr = ioapic->irr; |
| 204 | if (irq >= 0 && irq < IOAPIC_NUM_PINS) { | 206 | irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], |
| 205 | int irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], | 207 | irq_source_id, level); |
| 206 | irq_source_id, level); | 208 | entry = ioapic->redirtbl[irq]; |
| 207 | entry = ioapic->redirtbl[irq]; | 209 | irq_level ^= entry.fields.polarity; |
| 208 | irq_level ^= entry.fields.polarity; | 210 | if (!irq_level) { |
| 209 | if (!irq_level) | 211 | ioapic->irr &= ~mask; |
| 210 | ioapic->irr &= ~mask; | 212 | ret = 1; |
| 211 | else { | 213 | } else { |
| 212 | int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); | 214 | int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); |
| 213 | ioapic->irr |= mask; | 215 | ioapic->irr |= mask; |
| 214 | if ((edge && old_irr != ioapic->irr) || | 216 | if ((edge && old_irr != ioapic->irr) || |
| 215 | (!edge && !entry.fields.remote_irr)) | 217 | (!edge && !entry.fields.remote_irr)) |
| 216 | ret = ioapic_service(ioapic, irq); | 218 | ret = ioapic_service(ioapic, irq); |
| 217 | else | 219 | else |
| 218 | ret = 0; /* report coalesced interrupt */ | 220 | ret = 0; /* report coalesced interrupt */ |
| 219 | } | ||
| 220 | trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); | ||
| 221 | } | 221 | } |
| 222 | trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); | ||
| 222 | spin_unlock(&ioapic->lock); | 223 | spin_unlock(&ioapic->lock); |
| 223 | 224 | ||
| 224 | return ret; | 225 | return ret; |
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index e9fff9830bf..037cb6730e6 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c | |||
| @@ -42,13 +42,13 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm); | |||
| 42 | static void kvm_iommu_put_pages(struct kvm *kvm, | 42 | static void kvm_iommu_put_pages(struct kvm *kvm, |
| 43 | gfn_t base_gfn, unsigned long npages); | 43 | gfn_t base_gfn, unsigned long npages); |
| 44 | 44 | ||
| 45 | static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, | 45 | static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, |
| 46 | gfn_t gfn, unsigned long size) | 46 | unsigned long size) |
| 47 | { | 47 | { |
| 48 | gfn_t end_gfn; | 48 | gfn_t end_gfn; |
| 49 | pfn_t pfn; | 49 | pfn_t pfn; |
| 50 | 50 | ||
| 51 | pfn = gfn_to_pfn_memslot(kvm, slot, gfn); | 51 | pfn = gfn_to_pfn_memslot(slot, gfn); |
| 52 | end_gfn = gfn + (size >> PAGE_SHIFT); | 52 | end_gfn = gfn + (size >> PAGE_SHIFT); |
| 53 | gfn += 1; | 53 | gfn += 1; |
| 54 | 54 | ||
| @@ -56,7 +56,7 @@ static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, | |||
| 56 | return pfn; | 56 | return pfn; |
| 57 | 57 | ||
| 58 | while (gfn < end_gfn) | 58 | while (gfn < end_gfn) |
| 59 | gfn_to_pfn_memslot(kvm, slot, gfn++); | 59 | gfn_to_pfn_memslot(slot, gfn++); |
| 60 | 60 | ||
| 61 | return pfn; | 61 | return pfn; |
| 62 | } | 62 | } |
| @@ -105,7 +105,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) | |||
| 105 | * Pin all pages we are about to map in memory. This is | 105 | * Pin all pages we are about to map in memory. This is |
| 106 | * important because we unmap and unpin in 4kb steps later. | 106 | * important because we unmap and unpin in 4kb steps later. |
| 107 | */ | 107 | */ |
| 108 | pfn = kvm_pin_pages(kvm, slot, gfn, page_size); | 108 | pfn = kvm_pin_pages(slot, gfn, page_size); |
| 109 | if (is_error_pfn(pfn)) { | 109 | if (is_error_pfn(pfn)) { |
| 110 | gfn += 1; | 110 | gfn += 1; |
| 111 | continue; | 111 | continue; |
| @@ -300,6 +300,12 @@ static void kvm_iommu_put_pages(struct kvm *kvm, | |||
| 300 | 300 | ||
| 301 | /* Get physical address */ | 301 | /* Get physical address */ |
| 302 | phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); | 302 | phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); |
| 303 | |||
| 304 | if (!phys) { | ||
| 305 | gfn++; | ||
| 306 | continue; | ||
| 307 | } | ||
| 308 | |||
| 303 | pfn = phys >> PAGE_SHIFT; | 309 | pfn = phys >> PAGE_SHIFT; |
| 304 | 310 | ||
| 305 | /* Unmap address from IO address space */ | 311 | /* Unmap address from IO address space */ |
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 83402d74a76..2eb58af7ee9 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c | |||
| @@ -68,8 +68,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, | |||
| 68 | struct kvm_vcpu *vcpu, *lowest = NULL; | 68 | struct kvm_vcpu *vcpu, *lowest = NULL; |
| 69 | 69 | ||
| 70 | if (irq->dest_mode == 0 && irq->dest_id == 0xff && | 70 | if (irq->dest_mode == 0 && irq->dest_id == 0xff && |
| 71 | kvm_is_dm_lowest_prio(irq)) | 71 | kvm_is_dm_lowest_prio(irq)) { |
| 72 | printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); | 72 | printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); |
| 73 | irq->delivery_mode = APIC_DM_FIXED; | ||
| 74 | } | ||
| 75 | |||
| 76 | if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r)) | ||
| 77 | return r; | ||
| 73 | 78 | ||
| 74 | kvm_for_each_vcpu(i, vcpu, kvm) { | 79 | kvm_for_each_vcpu(i, vcpu, kvm) { |
| 75 | if (!kvm_apic_present(vcpu)) | 80 | if (!kvm_apic_present(vcpu)) |
| @@ -223,6 +228,9 @@ int kvm_request_irq_source_id(struct kvm *kvm) | |||
| 223 | } | 228 | } |
| 224 | 229 | ||
| 225 | ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); | 230 | ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); |
| 231 | #ifdef CONFIG_X86 | ||
| 232 | ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); | ||
| 233 | #endif | ||
| 226 | set_bit(irq_source_id, bitmap); | 234 | set_bit(irq_source_id, bitmap); |
| 227 | unlock: | 235 | unlock: |
| 228 | mutex_unlock(&kvm->irq_lock); | 236 | mutex_unlock(&kvm->irq_lock); |
| @@ -233,6 +241,9 @@ unlock: | |||
| 233 | void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) | 241 | void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) |
| 234 | { | 242 | { |
| 235 | ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); | 243 | ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); |
| 244 | #ifdef CONFIG_X86 | ||
| 245 | ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); | ||
| 246 | #endif | ||
| 236 | 247 | ||
| 237 | mutex_lock(&kvm->irq_lock); | 248 | mutex_lock(&kvm->irq_lock); |
| 238 | if (irq_source_id < 0 || | 249 | if (irq_source_id < 0 || |
| @@ -321,11 +332,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, | |||
| 321 | switch (ue->u.irqchip.irqchip) { | 332 | switch (ue->u.irqchip.irqchip) { |
| 322 | case KVM_IRQCHIP_PIC_MASTER: | 333 | case KVM_IRQCHIP_PIC_MASTER: |
| 323 | e->set = kvm_set_pic_irq; | 334 | e->set = kvm_set_pic_irq; |
| 324 | max_pin = 16; | 335 | max_pin = PIC_NUM_PINS; |
| 325 | break; | 336 | break; |
| 326 | case KVM_IRQCHIP_PIC_SLAVE: | 337 | case KVM_IRQCHIP_PIC_SLAVE: |
| 327 | e->set = kvm_set_pic_irq; | 338 | e->set = kvm_set_pic_irq; |
| 328 | max_pin = 16; | 339 | max_pin = PIC_NUM_PINS; |
| 329 | delta = 8; | 340 | delta = 8; |
| 330 | break; | 341 | break; |
| 331 | case KVM_IRQCHIP_IOAPIC: | 342 | case KVM_IRQCHIP_IOAPIC: |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d617f69131d..c353b4599ce 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
| @@ -100,13 +100,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting); | |||
| 100 | 100 | ||
| 101 | static bool largepages_enabled = true; | 101 | static bool largepages_enabled = true; |
| 102 | 102 | ||
| 103 | static struct page *hwpoison_page; | 103 | bool kvm_is_mmio_pfn(pfn_t pfn) |
| 104 | static pfn_t hwpoison_pfn; | ||
| 105 | |||
| 106 | struct page *fault_page; | ||
| 107 | pfn_t fault_pfn; | ||
| 108 | |||
| 109 | inline int kvm_is_mmio_pfn(pfn_t pfn) | ||
| 110 | { | 104 | { |
| 111 | if (pfn_valid(pfn)) { | 105 | if (pfn_valid(pfn)) { |
| 112 | int reserved; | 106 | int reserved; |
| @@ -137,11 +131,12 @@ inline int kvm_is_mmio_pfn(pfn_t pfn) | |||
| 137 | /* | 131 | /* |
| 138 | * Switches to specified vcpu, until a matching vcpu_put() | 132 | * Switches to specified vcpu, until a matching vcpu_put() |
| 139 | */ | 133 | */ |
| 140 | void vcpu_load(struct kvm_vcpu *vcpu) | 134 | int vcpu_load(struct kvm_vcpu *vcpu) |
| 141 | { | 135 | { |
| 142 | int cpu; | 136 | int cpu; |
| 143 | 137 | ||
| 144 | mutex_lock(&vcpu->mutex); | 138 | if (mutex_lock_killable(&vcpu->mutex)) |
| 139 | return -EINTR; | ||
| 145 | if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { | 140 | if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { |
| 146 | /* The thread running this VCPU changed. */ | 141 | /* The thread running this VCPU changed. */ |
| 147 | struct pid *oldpid = vcpu->pid; | 142 | struct pid *oldpid = vcpu->pid; |
| @@ -154,6 +149,7 @@ void vcpu_load(struct kvm_vcpu *vcpu) | |||
| 154 | preempt_notifier_register(&vcpu->preempt_notifier); | 149 | preempt_notifier_register(&vcpu->preempt_notifier); |
| 155 | kvm_arch_vcpu_load(vcpu, cpu); | 150 | kvm_arch_vcpu_load(vcpu, cpu); |
| 156 | put_cpu(); | 151 | put_cpu(); |
| 152 | return 0; | ||
| 157 | } | 153 | } |
| 158 | 154 | ||
| 159 | void vcpu_put(struct kvm_vcpu *vcpu) | 155 | void vcpu_put(struct kvm_vcpu *vcpu) |
| @@ -236,6 +232,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) | |||
| 236 | } | 232 | } |
| 237 | vcpu->run = page_address(page); | 233 | vcpu->run = page_address(page); |
| 238 | 234 | ||
| 235 | kvm_vcpu_set_in_spin_loop(vcpu, false); | ||
| 236 | kvm_vcpu_set_dy_eligible(vcpu, false); | ||
| 237 | |||
| 239 | r = kvm_arch_vcpu_init(vcpu); | 238 | r = kvm_arch_vcpu_init(vcpu); |
| 240 | if (r < 0) | 239 | if (r < 0) |
| 241 | goto fail_free_run; | 240 | goto fail_free_run; |
| @@ -332,8 +331,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, | |||
| 332 | * count is also read inside the mmu_lock critical section. | 331 | * count is also read inside the mmu_lock critical section. |
| 333 | */ | 332 | */ |
| 334 | kvm->mmu_notifier_count++; | 333 | kvm->mmu_notifier_count++; |
| 335 | for (; start < end; start += PAGE_SIZE) | 334 | need_tlb_flush = kvm_unmap_hva_range(kvm, start, end); |
| 336 | need_tlb_flush |= kvm_unmap_hva(kvm, start); | ||
| 337 | need_tlb_flush |= kvm->tlbs_dirty; | 335 | need_tlb_flush |= kvm->tlbs_dirty; |
| 338 | /* we've to flush the tlb before the pages can be freed */ | 336 | /* we've to flush the tlb before the pages can be freed */ |
| 339 | if (need_tlb_flush) | 337 | if (need_tlb_flush) |
| @@ -412,7 +410,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, | |||
| 412 | int idx; | 410 | int idx; |
| 413 | 411 | ||
| 414 | idx = srcu_read_lock(&kvm->srcu); | 412 | idx = srcu_read_lock(&kvm->srcu); |
| 415 | kvm_arch_flush_shadow(kvm); | 413 | kvm_arch_flush_shadow_all(kvm); |
| 416 | srcu_read_unlock(&kvm->srcu, idx); | 414 | srcu_read_unlock(&kvm->srcu, idx); |
| 417 | } | 415 | } |
| 418 | 416 | ||
| @@ -551,16 +549,12 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) | |||
| 551 | static void kvm_free_physmem_slot(struct kvm_memory_slot *free, | 549 | static void kvm_free_physmem_slot(struct kvm_memory_slot *free, |
| 552 | struct kvm_memory_slot *dont) | 550 | struct kvm_memory_slot *dont) |
| 553 | { | 551 | { |
| 554 | if (!dont || free->rmap != dont->rmap) | ||
| 555 | vfree(free->rmap); | ||
| 556 | |||
| 557 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) | 552 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) |
| 558 | kvm_destroy_dirty_bitmap(free); | 553 | kvm_destroy_dirty_bitmap(free); |
| 559 | 554 | ||
| 560 | kvm_arch_free_memslot(free, dont); | 555 | kvm_arch_free_memslot(free, dont); |
| 561 | 556 | ||
| 562 | free->npages = 0; | 557 | free->npages = 0; |
| 563 | free->rmap = NULL; | ||
| 564 | } | 558 | } |
| 565 | 559 | ||
| 566 | void kvm_free_physmem(struct kvm *kvm) | 560 | void kvm_free_physmem(struct kvm *kvm) |
| @@ -590,7 +584,7 @@ static void kvm_destroy_vm(struct kvm *kvm) | |||
| 590 | #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) | 584 | #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) |
| 591 | mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); | 585 | mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); |
| 592 | #else | 586 | #else |
| 593 | kvm_arch_flush_shadow(kvm); | 587 | kvm_arch_flush_shadow_all(kvm); |
| 594 | #endif | 588 | #endif |
| 595 | kvm_arch_destroy_vm(kvm); | 589 | kvm_arch_destroy_vm(kvm); |
| 596 | kvm_free_physmem(kvm); | 590 | kvm_free_physmem(kvm); |
| @@ -686,6 +680,20 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new) | |||
| 686 | slots->generation++; | 680 | slots->generation++; |
| 687 | } | 681 | } |
| 688 | 682 | ||
| 683 | static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) | ||
| 684 | { | ||
| 685 | u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; | ||
| 686 | |||
| 687 | #ifdef KVM_CAP_READONLY_MEM | ||
| 688 | valid_flags |= KVM_MEM_READONLY; | ||
| 689 | #endif | ||
| 690 | |||
| 691 | if (mem->flags & ~valid_flags) | ||
| 692 | return -EINVAL; | ||
| 693 | |||
| 694 | return 0; | ||
| 695 | } | ||
| 696 | |||
| 689 | /* | 697 | /* |
| 690 | * Allocate some memory and give it an address in the guest physical address | 698 | * Allocate some memory and give it an address in the guest physical address |
| 691 | * space. | 699 | * space. |
| @@ -706,6 +714,10 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
| 706 | struct kvm_memory_slot old, new; | 714 | struct kvm_memory_slot old, new; |
| 707 | struct kvm_memslots *slots, *old_memslots; | 715 | struct kvm_memslots *slots, *old_memslots; |
| 708 | 716 | ||
| 717 | r = check_memory_region_flags(mem); | ||
| 718 | if (r) | ||
| 719 | goto out; | ||
| 720 | |||
| 709 | r = -EINVAL; | 721 | r = -EINVAL; |
| 710 | /* General sanity checks */ | 722 | /* General sanity checks */ |
| 711 | if (mem->memory_size & (PAGE_SIZE - 1)) | 723 | if (mem->memory_size & (PAGE_SIZE - 1)) |
| @@ -769,11 +781,7 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
| 769 | if (npages && !old.npages) { | 781 | if (npages && !old.npages) { |
| 770 | new.user_alloc = user_alloc; | 782 | new.user_alloc = user_alloc; |
| 771 | new.userspace_addr = mem->userspace_addr; | 783 | new.userspace_addr = mem->userspace_addr; |
| 772 | #ifndef CONFIG_S390 | 784 | |
| 773 | new.rmap = vzalloc(npages * sizeof(*new.rmap)); | ||
| 774 | if (!new.rmap) | ||
| 775 | goto out_free; | ||
| 776 | #endif /* not defined CONFIG_S390 */ | ||
| 777 | if (kvm_arch_create_memslot(&new, npages)) | 785 | if (kvm_arch_create_memslot(&new, npages)) |
| 778 | goto out_free; | 786 | goto out_free; |
| 779 | } | 787 | } |
| @@ -785,7 +793,7 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
| 785 | /* destroy any largepage mappings for dirty tracking */ | 793 | /* destroy any largepage mappings for dirty tracking */ |
| 786 | } | 794 | } |
| 787 | 795 | ||
| 788 | if (!npages) { | 796 | if (!npages || base_gfn != old.base_gfn) { |
| 789 | struct kvm_memory_slot *slot; | 797 | struct kvm_memory_slot *slot; |
| 790 | 798 | ||
| 791 | r = -ENOMEM; | 799 | r = -ENOMEM; |
| @@ -801,14 +809,14 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
| 801 | old_memslots = kvm->memslots; | 809 | old_memslots = kvm->memslots; |
| 802 | rcu_assign_pointer(kvm->memslots, slots); | 810 | rcu_assign_pointer(kvm->memslots, slots); |
| 803 | synchronize_srcu_expedited(&kvm->srcu); | 811 | synchronize_srcu_expedited(&kvm->srcu); |
| 804 | /* From this point no new shadow pages pointing to a deleted | 812 | /* From this point no new shadow pages pointing to a deleted, |
| 805 | * memslot will be created. | 813 | * or moved, memslot will be created. |
| 806 | * | 814 | * |
| 807 | * validation of sp->gfn happens in: | 815 | * validation of sp->gfn happens in: |
| 808 | * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) | 816 | * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) |
| 809 | * - kvm_is_visible_gfn (mmu_check_roots) | 817 | * - kvm_is_visible_gfn (mmu_check_roots) |
| 810 | */ | 818 | */ |
| 811 | kvm_arch_flush_shadow(kvm); | 819 | kvm_arch_flush_shadow_memslot(kvm, slot); |
| 812 | kfree(old_memslots); | 820 | kfree(old_memslots); |
| 813 | } | 821 | } |
| 814 | 822 | ||
| @@ -832,7 +840,6 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
| 832 | 840 | ||
| 833 | /* actual memory is freed via old in kvm_free_physmem_slot below */ | 841 | /* actual memory is freed via old in kvm_free_physmem_slot below */ |
| 834 | if (!npages) { | 842 | if (!npages) { |
| 835 | new.rmap = NULL; | ||
| 836 | new.dirty_bitmap = NULL; | 843 | new.dirty_bitmap = NULL; |
| 837 | memset(&new.arch, 0, sizeof(new.arch)); | 844 | memset(&new.arch, 0, sizeof(new.arch)); |
| 838 | } | 845 | } |
| @@ -844,13 +851,6 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
| 844 | 851 | ||
| 845 | kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); | 852 | kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); |
| 846 | 853 | ||
| 847 | /* | ||
| 848 | * If the new memory slot is created, we need to clear all | ||
| 849 | * mmio sptes. | ||
| 850 | */ | ||
| 851 | if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) | ||
| 852 | kvm_arch_flush_shadow(kvm); | ||
| 853 | |||
| 854 | kvm_free_physmem_slot(&old, &new); | 854 | kvm_free_physmem_slot(&old, &new); |
| 855 | kfree(old_memslots); | 855 | kfree(old_memslots); |
| 856 | 856 | ||
| @@ -932,53 +932,6 @@ void kvm_disable_largepages(void) | |||
| 932 | } | 932 | } |
| 933 | EXPORT_SYMBOL_GPL(kvm_disable_largepages); | 933 | EXPORT_SYMBOL_GPL(kvm_disable_largepages); |
| 934 | 934 | ||
| 935 | int is_error_page(struct page *page) | ||
| 936 | { | ||
| 937 | return page == bad_page || page == hwpoison_page || page == fault_page; | ||
| 938 | } | ||
| 939 | EXPORT_SYMBOL_GPL(is_error_page); | ||
| 940 | |||
| 941 | int is_error_pfn(pfn_t pfn) | ||
| 942 | { | ||
| 943 | return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn; | ||
| 944 | } | ||
| 945 | EXPORT_SYMBOL_GPL(is_error_pfn); | ||
| 946 | |||
| 947 | int is_hwpoison_pfn(pfn_t pfn) | ||
| 948 | { | ||
| 949 | return pfn == hwpoison_pfn; | ||
| 950 | } | ||
| 951 | EXPORT_SYMBOL_GPL(is_hwpoison_pfn); | ||
| 952 | |||
| 953 | int is_fault_pfn(pfn_t pfn) | ||
| 954 | { | ||
| 955 | return pfn == fault_pfn; | ||
| 956 | } | ||
| 957 | EXPORT_SYMBOL_GPL(is_fault_pfn); | ||
| 958 | |||
| 959 | int is_noslot_pfn(pfn_t pfn) | ||
| 960 | { | ||
| 961 | return pfn == bad_pfn; | ||
| 962 | } | ||
| 963 | EXPORT_SYMBOL_GPL(is_noslot_pfn); | ||
| 964 | |||
| 965 | int is_invalid_pfn(pfn_t pfn) | ||
| 966 | { | ||
| 967 | return pfn == hwpoison_pfn || pfn == fault_pfn; | ||
| 968 | } | ||
| 969 | EXPORT_SYMBOL_GPL(is_invalid_pfn); | ||
| 970 | |||
| 971 | static inline unsigned long bad_hva(void) | ||
| 972 | { | ||
| 973 | return PAGE_OFFSET; | ||
| 974 | } | ||
| 975 | |||
| 976 | int kvm_is_error_hva(unsigned long addr) | ||
| 977 | { | ||
| 978 | return addr == bad_hva(); | ||
| 979 | } | ||
| 980 | EXPORT_SYMBOL_GPL(kvm_is_error_hva); | ||
| 981 | |||
| 982 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | 935 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) |
| 983 | { | 936 | { |
| 984 | return __gfn_to_memslot(kvm_memslots(kvm), gfn); | 937 | return __gfn_to_memslot(kvm_memslots(kvm), gfn); |
| @@ -1021,28 +974,62 @@ out: | |||
| 1021 | return size; | 974 | return size; |
| 1022 | } | 975 | } |
| 1023 | 976 | ||
| 1024 | static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, | 977 | static bool memslot_is_readonly(struct kvm_memory_slot *slot) |
| 1025 | gfn_t *nr_pages) | 978 | { |
| 979 | return slot->flags & KVM_MEM_READONLY; | ||
| 980 | } | ||
| 981 | |||
| 982 | static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, | ||
| 983 | gfn_t *nr_pages, bool write) | ||
| 1026 | { | 984 | { |
| 1027 | if (!slot || slot->flags & KVM_MEMSLOT_INVALID) | 985 | if (!slot || slot->flags & KVM_MEMSLOT_INVALID) |
| 1028 | return bad_hva(); | 986 | return KVM_HVA_ERR_BAD; |
| 987 | |||
| 988 | if (memslot_is_readonly(slot) && write) | ||
| 989 | return KVM_HVA_ERR_RO_BAD; | ||
| 1029 | 990 | ||
| 1030 | if (nr_pages) | 991 | if (nr_pages) |
| 1031 | *nr_pages = slot->npages - (gfn - slot->base_gfn); | 992 | *nr_pages = slot->npages - (gfn - slot->base_gfn); |
| 1032 | 993 | ||
| 1033 | return gfn_to_hva_memslot(slot, gfn); | 994 | return __gfn_to_hva_memslot(slot, gfn); |
| 1034 | } | 995 | } |
| 1035 | 996 | ||
| 997 | static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, | ||
| 998 | gfn_t *nr_pages) | ||
| 999 | { | ||
| 1000 | return __gfn_to_hva_many(slot, gfn, nr_pages, true); | ||
| 1001 | } | ||
| 1002 | |||
| 1003 | unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, | ||
| 1004 | gfn_t gfn) | ||
| 1005 | { | ||
| 1006 | return gfn_to_hva_many(slot, gfn, NULL); | ||
| 1007 | } | ||
| 1008 | EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); | ||
| 1009 | |||
| 1036 | unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) | 1010 | unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) |
| 1037 | { | 1011 | { |
| 1038 | return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); | 1012 | return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); |
| 1039 | } | 1013 | } |
| 1040 | EXPORT_SYMBOL_GPL(gfn_to_hva); | 1014 | EXPORT_SYMBOL_GPL(gfn_to_hva); |
| 1041 | 1015 | ||
| 1042 | static pfn_t get_fault_pfn(void) | 1016 | /* |
| 1017 | * The hva returned by this function is only allowed to be read. | ||
| 1018 | * It should pair with kvm_read_hva() or kvm_read_hva_atomic(). | ||
| 1019 | */ | ||
| 1020 | static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn) | ||
| 1021 | { | ||
| 1022 | return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false); | ||
| 1023 | } | ||
| 1024 | |||
| 1025 | static int kvm_read_hva(void *data, void __user *hva, int len) | ||
| 1043 | { | 1026 | { |
| 1044 | get_page(fault_page); | 1027 | return __copy_from_user(data, hva, len); |
| 1045 | return fault_pfn; | 1028 | } |
| 1029 | |||
| 1030 | static int kvm_read_hva_atomic(void *data, void __user *hva, int len) | ||
| 1031 | { | ||
| 1032 | return __copy_from_user_inatomic(data, hva, len); | ||
| 1046 | } | 1033 | } |
| 1047 | 1034 | ||
| 1048 | int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, | 1035 | int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, |
| @@ -1065,108 +1052,186 @@ static inline int check_user_page_hwpoison(unsigned long addr) | |||
| 1065 | return rc == -EHWPOISON; | 1052 | return rc == -EHWPOISON; |
| 1066 | } | 1053 | } |
| 1067 | 1054 | ||
| 1068 | static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, | 1055 | /* |
| 1069 | bool *async, bool write_fault, bool *writable) | 1056 | * The atomic path to get the writable pfn which will be stored in @pfn, |
| 1057 | * true indicates success, otherwise false is returned. | ||
| 1058 | */ | ||
| 1059 | static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, | ||
| 1060 | bool write_fault, bool *writable, pfn_t *pfn) | ||
| 1070 | { | 1061 | { |
| 1071 | struct page *page[1]; | 1062 | struct page *page[1]; |
| 1072 | int npages = 0; | 1063 | int npages; |
| 1073 | pfn_t pfn; | ||
| 1074 | 1064 | ||
| 1075 | /* we can do it either atomically or asynchronously, not both */ | 1065 | if (!(async || atomic)) |
| 1076 | BUG_ON(atomic && async); | 1066 | return false; |
| 1077 | 1067 | ||
| 1078 | BUG_ON(!write_fault && !writable); | 1068 | /* |
| 1069 | * Fast pin a writable pfn only if it is a write fault request | ||
| 1070 | * or the caller allows to map a writable pfn for a read fault | ||
| 1071 | * request. | ||
| 1072 | */ | ||
| 1073 | if (!(write_fault || writable)) | ||
| 1074 | return false; | ||
| 1079 | 1075 | ||
| 1080 | if (writable) | 1076 | npages = __get_user_pages_fast(addr, 1, 1, page); |
| 1081 | *writable = true; | 1077 | if (npages == 1) { |
| 1078 | *pfn = page_to_pfn(page[0]); | ||
| 1082 | 1079 | ||
| 1083 | if (atomic || async) | 1080 | if (writable) |
| 1084 | npages = __get_user_pages_fast(addr, 1, 1, page); | 1081 | *writable = true; |
| 1082 | return true; | ||
| 1083 | } | ||
| 1085 | 1084 | ||
| 1086 | if (unlikely(npages != 1) && !atomic) { | 1085 | return false; |
| 1087 | might_sleep(); | 1086 | } |
| 1088 | 1087 | ||
| 1089 | if (writable) | 1088 | /* |
| 1090 | *writable = write_fault; | 1089 | * The slow path to get the pfn of the specified host virtual address, |
| 1090 | * 1 indicates success, -errno is returned if error is detected. | ||
| 1091 | */ | ||
| 1092 | static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, | ||
| 1093 | bool *writable, pfn_t *pfn) | ||
| 1094 | { | ||
| 1095 | struct page *page[1]; | ||
| 1096 | int npages = 0; | ||
| 1091 | 1097 | ||
| 1092 | if (async) { | 1098 | might_sleep(); |
| 1093 | down_read(¤t->mm->mmap_sem); | 1099 | |
| 1094 | npages = get_user_page_nowait(current, current->mm, | 1100 | if (writable) |
| 1095 | addr, write_fault, page); | 1101 | *writable = write_fault; |
| 1096 | up_read(¤t->mm->mmap_sem); | 1102 | |
| 1097 | } else | 1103 | if (async) { |
| 1098 | npages = get_user_pages_fast(addr, 1, write_fault, | 1104 | down_read(¤t->mm->mmap_sem); |
| 1099 | page); | 1105 | npages = get_user_page_nowait(current, current->mm, |
| 1100 | 1106 | addr, write_fault, page); | |
| 1101 | /* map read fault as writable if possible */ | 1107 | up_read(¤t->mm->mmap_sem); |
| 1102 | if (unlikely(!write_fault) && npages == 1) { | 1108 | } else |
| 1103 | struct page *wpage[1]; | 1109 | npages = get_user_pages_fast(addr, 1, write_fault, |
| 1104 | 1110 | page); | |
| 1105 | npages = __get_user_pages_fast(addr, 1, 1, wpage); | 1111 | if (npages != 1) |
| 1106 | if (npages == 1) { | 1112 | return npages; |
| 1107 | *writable = true; | 1113 | |
| 1108 | put_page(page[0]); | 1114 | /* map read fault as writable if possible */ |
| 1109 | page[0] = wpage[0]; | 1115 | if (unlikely(!write_fault) && writable) { |
| 1110 | } | 1116 | struct page *wpage[1]; |
| 1111 | npages = 1; | 1117 | |
| 1118 | npages = __get_user_pages_fast(addr, 1, 1, wpage); | ||
| 1119 | if (npages == 1) { | ||
| 1120 | *writable = true; | ||
| 1121 | put_page(page[0]); | ||
| 1122 | page[0] = wpage[0]; | ||
| 1112 | } | 1123 | } |
| 1124 | |||
| 1125 | npages = 1; | ||
| 1113 | } | 1126 | } |
| 1127 | *pfn = page_to_pfn(page[0]); | ||
| 1128 | return npages; | ||
| 1129 | } | ||
| 1114 | 1130 | ||
| 1115 | if (unlikely(npages != 1)) { | 1131 | static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) |
| 1116 | struct vm_area_struct *vma; | 1132 | { |
| 1133 | if (unlikely(!(vma->vm_flags & VM_READ))) | ||
| 1134 | return false; | ||
| 1117 | 1135 | ||
| 1118 | if (atomic) | 1136 | if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) |
| 1119 | return get_fault_pfn(); | 1137 | return false; |
| 1120 | 1138 | ||
| 1121 | down_read(¤t->mm->mmap_sem); | 1139 | return true; |
| 1122 | if (npages == -EHWPOISON || | 1140 | } |
| 1123 | (!async && check_user_page_hwpoison(addr))) { | ||
| 1124 | up_read(¤t->mm->mmap_sem); | ||
| 1125 | get_page(hwpoison_page); | ||
| 1126 | return page_to_pfn(hwpoison_page); | ||
| 1127 | } | ||
| 1128 | 1141 | ||
| 1129 | vma = find_vma_intersection(current->mm, addr, addr+1); | 1142 | /* |
| 1130 | 1143 | * Pin guest page in memory and return its pfn. | |
| 1131 | if (vma == NULL) | 1144 | * @addr: host virtual address which maps memory to the guest |
| 1132 | pfn = get_fault_pfn(); | 1145 | * @atomic: whether this function can sleep |
| 1133 | else if ((vma->vm_flags & VM_PFNMAP)) { | 1146 | * @async: whether this function need to wait IO complete if the |
| 1134 | pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + | 1147 | * host page is not in the memory |
| 1135 | vma->vm_pgoff; | 1148 | * @write_fault: whether we should get a writable host page |
| 1136 | BUG_ON(!kvm_is_mmio_pfn(pfn)); | 1149 | * @writable: whether it allows to map a writable host page for !@write_fault |
| 1137 | } else { | 1150 | * |
| 1138 | if (async && (vma->vm_flags & VM_WRITE)) | 1151 | * The function will map a writable host page for these two cases: |
| 1139 | *async = true; | 1152 | * 1): @write_fault = true |
| 1140 | pfn = get_fault_pfn(); | 1153 | * 2): @write_fault = false && @writable, @writable will tell the caller |
| 1141 | } | 1154 | * whether the mapping is writable. |
| 1142 | up_read(¤t->mm->mmap_sem); | 1155 | */ |
| 1143 | } else | 1156 | static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, |
| 1144 | pfn = page_to_pfn(page[0]); | 1157 | bool write_fault, bool *writable) |
| 1158 | { | ||
| 1159 | struct vm_area_struct *vma; | ||
| 1160 | pfn_t pfn = 0; | ||
| 1161 | int npages; | ||
| 1162 | |||
| 1163 | /* we can do it either atomically or asynchronously, not both */ | ||
| 1164 | BUG_ON(atomic && async); | ||
| 1145 | 1165 | ||
| 1166 | if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn)) | ||
| 1167 | return pfn; | ||
| 1168 | |||
| 1169 | if (atomic) | ||
| 1170 | return KVM_PFN_ERR_FAULT; | ||
| 1171 | |||
| 1172 | npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); | ||
| 1173 | if (npages == 1) | ||
| 1174 | return pfn; | ||
| 1175 | |||
| 1176 | down_read(¤t->mm->mmap_sem); | ||
| 1177 | if (npages == -EHWPOISON || | ||
| 1178 | (!async && check_user_page_hwpoison(addr))) { | ||
| 1179 | pfn = KVM_PFN_ERR_HWPOISON; | ||
| 1180 | goto exit; | ||
| 1181 | } | ||
| 1182 | |||
| 1183 | vma = find_vma_intersection(current->mm, addr, addr + 1); | ||
| 1184 | |||
| 1185 | if (vma == NULL) | ||
| 1186 | pfn = KVM_PFN_ERR_FAULT; | ||
| 1187 | else if ((vma->vm_flags & VM_PFNMAP)) { | ||
| 1188 | pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + | ||
| 1189 | vma->vm_pgoff; | ||
| 1190 | BUG_ON(!kvm_is_mmio_pfn(pfn)); | ||
| 1191 | } else { | ||
| 1192 | if (async && vma_is_valid(vma, write_fault)) | ||
| 1193 | *async = true; | ||
| 1194 | pfn = KVM_PFN_ERR_FAULT; | ||
| 1195 | } | ||
| 1196 | exit: | ||
| 1197 | up_read(¤t->mm->mmap_sem); | ||
| 1146 | return pfn; | 1198 | return pfn; |
| 1147 | } | 1199 | } |
| 1148 | 1200 | ||
| 1149 | pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) | 1201 | static pfn_t |
| 1202 | __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, | ||
| 1203 | bool *async, bool write_fault, bool *writable) | ||
| 1150 | { | 1204 | { |
| 1151 | return hva_to_pfn(kvm, addr, true, NULL, true, NULL); | 1205 | unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); |
| 1206 | |||
| 1207 | if (addr == KVM_HVA_ERR_RO_BAD) | ||
| 1208 | return KVM_PFN_ERR_RO_FAULT; | ||
| 1209 | |||
| 1210 | if (kvm_is_error_hva(addr)) | ||
| 1211 | return KVM_PFN_ERR_BAD; | ||
| 1212 | |||
| 1213 | /* Do not map writable pfn in the readonly memslot. */ | ||
| 1214 | if (writable && memslot_is_readonly(slot)) { | ||
| 1215 | *writable = false; | ||
| 1216 | writable = NULL; | ||
| 1217 | } | ||
| 1218 | |||
| 1219 | return hva_to_pfn(addr, atomic, async, write_fault, | ||
| 1220 | writable); | ||
| 1152 | } | 1221 | } |
| 1153 | EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); | ||
| 1154 | 1222 | ||
| 1155 | static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, | 1223 | static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, |
| 1156 | bool write_fault, bool *writable) | 1224 | bool write_fault, bool *writable) |
| 1157 | { | 1225 | { |
| 1158 | unsigned long addr; | 1226 | struct kvm_memory_slot *slot; |
| 1159 | 1227 | ||
| 1160 | if (async) | 1228 | if (async) |
| 1161 | *async = false; | 1229 | *async = false; |
| 1162 | 1230 | ||
| 1163 | addr = gfn_to_hva(kvm, gfn); | 1231 | slot = gfn_to_memslot(kvm, gfn); |
| 1164 | if (kvm_is_error_hva(addr)) { | ||
| 1165 | get_page(bad_page); | ||
| 1166 | return page_to_pfn(bad_page); | ||
| 1167 | } | ||
| 1168 | 1232 | ||
| 1169 | return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); | 1233 | return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault, |
| 1234 | writable); | ||
| 1170 | } | 1235 | } |
| 1171 | 1236 | ||
| 1172 | pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) | 1237 | pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) |
| @@ -1195,12 +1260,16 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, | |||
| 1195 | } | 1260 | } |
| 1196 | EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); | 1261 | EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); |
| 1197 | 1262 | ||
| 1198 | pfn_t gfn_to_pfn_memslot(struct kvm *kvm, | 1263 | pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) |
| 1199 | struct kvm_memory_slot *slot, gfn_t gfn) | 1264 | { |
| 1265 | return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); | ||
| 1266 | } | ||
| 1267 | |||
| 1268 | pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) | ||
| 1200 | { | 1269 | { |
| 1201 | unsigned long addr = gfn_to_hva_memslot(slot, gfn); | 1270 | return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); |
| 1202 | return hva_to_pfn(kvm, addr, false, NULL, true, NULL); | ||
| 1203 | } | 1271 | } |
| 1272 | EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); | ||
| 1204 | 1273 | ||
| 1205 | int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, | 1274 | int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, |
| 1206 | int nr_pages) | 1275 | int nr_pages) |
| @@ -1219,30 +1288,42 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, | |||
| 1219 | } | 1288 | } |
| 1220 | EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); | 1289 | EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); |
| 1221 | 1290 | ||
| 1291 | static struct page *kvm_pfn_to_page(pfn_t pfn) | ||
| 1292 | { | ||
| 1293 | if (is_error_pfn(pfn)) | ||
| 1294 | return KVM_ERR_PTR_BAD_PAGE; | ||
| 1295 | |||
| 1296 | if (kvm_is_mmio_pfn(pfn)) { | ||
| 1297 | WARN_ON(1); | ||
| 1298 | return KVM_ERR_PTR_BAD_PAGE; | ||
| 1299 | } | ||
| 1300 | |||
| 1301 | return pfn_to_page(pfn); | ||
| 1302 | } | ||
| 1303 | |||
| 1222 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) | 1304 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) |
| 1223 | { | 1305 | { |
| 1224 | pfn_t pfn; | 1306 | pfn_t pfn; |
| 1225 | 1307 | ||
| 1226 | pfn = gfn_to_pfn(kvm, gfn); | 1308 | pfn = gfn_to_pfn(kvm, gfn); |
| 1227 | if (!kvm_is_mmio_pfn(pfn)) | ||
| 1228 | return pfn_to_page(pfn); | ||
| 1229 | |||
| 1230 | WARN_ON(kvm_is_mmio_pfn(pfn)); | ||
| 1231 | 1309 | ||
| 1232 | get_page(bad_page); | 1310 | return kvm_pfn_to_page(pfn); |
| 1233 | return bad_page; | ||
| 1234 | } | 1311 | } |
| 1235 | 1312 | ||
| 1236 | EXPORT_SYMBOL_GPL(gfn_to_page); | 1313 | EXPORT_SYMBOL_GPL(gfn_to_page); |
| 1237 | 1314 | ||
| 1238 | void kvm_release_page_clean(struct page *page) | 1315 | void kvm_release_page_clean(struct page *page) |
| 1239 | { | 1316 | { |
| 1317 | WARN_ON(is_error_page(page)); | ||
| 1318 | |||
| 1240 | kvm_release_pfn_clean(page_to_pfn(page)); | 1319 | kvm_release_pfn_clean(page_to_pfn(page)); |
| 1241 | } | 1320 | } |
| 1242 | EXPORT_SYMBOL_GPL(kvm_release_page_clean); | 1321 | EXPORT_SYMBOL_GPL(kvm_release_page_clean); |
| 1243 | 1322 | ||
| 1244 | void kvm_release_pfn_clean(pfn_t pfn) | 1323 | void kvm_release_pfn_clean(pfn_t pfn) |
| 1245 | { | 1324 | { |
| 1325 | WARN_ON(is_error_pfn(pfn)); | ||
| 1326 | |||
| 1246 | if (!kvm_is_mmio_pfn(pfn)) | 1327 | if (!kvm_is_mmio_pfn(pfn)) |
| 1247 | put_page(pfn_to_page(pfn)); | 1328 | put_page(pfn_to_page(pfn)); |
| 1248 | } | 1329 | } |
| @@ -1250,6 +1331,8 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); | |||
| 1250 | 1331 | ||
| 1251 | void kvm_release_page_dirty(struct page *page) | 1332 | void kvm_release_page_dirty(struct page *page) |
| 1252 | { | 1333 | { |
| 1334 | WARN_ON(is_error_page(page)); | ||
| 1335 | |||
| 1253 | kvm_release_pfn_dirty(page_to_pfn(page)); | 1336 | kvm_release_pfn_dirty(page_to_pfn(page)); |
| 1254 | } | 1337 | } |
| 1255 | EXPORT_SYMBOL_GPL(kvm_release_page_dirty); | 1338 | EXPORT_SYMBOL_GPL(kvm_release_page_dirty); |
| @@ -1305,10 +1388,10 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, | |||
| 1305 | int r; | 1388 | int r; |
| 1306 | unsigned long addr; | 1389 | unsigned long addr; |
| 1307 | 1390 | ||
| 1308 | addr = gfn_to_hva(kvm, gfn); | 1391 | addr = gfn_to_hva_read(kvm, gfn); |
| 1309 | if (kvm_is_error_hva(addr)) | 1392 | if (kvm_is_error_hva(addr)) |
| 1310 | return -EFAULT; | 1393 | return -EFAULT; |
| 1311 | r = __copy_from_user(data, (void __user *)addr + offset, len); | 1394 | r = kvm_read_hva(data, (void __user *)addr + offset, len); |
| 1312 | if (r) | 1395 | if (r) |
| 1313 | return -EFAULT; | 1396 | return -EFAULT; |
| 1314 | return 0; | 1397 | return 0; |
| @@ -1343,11 +1426,11 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, | |||
| 1343 | gfn_t gfn = gpa >> PAGE_SHIFT; | 1426 | gfn_t gfn = gpa >> PAGE_SHIFT; |
| 1344 | int offset = offset_in_page(gpa); | 1427 | int offset = offset_in_page(gpa); |
| 1345 | 1428 | ||
| 1346 | addr = gfn_to_hva(kvm, gfn); | 1429 | addr = gfn_to_hva_read(kvm, gfn); |
| 1347 | if (kvm_is_error_hva(addr)) | 1430 | if (kvm_is_error_hva(addr)) |
| 1348 | return -EFAULT; | 1431 | return -EFAULT; |
| 1349 | pagefault_disable(); | 1432 | pagefault_disable(); |
| 1350 | r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); | 1433 | r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len); |
| 1351 | pagefault_enable(); | 1434 | pagefault_enable(); |
| 1352 | if (r) | 1435 | if (r) |
| 1353 | return -EFAULT; | 1436 | return -EFAULT; |
| @@ -1580,6 +1663,43 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target) | |||
| 1580 | } | 1663 | } |
| 1581 | EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); | 1664 | EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); |
| 1582 | 1665 | ||
| 1666 | #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT | ||
| 1667 | /* | ||
| 1668 | * Helper that checks whether a VCPU is eligible for directed yield. | ||
| 1669 | * Most eligible candidate to yield is decided by following heuristics: | ||
| 1670 | * | ||
| 1671 | * (a) VCPU which has not done pl-exit or cpu relax intercepted recently | ||
| 1672 | * (preempted lock holder), indicated by @in_spin_loop. | ||
| 1673 | * Set at the beiginning and cleared at the end of interception/PLE handler. | ||
| 1674 | * | ||
| 1675 | * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get | ||
| 1676 | * chance last time (mostly it has become eligible now since we have probably | ||
| 1677 | * yielded to lockholder in last iteration. This is done by toggling | ||
| 1678 | * @dy_eligible each time a VCPU checked for eligibility.) | ||
| 1679 | * | ||
| 1680 | * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding | ||
| 1681 | * to preempted lock-holder could result in wrong VCPU selection and CPU | ||
| 1682 | * burning. Giving priority for a potential lock-holder increases lock | ||
| 1683 | * progress. | ||
| 1684 | * | ||
| 1685 | * Since algorithm is based on heuristics, accessing another VCPU data without | ||
| 1686 | * locking does not harm. It may result in trying to yield to same VCPU, fail | ||
| 1687 | * and continue with next VCPU and so on. | ||
| 1688 | */ | ||
| 1689 | bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) | ||
| 1690 | { | ||
| 1691 | bool eligible; | ||
| 1692 | |||
| 1693 | eligible = !vcpu->spin_loop.in_spin_loop || | ||
| 1694 | (vcpu->spin_loop.in_spin_loop && | ||
| 1695 | vcpu->spin_loop.dy_eligible); | ||
| 1696 | |||
| 1697 | if (vcpu->spin_loop.in_spin_loop) | ||
| 1698 | kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); | ||
| 1699 | |||
| 1700 | return eligible; | ||
| 1701 | } | ||
| 1702 | #endif | ||
| 1583 | void kvm_vcpu_on_spin(struct kvm_vcpu *me) | 1703 | void kvm_vcpu_on_spin(struct kvm_vcpu *me) |
| 1584 | { | 1704 | { |
| 1585 | struct kvm *kvm = me->kvm; | 1705 | struct kvm *kvm = me->kvm; |
| @@ -1589,6 +1709,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) | |||
| 1589 | int pass; | 1709 | int pass; |
| 1590 | int i; | 1710 | int i; |
| 1591 | 1711 | ||
| 1712 | kvm_vcpu_set_in_spin_loop(me, true); | ||
| 1592 | /* | 1713 | /* |
| 1593 | * We boost the priority of a VCPU that is runnable but not | 1714 | * We boost the priority of a VCPU that is runnable but not |
| 1594 | * currently running, because it got preempted by something | 1715 | * currently running, because it got preempted by something |
| @@ -1607,6 +1728,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) | |||
| 1607 | continue; | 1728 | continue; |
| 1608 | if (waitqueue_active(&vcpu->wq)) | 1729 | if (waitqueue_active(&vcpu->wq)) |
| 1609 | continue; | 1730 | continue; |
| 1731 | if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) | ||
| 1732 | continue; | ||
| 1610 | if (kvm_vcpu_yield_to(vcpu)) { | 1733 | if (kvm_vcpu_yield_to(vcpu)) { |
| 1611 | kvm->last_boosted_vcpu = i; | 1734 | kvm->last_boosted_vcpu = i; |
| 1612 | yielded = 1; | 1735 | yielded = 1; |
| @@ -1614,6 +1737,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) | |||
| 1614 | } | 1737 | } |
| 1615 | } | 1738 | } |
| 1616 | } | 1739 | } |
| 1740 | kvm_vcpu_set_in_spin_loop(me, false); | ||
| 1741 | |||
| 1742 | /* Ensure vcpu is not eligible during next spinloop */ | ||
| 1743 | kvm_vcpu_set_dy_eligible(me, false); | ||
| 1617 | } | 1744 | } |
| 1618 | EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); | 1745 | EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); |
| 1619 | 1746 | ||
| @@ -1766,7 +1893,9 @@ static long kvm_vcpu_ioctl(struct file *filp, | |||
| 1766 | #endif | 1893 | #endif |
| 1767 | 1894 | ||
| 1768 | 1895 | ||
| 1769 | vcpu_load(vcpu); | 1896 | r = vcpu_load(vcpu); |
| 1897 | if (r) | ||
| 1898 | return r; | ||
| 1770 | switch (ioctl) { | 1899 | switch (ioctl) { |
| 1771 | case KVM_RUN: | 1900 | case KVM_RUN: |
| 1772 | r = -EINVAL; | 1901 | r = -EINVAL; |
| @@ -2094,6 +2223,29 @@ static long kvm_vm_ioctl(struct file *filp, | |||
| 2094 | break; | 2223 | break; |
| 2095 | } | 2224 | } |
| 2096 | #endif | 2225 | #endif |
| 2226 | #ifdef __KVM_HAVE_IRQ_LINE | ||
| 2227 | case KVM_IRQ_LINE_STATUS: | ||
| 2228 | case KVM_IRQ_LINE: { | ||
| 2229 | struct kvm_irq_level irq_event; | ||
| 2230 | |||
| 2231 | r = -EFAULT; | ||
| 2232 | if (copy_from_user(&irq_event, argp, sizeof irq_event)) | ||
| 2233 | goto out; | ||
| 2234 | |||
| 2235 | r = kvm_vm_ioctl_irq_line(kvm, &irq_event); | ||
| 2236 | if (r) | ||
| 2237 | goto out; | ||
| 2238 | |||
| 2239 | r = -EFAULT; | ||
| 2240 | if (ioctl == KVM_IRQ_LINE_STATUS) { | ||
| 2241 | if (copy_to_user(argp, &irq_event, sizeof irq_event)) | ||
| 2242 | goto out; | ||
| 2243 | } | ||
| 2244 | |||
| 2245 | r = 0; | ||
| 2246 | break; | ||
| 2247 | } | ||
| 2248 | #endif | ||
| 2097 | default: | 2249 | default: |
| 2098 | r = kvm_arch_vm_ioctl(filp, ioctl, arg); | 2250 | r = kvm_arch_vm_ioctl(filp, ioctl, arg); |
| 2099 | if (r == -ENOTTY) | 2251 | if (r == -ENOTTY) |
| @@ -2698,9 +2850,6 @@ static struct syscore_ops kvm_syscore_ops = { | |||
| 2698 | .resume = kvm_resume, | 2850 | .resume = kvm_resume, |
| 2699 | }; | 2851 | }; |
| 2700 | 2852 | ||
| 2701 | struct page *bad_page; | ||
| 2702 | pfn_t bad_pfn; | ||
| 2703 | |||
| 2704 | static inline | 2853 | static inline |
| 2705 | struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) | 2854 | struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) |
| 2706 | { | 2855 | { |
| @@ -2732,33 +2881,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, | |||
| 2732 | if (r) | 2881 | if (r) |
| 2733 | goto out_fail; | 2882 | goto out_fail; |
| 2734 | 2883 | ||
| 2735 | bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
| 2736 | |||
| 2737 | if (bad_page == NULL) { | ||
| 2738 | r = -ENOMEM; | ||
| 2739 | goto out; | ||
| 2740 | } | ||
| 2741 | |||
| 2742 | bad_pfn = page_to_pfn(bad_page); | ||
| 2743 | |||
| 2744 | hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
| 2745 | |||
| 2746 | if (hwpoison_page == NULL) { | ||
| 2747 | r = -ENOMEM; | ||
| 2748 | goto out_free_0; | ||
| 2749 | } | ||
| 2750 | |||
| 2751 | hwpoison_pfn = page_to_pfn(hwpoison_page); | ||
| 2752 | |||
| 2753 | fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
| 2754 | |||
| 2755 | if (fault_page == NULL) { | ||
| 2756 | r = -ENOMEM; | ||
| 2757 | goto out_free_0; | ||
| 2758 | } | ||
| 2759 | |||
| 2760 | fault_pfn = page_to_pfn(fault_page); | ||
| 2761 | |||
| 2762 | if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { | 2884 | if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { |
| 2763 | r = -ENOMEM; | 2885 | r = -ENOMEM; |
| 2764 | goto out_free_0; | 2886 | goto out_free_0; |
| @@ -2833,12 +2955,6 @@ out_free_1: | |||
| 2833 | out_free_0a: | 2955 | out_free_0a: |
| 2834 | free_cpumask_var(cpus_hardware_enabled); | 2956 | free_cpumask_var(cpus_hardware_enabled); |
| 2835 | out_free_0: | 2957 | out_free_0: |
| 2836 | if (fault_page) | ||
| 2837 | __free_page(fault_page); | ||
| 2838 | if (hwpoison_page) | ||
| 2839 | __free_page(hwpoison_page); | ||
| 2840 | __free_page(bad_page); | ||
| 2841 | out: | ||
| 2842 | kvm_arch_exit(); | 2958 | kvm_arch_exit(); |
| 2843 | out_fail: | 2959 | out_fail: |
| 2844 | return r; | 2960 | return r; |
| @@ -2858,8 +2974,5 @@ void kvm_exit(void) | |||
| 2858 | kvm_arch_hardware_unsetup(); | 2974 | kvm_arch_hardware_unsetup(); |
| 2859 | kvm_arch_exit(); | 2975 | kvm_arch_exit(); |
| 2860 | free_cpumask_var(cpus_hardware_enabled); | 2976 | free_cpumask_var(cpus_hardware_enabled); |
| 2861 | __free_page(fault_page); | ||
| 2862 | __free_page(hwpoison_page); | ||
| 2863 | __free_page(bad_page); | ||
| 2864 | } | 2977 | } |
| 2865 | EXPORT_SYMBOL_GPL(kvm_exit); | 2978 | EXPORT_SYMBOL_GPL(kvm_exit); |
