aboutsummaryrefslogtreecommitdiffstats
path: root/virt/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-14 23:51:36 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-14 23:51:36 -0500
commitf080480488028bcc25357f85e8ae54ccc3bb7173 (patch)
tree8fcc943f16d26c795b3b6324b478af2d5a30285d /virt/kvm
parenteda670c626a4f53eb8ac5f20d8c10d3f0b54c583 (diff)
parente504c9098ed6acd9e1079c5e10e4910724ad429f (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM changes from Paolo Bonzini: "Here are the 3.13 KVM changes. There was a lot of work on the PPC side: the HV and emulation flavors can now coexist in a single kernel is probably the most interesting change from a user point of view. On the x86 side there are nested virtualization improvements and a few bugfixes. ARM got transparent huge page support, improved overcommit, and support for big endian guests. Finally, there is a new interface to connect KVM with VFIO. This helps with devices that use NoSnoop PCI transactions, letting the driver in the guest execute WBINVD instructions. This includes some nVidia cards on Windows, that fail to start without these patches and the corresponding userspace changes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (146 commits) kvm, vmx: Fix lazy FPU on nested guest arm/arm64: KVM: PSCI: propagate caller endianness to the incoming vcpu arm/arm64: KVM: MMIO support for BE guest kvm, cpuid: Fix sparse warning kvm: Delete prototype for non-existent function kvm_check_iopl kvm: Delete prototype for non-existent function complete_pio hung_task: add method to reset detector pvclock: detect watchdog reset at pvclock read kvm: optimize out smp_mb after srcu_read_unlock srcu: API for barrier after srcu read unlock KVM: remove vm mmap method KVM: IOMMU: hva align mapping page size KVM: x86: trace cpuid emulation when called from emulator KVM: emulator: cleanup decode_register_operand() a bit KVM: emulator: check rex prefix inside decode_register() KVM: x86: fix emulation of "movzbl %bpl, %eax" kvm_host: typo fix KVM: x86: emulate SAHF instruction MAINTAINERS: add tree for kvm.git Documentation/kvm: add a 00-INDEX file ...
Diffstat (limited to 'virt/kvm')
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/async_pf.c22
-rw-r--r--virt/kvm/iommu.c38
-rw-r--r--virt/kvm/kvm_main.c134
-rw-r--r--virt/kvm/vfio.c264
5 files changed, 348 insertions, 113 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 779262f59e25..fbe1a48bd629 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -27,3 +27,6 @@ config HAVE_KVM_MSI
27 27
28config HAVE_KVM_CPU_RELAX_INTERCEPT 28config HAVE_KVM_CPU_RELAX_INTERCEPT
29 bool 29 bool
30
31config KVM_VFIO
32 bool
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 8a39dda7a325..8631d9c14320 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -56,7 +56,6 @@ void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu)
56 56
57static void async_pf_execute(struct work_struct *work) 57static void async_pf_execute(struct work_struct *work)
58{ 58{
59 struct page *page = NULL;
60 struct kvm_async_pf *apf = 59 struct kvm_async_pf *apf =
61 container_of(work, struct kvm_async_pf, work); 60 container_of(work, struct kvm_async_pf, work);
62 struct mm_struct *mm = apf->mm; 61 struct mm_struct *mm = apf->mm;
@@ -68,14 +67,12 @@ static void async_pf_execute(struct work_struct *work)
68 67
69 use_mm(mm); 68 use_mm(mm);
70 down_read(&mm->mmap_sem); 69 down_read(&mm->mmap_sem);
71 get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL); 70 get_user_pages(current, mm, addr, 1, 1, 0, NULL, NULL);
72 up_read(&mm->mmap_sem); 71 up_read(&mm->mmap_sem);
73 unuse_mm(mm); 72 unuse_mm(mm);
74 73
75 spin_lock(&vcpu->async_pf.lock); 74 spin_lock(&vcpu->async_pf.lock);
76 list_add_tail(&apf->link, &vcpu->async_pf.done); 75 list_add_tail(&apf->link, &vcpu->async_pf.done);
77 apf->page = page;
78 apf->done = true;
79 spin_unlock(&vcpu->async_pf.lock); 76 spin_unlock(&vcpu->async_pf.lock);
80 77
81 /* 78 /*
@@ -83,7 +80,7 @@ static void async_pf_execute(struct work_struct *work)
83 * this point 80 * this point
84 */ 81 */
85 82
86 trace_kvm_async_pf_completed(addr, page, gva); 83 trace_kvm_async_pf_completed(addr, gva);
87 84
88 if (waitqueue_active(&vcpu->wq)) 85 if (waitqueue_active(&vcpu->wq))
89 wake_up_interruptible(&vcpu->wq); 86 wake_up_interruptible(&vcpu->wq);
@@ -99,9 +96,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
99 struct kvm_async_pf *work = 96 struct kvm_async_pf *work =
100 list_entry(vcpu->async_pf.queue.next, 97 list_entry(vcpu->async_pf.queue.next,
101 typeof(*work), queue); 98 typeof(*work), queue);
102 cancel_work_sync(&work->work);
103 list_del(&work->queue); 99 list_del(&work->queue);
104 if (!work->done) { /* work was canceled */ 100 if (cancel_work_sync(&work->work)) {
105 mmdrop(work->mm); 101 mmdrop(work->mm);
106 kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */ 102 kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */
107 kmem_cache_free(async_pf_cache, work); 103 kmem_cache_free(async_pf_cache, work);
@@ -114,8 +110,6 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
114 list_entry(vcpu->async_pf.done.next, 110 list_entry(vcpu->async_pf.done.next,
115 typeof(*work), link); 111 typeof(*work), link);
116 list_del(&work->link); 112 list_del(&work->link);
117 if (!is_error_page(work->page))
118 kvm_release_page_clean(work->page);
119 kmem_cache_free(async_pf_cache, work); 113 kmem_cache_free(async_pf_cache, work);
120 } 114 }
121 spin_unlock(&vcpu->async_pf.lock); 115 spin_unlock(&vcpu->async_pf.lock);
@@ -135,14 +129,11 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
135 list_del(&work->link); 129 list_del(&work->link);
136 spin_unlock(&vcpu->async_pf.lock); 130 spin_unlock(&vcpu->async_pf.lock);
137 131
138 if (work->page) 132 kvm_arch_async_page_ready(vcpu, work);
139 kvm_arch_async_page_ready(vcpu, work);
140 kvm_arch_async_page_present(vcpu, work); 133 kvm_arch_async_page_present(vcpu, work);
141 134
142 list_del(&work->queue); 135 list_del(&work->queue);
143 vcpu->async_pf.queued--; 136 vcpu->async_pf.queued--;
144 if (!is_error_page(work->page))
145 kvm_release_page_clean(work->page);
146 kmem_cache_free(async_pf_cache, work); 137 kmem_cache_free(async_pf_cache, work);
147 } 138 }
148} 139}
@@ -165,8 +156,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
165 if (!work) 156 if (!work)
166 return 0; 157 return 0;
167 158
168 work->page = NULL; 159 work->wakeup_all = false;
169 work->done = false;
170 work->vcpu = vcpu; 160 work->vcpu = vcpu;
171 work->gva = gva; 161 work->gva = gva;
172 work->addr = gfn_to_hva(vcpu->kvm, gfn); 162 work->addr = gfn_to_hva(vcpu->kvm, gfn);
@@ -206,7 +196,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
206 if (!work) 196 if (!work)
207 return -ENOMEM; 197 return -ENOMEM;
208 198
209 work->page = KVM_ERR_PTR_BAD_PAGE; 199 work->wakeup_all = true;
210 INIT_LIST_HEAD(&work->queue); /* for list_del to work */ 200 INIT_LIST_HEAD(&work->queue); /* for list_del to work */
211 201
212 spin_lock(&vcpu->async_pf.lock); 202 spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 72a130bc448a..0df7d4b34dfe 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -79,7 +79,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
79 flags = IOMMU_READ; 79 flags = IOMMU_READ;
80 if (!(slot->flags & KVM_MEM_READONLY)) 80 if (!(slot->flags & KVM_MEM_READONLY))
81 flags |= IOMMU_WRITE; 81 flags |= IOMMU_WRITE;
82 if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY) 82 if (!kvm->arch.iommu_noncoherent)
83 flags |= IOMMU_CACHE; 83 flags |= IOMMU_CACHE;
84 84
85 85
@@ -103,6 +103,10 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
103 while ((gfn << PAGE_SHIFT) & (page_size - 1)) 103 while ((gfn << PAGE_SHIFT) & (page_size - 1))
104 page_size >>= 1; 104 page_size >>= 1;
105 105
106 /* Make sure hva is aligned to the page size we want to map */
107 while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1))
108 page_size >>= 1;
109
106 /* 110 /*
107 * Pin all pages we are about to map in memory. This is 111 * Pin all pages we are about to map in memory. This is
108 * important because we unmap and unpin in 4kb steps later. 112 * important because we unmap and unpin in 4kb steps later.
@@ -140,6 +144,9 @@ static int kvm_iommu_map_memslots(struct kvm *kvm)
140 struct kvm_memslots *slots; 144 struct kvm_memslots *slots;
141 struct kvm_memory_slot *memslot; 145 struct kvm_memory_slot *memslot;
142 146
147 if (kvm->arch.iommu_noncoherent)
148 kvm_arch_register_noncoherent_dma(kvm);
149
143 idx = srcu_read_lock(&kvm->srcu); 150 idx = srcu_read_lock(&kvm->srcu);
144 slots = kvm_memslots(kvm); 151 slots = kvm_memslots(kvm);
145 152
@@ -158,7 +165,8 @@ int kvm_assign_device(struct kvm *kvm,
158{ 165{
159 struct pci_dev *pdev = NULL; 166 struct pci_dev *pdev = NULL;
160 struct iommu_domain *domain = kvm->arch.iommu_domain; 167 struct iommu_domain *domain = kvm->arch.iommu_domain;
161 int r, last_flags; 168 int r;
169 bool noncoherent;
162 170
163 /* check if iommu exists and in use */ 171 /* check if iommu exists and in use */
164 if (!domain) 172 if (!domain)
@@ -174,15 +182,13 @@ int kvm_assign_device(struct kvm *kvm,
174 return r; 182 return r;
175 } 183 }
176 184
177 last_flags = kvm->arch.iommu_flags; 185 noncoherent = !iommu_domain_has_cap(kvm->arch.iommu_domain,
178 if (iommu_domain_has_cap(kvm->arch.iommu_domain, 186 IOMMU_CAP_CACHE_COHERENCY);
179 IOMMU_CAP_CACHE_COHERENCY))
180 kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY;
181 187
182 /* Check if need to update IOMMU page table for guest memory */ 188 /* Check if need to update IOMMU page table for guest memory */
183 if ((last_flags ^ kvm->arch.iommu_flags) == 189 if (noncoherent != kvm->arch.iommu_noncoherent) {
184 KVM_IOMMU_CACHE_COHERENCY) {
185 kvm_iommu_unmap_memslots(kvm); 190 kvm_iommu_unmap_memslots(kvm);
191 kvm->arch.iommu_noncoherent = noncoherent;
186 r = kvm_iommu_map_memslots(kvm); 192 r = kvm_iommu_map_memslots(kvm);
187 if (r) 193 if (r)
188 goto out_unmap; 194 goto out_unmap;
@@ -190,11 +196,7 @@ int kvm_assign_device(struct kvm *kvm,
190 196
191 pdev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED; 197 pdev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED;
192 198
193 printk(KERN_DEBUG "assign device %x:%x:%x.%x\n", 199 dev_info(&pdev->dev, "kvm assign device\n");
194 assigned_dev->host_segnr,
195 assigned_dev->host_busnr,
196 PCI_SLOT(assigned_dev->host_devfn),
197 PCI_FUNC(assigned_dev->host_devfn));
198 200
199 return 0; 201 return 0;
200out_unmap: 202out_unmap:
@@ -220,11 +222,7 @@ int kvm_deassign_device(struct kvm *kvm,
220 222
221 pdev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; 223 pdev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;
222 224
223 printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n", 225 dev_info(&pdev->dev, "kvm deassign device\n");
224 assigned_dev->host_segnr,
225 assigned_dev->host_busnr,
226 PCI_SLOT(assigned_dev->host_devfn),
227 PCI_FUNC(assigned_dev->host_devfn));
228 226
229 return 0; 227 return 0;
230} 228}
@@ -336,6 +334,9 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm)
336 334
337 srcu_read_unlock(&kvm->srcu, idx); 335 srcu_read_unlock(&kvm->srcu, idx);
338 336
337 if (kvm->arch.iommu_noncoherent)
338 kvm_arch_unregister_noncoherent_dma(kvm);
339
339 return 0; 340 return 0;
340} 341}
341 342
@@ -350,6 +351,7 @@ int kvm_iommu_unmap_guest(struct kvm *kvm)
350 mutex_lock(&kvm->slots_lock); 351 mutex_lock(&kvm->slots_lock);
351 kvm_iommu_unmap_memslots(kvm); 352 kvm_iommu_unmap_memslots(kvm);
352 kvm->arch.iommu_domain = NULL; 353 kvm->arch.iommu_domain = NULL;
354 kvm->arch.iommu_noncoherent = false;
353 mutex_unlock(&kvm->slots_lock); 355 mutex_unlock(&kvm->slots_lock);
354 356
355 iommu_domain_free(domain); 357 iommu_domain_free(domain);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1cf9ccb01013..662f34c3287e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -70,7 +70,8 @@ MODULE_LICENSE("GPL");
70 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 70 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
71 */ 71 */
72 72
73DEFINE_RAW_SPINLOCK(kvm_lock); 73DEFINE_SPINLOCK(kvm_lock);
74static DEFINE_RAW_SPINLOCK(kvm_count_lock);
74LIST_HEAD(vm_list); 75LIST_HEAD(vm_list);
75 76
76static cpumask_var_t cpus_hardware_enabled; 77static cpumask_var_t cpus_hardware_enabled;
@@ -186,6 +187,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
186 ++kvm->stat.remote_tlb_flush; 187 ++kvm->stat.remote_tlb_flush;
187 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 188 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
188} 189}
190EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
189 191
190void kvm_reload_remote_mmus(struct kvm *kvm) 192void kvm_reload_remote_mmus(struct kvm *kvm)
191{ 193{
@@ -490,9 +492,9 @@ static struct kvm *kvm_create_vm(unsigned long type)
490 if (r) 492 if (r)
491 goto out_err; 493 goto out_err;
492 494
493 raw_spin_lock(&kvm_lock); 495 spin_lock(&kvm_lock);
494 list_add(&kvm->vm_list, &vm_list); 496 list_add(&kvm->vm_list, &vm_list);
495 raw_spin_unlock(&kvm_lock); 497 spin_unlock(&kvm_lock);
496 498
497 return kvm; 499 return kvm;
498 500
@@ -540,13 +542,13 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
540/* 542/*
541 * Free any memory in @free but not in @dont. 543 * Free any memory in @free but not in @dont.
542 */ 544 */
543static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 545static void kvm_free_physmem_slot(struct kvm *kvm, struct kvm_memory_slot *free,
544 struct kvm_memory_slot *dont) 546 struct kvm_memory_slot *dont)
545{ 547{
546 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 548 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
547 kvm_destroy_dirty_bitmap(free); 549 kvm_destroy_dirty_bitmap(free);
548 550
549 kvm_arch_free_memslot(free, dont); 551 kvm_arch_free_memslot(kvm, free, dont);
550 552
551 free->npages = 0; 553 free->npages = 0;
552} 554}
@@ -557,7 +559,7 @@ void kvm_free_physmem(struct kvm *kvm)
557 struct kvm_memory_slot *memslot; 559 struct kvm_memory_slot *memslot;
558 560
559 kvm_for_each_memslot(memslot, slots) 561 kvm_for_each_memslot(memslot, slots)
560 kvm_free_physmem_slot(memslot, NULL); 562 kvm_free_physmem_slot(kvm, memslot, NULL);
561 563
562 kfree(kvm->memslots); 564 kfree(kvm->memslots);
563} 565}
@@ -581,9 +583,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
581 struct mm_struct *mm = kvm->mm; 583 struct mm_struct *mm = kvm->mm;
582 584
583 kvm_arch_sync_events(kvm); 585 kvm_arch_sync_events(kvm);
584 raw_spin_lock(&kvm_lock); 586 spin_lock(&kvm_lock);
585 list_del(&kvm->vm_list); 587 list_del(&kvm->vm_list);
586 raw_spin_unlock(&kvm_lock); 588 spin_unlock(&kvm_lock);
587 kvm_free_irq_routing(kvm); 589 kvm_free_irq_routing(kvm);
588 for (i = 0; i < KVM_NR_BUSES; i++) 590 for (i = 0; i < KVM_NR_BUSES; i++)
589 kvm_io_bus_destroy(kvm->buses[i]); 591 kvm_io_bus_destroy(kvm->buses[i]);
@@ -821,7 +823,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
821 if (change == KVM_MR_CREATE) { 823 if (change == KVM_MR_CREATE) {
822 new.userspace_addr = mem->userspace_addr; 824 new.userspace_addr = mem->userspace_addr;
823 825
824 if (kvm_arch_create_memslot(&new, npages)) 826 if (kvm_arch_create_memslot(kvm, &new, npages))
825 goto out_free; 827 goto out_free;
826 } 828 }
827 829
@@ -872,6 +874,19 @@ int __kvm_set_memory_region(struct kvm *kvm,
872 goto out_free; 874 goto out_free;
873 } 875 }
874 876
877 /* actual memory is freed via old in kvm_free_physmem_slot below */
878 if (change == KVM_MR_DELETE) {
879 new.dirty_bitmap = NULL;
880 memset(&new.arch, 0, sizeof(new.arch));
881 }
882
883 old_memslots = install_new_memslots(kvm, slots, &new);
884
885 kvm_arch_commit_memory_region(kvm, mem, &old, change);
886
887 kvm_free_physmem_slot(kvm, &old, &new);
888 kfree(old_memslots);
889
875 /* 890 /*
876 * IOMMU mapping: New slots need to be mapped. Old slots need to be 891 * IOMMU mapping: New slots need to be mapped. Old slots need to be
877 * un-mapped and re-mapped if their base changes. Since base change 892 * un-mapped and re-mapped if their base changes. Since base change
@@ -883,29 +898,15 @@ int __kvm_set_memory_region(struct kvm *kvm,
883 */ 898 */
884 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 899 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
885 r = kvm_iommu_map_pages(kvm, &new); 900 r = kvm_iommu_map_pages(kvm, &new);
886 if (r) 901 return r;
887 goto out_slots;
888 }
889
890 /* actual memory is freed via old in kvm_free_physmem_slot below */
891 if (change == KVM_MR_DELETE) {
892 new.dirty_bitmap = NULL;
893 memset(&new.arch, 0, sizeof(new.arch));
894 } 902 }
895 903
896 old_memslots = install_new_memslots(kvm, slots, &new);
897
898 kvm_arch_commit_memory_region(kvm, mem, &old, change);
899
900 kvm_free_physmem_slot(&old, &new);
901 kfree(old_memslots);
902
903 return 0; 904 return 0;
904 905
905out_slots: 906out_slots:
906 kfree(slots); 907 kfree(slots);
907out_free: 908out_free:
908 kvm_free_physmem_slot(&new, &old); 909 kvm_free_physmem_slot(kvm, &new, &old);
909out: 910out:
910 return r; 911 return r;
911} 912}
@@ -964,6 +965,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
964out: 965out:
965 return r; 966 return r;
966} 967}
968EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
967 969
968bool kvm_largepages_enabled(void) 970bool kvm_largepages_enabled(void)
969{ 971{
@@ -1654,6 +1656,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1654 memslot = gfn_to_memslot(kvm, gfn); 1656 memslot = gfn_to_memslot(kvm, gfn);
1655 mark_page_dirty_in_slot(kvm, memslot, gfn); 1657 mark_page_dirty_in_slot(kvm, memslot, gfn);
1656} 1658}
1659EXPORT_SYMBOL_GPL(mark_page_dirty);
1657 1660
1658/* 1661/*
1659 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1662 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
@@ -1679,6 +1682,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1679 1682
1680 finish_wait(&vcpu->wq, &wait); 1683 finish_wait(&vcpu->wq, &wait);
1681} 1684}
1685EXPORT_SYMBOL_GPL(kvm_vcpu_block);
1682 1686
1683#ifndef CONFIG_S390 1687#ifndef CONFIG_S390
1684/* 1688/*
@@ -2271,6 +2275,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
2271 ops = &kvm_xics_ops; 2275 ops = &kvm_xics_ops;
2272 break; 2276 break;
2273#endif 2277#endif
2278#ifdef CONFIG_KVM_VFIO
2279 case KVM_DEV_TYPE_VFIO:
2280 ops = &kvm_vfio_ops;
2281 break;
2282#endif
2274 default: 2283 default:
2275 return -ENODEV; 2284 return -ENODEV;
2276 } 2285 }
@@ -2519,44 +2528,12 @@ out:
2519} 2528}
2520#endif 2529#endif
2521 2530
2522static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2523{
2524 struct page *page[1];
2525 unsigned long addr;
2526 int npages;
2527 gfn_t gfn = vmf->pgoff;
2528 struct kvm *kvm = vma->vm_file->private_data;
2529
2530 addr = gfn_to_hva(kvm, gfn);
2531 if (kvm_is_error_hva(addr))
2532 return VM_FAULT_SIGBUS;
2533
2534 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
2535 NULL);
2536 if (unlikely(npages != 1))
2537 return VM_FAULT_SIGBUS;
2538
2539 vmf->page = page[0];
2540 return 0;
2541}
2542
2543static const struct vm_operations_struct kvm_vm_vm_ops = {
2544 .fault = kvm_vm_fault,
2545};
2546
2547static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2548{
2549 vma->vm_ops = &kvm_vm_vm_ops;
2550 return 0;
2551}
2552
2553static struct file_operations kvm_vm_fops = { 2531static struct file_operations kvm_vm_fops = {
2554 .release = kvm_vm_release, 2532 .release = kvm_vm_release,
2555 .unlocked_ioctl = kvm_vm_ioctl, 2533 .unlocked_ioctl = kvm_vm_ioctl,
2556#ifdef CONFIG_COMPAT 2534#ifdef CONFIG_COMPAT
2557 .compat_ioctl = kvm_vm_compat_ioctl, 2535 .compat_ioctl = kvm_vm_compat_ioctl,
2558#endif 2536#endif
2559 .mmap = kvm_vm_mmap,
2560 .llseek = noop_llseek, 2537 .llseek = noop_llseek,
2561}; 2538};
2562 2539
@@ -2683,11 +2660,12 @@ static void hardware_enable_nolock(void *junk)
2683 } 2660 }
2684} 2661}
2685 2662
2686static void hardware_enable(void *junk) 2663static void hardware_enable(void)
2687{ 2664{
2688 raw_spin_lock(&kvm_lock); 2665 raw_spin_lock(&kvm_count_lock);
2689 hardware_enable_nolock(junk); 2666 if (kvm_usage_count)
2690 raw_spin_unlock(&kvm_lock); 2667 hardware_enable_nolock(NULL);
2668 raw_spin_unlock(&kvm_count_lock);
2691} 2669}
2692 2670
2693static void hardware_disable_nolock(void *junk) 2671static void hardware_disable_nolock(void *junk)
@@ -2700,11 +2678,12 @@ static void hardware_disable_nolock(void *junk)
2700 kvm_arch_hardware_disable(NULL); 2678 kvm_arch_hardware_disable(NULL);
2701} 2679}
2702 2680
2703static void hardware_disable(void *junk) 2681static void hardware_disable(void)
2704{ 2682{
2705 raw_spin_lock(&kvm_lock); 2683 raw_spin_lock(&kvm_count_lock);
2706 hardware_disable_nolock(junk); 2684 if (kvm_usage_count)
2707 raw_spin_unlock(&kvm_lock); 2685 hardware_disable_nolock(NULL);
2686 raw_spin_unlock(&kvm_count_lock);
2708} 2687}
2709 2688
2710static void hardware_disable_all_nolock(void) 2689static void hardware_disable_all_nolock(void)
@@ -2718,16 +2697,16 @@ static void hardware_disable_all_nolock(void)
2718 2697
2719static void hardware_disable_all(void) 2698static void hardware_disable_all(void)
2720{ 2699{
2721 raw_spin_lock(&kvm_lock); 2700 raw_spin_lock(&kvm_count_lock);
2722 hardware_disable_all_nolock(); 2701 hardware_disable_all_nolock();
2723 raw_spin_unlock(&kvm_lock); 2702 raw_spin_unlock(&kvm_count_lock);
2724} 2703}
2725 2704
2726static int hardware_enable_all(void) 2705static int hardware_enable_all(void)
2727{ 2706{
2728 int r = 0; 2707 int r = 0;
2729 2708
2730 raw_spin_lock(&kvm_lock); 2709 raw_spin_lock(&kvm_count_lock);
2731 2710
2732 kvm_usage_count++; 2711 kvm_usage_count++;
2733 if (kvm_usage_count == 1) { 2712 if (kvm_usage_count == 1) {
@@ -2740,7 +2719,7 @@ static int hardware_enable_all(void)
2740 } 2719 }
2741 } 2720 }
2742 2721
2743 raw_spin_unlock(&kvm_lock); 2722 raw_spin_unlock(&kvm_count_lock);
2744 2723
2745 return r; 2724 return r;
2746} 2725}
@@ -2750,20 +2729,17 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2750{ 2729{
2751 int cpu = (long)v; 2730 int cpu = (long)v;
2752 2731
2753 if (!kvm_usage_count)
2754 return NOTIFY_OK;
2755
2756 val &= ~CPU_TASKS_FROZEN; 2732 val &= ~CPU_TASKS_FROZEN;
2757 switch (val) { 2733 switch (val) {
2758 case CPU_DYING: 2734 case CPU_DYING:
2759 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2735 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2760 cpu); 2736 cpu);
2761 hardware_disable(NULL); 2737 hardware_disable();
2762 break; 2738 break;
2763 case CPU_STARTING: 2739 case CPU_STARTING:
2764 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2740 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2765 cpu); 2741 cpu);
2766 hardware_enable(NULL); 2742 hardware_enable();
2767 break; 2743 break;
2768 } 2744 }
2769 return NOTIFY_OK; 2745 return NOTIFY_OK;
@@ -3056,10 +3032,10 @@ static int vm_stat_get(void *_offset, u64 *val)
3056 struct kvm *kvm; 3032 struct kvm *kvm;
3057 3033
3058 *val = 0; 3034 *val = 0;
3059 raw_spin_lock(&kvm_lock); 3035 spin_lock(&kvm_lock);
3060 list_for_each_entry(kvm, &vm_list, vm_list) 3036 list_for_each_entry(kvm, &vm_list, vm_list)
3061 *val += *(u32 *)((void *)kvm + offset); 3037 *val += *(u32 *)((void *)kvm + offset);
3062 raw_spin_unlock(&kvm_lock); 3038 spin_unlock(&kvm_lock);
3063 return 0; 3039 return 0;
3064} 3040}
3065 3041
@@ -3073,12 +3049,12 @@ static int vcpu_stat_get(void *_offset, u64 *val)
3073 int i; 3049 int i;
3074 3050
3075 *val = 0; 3051 *val = 0;
3076 raw_spin_lock(&kvm_lock); 3052 spin_lock(&kvm_lock);
3077 list_for_each_entry(kvm, &vm_list, vm_list) 3053 list_for_each_entry(kvm, &vm_list, vm_list)
3078 kvm_for_each_vcpu(i, vcpu, kvm) 3054 kvm_for_each_vcpu(i, vcpu, kvm)
3079 *val += *(u32 *)((void *)vcpu + offset); 3055 *val += *(u32 *)((void *)vcpu + offset);
3080 3056
3081 raw_spin_unlock(&kvm_lock); 3057 spin_unlock(&kvm_lock);
3082 return 0; 3058 return 0;
3083} 3059}
3084 3060
@@ -3133,7 +3109,7 @@ static int kvm_suspend(void)
3133static void kvm_resume(void) 3109static void kvm_resume(void)
3134{ 3110{
3135 if (kvm_usage_count) { 3111 if (kvm_usage_count) {
3136 WARN_ON(raw_spin_is_locked(&kvm_lock)); 3112 WARN_ON(raw_spin_is_locked(&kvm_count_lock));
3137 hardware_enable_nolock(NULL); 3113 hardware_enable_nolock(NULL);
3138 } 3114 }
3139} 3115}
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
new file mode 100644
index 000000000000..ca4260e35037
--- /dev/null
+++ b/virt/kvm/vfio.c
@@ -0,0 +1,264 @@
1/*
2 * VFIO-KVM bridge pseudo device
3 *
4 * Copyright (C) 2013 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/errno.h>
13#include <linux/file.h>
14#include <linux/kvm_host.h>
15#include <linux/list.h>
16#include <linux/module.h>
17#include <linux/mutex.h>
18#include <linux/slab.h>
19#include <linux/uaccess.h>
20#include <linux/vfio.h>
21
22struct kvm_vfio_group {
23 struct list_head node;
24 struct vfio_group *vfio_group;
25};
26
27struct kvm_vfio {
28 struct list_head group_list;
29 struct mutex lock;
30 bool noncoherent;
31};
32
33static struct vfio_group *kvm_vfio_group_get_external_user(struct file *filep)
34{
35 struct vfio_group *vfio_group;
36 struct vfio_group *(*fn)(struct file *);
37
38 fn = symbol_get(vfio_group_get_external_user);
39 if (!fn)
40 return ERR_PTR(-EINVAL);
41
42 vfio_group = fn(filep);
43
44 symbol_put(vfio_group_get_external_user);
45
46 return vfio_group;
47}
48
49static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group)
50{
51 void (*fn)(struct vfio_group *);
52
53 fn = symbol_get(vfio_group_put_external_user);
54 if (!fn)
55 return;
56
57 fn(vfio_group);
58
59 symbol_put(vfio_group_put_external_user);
60}
61
62/*
63 * Groups can use the same or different IOMMU domains. If the same then
64 * adding a new group may change the coherency of groups we've previously
65 * been told about. We don't want to care about any of that so we retest
66 * each group and bail as soon as we find one that's noncoherent. This
67 * means we only ever [un]register_noncoherent_dma once for the whole device.
68 */
69static void kvm_vfio_update_coherency(struct kvm_device *dev)
70{
71 struct kvm_vfio *kv = dev->private;
72 bool noncoherent = false;
73 struct kvm_vfio_group *kvg;
74
75 mutex_lock(&kv->lock);
76
77 list_for_each_entry(kvg, &kv->group_list, node) {
78 /*
79 * TODO: We need an interface to check the coherency of
80 * the IOMMU domain this group is using. For now, assume
81 * it's always noncoherent.
82 */
83 noncoherent = true;
84 break;
85 }
86
87 if (noncoherent != kv->noncoherent) {
88 kv->noncoherent = noncoherent;
89
90 if (kv->noncoherent)
91 kvm_arch_register_noncoherent_dma(dev->kvm);
92 else
93 kvm_arch_unregister_noncoherent_dma(dev->kvm);
94 }
95
96 mutex_unlock(&kv->lock);
97}
98
99static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
100{
101 struct kvm_vfio *kv = dev->private;
102 struct vfio_group *vfio_group;
103 struct kvm_vfio_group *kvg;
104 void __user *argp = (void __user *)arg;
105 struct fd f;
106 int32_t fd;
107 int ret;
108
109 switch (attr) {
110 case KVM_DEV_VFIO_GROUP_ADD:
111 if (get_user(fd, (int32_t __user *)argp))
112 return -EFAULT;
113
114 f = fdget(fd);
115 if (!f.file)
116 return -EBADF;
117
118 vfio_group = kvm_vfio_group_get_external_user(f.file);
119 fdput(f);
120
121 if (IS_ERR(vfio_group))
122 return PTR_ERR(vfio_group);
123
124 mutex_lock(&kv->lock);
125
126 list_for_each_entry(kvg, &kv->group_list, node) {
127 if (kvg->vfio_group == vfio_group) {
128 mutex_unlock(&kv->lock);
129 kvm_vfio_group_put_external_user(vfio_group);
130 return -EEXIST;
131 }
132 }
133
134 kvg = kzalloc(sizeof(*kvg), GFP_KERNEL);
135 if (!kvg) {
136 mutex_unlock(&kv->lock);
137 kvm_vfio_group_put_external_user(vfio_group);
138 return -ENOMEM;
139 }
140
141 list_add_tail(&kvg->node, &kv->group_list);
142 kvg->vfio_group = vfio_group;
143
144 mutex_unlock(&kv->lock);
145
146 kvm_vfio_update_coherency(dev);
147
148 return 0;
149
150 case KVM_DEV_VFIO_GROUP_DEL:
151 if (get_user(fd, (int32_t __user *)argp))
152 return -EFAULT;
153
154 f = fdget(fd);
155 if (!f.file)
156 return -EBADF;
157
158 vfio_group = kvm_vfio_group_get_external_user(f.file);
159 fdput(f);
160
161 if (IS_ERR(vfio_group))
162 return PTR_ERR(vfio_group);
163
164 ret = -ENOENT;
165
166 mutex_lock(&kv->lock);
167
168 list_for_each_entry(kvg, &kv->group_list, node) {
169 if (kvg->vfio_group != vfio_group)
170 continue;
171
172 list_del(&kvg->node);
173 kvm_vfio_group_put_external_user(kvg->vfio_group);
174 kfree(kvg);
175 ret = 0;
176 break;
177 }
178
179 mutex_unlock(&kv->lock);
180
181 kvm_vfio_group_put_external_user(vfio_group);
182
183 kvm_vfio_update_coherency(dev);
184
185 return ret;
186 }
187
188 return -ENXIO;
189}
190
191static int kvm_vfio_set_attr(struct kvm_device *dev,
192 struct kvm_device_attr *attr)
193{
194 switch (attr->group) {
195 case KVM_DEV_VFIO_GROUP:
196 return kvm_vfio_set_group(dev, attr->attr, attr->addr);
197 }
198
199 return -ENXIO;
200}
201
202static int kvm_vfio_has_attr(struct kvm_device *dev,
203 struct kvm_device_attr *attr)
204{
205 switch (attr->group) {
206 case KVM_DEV_VFIO_GROUP:
207 switch (attr->attr) {
208 case KVM_DEV_VFIO_GROUP_ADD:
209 case KVM_DEV_VFIO_GROUP_DEL:
210 return 0;
211 }
212
213 break;
214 }
215
216 return -ENXIO;
217}
218
219static void kvm_vfio_destroy(struct kvm_device *dev)
220{
221 struct kvm_vfio *kv = dev->private;
222 struct kvm_vfio_group *kvg, *tmp;
223
224 list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) {
225 kvm_vfio_group_put_external_user(kvg->vfio_group);
226 list_del(&kvg->node);
227 kfree(kvg);
228 }
229
230 kvm_vfio_update_coherency(dev);
231
232 kfree(kv);
233 kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */
234}
235
236static int kvm_vfio_create(struct kvm_device *dev, u32 type)
237{
238 struct kvm_device *tmp;
239 struct kvm_vfio *kv;
240
241 /* Only one VFIO "device" per VM */
242 list_for_each_entry(tmp, &dev->kvm->devices, vm_node)
243 if (tmp->ops == &kvm_vfio_ops)
244 return -EBUSY;
245
246 kv = kzalloc(sizeof(*kv), GFP_KERNEL);
247 if (!kv)
248 return -ENOMEM;
249
250 INIT_LIST_HEAD(&kv->group_list);
251 mutex_init(&kv->lock);
252
253 dev->private = kv;
254
255 return 0;
256}
257
258struct kvm_device_ops kvm_vfio_ops = {
259 .name = "kvm-vfio",
260 .create = kvm_vfio_create,
261 .destroy = kvm_vfio_destroy,
262 .set_attr = kvm_vfio_set_attr,
263 .has_attr = kvm_vfio_has_attr,
264};