diff options
author | Nicholas Bellinger <nab@linux-iscsi.org> | 2011-01-16 16:21:04 -0500 |
---|---|---|
committer | Nicholas Bellinger <nab@linux-iscsi.org> | 2011-01-16 16:21:04 -0500 |
commit | f652f6c5b7cfdf139f4155d78f397e99ae1c4acc (patch) | |
tree | 71c6344688bf56ea6aaf18c586ab69ff4f077ade /virt | |
parent | 140e3008e7fe1526cbb12f8f07dbc273ac713b75 (diff) | |
parent | c66ac9db8d4ad9994a02b3e933ea2ccc643e1fe5 (diff) |
Merge branch 'master' of /pub/scm/linux/kernel/git/jejb/scsi-post-merge-2.6 into for-linus
Diffstat (limited to 'virt')
-rw-r--r-- | virt/kvm/Kconfig | 3 | ||||
-rw-r--r-- | virt/kvm/assigned-dev.c | 125 | ||||
-rw-r--r-- | virt/kvm/async_pf.c | 216 | ||||
-rw-r--r-- | virt/kvm/async_pf.h | 36 | ||||
-rw-r--r-- | virt/kvm/eventfd.c | 91 | ||||
-rw-r--r-- | virt/kvm/irq_comm.c | 7 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 373 |
7 files changed, 666 insertions, 185 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 7f1178f6b839..f63ccb0a5982 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig | |||
@@ -15,3 +15,6 @@ config KVM_APIC_ARCHITECTURE | |||
15 | 15 | ||
16 | config KVM_MMIO | 16 | config KVM_MMIO |
17 | bool | 17 | bool |
18 | |||
19 | config KVM_ASYNC_PF | ||
20 | bool | ||
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 7c98928b09d9..ae72ae604c89 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c | |||
@@ -55,58 +55,31 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel | |||
55 | return index; | 55 | return index; |
56 | } | 56 | } |
57 | 57 | ||
58 | static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) | 58 | static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) |
59 | { | 59 | { |
60 | struct kvm_assigned_dev_kernel *assigned_dev; | 60 | struct kvm_assigned_dev_kernel *assigned_dev = dev_id; |
61 | int i; | 61 | u32 vector; |
62 | int index; | ||
62 | 63 | ||
63 | assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, | 64 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { |
64 | interrupt_work); | 65 | spin_lock(&assigned_dev->intx_lock); |
66 | disable_irq_nosync(irq); | ||
67 | assigned_dev->host_irq_disabled = true; | ||
68 | spin_unlock(&assigned_dev->intx_lock); | ||
69 | } | ||
65 | 70 | ||
66 | spin_lock_irq(&assigned_dev->assigned_dev_lock); | ||
67 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { | 71 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { |
68 | struct kvm_guest_msix_entry *guest_entries = | 72 | index = find_index_from_host_irq(assigned_dev, irq); |
69 | assigned_dev->guest_msix_entries; | 73 | if (index >= 0) { |
70 | for (i = 0; i < assigned_dev->entries_nr; i++) { | 74 | vector = assigned_dev-> |
71 | if (!(guest_entries[i].flags & | 75 | guest_msix_entries[index].vector; |
72 | KVM_ASSIGNED_MSIX_PENDING)) | ||
73 | continue; | ||
74 | guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING; | ||
75 | kvm_set_irq(assigned_dev->kvm, | 76 | kvm_set_irq(assigned_dev->kvm, |
76 | assigned_dev->irq_source_id, | 77 | assigned_dev->irq_source_id, vector, 1); |
77 | guest_entries[i].vector, 1); | ||
78 | } | 78 | } |
79 | } else | 79 | } else |
80 | kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, | 80 | kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, |
81 | assigned_dev->guest_irq, 1); | 81 | assigned_dev->guest_irq, 1); |
82 | 82 | ||
83 | spin_unlock_irq(&assigned_dev->assigned_dev_lock); | ||
84 | } | ||
85 | |||
86 | static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) | ||
87 | { | ||
88 | unsigned long flags; | ||
89 | struct kvm_assigned_dev_kernel *assigned_dev = | ||
90 | (struct kvm_assigned_dev_kernel *) dev_id; | ||
91 | |||
92 | spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags); | ||
93 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { | ||
94 | int index = find_index_from_host_irq(assigned_dev, irq); | ||
95 | if (index < 0) | ||
96 | goto out; | ||
97 | assigned_dev->guest_msix_entries[index].flags |= | ||
98 | KVM_ASSIGNED_MSIX_PENDING; | ||
99 | } | ||
100 | |||
101 | schedule_work(&assigned_dev->interrupt_work); | ||
102 | |||
103 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { | ||
104 | disable_irq_nosync(irq); | ||
105 | assigned_dev->host_irq_disabled = true; | ||
106 | } | ||
107 | |||
108 | out: | ||
109 | spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); | ||
110 | return IRQ_HANDLED; | 83 | return IRQ_HANDLED; |
111 | } | 84 | } |
112 | 85 | ||
@@ -114,7 +87,6 @@ out: | |||
114 | static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) | 87 | static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) |
115 | { | 88 | { |
116 | struct kvm_assigned_dev_kernel *dev; | 89 | struct kvm_assigned_dev_kernel *dev; |
117 | unsigned long flags; | ||
118 | 90 | ||
119 | if (kian->gsi == -1) | 91 | if (kian->gsi == -1) |
120 | return; | 92 | return; |
@@ -127,12 +99,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) | |||
127 | /* The guest irq may be shared so this ack may be | 99 | /* The guest irq may be shared so this ack may be |
128 | * from another device. | 100 | * from another device. |
129 | */ | 101 | */ |
130 | spin_lock_irqsave(&dev->assigned_dev_lock, flags); | 102 | spin_lock(&dev->intx_lock); |
131 | if (dev->host_irq_disabled) { | 103 | if (dev->host_irq_disabled) { |
132 | enable_irq(dev->host_irq); | 104 | enable_irq(dev->host_irq); |
133 | dev->host_irq_disabled = false; | 105 | dev->host_irq_disabled = false; |
134 | } | 106 | } |
135 | spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); | 107 | spin_unlock(&dev->intx_lock); |
136 | } | 108 | } |
137 | 109 | ||
138 | static void deassign_guest_irq(struct kvm *kvm, | 110 | static void deassign_guest_irq(struct kvm *kvm, |
@@ -141,6 +113,9 @@ static void deassign_guest_irq(struct kvm *kvm, | |||
141 | kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); | 113 | kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); |
142 | assigned_dev->ack_notifier.gsi = -1; | 114 | assigned_dev->ack_notifier.gsi = -1; |
143 | 115 | ||
116 | kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, | ||
117 | assigned_dev->guest_irq, 0); | ||
118 | |||
144 | if (assigned_dev->irq_source_id != -1) | 119 | if (assigned_dev->irq_source_id != -1) |
145 | kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); | 120 | kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); |
146 | assigned_dev->irq_source_id = -1; | 121 | assigned_dev->irq_source_id = -1; |
@@ -152,28 +127,19 @@ static void deassign_host_irq(struct kvm *kvm, | |||
152 | struct kvm_assigned_dev_kernel *assigned_dev) | 127 | struct kvm_assigned_dev_kernel *assigned_dev) |
153 | { | 128 | { |
154 | /* | 129 | /* |
155 | * In kvm_free_device_irq, cancel_work_sync return true if: | 130 | * We disable irq here to prevent further events. |
156 | * 1. work is scheduled, and then cancelled. | ||
157 | * 2. work callback is executed. | ||
158 | * | ||
159 | * The first one ensured that the irq is disabled and no more events | ||
160 | * would happen. But for the second one, the irq may be enabled (e.g. | ||
161 | * for MSI). So we disable irq here to prevent further events. | ||
162 | * | 131 | * |
163 | * Notice this maybe result in nested disable if the interrupt type is | 132 | * Notice this maybe result in nested disable if the interrupt type is |
164 | * INTx, but it's OK for we are going to free it. | 133 | * INTx, but it's OK for we are going to free it. |
165 | * | 134 | * |
166 | * If this function is a part of VM destroy, please ensure that till | 135 | * If this function is a part of VM destroy, please ensure that till |
167 | * now, the kvm state is still legal for probably we also have to wait | 136 | * now, the kvm state is still legal for probably we also have to wait |
168 | * interrupt_work done. | 137 | * on a currently running IRQ handler. |
169 | */ | 138 | */ |
170 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { | 139 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { |
171 | int i; | 140 | int i; |
172 | for (i = 0; i < assigned_dev->entries_nr; i++) | 141 | for (i = 0; i < assigned_dev->entries_nr; i++) |
173 | disable_irq_nosync(assigned_dev-> | 142 | disable_irq(assigned_dev->host_msix_entries[i].vector); |
174 | host_msix_entries[i].vector); | ||
175 | |||
176 | cancel_work_sync(&assigned_dev->interrupt_work); | ||
177 | 143 | ||
178 | for (i = 0; i < assigned_dev->entries_nr; i++) | 144 | for (i = 0; i < assigned_dev->entries_nr; i++) |
179 | free_irq(assigned_dev->host_msix_entries[i].vector, | 145 | free_irq(assigned_dev->host_msix_entries[i].vector, |
@@ -185,8 +151,7 @@ static void deassign_host_irq(struct kvm *kvm, | |||
185 | pci_disable_msix(assigned_dev->dev); | 151 | pci_disable_msix(assigned_dev->dev); |
186 | } else { | 152 | } else { |
187 | /* Deal with MSI and INTx */ | 153 | /* Deal with MSI and INTx */ |
188 | disable_irq_nosync(assigned_dev->host_irq); | 154 | disable_irq(assigned_dev->host_irq); |
189 | cancel_work_sync(&assigned_dev->interrupt_work); | ||
190 | 155 | ||
191 | free_irq(assigned_dev->host_irq, (void *)assigned_dev); | 156 | free_irq(assigned_dev->host_irq, (void *)assigned_dev); |
192 | 157 | ||
@@ -232,7 +197,8 @@ static void kvm_free_assigned_device(struct kvm *kvm, | |||
232 | { | 197 | { |
233 | kvm_free_assigned_irq(kvm, assigned_dev); | 198 | kvm_free_assigned_irq(kvm, assigned_dev); |
234 | 199 | ||
235 | pci_reset_function(assigned_dev->dev); | 200 | __pci_reset_function(assigned_dev->dev); |
201 | pci_restore_state(assigned_dev->dev); | ||
236 | 202 | ||
237 | pci_release_regions(assigned_dev->dev); | 203 | pci_release_regions(assigned_dev->dev); |
238 | pci_disable_device(assigned_dev->dev); | 204 | pci_disable_device(assigned_dev->dev); |
@@ -265,8 +231,8 @@ static int assigned_device_enable_host_intx(struct kvm *kvm, | |||
265 | * on the same interrupt line is not a happy situation: there | 231 | * on the same interrupt line is not a happy situation: there |
266 | * are going to be long delays in accepting, acking, etc. | 232 | * are going to be long delays in accepting, acking, etc. |
267 | */ | 233 | */ |
268 | if (request_irq(dev->host_irq, kvm_assigned_dev_intr, | 234 | if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, |
269 | 0, "kvm_assigned_intx_device", (void *)dev)) | 235 | IRQF_ONESHOT, dev->irq_name, (void *)dev)) |
270 | return -EIO; | 236 | return -EIO; |
271 | return 0; | 237 | return 0; |
272 | } | 238 | } |
@@ -284,8 +250,8 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, | |||
284 | } | 250 | } |
285 | 251 | ||
286 | dev->host_irq = dev->dev->irq; | 252 | dev->host_irq = dev->dev->irq; |
287 | if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, | 253 | if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, |
288 | "kvm_assigned_msi_device", (void *)dev)) { | 254 | 0, dev->irq_name, (void *)dev)) { |
289 | pci_disable_msi(dev->dev); | 255 | pci_disable_msi(dev->dev); |
290 | return -EIO; | 256 | return -EIO; |
291 | } | 257 | } |
@@ -310,10 +276,9 @@ static int assigned_device_enable_host_msix(struct kvm *kvm, | |||
310 | return r; | 276 | return r; |
311 | 277 | ||
312 | for (i = 0; i < dev->entries_nr; i++) { | 278 | for (i = 0; i < dev->entries_nr; i++) { |
313 | r = request_irq(dev->host_msix_entries[i].vector, | 279 | r = request_threaded_irq(dev->host_msix_entries[i].vector, |
314 | kvm_assigned_dev_intr, 0, | 280 | NULL, kvm_assigned_dev_thread, |
315 | "kvm_assigned_msix_device", | 281 | 0, dev->irq_name, (void *)dev); |
316 | (void *)dev); | ||
317 | if (r) | 282 | if (r) |
318 | goto err; | 283 | goto err; |
319 | } | 284 | } |
@@ -370,6 +335,9 @@ static int assign_host_irq(struct kvm *kvm, | |||
370 | if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) | 335 | if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) |
371 | return r; | 336 | return r; |
372 | 337 | ||
338 | snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", | ||
339 | pci_name(dev->dev)); | ||
340 | |||
373 | switch (host_irq_type) { | 341 | switch (host_irq_type) { |
374 | case KVM_DEV_IRQ_HOST_INTX: | 342 | case KVM_DEV_IRQ_HOST_INTX: |
375 | r = assigned_device_enable_host_intx(kvm, dev); | 343 | r = assigned_device_enable_host_intx(kvm, dev); |
@@ -547,6 +515,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, | |||
547 | } | 515 | } |
548 | 516 | ||
549 | pci_reset_function(dev); | 517 | pci_reset_function(dev); |
518 | pci_save_state(dev); | ||
550 | 519 | ||
551 | match->assigned_dev_id = assigned_dev->assigned_dev_id; | 520 | match->assigned_dev_id = assigned_dev->assigned_dev_id; |
552 | match->host_segnr = assigned_dev->segnr; | 521 | match->host_segnr = assigned_dev->segnr; |
@@ -554,12 +523,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, | |||
554 | match->host_devfn = assigned_dev->devfn; | 523 | match->host_devfn = assigned_dev->devfn; |
555 | match->flags = assigned_dev->flags; | 524 | match->flags = assigned_dev->flags; |
556 | match->dev = dev; | 525 | match->dev = dev; |
557 | spin_lock_init(&match->assigned_dev_lock); | 526 | spin_lock_init(&match->intx_lock); |
558 | match->irq_source_id = -1; | 527 | match->irq_source_id = -1; |
559 | match->kvm = kvm; | 528 | match->kvm = kvm; |
560 | match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; | 529 | match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; |
561 | INIT_WORK(&match->interrupt_work, | ||
562 | kvm_assigned_dev_interrupt_work_handler); | ||
563 | 530 | ||
564 | list_add(&match->list, &kvm->arch.assigned_dev_head); | 531 | list_add(&match->list, &kvm->arch.assigned_dev_head); |
565 | 532 | ||
@@ -579,6 +546,7 @@ out: | |||
579 | mutex_unlock(&kvm->lock); | 546 | mutex_unlock(&kvm->lock); |
580 | return r; | 547 | return r; |
581 | out_list_del: | 548 | out_list_del: |
549 | pci_restore_state(dev); | ||
582 | list_del(&match->list); | 550 | list_del(&match->list); |
583 | pci_release_regions(dev); | 551 | pci_release_regions(dev); |
584 | out_disable: | 552 | out_disable: |
@@ -651,9 +619,9 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, | |||
651 | r = -ENOMEM; | 619 | r = -ENOMEM; |
652 | goto msix_nr_out; | 620 | goto msix_nr_out; |
653 | } | 621 | } |
654 | adev->guest_msix_entries = kzalloc( | 622 | adev->guest_msix_entries = |
655 | sizeof(struct kvm_guest_msix_entry) * | 623 | kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, |
656 | entry_nr->entry_nr, GFP_KERNEL); | 624 | GFP_KERNEL); |
657 | if (!adev->guest_msix_entries) { | 625 | if (!adev->guest_msix_entries) { |
658 | kfree(adev->host_msix_entries); | 626 | kfree(adev->host_msix_entries); |
659 | r = -ENOMEM; | 627 | r = -ENOMEM; |
@@ -706,7 +674,7 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, | |||
706 | unsigned long arg) | 674 | unsigned long arg) |
707 | { | 675 | { |
708 | void __user *argp = (void __user *)arg; | 676 | void __user *argp = (void __user *)arg; |
709 | int r = -ENOTTY; | 677 | int r; |
710 | 678 | ||
711 | switch (ioctl) { | 679 | switch (ioctl) { |
712 | case KVM_ASSIGN_PCI_DEVICE: { | 680 | case KVM_ASSIGN_PCI_DEVICE: { |
@@ -724,7 +692,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, | |||
724 | r = -EOPNOTSUPP; | 692 | r = -EOPNOTSUPP; |
725 | break; | 693 | break; |
726 | } | 694 | } |
727 | #ifdef KVM_CAP_ASSIGN_DEV_IRQ | ||
728 | case KVM_ASSIGN_DEV_IRQ: { | 695 | case KVM_ASSIGN_DEV_IRQ: { |
729 | struct kvm_assigned_irq assigned_irq; | 696 | struct kvm_assigned_irq assigned_irq; |
730 | 697 | ||
@@ -747,8 +714,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, | |||
747 | goto out; | 714 | goto out; |
748 | break; | 715 | break; |
749 | } | 716 | } |
750 | #endif | ||
751 | #ifdef KVM_CAP_DEVICE_DEASSIGNMENT | ||
752 | case KVM_DEASSIGN_PCI_DEVICE: { | 717 | case KVM_DEASSIGN_PCI_DEVICE: { |
753 | struct kvm_assigned_pci_dev assigned_dev; | 718 | struct kvm_assigned_pci_dev assigned_dev; |
754 | 719 | ||
@@ -760,7 +725,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, | |||
760 | goto out; | 725 | goto out; |
761 | break; | 726 | break; |
762 | } | 727 | } |
763 | #endif | ||
764 | #ifdef KVM_CAP_IRQ_ROUTING | 728 | #ifdef KVM_CAP_IRQ_ROUTING |
765 | case KVM_SET_GSI_ROUTING: { | 729 | case KVM_SET_GSI_ROUTING: { |
766 | struct kvm_irq_routing routing; | 730 | struct kvm_irq_routing routing; |
@@ -813,6 +777,9 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, | |||
813 | break; | 777 | break; |
814 | } | 778 | } |
815 | #endif | 779 | #endif |
780 | default: | ||
781 | r = -ENOTTY; | ||
782 | break; | ||
816 | } | 783 | } |
817 | out: | 784 | out: |
818 | return r; | 785 | return r; |
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c new file mode 100644 index 000000000000..74268b4c2ee1 --- /dev/null +++ b/virt/kvm/async_pf.c | |||
@@ -0,0 +1,216 @@ | |||
1 | /* | ||
2 | * kvm asynchronous fault support | ||
3 | * | ||
4 | * Copyright 2010 Red Hat, Inc. | ||
5 | * | ||
6 | * Author: | ||
7 | * Gleb Natapov <gleb@redhat.com> | ||
8 | * | ||
9 | * This file is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of version 2 of the GNU General Public License | ||
11 | * as published by the Free Software Foundation. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, write to the Free Software Foundation, | ||
20 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. | ||
21 | */ | ||
22 | |||
23 | #include <linux/kvm_host.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <linux/mmu_context.h> | ||
27 | |||
28 | #include "async_pf.h" | ||
29 | #include <trace/events/kvm.h> | ||
30 | |||
31 | static struct kmem_cache *async_pf_cache; | ||
32 | |||
33 | int kvm_async_pf_init(void) | ||
34 | { | ||
35 | async_pf_cache = KMEM_CACHE(kvm_async_pf, 0); | ||
36 | |||
37 | if (!async_pf_cache) | ||
38 | return -ENOMEM; | ||
39 | |||
40 | return 0; | ||
41 | } | ||
42 | |||
43 | void kvm_async_pf_deinit(void) | ||
44 | { | ||
45 | if (async_pf_cache) | ||
46 | kmem_cache_destroy(async_pf_cache); | ||
47 | async_pf_cache = NULL; | ||
48 | } | ||
49 | |||
50 | void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu) | ||
51 | { | ||
52 | INIT_LIST_HEAD(&vcpu->async_pf.done); | ||
53 | INIT_LIST_HEAD(&vcpu->async_pf.queue); | ||
54 | spin_lock_init(&vcpu->async_pf.lock); | ||
55 | } | ||
56 | |||
57 | static void async_pf_execute(struct work_struct *work) | ||
58 | { | ||
59 | struct page *page = NULL; | ||
60 | struct kvm_async_pf *apf = | ||
61 | container_of(work, struct kvm_async_pf, work); | ||
62 | struct mm_struct *mm = apf->mm; | ||
63 | struct kvm_vcpu *vcpu = apf->vcpu; | ||
64 | unsigned long addr = apf->addr; | ||
65 | gva_t gva = apf->gva; | ||
66 | |||
67 | might_sleep(); | ||
68 | |||
69 | use_mm(mm); | ||
70 | down_read(&mm->mmap_sem); | ||
71 | get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL); | ||
72 | up_read(&mm->mmap_sem); | ||
73 | unuse_mm(mm); | ||
74 | |||
75 | spin_lock(&vcpu->async_pf.lock); | ||
76 | list_add_tail(&apf->link, &vcpu->async_pf.done); | ||
77 | apf->page = page; | ||
78 | apf->done = true; | ||
79 | spin_unlock(&vcpu->async_pf.lock); | ||
80 | |||
81 | /* | ||
82 | * apf may be freed by kvm_check_async_pf_completion() after | ||
83 | * this point | ||
84 | */ | ||
85 | |||
86 | trace_kvm_async_pf_completed(addr, page, gva); | ||
87 | |||
88 | if (waitqueue_active(&vcpu->wq)) | ||
89 | wake_up_interruptible(&vcpu->wq); | ||
90 | |||
91 | mmdrop(mm); | ||
92 | kvm_put_kvm(vcpu->kvm); | ||
93 | } | ||
94 | |||
95 | void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) | ||
96 | { | ||
97 | /* cancel outstanding work queue item */ | ||
98 | while (!list_empty(&vcpu->async_pf.queue)) { | ||
99 | struct kvm_async_pf *work = | ||
100 | list_entry(vcpu->async_pf.queue.next, | ||
101 | typeof(*work), queue); | ||
102 | cancel_work_sync(&work->work); | ||
103 | list_del(&work->queue); | ||
104 | if (!work->done) /* work was canceled */ | ||
105 | kmem_cache_free(async_pf_cache, work); | ||
106 | } | ||
107 | |||
108 | spin_lock(&vcpu->async_pf.lock); | ||
109 | while (!list_empty(&vcpu->async_pf.done)) { | ||
110 | struct kvm_async_pf *work = | ||
111 | list_entry(vcpu->async_pf.done.next, | ||
112 | typeof(*work), link); | ||
113 | list_del(&work->link); | ||
114 | if (work->page) | ||
115 | put_page(work->page); | ||
116 | kmem_cache_free(async_pf_cache, work); | ||
117 | } | ||
118 | spin_unlock(&vcpu->async_pf.lock); | ||
119 | |||
120 | vcpu->async_pf.queued = 0; | ||
121 | } | ||
122 | |||
123 | void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) | ||
124 | { | ||
125 | struct kvm_async_pf *work; | ||
126 | |||
127 | while (!list_empty_careful(&vcpu->async_pf.done) && | ||
128 | kvm_arch_can_inject_async_page_present(vcpu)) { | ||
129 | spin_lock(&vcpu->async_pf.lock); | ||
130 | work = list_first_entry(&vcpu->async_pf.done, typeof(*work), | ||
131 | link); | ||
132 | list_del(&work->link); | ||
133 | spin_unlock(&vcpu->async_pf.lock); | ||
134 | |||
135 | if (work->page) | ||
136 | kvm_arch_async_page_ready(vcpu, work); | ||
137 | kvm_arch_async_page_present(vcpu, work); | ||
138 | |||
139 | list_del(&work->queue); | ||
140 | vcpu->async_pf.queued--; | ||
141 | if (work->page) | ||
142 | put_page(work->page); | ||
143 | kmem_cache_free(async_pf_cache, work); | ||
144 | } | ||
145 | } | ||
146 | |||
147 | int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, | ||
148 | struct kvm_arch_async_pf *arch) | ||
149 | { | ||
150 | struct kvm_async_pf *work; | ||
151 | |||
152 | if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU) | ||
153 | return 0; | ||
154 | |||
155 | /* setup delayed work */ | ||
156 | |||
157 | /* | ||
158 | * do alloc nowait since if we are going to sleep anyway we | ||
159 | * may as well sleep faulting in page | ||
160 | */ | ||
161 | work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT); | ||
162 | if (!work) | ||
163 | return 0; | ||
164 | |||
165 | work->page = NULL; | ||
166 | work->done = false; | ||
167 | work->vcpu = vcpu; | ||
168 | work->gva = gva; | ||
169 | work->addr = gfn_to_hva(vcpu->kvm, gfn); | ||
170 | work->arch = *arch; | ||
171 | work->mm = current->mm; | ||
172 | atomic_inc(&work->mm->mm_count); | ||
173 | kvm_get_kvm(work->vcpu->kvm); | ||
174 | |||
175 | /* this can't really happen otherwise gfn_to_pfn_async | ||
176 | would succeed */ | ||
177 | if (unlikely(kvm_is_error_hva(work->addr))) | ||
178 | goto retry_sync; | ||
179 | |||
180 | INIT_WORK(&work->work, async_pf_execute); | ||
181 | if (!schedule_work(&work->work)) | ||
182 | goto retry_sync; | ||
183 | |||
184 | list_add_tail(&work->queue, &vcpu->async_pf.queue); | ||
185 | vcpu->async_pf.queued++; | ||
186 | kvm_arch_async_page_not_present(vcpu, work); | ||
187 | return 1; | ||
188 | retry_sync: | ||
189 | kvm_put_kvm(work->vcpu->kvm); | ||
190 | mmdrop(work->mm); | ||
191 | kmem_cache_free(async_pf_cache, work); | ||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) | ||
196 | { | ||
197 | struct kvm_async_pf *work; | ||
198 | |||
199 | if (!list_empty_careful(&vcpu->async_pf.done)) | ||
200 | return 0; | ||
201 | |||
202 | work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC); | ||
203 | if (!work) | ||
204 | return -ENOMEM; | ||
205 | |||
206 | work->page = bad_page; | ||
207 | get_page(bad_page); | ||
208 | INIT_LIST_HEAD(&work->queue); /* for list_del to work */ | ||
209 | |||
210 | spin_lock(&vcpu->async_pf.lock); | ||
211 | list_add_tail(&work->link, &vcpu->async_pf.done); | ||
212 | spin_unlock(&vcpu->async_pf.lock); | ||
213 | |||
214 | vcpu->async_pf.queued++; | ||
215 | return 0; | ||
216 | } | ||
diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h new file mode 100644 index 000000000000..e7ef6447cb82 --- /dev/null +++ b/virt/kvm/async_pf.h | |||
@@ -0,0 +1,36 @@ | |||
1 | /* | ||
2 | * kvm asynchronous fault support | ||
3 | * | ||
4 | * Copyright 2010 Red Hat, Inc. | ||
5 | * | ||
6 | * Author: | ||
7 | * Gleb Natapov <gleb@redhat.com> | ||
8 | * | ||
9 | * This file is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of version 2 of the GNU General Public License | ||
11 | * as published by the Free Software Foundation. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, write to the Free Software Foundation, | ||
20 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. | ||
21 | */ | ||
22 | |||
23 | #ifndef __KVM_ASYNC_PF_H__ | ||
24 | #define __KVM_ASYNC_PF_H__ | ||
25 | |||
26 | #ifdef CONFIG_KVM_ASYNC_PF | ||
27 | int kvm_async_pf_init(void); | ||
28 | void kvm_async_pf_deinit(void); | ||
29 | void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu); | ||
30 | #else | ||
31 | #define kvm_async_pf_init() (0) | ||
32 | #define kvm_async_pf_deinit() do{}while(0) | ||
33 | #define kvm_async_pf_vcpu_init(C) do{}while(0) | ||
34 | #endif | ||
35 | |||
36 | #endif | ||
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index c1f1e3c62984..2ca4535f4fb7 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c | |||
@@ -44,14 +44,19 @@ | |||
44 | */ | 44 | */ |
45 | 45 | ||
46 | struct _irqfd { | 46 | struct _irqfd { |
47 | struct kvm *kvm; | 47 | /* Used for MSI fast-path */ |
48 | struct eventfd_ctx *eventfd; | 48 | struct kvm *kvm; |
49 | int gsi; | 49 | wait_queue_t wait; |
50 | struct list_head list; | 50 | /* Update side is protected by irqfds.lock */ |
51 | poll_table pt; | 51 | struct kvm_kernel_irq_routing_entry __rcu *irq_entry; |
52 | wait_queue_t wait; | 52 | /* Used for level IRQ fast-path */ |
53 | struct work_struct inject; | 53 | int gsi; |
54 | struct work_struct shutdown; | 54 | struct work_struct inject; |
55 | /* Used for setup/shutdown */ | ||
56 | struct eventfd_ctx *eventfd; | ||
57 | struct list_head list; | ||
58 | poll_table pt; | ||
59 | struct work_struct shutdown; | ||
55 | }; | 60 | }; |
56 | 61 | ||
57 | static struct workqueue_struct *irqfd_cleanup_wq; | 62 | static struct workqueue_struct *irqfd_cleanup_wq; |
@@ -125,14 +130,22 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) | |||
125 | { | 130 | { |
126 | struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); | 131 | struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); |
127 | unsigned long flags = (unsigned long)key; | 132 | unsigned long flags = (unsigned long)key; |
133 | struct kvm_kernel_irq_routing_entry *irq; | ||
134 | struct kvm *kvm = irqfd->kvm; | ||
128 | 135 | ||
129 | if (flags & POLLIN) | 136 | if (flags & POLLIN) { |
137 | rcu_read_lock(); | ||
138 | irq = rcu_dereference(irqfd->irq_entry); | ||
130 | /* An event has been signaled, inject an interrupt */ | 139 | /* An event has been signaled, inject an interrupt */ |
131 | schedule_work(&irqfd->inject); | 140 | if (irq) |
141 | kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1); | ||
142 | else | ||
143 | schedule_work(&irqfd->inject); | ||
144 | rcu_read_unlock(); | ||
145 | } | ||
132 | 146 | ||
133 | if (flags & POLLHUP) { | 147 | if (flags & POLLHUP) { |
134 | /* The eventfd is closing, detach from KVM */ | 148 | /* The eventfd is closing, detach from KVM */ |
135 | struct kvm *kvm = irqfd->kvm; | ||
136 | unsigned long flags; | 149 | unsigned long flags; |
137 | 150 | ||
138 | spin_lock_irqsave(&kvm->irqfds.lock, flags); | 151 | spin_lock_irqsave(&kvm->irqfds.lock, flags); |
@@ -163,9 +176,31 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, | |||
163 | add_wait_queue(wqh, &irqfd->wait); | 176 | add_wait_queue(wqh, &irqfd->wait); |
164 | } | 177 | } |
165 | 178 | ||
179 | /* Must be called under irqfds.lock */ | ||
180 | static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd, | ||
181 | struct kvm_irq_routing_table *irq_rt) | ||
182 | { | ||
183 | struct kvm_kernel_irq_routing_entry *e; | ||
184 | struct hlist_node *n; | ||
185 | |||
186 | if (irqfd->gsi >= irq_rt->nr_rt_entries) { | ||
187 | rcu_assign_pointer(irqfd->irq_entry, NULL); | ||
188 | return; | ||
189 | } | ||
190 | |||
191 | hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) { | ||
192 | /* Only fast-path MSI. */ | ||
193 | if (e->type == KVM_IRQ_ROUTING_MSI) | ||
194 | rcu_assign_pointer(irqfd->irq_entry, e); | ||
195 | else | ||
196 | rcu_assign_pointer(irqfd->irq_entry, NULL); | ||
197 | } | ||
198 | } | ||
199 | |||
166 | static int | 200 | static int |
167 | kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) | 201 | kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) |
168 | { | 202 | { |
203 | struct kvm_irq_routing_table *irq_rt; | ||
169 | struct _irqfd *irqfd, *tmp; | 204 | struct _irqfd *irqfd, *tmp; |
170 | struct file *file = NULL; | 205 | struct file *file = NULL; |
171 | struct eventfd_ctx *eventfd = NULL; | 206 | struct eventfd_ctx *eventfd = NULL; |
@@ -215,6 +250,10 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) | |||
215 | goto fail; | 250 | goto fail; |
216 | } | 251 | } |
217 | 252 | ||
253 | irq_rt = rcu_dereference_protected(kvm->irq_routing, | ||
254 | lockdep_is_held(&kvm->irqfds.lock)); | ||
255 | irqfd_update(kvm, irqfd, irq_rt); | ||
256 | |||
218 | events = file->f_op->poll(file, &irqfd->pt); | 257 | events = file->f_op->poll(file, &irqfd->pt); |
219 | 258 | ||
220 | list_add_tail(&irqfd->list, &kvm->irqfds.items); | 259 | list_add_tail(&irqfd->list, &kvm->irqfds.items); |
@@ -271,8 +310,17 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) | |||
271 | spin_lock_irq(&kvm->irqfds.lock); | 310 | spin_lock_irq(&kvm->irqfds.lock); |
272 | 311 | ||
273 | list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { | 312 | list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { |
274 | if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) | 313 | if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) { |
314 | /* | ||
315 | * This rcu_assign_pointer is needed for when | ||
316 | * another thread calls kvm_irqfd_update before | ||
317 | * we flush workqueue below. | ||
318 | * It is paired with synchronize_rcu done by caller | ||
319 | * of that function. | ||
320 | */ | ||
321 | rcu_assign_pointer(irqfd->irq_entry, NULL); | ||
275 | irqfd_deactivate(irqfd); | 322 | irqfd_deactivate(irqfd); |
323 | } | ||
276 | } | 324 | } |
277 | 325 | ||
278 | spin_unlock_irq(&kvm->irqfds.lock); | 326 | spin_unlock_irq(&kvm->irqfds.lock); |
@@ -322,6 +370,25 @@ kvm_irqfd_release(struct kvm *kvm) | |||
322 | } | 370 | } |
323 | 371 | ||
324 | /* | 372 | /* |
373 | * Change irq_routing and irqfd. | ||
374 | * Caller must invoke synchronize_rcu afterwards. | ||
375 | */ | ||
376 | void kvm_irq_routing_update(struct kvm *kvm, | ||
377 | struct kvm_irq_routing_table *irq_rt) | ||
378 | { | ||
379 | struct _irqfd *irqfd; | ||
380 | |||
381 | spin_lock_irq(&kvm->irqfds.lock); | ||
382 | |||
383 | rcu_assign_pointer(kvm->irq_routing, irq_rt); | ||
384 | |||
385 | list_for_each_entry(irqfd, &kvm->irqfds.items, list) | ||
386 | irqfd_update(kvm, irqfd, irq_rt); | ||
387 | |||
388 | spin_unlock_irq(&kvm->irqfds.lock); | ||
389 | } | ||
390 | |||
391 | /* | ||
325 | * create a host-wide workqueue for issuing deferred shutdown requests | 392 | * create a host-wide workqueue for issuing deferred shutdown requests |
326 | * aggregated from all vm* instances. We need our own isolated single-thread | 393 | * aggregated from all vm* instances. We need our own isolated single-thread |
327 | * queue to prevent deadlock against flushing the normal work-queue. | 394 | * queue to prevent deadlock against flushing the normal work-queue. |
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 8edca9141b78..9f614b4e365f 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c | |||
@@ -114,8 +114,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, | |||
114 | return r; | 114 | return r; |
115 | } | 115 | } |
116 | 116 | ||
117 | static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, | 117 | int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, |
118 | struct kvm *kvm, int irq_source_id, int level) | 118 | struct kvm *kvm, int irq_source_id, int level) |
119 | { | 119 | { |
120 | struct kvm_lapic_irq irq; | 120 | struct kvm_lapic_irq irq; |
121 | 121 | ||
@@ -409,8 +409,9 @@ int kvm_set_irq_routing(struct kvm *kvm, | |||
409 | 409 | ||
410 | mutex_lock(&kvm->irq_lock); | 410 | mutex_lock(&kvm->irq_lock); |
411 | old = kvm->irq_routing; | 411 | old = kvm->irq_routing; |
412 | rcu_assign_pointer(kvm->irq_routing, new); | 412 | kvm_irq_routing_update(kvm, new); |
413 | mutex_unlock(&kvm->irq_lock); | 413 | mutex_unlock(&kvm->irq_lock); |
414 | |||
414 | synchronize_rcu(); | 415 | synchronize_rcu(); |
415 | 416 | ||
416 | new = old; | 417 | new = old; |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5225052aebc1..f29abeb6a912 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -55,6 +55,7 @@ | |||
55 | #include <asm-generic/bitops/le.h> | 55 | #include <asm-generic/bitops/le.h> |
56 | 56 | ||
57 | #include "coalesced_mmio.h" | 57 | #include "coalesced_mmio.h" |
58 | #include "async_pf.h" | ||
58 | 59 | ||
59 | #define CREATE_TRACE_POINTS | 60 | #define CREATE_TRACE_POINTS |
60 | #include <trace/events/kvm.h> | 61 | #include <trace/events/kvm.h> |
@@ -89,7 +90,8 @@ static void hardware_disable_all(void); | |||
89 | 90 | ||
90 | static void kvm_io_bus_destroy(struct kvm_io_bus *bus); | 91 | static void kvm_io_bus_destroy(struct kvm_io_bus *bus); |
91 | 92 | ||
92 | static bool kvm_rebooting; | 93 | bool kvm_rebooting; |
94 | EXPORT_SYMBOL_GPL(kvm_rebooting); | ||
93 | 95 | ||
94 | static bool largepages_enabled = true; | 96 | static bool largepages_enabled = true; |
95 | 97 | ||
@@ -102,8 +104,26 @@ static pfn_t fault_pfn; | |||
102 | inline int kvm_is_mmio_pfn(pfn_t pfn) | 104 | inline int kvm_is_mmio_pfn(pfn_t pfn) |
103 | { | 105 | { |
104 | if (pfn_valid(pfn)) { | 106 | if (pfn_valid(pfn)) { |
105 | struct page *page = compound_head(pfn_to_page(pfn)); | 107 | int reserved; |
106 | return PageReserved(page); | 108 | struct page *tail = pfn_to_page(pfn); |
109 | struct page *head = compound_trans_head(tail); | ||
110 | reserved = PageReserved(head); | ||
111 | if (head != tail) { | ||
112 | /* | ||
113 | * "head" is not a dangling pointer | ||
114 | * (compound_trans_head takes care of that) | ||
115 | * but the hugepage may have been splitted | ||
116 | * from under us (and we may not hold a | ||
117 | * reference count on the head page so it can | ||
118 | * be reused before we run PageReferenced), so | ||
119 | * we've to check PageTail before returning | ||
120 | * what we just read. | ||
121 | */ | ||
122 | smp_rmb(); | ||
123 | if (PageTail(tail)) | ||
124 | return reserved; | ||
125 | } | ||
126 | return PageReserved(tail); | ||
107 | } | 127 | } |
108 | 128 | ||
109 | return true; | 129 | return true; |
@@ -167,8 +187,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) | |||
167 | 187 | ||
168 | void kvm_flush_remote_tlbs(struct kvm *kvm) | 188 | void kvm_flush_remote_tlbs(struct kvm *kvm) |
169 | { | 189 | { |
190 | int dirty_count = kvm->tlbs_dirty; | ||
191 | |||
192 | smp_mb(); | ||
170 | if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) | 193 | if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) |
171 | ++kvm->stat.remote_tlb_flush; | 194 | ++kvm->stat.remote_tlb_flush; |
195 | cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); | ||
172 | } | 196 | } |
173 | 197 | ||
174 | void kvm_reload_remote_mmus(struct kvm *kvm) | 198 | void kvm_reload_remote_mmus(struct kvm *kvm) |
@@ -186,6 +210,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) | |||
186 | vcpu->kvm = kvm; | 210 | vcpu->kvm = kvm; |
187 | vcpu->vcpu_id = id; | 211 | vcpu->vcpu_id = id; |
188 | init_waitqueue_head(&vcpu->wq); | 212 | init_waitqueue_head(&vcpu->wq); |
213 | kvm_async_pf_vcpu_init(vcpu); | ||
189 | 214 | ||
190 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | 215 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); |
191 | if (!page) { | 216 | if (!page) { |
@@ -247,7 +272,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, | |||
247 | idx = srcu_read_lock(&kvm->srcu); | 272 | idx = srcu_read_lock(&kvm->srcu); |
248 | spin_lock(&kvm->mmu_lock); | 273 | spin_lock(&kvm->mmu_lock); |
249 | kvm->mmu_notifier_seq++; | 274 | kvm->mmu_notifier_seq++; |
250 | need_tlb_flush = kvm_unmap_hva(kvm, address); | 275 | need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; |
251 | spin_unlock(&kvm->mmu_lock); | 276 | spin_unlock(&kvm->mmu_lock); |
252 | srcu_read_unlock(&kvm->srcu, idx); | 277 | srcu_read_unlock(&kvm->srcu, idx); |
253 | 278 | ||
@@ -291,6 +316,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, | |||
291 | kvm->mmu_notifier_count++; | 316 | kvm->mmu_notifier_count++; |
292 | for (; start < end; start += PAGE_SIZE) | 317 | for (; start < end; start += PAGE_SIZE) |
293 | need_tlb_flush |= kvm_unmap_hva(kvm, start); | 318 | need_tlb_flush |= kvm_unmap_hva(kvm, start); |
319 | need_tlb_flush |= kvm->tlbs_dirty; | ||
294 | spin_unlock(&kvm->mmu_lock); | 320 | spin_unlock(&kvm->mmu_lock); |
295 | srcu_read_unlock(&kvm->srcu, idx); | 321 | srcu_read_unlock(&kvm->srcu, idx); |
296 | 322 | ||
@@ -344,6 +370,22 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, | |||
344 | return young; | 370 | return young; |
345 | } | 371 | } |
346 | 372 | ||
373 | static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, | ||
374 | struct mm_struct *mm, | ||
375 | unsigned long address) | ||
376 | { | ||
377 | struct kvm *kvm = mmu_notifier_to_kvm(mn); | ||
378 | int young, idx; | ||
379 | |||
380 | idx = srcu_read_lock(&kvm->srcu); | ||
381 | spin_lock(&kvm->mmu_lock); | ||
382 | young = kvm_test_age_hva(kvm, address); | ||
383 | spin_unlock(&kvm->mmu_lock); | ||
384 | srcu_read_unlock(&kvm->srcu, idx); | ||
385 | |||
386 | return young; | ||
387 | } | ||
388 | |||
347 | static void kvm_mmu_notifier_release(struct mmu_notifier *mn, | 389 | static void kvm_mmu_notifier_release(struct mmu_notifier *mn, |
348 | struct mm_struct *mm) | 390 | struct mm_struct *mm) |
349 | { | 391 | { |
@@ -360,6 +402,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { | |||
360 | .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, | 402 | .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, |
361 | .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, | 403 | .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, |
362 | .clear_flush_young = kvm_mmu_notifier_clear_flush_young, | 404 | .clear_flush_young = kvm_mmu_notifier_clear_flush_young, |
405 | .test_young = kvm_mmu_notifier_test_young, | ||
363 | .change_pte = kvm_mmu_notifier_change_pte, | 406 | .change_pte = kvm_mmu_notifier_change_pte, |
364 | .release = kvm_mmu_notifier_release, | 407 | .release = kvm_mmu_notifier_release, |
365 | }; | 408 | }; |
@@ -381,11 +424,15 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) | |||
381 | 424 | ||
382 | static struct kvm *kvm_create_vm(void) | 425 | static struct kvm *kvm_create_vm(void) |
383 | { | 426 | { |
384 | int r = 0, i; | 427 | int r, i; |
385 | struct kvm *kvm = kvm_arch_create_vm(); | 428 | struct kvm *kvm = kvm_arch_alloc_vm(); |
386 | 429 | ||
387 | if (IS_ERR(kvm)) | 430 | if (!kvm) |
388 | goto out; | 431 | return ERR_PTR(-ENOMEM); |
432 | |||
433 | r = kvm_arch_init_vm(kvm); | ||
434 | if (r) | ||
435 | goto out_err_nodisable; | ||
389 | 436 | ||
390 | r = hardware_enable_all(); | 437 | r = hardware_enable_all(); |
391 | if (r) | 438 | if (r) |
@@ -399,23 +446,19 @@ static struct kvm *kvm_create_vm(void) | |||
399 | r = -ENOMEM; | 446 | r = -ENOMEM; |
400 | kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); | 447 | kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); |
401 | if (!kvm->memslots) | 448 | if (!kvm->memslots) |
402 | goto out_err; | 449 | goto out_err_nosrcu; |
403 | if (init_srcu_struct(&kvm->srcu)) | 450 | if (init_srcu_struct(&kvm->srcu)) |
404 | goto out_err; | 451 | goto out_err_nosrcu; |
405 | for (i = 0; i < KVM_NR_BUSES; i++) { | 452 | for (i = 0; i < KVM_NR_BUSES; i++) { |
406 | kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), | 453 | kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), |
407 | GFP_KERNEL); | 454 | GFP_KERNEL); |
408 | if (!kvm->buses[i]) { | 455 | if (!kvm->buses[i]) |
409 | cleanup_srcu_struct(&kvm->srcu); | ||
410 | goto out_err; | 456 | goto out_err; |
411 | } | ||
412 | } | 457 | } |
413 | 458 | ||
414 | r = kvm_init_mmu_notifier(kvm); | 459 | r = kvm_init_mmu_notifier(kvm); |
415 | if (r) { | 460 | if (r) |
416 | cleanup_srcu_struct(&kvm->srcu); | ||
417 | goto out_err; | 461 | goto out_err; |
418 | } | ||
419 | 462 | ||
420 | kvm->mm = current->mm; | 463 | kvm->mm = current->mm; |
421 | atomic_inc(&kvm->mm->mm_count); | 464 | atomic_inc(&kvm->mm->mm_count); |
@@ -429,19 +472,35 @@ static struct kvm *kvm_create_vm(void) | |||
429 | spin_lock(&kvm_lock); | 472 | spin_lock(&kvm_lock); |
430 | list_add(&kvm->vm_list, &vm_list); | 473 | list_add(&kvm->vm_list, &vm_list); |
431 | spin_unlock(&kvm_lock); | 474 | spin_unlock(&kvm_lock); |
432 | out: | 475 | |
433 | return kvm; | 476 | return kvm; |
434 | 477 | ||
435 | out_err: | 478 | out_err: |
479 | cleanup_srcu_struct(&kvm->srcu); | ||
480 | out_err_nosrcu: | ||
436 | hardware_disable_all(); | 481 | hardware_disable_all(); |
437 | out_err_nodisable: | 482 | out_err_nodisable: |
438 | for (i = 0; i < KVM_NR_BUSES; i++) | 483 | for (i = 0; i < KVM_NR_BUSES; i++) |
439 | kfree(kvm->buses[i]); | 484 | kfree(kvm->buses[i]); |
440 | kfree(kvm->memslots); | 485 | kfree(kvm->memslots); |
441 | kfree(kvm); | 486 | kvm_arch_free_vm(kvm); |
442 | return ERR_PTR(r); | 487 | return ERR_PTR(r); |
443 | } | 488 | } |
444 | 489 | ||
490 | static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) | ||
491 | { | ||
492 | if (!memslot->dirty_bitmap) | ||
493 | return; | ||
494 | |||
495 | if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) | ||
496 | vfree(memslot->dirty_bitmap_head); | ||
497 | else | ||
498 | kfree(memslot->dirty_bitmap_head); | ||
499 | |||
500 | memslot->dirty_bitmap = NULL; | ||
501 | memslot->dirty_bitmap_head = NULL; | ||
502 | } | ||
503 | |||
445 | /* | 504 | /* |
446 | * Free any memory in @free but not in @dont. | 505 | * Free any memory in @free but not in @dont. |
447 | */ | 506 | */ |
@@ -454,7 +513,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, | |||
454 | vfree(free->rmap); | 513 | vfree(free->rmap); |
455 | 514 | ||
456 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) | 515 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) |
457 | vfree(free->dirty_bitmap); | 516 | kvm_destroy_dirty_bitmap(free); |
458 | 517 | ||
459 | 518 | ||
460 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { | 519 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { |
@@ -465,7 +524,6 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, | |||
465 | } | 524 | } |
466 | 525 | ||
467 | free->npages = 0; | 526 | free->npages = 0; |
468 | free->dirty_bitmap = NULL; | ||
469 | free->rmap = NULL; | 527 | free->rmap = NULL; |
470 | } | 528 | } |
471 | 529 | ||
@@ -499,6 +557,9 @@ static void kvm_destroy_vm(struct kvm *kvm) | |||
499 | kvm_arch_flush_shadow(kvm); | 557 | kvm_arch_flush_shadow(kvm); |
500 | #endif | 558 | #endif |
501 | kvm_arch_destroy_vm(kvm); | 559 | kvm_arch_destroy_vm(kvm); |
560 | kvm_free_physmem(kvm); | ||
561 | cleanup_srcu_struct(&kvm->srcu); | ||
562 | kvm_arch_free_vm(kvm); | ||
502 | hardware_disable_all(); | 563 | hardware_disable_all(); |
503 | mmdrop(mm); | 564 | mmdrop(mm); |
504 | } | 565 | } |
@@ -528,6 +589,27 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) | |||
528 | } | 589 | } |
529 | 590 | ||
530 | /* | 591 | /* |
592 | * Allocation size is twice as large as the actual dirty bitmap size. | ||
593 | * This makes it possible to do double buffering: see x86's | ||
594 | * kvm_vm_ioctl_get_dirty_log(). | ||
595 | */ | ||
596 | static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) | ||
597 | { | ||
598 | unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); | ||
599 | |||
600 | if (dirty_bytes > PAGE_SIZE) | ||
601 | memslot->dirty_bitmap = vzalloc(dirty_bytes); | ||
602 | else | ||
603 | memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL); | ||
604 | |||
605 | if (!memslot->dirty_bitmap) | ||
606 | return -ENOMEM; | ||
607 | |||
608 | memslot->dirty_bitmap_head = memslot->dirty_bitmap; | ||
609 | return 0; | ||
610 | } | ||
611 | |||
612 | /* | ||
531 | * Allocate some memory and give it an address in the guest physical address | 613 | * Allocate some memory and give it an address in the guest physical address |
532 | * space. | 614 | * space. |
533 | * | 615 | * |
@@ -604,13 +686,11 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
604 | /* Allocate if a slot is being created */ | 686 | /* Allocate if a slot is being created */ |
605 | #ifndef CONFIG_S390 | 687 | #ifndef CONFIG_S390 |
606 | if (npages && !new.rmap) { | 688 | if (npages && !new.rmap) { |
607 | new.rmap = vmalloc(npages * sizeof(*new.rmap)); | 689 | new.rmap = vzalloc(npages * sizeof(*new.rmap)); |
608 | 690 | ||
609 | if (!new.rmap) | 691 | if (!new.rmap) |
610 | goto out_free; | 692 | goto out_free; |
611 | 693 | ||
612 | memset(new.rmap, 0, npages * sizeof(*new.rmap)); | ||
613 | |||
614 | new.user_alloc = user_alloc; | 694 | new.user_alloc = user_alloc; |
615 | new.userspace_addr = mem->userspace_addr; | 695 | new.userspace_addr = mem->userspace_addr; |
616 | } | 696 | } |
@@ -633,14 +713,11 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
633 | >> KVM_HPAGE_GFN_SHIFT(level)); | 713 | >> KVM_HPAGE_GFN_SHIFT(level)); |
634 | lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); | 714 | lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); |
635 | 715 | ||
636 | new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); | 716 | new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i])); |
637 | 717 | ||
638 | if (!new.lpage_info[i]) | 718 | if (!new.lpage_info[i]) |
639 | goto out_free; | 719 | goto out_free; |
640 | 720 | ||
641 | memset(new.lpage_info[i], 0, | ||
642 | lpages * sizeof(*new.lpage_info[i])); | ||
643 | |||
644 | if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) | 721 | if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) |
645 | new.lpage_info[i][0].write_count = 1; | 722 | new.lpage_info[i][0].write_count = 1; |
646 | if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) | 723 | if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) |
@@ -661,12 +738,8 @@ skip_lpage: | |||
661 | 738 | ||
662 | /* Allocate page dirty bitmap if needed */ | 739 | /* Allocate page dirty bitmap if needed */ |
663 | if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { | 740 | if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { |
664 | unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new); | 741 | if (kvm_create_dirty_bitmap(&new) < 0) |
665 | |||
666 | new.dirty_bitmap = vmalloc(dirty_bytes); | ||
667 | if (!new.dirty_bitmap) | ||
668 | goto out_free; | 742 | goto out_free; |
669 | memset(new.dirty_bitmap, 0, dirty_bytes); | ||
670 | /* destroy any largepage mappings for dirty tracking */ | 743 | /* destroy any largepage mappings for dirty tracking */ |
671 | if (old.npages) | 744 | if (old.npages) |
672 | flush_shadow = 1; | 745 | flush_shadow = 1; |
@@ -685,6 +758,7 @@ skip_lpage: | |||
685 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); | 758 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); |
686 | if (mem->slot >= slots->nmemslots) | 759 | if (mem->slot >= slots->nmemslots) |
687 | slots->nmemslots = mem->slot + 1; | 760 | slots->nmemslots = mem->slot + 1; |
761 | slots->generation++; | ||
688 | slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; | 762 | slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; |
689 | 763 | ||
690 | old_memslots = kvm->memslots; | 764 | old_memslots = kvm->memslots; |
@@ -719,6 +793,7 @@ skip_lpage: | |||
719 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); | 793 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); |
720 | if (mem->slot >= slots->nmemslots) | 794 | if (mem->slot >= slots->nmemslots) |
721 | slots->nmemslots = mem->slot + 1; | 795 | slots->nmemslots = mem->slot + 1; |
796 | slots->generation++; | ||
722 | 797 | ||
723 | /* actual memory is freed via old in kvm_free_physmem_slot below */ | 798 | /* actual memory is freed via old in kvm_free_physmem_slot below */ |
724 | if (!npages) { | 799 | if (!npages) { |
@@ -849,10 +924,10 @@ int kvm_is_error_hva(unsigned long addr) | |||
849 | } | 924 | } |
850 | EXPORT_SYMBOL_GPL(kvm_is_error_hva); | 925 | EXPORT_SYMBOL_GPL(kvm_is_error_hva); |
851 | 926 | ||
852 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | 927 | static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, |
928 | gfn_t gfn) | ||
853 | { | 929 | { |
854 | int i; | 930 | int i; |
855 | struct kvm_memslots *slots = kvm_memslots(kvm); | ||
856 | 931 | ||
857 | for (i = 0; i < slots->nmemslots; ++i) { | 932 | for (i = 0; i < slots->nmemslots; ++i) { |
858 | struct kvm_memory_slot *memslot = &slots->memslots[i]; | 933 | struct kvm_memory_slot *memslot = &slots->memslots[i]; |
@@ -863,6 +938,11 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | |||
863 | } | 938 | } |
864 | return NULL; | 939 | return NULL; |
865 | } | 940 | } |
941 | |||
942 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | ||
943 | { | ||
944 | return __gfn_to_memslot(kvm_memslots(kvm), gfn); | ||
945 | } | ||
866 | EXPORT_SYMBOL_GPL(gfn_to_memslot); | 946 | EXPORT_SYMBOL_GPL(gfn_to_memslot); |
867 | 947 | ||
868 | int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) | 948 | int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) |
@@ -925,12 +1005,9 @@ int memslot_id(struct kvm *kvm, gfn_t gfn) | |||
925 | return memslot - slots->memslots; | 1005 | return memslot - slots->memslots; |
926 | } | 1006 | } |
927 | 1007 | ||
928 | static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, | 1008 | static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, |
929 | gfn_t *nr_pages) | 1009 | gfn_t *nr_pages) |
930 | { | 1010 | { |
931 | struct kvm_memory_slot *slot; | ||
932 | |||
933 | slot = gfn_to_memslot(kvm, gfn); | ||
934 | if (!slot || slot->flags & KVM_MEMSLOT_INVALID) | 1011 | if (!slot || slot->flags & KVM_MEMSLOT_INVALID) |
935 | return bad_hva(); | 1012 | return bad_hva(); |
936 | 1013 | ||
@@ -942,28 +1019,61 @@ static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, | |||
942 | 1019 | ||
943 | unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) | 1020 | unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) |
944 | { | 1021 | { |
945 | return gfn_to_hva_many(kvm, gfn, NULL); | 1022 | return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); |
946 | } | 1023 | } |
947 | EXPORT_SYMBOL_GPL(gfn_to_hva); | 1024 | EXPORT_SYMBOL_GPL(gfn_to_hva); |
948 | 1025 | ||
949 | static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) | 1026 | static pfn_t get_fault_pfn(void) |
1027 | { | ||
1028 | get_page(fault_page); | ||
1029 | return fault_pfn; | ||
1030 | } | ||
1031 | |||
1032 | static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, | ||
1033 | bool *async, bool write_fault, bool *writable) | ||
950 | { | 1034 | { |
951 | struct page *page[1]; | 1035 | struct page *page[1]; |
952 | int npages; | 1036 | int npages = 0; |
953 | pfn_t pfn; | 1037 | pfn_t pfn; |
954 | 1038 | ||
955 | if (atomic) | 1039 | /* we can do it either atomically or asynchronously, not both */ |
1040 | BUG_ON(atomic && async); | ||
1041 | |||
1042 | BUG_ON(!write_fault && !writable); | ||
1043 | |||
1044 | if (writable) | ||
1045 | *writable = true; | ||
1046 | |||
1047 | if (atomic || async) | ||
956 | npages = __get_user_pages_fast(addr, 1, 1, page); | 1048 | npages = __get_user_pages_fast(addr, 1, 1, page); |
957 | else { | 1049 | |
1050 | if (unlikely(npages != 1) && !atomic) { | ||
958 | might_sleep(); | 1051 | might_sleep(); |
959 | npages = get_user_pages_fast(addr, 1, 1, page); | 1052 | |
1053 | if (writable) | ||
1054 | *writable = write_fault; | ||
1055 | |||
1056 | npages = get_user_pages_fast(addr, 1, write_fault, page); | ||
1057 | |||
1058 | /* map read fault as writable if possible */ | ||
1059 | if (unlikely(!write_fault) && npages == 1) { | ||
1060 | struct page *wpage[1]; | ||
1061 | |||
1062 | npages = __get_user_pages_fast(addr, 1, 1, wpage); | ||
1063 | if (npages == 1) { | ||
1064 | *writable = true; | ||
1065 | put_page(page[0]); | ||
1066 | page[0] = wpage[0]; | ||
1067 | } | ||
1068 | npages = 1; | ||
1069 | } | ||
960 | } | 1070 | } |
961 | 1071 | ||
962 | if (unlikely(npages != 1)) { | 1072 | if (unlikely(npages != 1)) { |
963 | struct vm_area_struct *vma; | 1073 | struct vm_area_struct *vma; |
964 | 1074 | ||
965 | if (atomic) | 1075 | if (atomic) |
966 | goto return_fault_page; | 1076 | return get_fault_pfn(); |
967 | 1077 | ||
968 | down_read(¤t->mm->mmap_sem); | 1078 | down_read(¤t->mm->mmap_sem); |
969 | if (is_hwpoison_address(addr)) { | 1079 | if (is_hwpoison_address(addr)) { |
@@ -972,19 +1082,20 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) | |||
972 | return page_to_pfn(hwpoison_page); | 1082 | return page_to_pfn(hwpoison_page); |
973 | } | 1083 | } |
974 | 1084 | ||
975 | vma = find_vma(current->mm, addr); | 1085 | vma = find_vma_intersection(current->mm, addr, addr+1); |
976 | 1086 | ||
977 | if (vma == NULL || addr < vma->vm_start || | 1087 | if (vma == NULL) |
978 | !(vma->vm_flags & VM_PFNMAP)) { | 1088 | pfn = get_fault_pfn(); |
979 | up_read(¤t->mm->mmap_sem); | 1089 | else if ((vma->vm_flags & VM_PFNMAP)) { |
980 | return_fault_page: | 1090 | pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + |
981 | get_page(fault_page); | 1091 | vma->vm_pgoff; |
982 | return page_to_pfn(fault_page); | 1092 | BUG_ON(!kvm_is_mmio_pfn(pfn)); |
1093 | } else { | ||
1094 | if (async && (vma->vm_flags & VM_WRITE)) | ||
1095 | *async = true; | ||
1096 | pfn = get_fault_pfn(); | ||
983 | } | 1097 | } |
984 | |||
985 | pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
986 | up_read(¤t->mm->mmap_sem); | 1098 | up_read(¤t->mm->mmap_sem); |
987 | BUG_ON(!kvm_is_mmio_pfn(pfn)); | ||
988 | } else | 1099 | } else |
989 | pfn = page_to_pfn(page[0]); | 1100 | pfn = page_to_pfn(page[0]); |
990 | 1101 | ||
@@ -993,40 +1104,58 @@ return_fault_page: | |||
993 | 1104 | ||
994 | pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) | 1105 | pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) |
995 | { | 1106 | { |
996 | return hva_to_pfn(kvm, addr, true); | 1107 | return hva_to_pfn(kvm, addr, true, NULL, true, NULL); |
997 | } | 1108 | } |
998 | EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); | 1109 | EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); |
999 | 1110 | ||
1000 | static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic) | 1111 | static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, |
1112 | bool write_fault, bool *writable) | ||
1001 | { | 1113 | { |
1002 | unsigned long addr; | 1114 | unsigned long addr; |
1003 | 1115 | ||
1116 | if (async) | ||
1117 | *async = false; | ||
1118 | |||
1004 | addr = gfn_to_hva(kvm, gfn); | 1119 | addr = gfn_to_hva(kvm, gfn); |
1005 | if (kvm_is_error_hva(addr)) { | 1120 | if (kvm_is_error_hva(addr)) { |
1006 | get_page(bad_page); | 1121 | get_page(bad_page); |
1007 | return page_to_pfn(bad_page); | 1122 | return page_to_pfn(bad_page); |
1008 | } | 1123 | } |
1009 | 1124 | ||
1010 | return hva_to_pfn(kvm, addr, atomic); | 1125 | return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); |
1011 | } | 1126 | } |
1012 | 1127 | ||
1013 | pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) | 1128 | pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) |
1014 | { | 1129 | { |
1015 | return __gfn_to_pfn(kvm, gfn, true); | 1130 | return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); |
1016 | } | 1131 | } |
1017 | EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); | 1132 | EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); |
1018 | 1133 | ||
1134 | pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, | ||
1135 | bool write_fault, bool *writable) | ||
1136 | { | ||
1137 | return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); | ||
1138 | } | ||
1139 | EXPORT_SYMBOL_GPL(gfn_to_pfn_async); | ||
1140 | |||
1019 | pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) | 1141 | pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) |
1020 | { | 1142 | { |
1021 | return __gfn_to_pfn(kvm, gfn, false); | 1143 | return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); |
1022 | } | 1144 | } |
1023 | EXPORT_SYMBOL_GPL(gfn_to_pfn); | 1145 | EXPORT_SYMBOL_GPL(gfn_to_pfn); |
1024 | 1146 | ||
1147 | pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, | ||
1148 | bool *writable) | ||
1149 | { | ||
1150 | return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); | ||
1151 | } | ||
1152 | EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); | ||
1153 | |||
1025 | pfn_t gfn_to_pfn_memslot(struct kvm *kvm, | 1154 | pfn_t gfn_to_pfn_memslot(struct kvm *kvm, |
1026 | struct kvm_memory_slot *slot, gfn_t gfn) | 1155 | struct kvm_memory_slot *slot, gfn_t gfn) |
1027 | { | 1156 | { |
1028 | unsigned long addr = gfn_to_hva_memslot(slot, gfn); | 1157 | unsigned long addr = gfn_to_hva_memslot(slot, gfn); |
1029 | return hva_to_pfn(kvm, addr, false); | 1158 | return hva_to_pfn(kvm, addr, false, NULL, true, NULL); |
1030 | } | 1159 | } |
1031 | 1160 | ||
1032 | int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, | 1161 | int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, |
@@ -1035,7 +1164,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, | |||
1035 | unsigned long addr; | 1164 | unsigned long addr; |
1036 | gfn_t entry; | 1165 | gfn_t entry; |
1037 | 1166 | ||
1038 | addr = gfn_to_hva_many(kvm, gfn, &entry); | 1167 | addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); |
1039 | if (kvm_is_error_hva(addr)) | 1168 | if (kvm_is_error_hva(addr)) |
1040 | return -1; | 1169 | return -1; |
1041 | 1170 | ||
@@ -1219,9 +1348,51 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, | |||
1219 | return 0; | 1348 | return 0; |
1220 | } | 1349 | } |
1221 | 1350 | ||
1351 | int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | ||
1352 | gpa_t gpa) | ||
1353 | { | ||
1354 | struct kvm_memslots *slots = kvm_memslots(kvm); | ||
1355 | int offset = offset_in_page(gpa); | ||
1356 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
1357 | |||
1358 | ghc->gpa = gpa; | ||
1359 | ghc->generation = slots->generation; | ||
1360 | ghc->memslot = __gfn_to_memslot(slots, gfn); | ||
1361 | ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); | ||
1362 | if (!kvm_is_error_hva(ghc->hva)) | ||
1363 | ghc->hva += offset; | ||
1364 | else | ||
1365 | return -EFAULT; | ||
1366 | |||
1367 | return 0; | ||
1368 | } | ||
1369 | EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); | ||
1370 | |||
1371 | int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | ||
1372 | void *data, unsigned long len) | ||
1373 | { | ||
1374 | struct kvm_memslots *slots = kvm_memslots(kvm); | ||
1375 | int r; | ||
1376 | |||
1377 | if (slots->generation != ghc->generation) | ||
1378 | kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); | ||
1379 | |||
1380 | if (kvm_is_error_hva(ghc->hva)) | ||
1381 | return -EFAULT; | ||
1382 | |||
1383 | r = copy_to_user((void __user *)ghc->hva, data, len); | ||
1384 | if (r) | ||
1385 | return -EFAULT; | ||
1386 | mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); | ||
1387 | |||
1388 | return 0; | ||
1389 | } | ||
1390 | EXPORT_SYMBOL_GPL(kvm_write_guest_cached); | ||
1391 | |||
1222 | int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) | 1392 | int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) |
1223 | { | 1393 | { |
1224 | return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); | 1394 | return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, |
1395 | offset, len); | ||
1225 | } | 1396 | } |
1226 | EXPORT_SYMBOL_GPL(kvm_clear_guest_page); | 1397 | EXPORT_SYMBOL_GPL(kvm_clear_guest_page); |
1227 | 1398 | ||
@@ -1244,11 +1415,9 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) | |||
1244 | } | 1415 | } |
1245 | EXPORT_SYMBOL_GPL(kvm_clear_guest); | 1416 | EXPORT_SYMBOL_GPL(kvm_clear_guest); |
1246 | 1417 | ||
1247 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn) | 1418 | void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, |
1419 | gfn_t gfn) | ||
1248 | { | 1420 | { |
1249 | struct kvm_memory_slot *memslot; | ||
1250 | |||
1251 | memslot = gfn_to_memslot(kvm, gfn); | ||
1252 | if (memslot && memslot->dirty_bitmap) { | 1421 | if (memslot && memslot->dirty_bitmap) { |
1253 | unsigned long rel_gfn = gfn - memslot->base_gfn; | 1422 | unsigned long rel_gfn = gfn - memslot->base_gfn; |
1254 | 1423 | ||
@@ -1256,6 +1425,14 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) | |||
1256 | } | 1425 | } |
1257 | } | 1426 | } |
1258 | 1427 | ||
1428 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn) | ||
1429 | { | ||
1430 | struct kvm_memory_slot *memslot; | ||
1431 | |||
1432 | memslot = gfn_to_memslot(kvm, gfn); | ||
1433 | mark_page_dirty_in_slot(kvm, memslot, gfn); | ||
1434 | } | ||
1435 | |||
1259 | /* | 1436 | /* |
1260 | * The vCPU has executed a HLT instruction with in-kernel mode enabled. | 1437 | * The vCPU has executed a HLT instruction with in-kernel mode enabled. |
1261 | */ | 1438 | */ |
@@ -1457,6 +1634,7 @@ static long kvm_vcpu_ioctl(struct file *filp, | |||
1457 | if (arg) | 1634 | if (arg) |
1458 | goto out; | 1635 | goto out; |
1459 | r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); | 1636 | r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); |
1637 | trace_kvm_userspace_exit(vcpu->run->exit_reason, r); | ||
1460 | break; | 1638 | break; |
1461 | case KVM_GET_REGS: { | 1639 | case KVM_GET_REGS: { |
1462 | struct kvm_regs *kvm_regs; | 1640 | struct kvm_regs *kvm_regs; |
@@ -1824,7 +2002,7 @@ static struct file_operations kvm_vm_fops = { | |||
1824 | 2002 | ||
1825 | static int kvm_dev_ioctl_create_vm(void) | 2003 | static int kvm_dev_ioctl_create_vm(void) |
1826 | { | 2004 | { |
1827 | int fd, r; | 2005 | int r; |
1828 | struct kvm *kvm; | 2006 | struct kvm *kvm; |
1829 | 2007 | ||
1830 | kvm = kvm_create_vm(); | 2008 | kvm = kvm_create_vm(); |
@@ -1837,11 +2015,11 @@ static int kvm_dev_ioctl_create_vm(void) | |||
1837 | return r; | 2015 | return r; |
1838 | } | 2016 | } |
1839 | #endif | 2017 | #endif |
1840 | fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); | 2018 | r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); |
1841 | if (fd < 0) | 2019 | if (r < 0) |
1842 | kvm_put_kvm(kvm); | 2020 | kvm_put_kvm(kvm); |
1843 | 2021 | ||
1844 | return fd; | 2022 | return r; |
1845 | } | 2023 | } |
1846 | 2024 | ||
1847 | static long kvm_dev_ioctl_check_extension_generic(long arg) | 2025 | static long kvm_dev_ioctl_check_extension_generic(long arg) |
@@ -1922,7 +2100,7 @@ static struct miscdevice kvm_dev = { | |||
1922 | &kvm_chardev_ops, | 2100 | &kvm_chardev_ops, |
1923 | }; | 2101 | }; |
1924 | 2102 | ||
1925 | static void hardware_enable(void *junk) | 2103 | static void hardware_enable_nolock(void *junk) |
1926 | { | 2104 | { |
1927 | int cpu = raw_smp_processor_id(); | 2105 | int cpu = raw_smp_processor_id(); |
1928 | int r; | 2106 | int r; |
@@ -1942,7 +2120,14 @@ static void hardware_enable(void *junk) | |||
1942 | } | 2120 | } |
1943 | } | 2121 | } |
1944 | 2122 | ||
1945 | static void hardware_disable(void *junk) | 2123 | static void hardware_enable(void *junk) |
2124 | { | ||
2125 | spin_lock(&kvm_lock); | ||
2126 | hardware_enable_nolock(junk); | ||
2127 | spin_unlock(&kvm_lock); | ||
2128 | } | ||
2129 | |||
2130 | static void hardware_disable_nolock(void *junk) | ||
1946 | { | 2131 | { |
1947 | int cpu = raw_smp_processor_id(); | 2132 | int cpu = raw_smp_processor_id(); |
1948 | 2133 | ||
@@ -1952,13 +2137,20 @@ static void hardware_disable(void *junk) | |||
1952 | kvm_arch_hardware_disable(NULL); | 2137 | kvm_arch_hardware_disable(NULL); |
1953 | } | 2138 | } |
1954 | 2139 | ||
2140 | static void hardware_disable(void *junk) | ||
2141 | { | ||
2142 | spin_lock(&kvm_lock); | ||
2143 | hardware_disable_nolock(junk); | ||
2144 | spin_unlock(&kvm_lock); | ||
2145 | } | ||
2146 | |||
1955 | static void hardware_disable_all_nolock(void) | 2147 | static void hardware_disable_all_nolock(void) |
1956 | { | 2148 | { |
1957 | BUG_ON(!kvm_usage_count); | 2149 | BUG_ON(!kvm_usage_count); |
1958 | 2150 | ||
1959 | kvm_usage_count--; | 2151 | kvm_usage_count--; |
1960 | if (!kvm_usage_count) | 2152 | if (!kvm_usage_count) |
1961 | on_each_cpu(hardware_disable, NULL, 1); | 2153 | on_each_cpu(hardware_disable_nolock, NULL, 1); |
1962 | } | 2154 | } |
1963 | 2155 | ||
1964 | static void hardware_disable_all(void) | 2156 | static void hardware_disable_all(void) |
@@ -1977,7 +2169,7 @@ static int hardware_enable_all(void) | |||
1977 | kvm_usage_count++; | 2169 | kvm_usage_count++; |
1978 | if (kvm_usage_count == 1) { | 2170 | if (kvm_usage_count == 1) { |
1979 | atomic_set(&hardware_enable_failed, 0); | 2171 | atomic_set(&hardware_enable_failed, 0); |
1980 | on_each_cpu(hardware_enable, NULL, 1); | 2172 | on_each_cpu(hardware_enable_nolock, NULL, 1); |
1981 | 2173 | ||
1982 | if (atomic_read(&hardware_enable_failed)) { | 2174 | if (atomic_read(&hardware_enable_failed)) { |
1983 | hardware_disable_all_nolock(); | 2175 | hardware_disable_all_nolock(); |
@@ -2008,27 +2200,19 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, | |||
2008 | case CPU_STARTING: | 2200 | case CPU_STARTING: |
2009 | printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", | 2201 | printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", |
2010 | cpu); | 2202 | cpu); |
2011 | spin_lock(&kvm_lock); | ||
2012 | hardware_enable(NULL); | 2203 | hardware_enable(NULL); |
2013 | spin_unlock(&kvm_lock); | ||
2014 | break; | 2204 | break; |
2015 | } | 2205 | } |
2016 | return NOTIFY_OK; | 2206 | return NOTIFY_OK; |
2017 | } | 2207 | } |
2018 | 2208 | ||
2019 | 2209 | ||
2020 | asmlinkage void kvm_handle_fault_on_reboot(void) | 2210 | asmlinkage void kvm_spurious_fault(void) |
2021 | { | 2211 | { |
2022 | if (kvm_rebooting) { | ||
2023 | /* spin while reset goes on */ | ||
2024 | local_irq_enable(); | ||
2025 | while (true) | ||
2026 | cpu_relax(); | ||
2027 | } | ||
2028 | /* Fault while not rebooting. We want the trace. */ | 2212 | /* Fault while not rebooting. We want the trace. */ |
2029 | BUG(); | 2213 | BUG(); |
2030 | } | 2214 | } |
2031 | EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); | 2215 | EXPORT_SYMBOL_GPL(kvm_spurious_fault); |
2032 | 2216 | ||
2033 | static int kvm_reboot(struct notifier_block *notifier, unsigned long val, | 2217 | static int kvm_reboot(struct notifier_block *notifier, unsigned long val, |
2034 | void *v) | 2218 | void *v) |
@@ -2041,7 +2225,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val, | |||
2041 | */ | 2225 | */ |
2042 | printk(KERN_INFO "kvm: exiting hardware virtualization\n"); | 2226 | printk(KERN_INFO "kvm: exiting hardware virtualization\n"); |
2043 | kvm_rebooting = true; | 2227 | kvm_rebooting = true; |
2044 | on_each_cpu(hardware_disable, NULL, 1); | 2228 | on_each_cpu(hardware_disable_nolock, NULL, 1); |
2045 | return NOTIFY_OK; | 2229 | return NOTIFY_OK; |
2046 | } | 2230 | } |
2047 | 2231 | ||
@@ -2211,7 +2395,7 @@ static void kvm_exit_debug(void) | |||
2211 | static int kvm_suspend(struct sys_device *dev, pm_message_t state) | 2395 | static int kvm_suspend(struct sys_device *dev, pm_message_t state) |
2212 | { | 2396 | { |
2213 | if (kvm_usage_count) | 2397 | if (kvm_usage_count) |
2214 | hardware_disable(NULL); | 2398 | hardware_disable_nolock(NULL); |
2215 | return 0; | 2399 | return 0; |
2216 | } | 2400 | } |
2217 | 2401 | ||
@@ -2219,7 +2403,7 @@ static int kvm_resume(struct sys_device *dev) | |||
2219 | { | 2403 | { |
2220 | if (kvm_usage_count) { | 2404 | if (kvm_usage_count) { |
2221 | WARN_ON(spin_is_locked(&kvm_lock)); | 2405 | WARN_ON(spin_is_locked(&kvm_lock)); |
2222 | hardware_enable(NULL); | 2406 | hardware_enable_nolock(NULL); |
2223 | } | 2407 | } |
2224 | return 0; | 2408 | return 0; |
2225 | } | 2409 | } |
@@ -2336,6 +2520,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, | |||
2336 | goto out_free_5; | 2520 | goto out_free_5; |
2337 | } | 2521 | } |
2338 | 2522 | ||
2523 | r = kvm_async_pf_init(); | ||
2524 | if (r) | ||
2525 | goto out_free; | ||
2526 | |||
2339 | kvm_chardev_ops.owner = module; | 2527 | kvm_chardev_ops.owner = module; |
2340 | kvm_vm_fops.owner = module; | 2528 | kvm_vm_fops.owner = module; |
2341 | kvm_vcpu_fops.owner = module; | 2529 | kvm_vcpu_fops.owner = module; |
@@ -2343,7 +2531,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, | |||
2343 | r = misc_register(&kvm_dev); | 2531 | r = misc_register(&kvm_dev); |
2344 | if (r) { | 2532 | if (r) { |
2345 | printk(KERN_ERR "kvm: misc device register failed\n"); | 2533 | printk(KERN_ERR "kvm: misc device register failed\n"); |
2346 | goto out_free; | 2534 | goto out_unreg; |
2347 | } | 2535 | } |
2348 | 2536 | ||
2349 | kvm_preempt_ops.sched_in = kvm_sched_in; | 2537 | kvm_preempt_ops.sched_in = kvm_sched_in; |
@@ -2353,6 +2541,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, | |||
2353 | 2541 | ||
2354 | return 0; | 2542 | return 0; |
2355 | 2543 | ||
2544 | out_unreg: | ||
2545 | kvm_async_pf_deinit(); | ||
2356 | out_free: | 2546 | out_free: |
2357 | kmem_cache_destroy(kvm_vcpu_cache); | 2547 | kmem_cache_destroy(kvm_vcpu_cache); |
2358 | out_free_5: | 2548 | out_free_5: |
@@ -2385,11 +2575,12 @@ void kvm_exit(void) | |||
2385 | kvm_exit_debug(); | 2575 | kvm_exit_debug(); |
2386 | misc_deregister(&kvm_dev); | 2576 | misc_deregister(&kvm_dev); |
2387 | kmem_cache_destroy(kvm_vcpu_cache); | 2577 | kmem_cache_destroy(kvm_vcpu_cache); |
2578 | kvm_async_pf_deinit(); | ||
2388 | sysdev_unregister(&kvm_sysdev); | 2579 | sysdev_unregister(&kvm_sysdev); |
2389 | sysdev_class_unregister(&kvm_sysdev_class); | 2580 | sysdev_class_unregister(&kvm_sysdev_class); |
2390 | unregister_reboot_notifier(&kvm_reboot_notifier); | 2581 | unregister_reboot_notifier(&kvm_reboot_notifier); |
2391 | unregister_cpu_notifier(&kvm_cpu_notifier); | 2582 | unregister_cpu_notifier(&kvm_cpu_notifier); |
2392 | on_each_cpu(hardware_disable, NULL, 1); | 2583 | on_each_cpu(hardware_disable_nolock, NULL, 1); |
2393 | kvm_arch_hardware_unsetup(); | 2584 | kvm_arch_hardware_unsetup(); |
2394 | kvm_arch_exit(); | 2585 | kvm_arch_exit(); |
2395 | free_cpumask_var(cpus_hardware_enabled); | 2586 | free_cpumask_var(cpus_hardware_enabled); |