aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile3
-rw-r--r--drivers/base/bus.c41
-rw-r--r--drivers/base/class.c4
-rw-r--r--drivers/base/core.c30
-rw-r--r--drivers/firewire/fw-cdev.c3
-rw-r--r--drivers/firewire/fw-device.c38
-rw-r--r--drivers/firewire/fw-device.h12
-rw-r--r--drivers/firewire/fw-ohci.c390
-rw-r--r--drivers/firewire/fw-sbp2.c127
-rw-r--r--drivers/firewire/fw-topology.c6
-rw-r--r--drivers/firewire/fw-transaction.c4
-rw-r--r--drivers/ieee1394/dma.c39
-rw-r--r--drivers/ieee1394/ieee1394_transactions.c68
-rw-r--r--drivers/ieee1394/ohci1394.c12
-rw-r--r--drivers/ieee1394/raw1394.c4
-rw-r--r--drivers/ieee1394/sbp2.c52
-rw-r--r--drivers/ieee1394/sbp2.h1
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c1
-rw-r--r--drivers/kvm/Kconfig54
-rw-r--r--drivers/kvm/Makefile10
-rw-r--r--drivers/kvm/i8259.c450
-rw-r--r--drivers/kvm/ioapic.c388
-rw-r--r--drivers/kvm/irq.c98
-rw-r--r--drivers/kvm/irq.h165
-rw-r--r--drivers/kvm/kvm.h796
-rw-r--r--drivers/kvm/kvm_main.c3628
-rw-r--r--drivers/kvm/kvm_svm.h45
-rw-r--r--drivers/kvm/lapic.c1080
-rw-r--r--drivers/kvm/mmu.c1498
-rw-r--r--drivers/kvm/paging_tmpl.h511
-rw-r--r--drivers/kvm/segment_descriptor.h17
-rw-r--r--drivers/kvm/svm.c1754
-rw-r--r--drivers/kvm/svm.h324
-rw-r--r--drivers/kvm/vmx.c2566
-rw-r--r--drivers/kvm/vmx.h310
-rw-r--r--drivers/kvm/x86_emulate.c1662
-rw-r--r--drivers/kvm/x86_emulate.h155
-rw-r--r--drivers/lguest/core.c46
-rw-r--r--drivers/lguest/hypercalls.c106
-rw-r--r--drivers/lguest/interrupts_and_traps.c149
-rw-r--r--drivers/lguest/lg.h154
-rw-r--r--drivers/lguest/lguest_user.c147
-rw-r--r--drivers/lguest/page_tables.c179
-rw-r--r--drivers/lguest/segments.c48
-rw-r--r--drivers/lguest/x86/core.c127
-rw-r--r--drivers/s390/scsi/zfcp_fsf.c4
-rw-r--r--drivers/scsi/3w-9xxx.c1
-rw-r--r--drivers/scsi/3w-xxxx.c1
-rw-r--r--drivers/scsi/BusLogic.c1
-rw-r--r--drivers/scsi/Kconfig2
-rw-r--r--drivers/scsi/NCR53c406a.c1
-rw-r--r--drivers/scsi/a100u2w.c1
-rw-r--r--drivers/scsi/aacraid/commctrl.c29
-rw-r--r--drivers/scsi/aacraid/linit.c1
-rw-r--r--drivers/scsi/aha1740.c1
-rw-r--r--drivers/scsi/aic7xxx/aic79xx.h5
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_core.c2
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_osm.c3
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_osm_pci.c33
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_pci.c2
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx.h4
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_core.c3
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_osm.c10
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_osm_pci.c33
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_pci.c2
-rw-r--r--drivers/scsi/aic7xxx_old.c1
-rw-r--r--drivers/scsi/arcmsr/arcmsr_hba.c1
-rw-r--r--drivers/scsi/dc395x.c1
-rw-r--r--drivers/scsi/dpt_i2o.c1
-rw-r--r--drivers/scsi/eata.c1
-rw-r--r--drivers/scsi/hosts.c1
-rw-r--r--drivers/scsi/hptiop.c3
-rw-r--r--drivers/scsi/ibmmca.c1
-rw-r--r--drivers/scsi/ibmvscsi/ibmvscsi.c1
-rw-r--r--drivers/scsi/initio.c1
-rw-r--r--drivers/scsi/iscsi_tcp.c1
-rw-r--r--drivers/scsi/libsrp.c4
-rw-r--r--drivers/scsi/lpfc/lpfc_scsi.c2
-rw-r--r--drivers/scsi/mac53c94.c1
-rw-r--r--drivers/scsi/megaraid.c1
-rw-r--r--drivers/scsi/megaraid/megaraid_mbox.c1
-rw-r--r--drivers/scsi/megaraid/megaraid_sas.c1
-rw-r--r--drivers/scsi/mesh.c1
-rw-r--r--drivers/scsi/ncr53c8xx.c2
-rw-r--r--drivers/scsi/nsp32.c1
-rw-r--r--drivers/scsi/pcmcia/sym53c500_cs.c1
-rw-r--r--drivers/scsi/qla1280.c1
-rw-r--r--drivers/scsi/qla2xxx/qla_os.c2
-rw-r--r--drivers/scsi/qla4xxx/ql4_os.c1
-rw-r--r--drivers/scsi/qlogicfas.c1
-rw-r--r--drivers/scsi/scsi.c2
-rw-r--r--drivers/scsi/scsi_debug.c174
-rw-r--r--drivers/scsi/scsi_error.c33
-rw-r--r--drivers/scsi/scsi_lib.c274
-rw-r--r--drivers/scsi/scsi_tgt_lib.c28
-rw-r--r--drivers/scsi/sd.c4
-rw-r--r--drivers/scsi/sgiwd93.c64
-rw-r--r--drivers/scsi/sr.c25
-rw-r--r--drivers/scsi/stex.c1
-rw-r--r--drivers/scsi/sym53c416.c1
-rw-r--r--drivers/scsi/sym53c8xx_2/sym_glue.c3
-rw-r--r--drivers/scsi/u14-34f.c1
-rw-r--r--drivers/scsi/ultrastor.c1
-rw-r--r--drivers/scsi/wd7000.c1
-rw-r--r--drivers/usb/storage/isd200.c8
-rw-r--r--drivers/watchdog/Kconfig2
107 files changed, 1410 insertions, 16688 deletions
diff --git a/drivers/Kconfig b/drivers/Kconfig
index f4076d9e9902..08d4ae201597 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -90,8 +90,6 @@ source "drivers/dca/Kconfig"
90 90
91source "drivers/auxdisplay/Kconfig" 91source "drivers/auxdisplay/Kconfig"
92 92
93source "drivers/kvm/Kconfig"
94
95source "drivers/uio/Kconfig" 93source "drivers/uio/Kconfig"
96 94
97source "drivers/virtio/Kconfig" 95source "drivers/virtio/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index d92d4d82d001..0ee9a8a4095e 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -47,7 +47,6 @@ obj-$(CONFIG_SPI) += spi/
47obj-$(CONFIG_PCCARD) += pcmcia/ 47obj-$(CONFIG_PCCARD) += pcmcia/
48obj-$(CONFIG_DIO) += dio/ 48obj-$(CONFIG_DIO) += dio/
49obj-$(CONFIG_SBUS) += sbus/ 49obj-$(CONFIG_SBUS) += sbus/
50obj-$(CONFIG_KVM) += kvm/
51obj-$(CONFIG_ZORRO) += zorro/ 50obj-$(CONFIG_ZORRO) += zorro/
52obj-$(CONFIG_MAC) += macintosh/ 51obj-$(CONFIG_MAC) += macintosh/
53obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ 52obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/
@@ -73,7 +72,7 @@ obj-$(CONFIG_ISDN) += isdn/
73obj-$(CONFIG_EDAC) += edac/ 72obj-$(CONFIG_EDAC) += edac/
74obj-$(CONFIG_MCA) += mca/ 73obj-$(CONFIG_MCA) += mca/
75obj-$(CONFIG_EISA) += eisa/ 74obj-$(CONFIG_EISA) += eisa/
76obj-$(CONFIG_LGUEST_GUEST) += lguest/ 75obj-y += lguest/
77obj-$(CONFIG_CPU_FREQ) += cpufreq/ 76obj-$(CONFIG_CPU_FREQ) += cpufreq/
78obj-$(CONFIG_CPU_IDLE) += cpuidle/ 77obj-$(CONFIG_CPU_IDLE) += cpuidle/
79obj-$(CONFIG_MMC) += mmc/ 78obj-$(CONFIG_MMC) += mmc/
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index f484495b2ad1..055989e94799 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -163,15 +163,6 @@ static struct kset *bus_kset;
163 163
164#ifdef CONFIG_HOTPLUG 164#ifdef CONFIG_HOTPLUG
165/* Manually detach a device from its associated driver. */ 165/* Manually detach a device from its associated driver. */
166static int driver_helper(struct device *dev, void *data)
167{
168 const char *name = data;
169
170 if (strcmp(name, dev->bus_id) == 0)
171 return 1;
172 return 0;
173}
174
175static ssize_t driver_unbind(struct device_driver *drv, 166static ssize_t driver_unbind(struct device_driver *drv,
176 const char *buf, size_t count) 167 const char *buf, size_t count)
177{ 168{
@@ -179,7 +170,7 @@ static ssize_t driver_unbind(struct device_driver *drv,
179 struct device *dev; 170 struct device *dev;
180 int err = -ENODEV; 171 int err = -ENODEV;
181 172
182 dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); 173 dev = bus_find_device_by_name(bus, NULL, buf);
183 if (dev && dev->driver == drv) { 174 if (dev && dev->driver == drv) {
184 if (dev->parent) /* Needed for USB */ 175 if (dev->parent) /* Needed for USB */
185 down(&dev->parent->sem); 176 down(&dev->parent->sem);
@@ -206,7 +197,7 @@ static ssize_t driver_bind(struct device_driver *drv,
206 struct device *dev; 197 struct device *dev;
207 int err = -ENODEV; 198 int err = -ENODEV;
208 199
209 dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); 200 dev = bus_find_device_by_name(bus, NULL, buf);
210 if (dev && dev->driver == NULL) { 201 if (dev && dev->driver == NULL) {
211 if (dev->parent) /* Needed for USB */ 202 if (dev->parent) /* Needed for USB */
212 down(&dev->parent->sem); 203 down(&dev->parent->sem);
@@ -250,7 +241,7 @@ static ssize_t store_drivers_probe(struct bus_type *bus,
250{ 241{
251 struct device *dev; 242 struct device *dev;
252 243
253 dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); 244 dev = bus_find_device_by_name(bus, NULL, buf);
254 if (!dev) 245 if (!dev)
255 return -ENODEV; 246 return -ENODEV;
256 if (bus_rescan_devices_helper(dev, NULL) != 0) 247 if (bus_rescan_devices_helper(dev, NULL) != 0)
@@ -338,6 +329,32 @@ struct device *bus_find_device(struct bus_type *bus,
338} 329}
339EXPORT_SYMBOL_GPL(bus_find_device); 330EXPORT_SYMBOL_GPL(bus_find_device);
340 331
332static int match_name(struct device *dev, void *data)
333{
334 const char *name = data;
335
336 if (strcmp(name, dev->bus_id) == 0)
337 return 1;
338 return 0;
339}
340
341/**
342 * bus_find_device_by_name - device iterator for locating a particular device of a specific name
343 * @bus: bus type
344 * @start: Device to begin with
345 * @name: name of the device to match
346 *
347 * This is similar to the bus_find_device() function above, but it handles
348 * searching by a name automatically, no need to write another strcmp matching
349 * function.
350 */
351struct device *bus_find_device_by_name(struct bus_type *bus,
352 struct device *start, const char *name)
353{
354 return bus_find_device(bus, start, (void *)name, match_name);
355}
356EXPORT_SYMBOL_GPL(bus_find_device_by_name);
357
341static struct device_driver *next_driver(struct klist_iter *i) 358static struct device_driver *next_driver(struct klist_iter *i)
342{ 359{
343 struct klist_node *n = klist_next(i); 360 struct klist_node *n = klist_next(i);
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 59cf35894cfc..9d915376c313 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -149,7 +149,7 @@ int class_register(struct class *cls)
149 if (error) 149 if (error)
150 return error; 150 return error;
151 151
152#ifdef CONFIG_SYSFS_DEPRECATED 152#if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK)
153 /* let the block class directory show up in the root of sysfs */ 153 /* let the block class directory show up in the root of sysfs */
154 if (cls != &block_class) 154 if (cls != &block_class)
155 cls->subsys.kobj.kset = class_kset; 155 cls->subsys.kobj.kset = class_kset;
@@ -863,7 +863,7 @@ EXPORT_SYMBOL_GPL(class_for_each_device);
863 * The callback should return 0 if the device doesn't match and non-zero 863 * The callback should return 0 if the device doesn't match and non-zero
864 * if it does. If the callback returns non-zero, this function will 864 * if it does. If the callback returns non-zero, this function will
865 * return to the caller and not iterate over any more devices. 865 * return to the caller and not iterate over any more devices.
866 866 *
867 * Note, you will need to drop the reference with put_device() after use. 867 * Note, you will need to drop the reference with put_device() after use.
868 * 868 *
869 * We hold class->sem in this function, so it can not be 869 * We hold class->sem in this function, so it can not be
diff --git a/drivers/base/core.c b/drivers/base/core.c
index edf3bbeb8d6a..b1727876182c 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -27,9 +27,17 @@
27int (*platform_notify)(struct device *dev) = NULL; 27int (*platform_notify)(struct device *dev) = NULL;
28int (*platform_notify_remove)(struct device *dev) = NULL; 28int (*platform_notify_remove)(struct device *dev) = NULL;
29 29
30/* 30#ifdef CONFIG_BLOCK
31 * sysfs bindings for devices. 31static inline int device_is_not_partition(struct device *dev)
32 */ 32{
33 return !(dev->type == &part_type);
34}
35#else
36static inline int device_is_not_partition(struct device *dev)
37{
38 return 1;
39}
40#endif
33 41
34/** 42/**
35 * dev_driver_string - Return a device's driver name, if at all possible 43 * dev_driver_string - Return a device's driver name, if at all possible
@@ -652,14 +660,14 @@ static int device_add_class_symlinks(struct device *dev)
652#ifdef CONFIG_SYSFS_DEPRECATED 660#ifdef CONFIG_SYSFS_DEPRECATED
653 /* stacked class devices need a symlink in the class directory */ 661 /* stacked class devices need a symlink in the class directory */
654 if (dev->kobj.parent != &dev->class->subsys.kobj && 662 if (dev->kobj.parent != &dev->class->subsys.kobj &&
655 dev->type != &part_type) { 663 device_is_not_partition(dev)) {
656 error = sysfs_create_link(&dev->class->subsys.kobj, &dev->kobj, 664 error = sysfs_create_link(&dev->class->subsys.kobj, &dev->kobj,
657 dev->bus_id); 665 dev->bus_id);
658 if (error) 666 if (error)
659 goto out_subsys; 667 goto out_subsys;
660 } 668 }
661 669
662 if (dev->parent && dev->type != &part_type) { 670 if (dev->parent && device_is_not_partition(dev)) {
663 struct device *parent = dev->parent; 671 struct device *parent = dev->parent;
664 char *class_name; 672 char *class_name;
665 673
@@ -688,11 +696,11 @@ static int device_add_class_symlinks(struct device *dev)
688 return 0; 696 return 0;
689 697
690out_device: 698out_device:
691 if (dev->parent && dev->type != &part_type) 699 if (dev->parent && device_is_not_partition(dev))
692 sysfs_remove_link(&dev->kobj, "device"); 700 sysfs_remove_link(&dev->kobj, "device");
693out_busid: 701out_busid:
694 if (dev->kobj.parent != &dev->class->subsys.kobj && 702 if (dev->kobj.parent != &dev->class->subsys.kobj &&
695 dev->type != &part_type) 703 device_is_not_partition(dev))
696 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); 704 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
697#else 705#else
698 /* link in the class directory pointing to the device */ 706 /* link in the class directory pointing to the device */
@@ -701,7 +709,7 @@ out_busid:
701 if (error) 709 if (error)
702 goto out_subsys; 710 goto out_subsys;
703 711
704 if (dev->parent && dev->type != &part_type) { 712 if (dev->parent && device_is_not_partition(dev)) {
705 error = sysfs_create_link(&dev->kobj, &dev->parent->kobj, 713 error = sysfs_create_link(&dev->kobj, &dev->parent->kobj,
706 "device"); 714 "device");
707 if (error) 715 if (error)
@@ -725,7 +733,7 @@ static void device_remove_class_symlinks(struct device *dev)
725 return; 733 return;
726 734
727#ifdef CONFIG_SYSFS_DEPRECATED 735#ifdef CONFIG_SYSFS_DEPRECATED
728 if (dev->parent && dev->type != &part_type) { 736 if (dev->parent && device_is_not_partition(dev)) {
729 char *class_name; 737 char *class_name;
730 738
731 class_name = make_class_name(dev->class->name, &dev->kobj); 739 class_name = make_class_name(dev->class->name, &dev->kobj);
@@ -737,10 +745,10 @@ static void device_remove_class_symlinks(struct device *dev)
737 } 745 }
738 746
739 if (dev->kobj.parent != &dev->class->subsys.kobj && 747 if (dev->kobj.parent != &dev->class->subsys.kobj &&
740 dev->type != &part_type) 748 device_is_not_partition(dev))
741 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); 749 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
742#else 750#else
743 if (dev->parent && dev->type != &part_type) 751 if (dev->parent && device_is_not_partition(dev))
744 sysfs_remove_link(&dev->kobj, "device"); 752 sysfs_remove_link(&dev->kobj, "device");
745 753
746 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); 754 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
diff --git a/drivers/firewire/fw-cdev.c b/drivers/firewire/fw-cdev.c
index 60f1a8924a95..7e73cbaa4121 100644
--- a/drivers/firewire/fw-cdev.c
+++ b/drivers/firewire/fw-cdev.c
@@ -206,12 +206,13 @@ fill_bus_reset_event(struct fw_cdev_event_bus_reset *event,
206 206
207 event->closure = client->bus_reset_closure; 207 event->closure = client->bus_reset_closure;
208 event->type = FW_CDEV_EVENT_BUS_RESET; 208 event->type = FW_CDEV_EVENT_BUS_RESET;
209 event->generation = client->device->generation;
210 smp_rmb(); /* node_id must not be older than generation */
209 event->node_id = client->device->node_id; 211 event->node_id = client->device->node_id;
210 event->local_node_id = card->local_node->node_id; 212 event->local_node_id = card->local_node->node_id;
211 event->bm_node_id = 0; /* FIXME: We don't track the BM. */ 213 event->bm_node_id = 0; /* FIXME: We don't track the BM. */
212 event->irm_node_id = card->irm_node->node_id; 214 event->irm_node_id = card->irm_node->node_id;
213 event->root_node_id = card->root_node->node_id; 215 event->root_node_id = card->root_node->node_id;
214 event->generation = card->generation;
215} 216}
216 217
217static void 218static void
diff --git a/drivers/firewire/fw-device.c b/drivers/firewire/fw-device.c
index 56681b3b297b..de9066e69adf 100644
--- a/drivers/firewire/fw-device.c
+++ b/drivers/firewire/fw-device.c
@@ -27,6 +27,7 @@
27#include <linux/idr.h> 27#include <linux/idr.h>
28#include <linux/rwsem.h> 28#include <linux/rwsem.h>
29#include <asm/semaphore.h> 29#include <asm/semaphore.h>
30#include <asm/system.h>
30#include <linux/ctype.h> 31#include <linux/ctype.h>
31#include "fw-transaction.h" 32#include "fw-transaction.h"
32#include "fw-topology.h" 33#include "fw-topology.h"
@@ -182,9 +183,14 @@ static void fw_device_release(struct device *dev)
182 183
183int fw_device_enable_phys_dma(struct fw_device *device) 184int fw_device_enable_phys_dma(struct fw_device *device)
184{ 185{
186 int generation = device->generation;
187
188 /* device->node_id, accessed below, must not be older than generation */
189 smp_rmb();
190
185 return device->card->driver->enable_phys_dma(device->card, 191 return device->card->driver->enable_phys_dma(device->card,
186 device->node_id, 192 device->node_id,
187 device->generation); 193 generation);
188} 194}
189EXPORT_SYMBOL(fw_device_enable_phys_dma); 195EXPORT_SYMBOL(fw_device_enable_phys_dma);
190 196
@@ -384,17 +390,21 @@ complete_transaction(struct fw_card *card, int rcode,
384 complete(&callback_data->done); 390 complete(&callback_data->done);
385} 391}
386 392
387static int read_rom(struct fw_device *device, int index, u32 * data) 393static int
394read_rom(struct fw_device *device, int generation, int index, u32 *data)
388{ 395{
389 struct read_quadlet_callback_data callback_data; 396 struct read_quadlet_callback_data callback_data;
390 struct fw_transaction t; 397 struct fw_transaction t;
391 u64 offset; 398 u64 offset;
392 399
400 /* device->node_id, accessed below, must not be older than generation */
401 smp_rmb();
402
393 init_completion(&callback_data.done); 403 init_completion(&callback_data.done);
394 404
395 offset = 0xfffff0000400ULL + index * 4; 405 offset = 0xfffff0000400ULL + index * 4;
396 fw_send_request(device->card, &t, TCODE_READ_QUADLET_REQUEST, 406 fw_send_request(device->card, &t, TCODE_READ_QUADLET_REQUEST,
397 device->node_id, device->generation, device->max_speed, 407 device->node_id, generation, device->max_speed,
398 offset, NULL, 4, complete_transaction, &callback_data); 408 offset, NULL, 4, complete_transaction, &callback_data);
399 409
400 wait_for_completion(&callback_data.done); 410 wait_for_completion(&callback_data.done);
@@ -404,7 +414,14 @@ static int read_rom(struct fw_device *device, int index, u32 * data)
404 return callback_data.rcode; 414 return callback_data.rcode;
405} 415}
406 416
407static int read_bus_info_block(struct fw_device *device) 417/*
418 * Read the bus info block, perform a speed probe, and read all of the rest of
419 * the config ROM. We do all this with a cached bus generation. If the bus
420 * generation changes under us, read_bus_info_block will fail and get retried.
421 * It's better to start all over in this case because the node from which we
422 * are reading the ROM may have changed the ROM during the reset.
423 */
424static int read_bus_info_block(struct fw_device *device, int generation)
408{ 425{
409 static u32 rom[256]; 426 static u32 rom[256];
410 u32 stack[16], sp, key; 427 u32 stack[16], sp, key;
@@ -414,7 +431,7 @@ static int read_bus_info_block(struct fw_device *device)
414 431
415 /* First read the bus info block. */ 432 /* First read the bus info block. */
416 for (i = 0; i < 5; i++) { 433 for (i = 0; i < 5; i++) {
417 if (read_rom(device, i, &rom[i]) != RCODE_COMPLETE) 434 if (read_rom(device, generation, i, &rom[i]) != RCODE_COMPLETE)
418 return -1; 435 return -1;
419 /* 436 /*
420 * As per IEEE1212 7.2, during power-up, devices can 437 * As per IEEE1212 7.2, during power-up, devices can
@@ -449,7 +466,8 @@ static int read_bus_info_block(struct fw_device *device)
449 device->max_speed = device->card->link_speed; 466 device->max_speed = device->card->link_speed;
450 467
451 while (device->max_speed > SCODE_100) { 468 while (device->max_speed > SCODE_100) {
452 if (read_rom(device, 0, &dummy) == RCODE_COMPLETE) 469 if (read_rom(device, generation, 0, &dummy) ==
470 RCODE_COMPLETE)
453 break; 471 break;
454 device->max_speed--; 472 device->max_speed--;
455 } 473 }
@@ -482,7 +500,7 @@ static int read_bus_info_block(struct fw_device *device)
482 return -1; 500 return -1;
483 501
484 /* Read header quadlet for the block to get the length. */ 502 /* Read header quadlet for the block to get the length. */
485 if (read_rom(device, i, &rom[i]) != RCODE_COMPLETE) 503 if (read_rom(device, generation, i, &rom[i]) != RCODE_COMPLETE)
486 return -1; 504 return -1;
487 end = i + (rom[i] >> 16) + 1; 505 end = i + (rom[i] >> 16) + 1;
488 i++; 506 i++;
@@ -501,7 +519,8 @@ static int read_bus_info_block(struct fw_device *device)
501 * it references another block, and push it in that case. 519 * it references another block, and push it in that case.
502 */ 520 */
503 while (i < end) { 521 while (i < end) {
504 if (read_rom(device, i, &rom[i]) != RCODE_COMPLETE) 522 if (read_rom(device, generation, i, &rom[i]) !=
523 RCODE_COMPLETE)
505 return -1; 524 return -1;
506 if ((key >> 30) == 3 && (rom[i] >> 30) > 1 && 525 if ((key >> 30) == 3 && (rom[i] >> 30) > 1 &&
507 sp < ARRAY_SIZE(stack)) 526 sp < ARRAY_SIZE(stack))
@@ -648,7 +667,7 @@ static void fw_device_init(struct work_struct *work)
648 * device. 667 * device.
649 */ 668 */
650 669
651 if (read_bus_info_block(device) < 0) { 670 if (read_bus_info_block(device, device->generation) < 0) {
652 if (device->config_rom_retries < MAX_RETRIES) { 671 if (device->config_rom_retries < MAX_RETRIES) {
653 device->config_rom_retries++; 672 device->config_rom_retries++;
654 schedule_delayed_work(&device->work, RETRY_DELAY); 673 schedule_delayed_work(&device->work, RETRY_DELAY);
@@ -801,6 +820,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event)
801 820
802 device = node->data; 821 device = node->data;
803 device->node_id = node->node_id; 822 device->node_id = node->node_id;
823 smp_wmb(); /* update node_id before generation */
804 device->generation = card->generation; 824 device->generation = card->generation;
805 if (atomic_read(&device->state) == FW_DEVICE_RUNNING) { 825 if (atomic_read(&device->state) == FW_DEVICE_RUNNING) {
806 PREPARE_DELAYED_WORK(&device->work, fw_device_update); 826 PREPARE_DELAYED_WORK(&device->work, fw_device_update);
diff --git a/drivers/firewire/fw-device.h b/drivers/firewire/fw-device.h
index 894d4a92a18e..0854fe2bc110 100644
--- a/drivers/firewire/fw-device.h
+++ b/drivers/firewire/fw-device.h
@@ -35,6 +35,18 @@ struct fw_attribute_group {
35 struct attribute *attrs[11]; 35 struct attribute *attrs[11];
36}; 36};
37 37
38/*
39 * Note, fw_device.generation always has to be read before fw_device.node_id.
40 * Use SMP memory barriers to ensure this. Otherwise requests will be sent
41 * to an outdated node_id if the generation was updated in the meantime due
42 * to a bus reset.
43 *
44 * Likewise, fw-core will take care to update .node_id before .generation so
45 * that whenever fw_device.generation is current WRT the actual bus generation,
46 * fw_device.node_id is guaranteed to be current too.
47 *
48 * The same applies to fw_device.card->node_id vs. fw_device.generation.
49 */
38struct fw_device { 50struct fw_device {
39 atomic_t state; 51 atomic_t state;
40 struct fw_node *node; 52 struct fw_node *node;
diff --git a/drivers/firewire/fw-ohci.c b/drivers/firewire/fw-ohci.c
index 436a855a4c60..7ebad3c14cb8 100644
--- a/drivers/firewire/fw-ohci.c
+++ b/drivers/firewire/fw-ohci.c
@@ -98,17 +98,48 @@ struct context;
98typedef int (*descriptor_callback_t)(struct context *ctx, 98typedef int (*descriptor_callback_t)(struct context *ctx,
99 struct descriptor *d, 99 struct descriptor *d,
100 struct descriptor *last); 100 struct descriptor *last);
101
102/*
103 * A buffer that contains a block of DMA-able coherent memory used for
104 * storing a portion of a DMA descriptor program.
105 */
106struct descriptor_buffer {
107 struct list_head list;
108 dma_addr_t buffer_bus;
109 size_t buffer_size;
110 size_t used;
111 struct descriptor buffer[0];
112};
113
101struct context { 114struct context {
102 struct fw_ohci *ohci; 115 struct fw_ohci *ohci;
103 u32 regs; 116 u32 regs;
117 int total_allocation;
104 118
105 struct descriptor *buffer; 119 /*
106 dma_addr_t buffer_bus; 120 * List of page-sized buffers for storing DMA descriptors.
107 size_t buffer_size; 121 * Head of list contains buffers in use and tail of list contains
108 struct descriptor *head_descriptor; 122 * free buffers.
109 struct descriptor *tail_descriptor; 123 */
110 struct descriptor *tail_descriptor_last; 124 struct list_head buffer_list;
111 struct descriptor *prev_descriptor; 125
126 /*
127 * Pointer to a buffer inside buffer_list that contains the tail
128 * end of the current DMA program.
129 */
130 struct descriptor_buffer *buffer_tail;
131
132 /*
133 * The descriptor containing the branch address of the first
134 * descriptor that has not yet been filled by the device.
135 */
136 struct descriptor *last;
137
138 /*
139 * The last descriptor in the DMA program. It contains the branch
140 * address that must be updated upon appending a new descriptor.
141 */
142 struct descriptor *prev;
112 143
113 descriptor_callback_t callback; 144 descriptor_callback_t callback;
114 145
@@ -125,6 +156,7 @@ struct context {
125struct iso_context { 156struct iso_context {
126 struct fw_iso_context base; 157 struct fw_iso_context base;
127 struct context context; 158 struct context context;
159 int excess_bytes;
128 void *header; 160 void *header;
129 size_t header_length; 161 size_t header_length;
130}; 162};
@@ -197,8 +229,6 @@ static inline struct fw_ohci *fw_ohci(struct fw_card *card)
197#define SELF_ID_BUF_SIZE 0x800 229#define SELF_ID_BUF_SIZE 0x800
198#define OHCI_TCODE_PHY_PACKET 0x0e 230#define OHCI_TCODE_PHY_PACKET 0x0e
199#define OHCI_VERSION_1_1 0x010010 231#define OHCI_VERSION_1_1 0x010010
200#define ISO_BUFFER_SIZE (64 * 1024)
201#define AT_BUFFER_SIZE 4096
202 232
203static char ohci_driver_name[] = KBUILD_MODNAME; 233static char ohci_driver_name[] = KBUILD_MODNAME;
204 234
@@ -455,71 +485,108 @@ find_branch_descriptor(struct descriptor *d, int z)
455static void context_tasklet(unsigned long data) 485static void context_tasklet(unsigned long data)
456{ 486{
457 struct context *ctx = (struct context *) data; 487 struct context *ctx = (struct context *) data;
458 struct fw_ohci *ohci = ctx->ohci;
459 struct descriptor *d, *last; 488 struct descriptor *d, *last;
460 u32 address; 489 u32 address;
461 int z; 490 int z;
491 struct descriptor_buffer *desc;
462 492
463 dma_sync_single_for_cpu(ohci->card.device, ctx->buffer_bus, 493 desc = list_entry(ctx->buffer_list.next,
464 ctx->buffer_size, DMA_TO_DEVICE); 494 struct descriptor_buffer, list);
465 495 last = ctx->last;
466 d = ctx->tail_descriptor;
467 last = ctx->tail_descriptor_last;
468
469 while (last->branch_address != 0) { 496 while (last->branch_address != 0) {
497 struct descriptor_buffer *old_desc = desc;
470 address = le32_to_cpu(last->branch_address); 498 address = le32_to_cpu(last->branch_address);
471 z = address & 0xf; 499 z = address & 0xf;
472 d = ctx->buffer + (address - ctx->buffer_bus) / sizeof(*d); 500 address &= ~0xf;
501
502 /* If the branch address points to a buffer outside of the
503 * current buffer, advance to the next buffer. */
504 if (address < desc->buffer_bus ||
505 address >= desc->buffer_bus + desc->used)
506 desc = list_entry(desc->list.next,
507 struct descriptor_buffer, list);
508 d = desc->buffer + (address - desc->buffer_bus) / sizeof(*d);
473 last = find_branch_descriptor(d, z); 509 last = find_branch_descriptor(d, z);
474 510
475 if (!ctx->callback(ctx, d, last)) 511 if (!ctx->callback(ctx, d, last))
476 break; 512 break;
477 513
478 ctx->tail_descriptor = d; 514 if (old_desc != desc) {
479 ctx->tail_descriptor_last = last; 515 /* If we've advanced to the next buffer, move the
516 * previous buffer to the free list. */
517 unsigned long flags;
518 old_desc->used = 0;
519 spin_lock_irqsave(&ctx->ohci->lock, flags);
520 list_move_tail(&old_desc->list, &ctx->buffer_list);
521 spin_unlock_irqrestore(&ctx->ohci->lock, flags);
522 }
523 ctx->last = last;
480 } 524 }
481} 525}
482 526
527/*
528 * Allocate a new buffer and add it to the list of free buffers for this
529 * context. Must be called with ohci->lock held.
530 */
531static int
532context_add_buffer(struct context *ctx)
533{
534 struct descriptor_buffer *desc;
535 dma_addr_t bus_addr;
536 int offset;
537
538 /*
539 * 16MB of descriptors should be far more than enough for any DMA
540 * program. This will catch run-away userspace or DoS attacks.
541 */
542 if (ctx->total_allocation >= 16*1024*1024)
543 return -ENOMEM;
544
545 desc = dma_alloc_coherent(ctx->ohci->card.device, PAGE_SIZE,
546 &bus_addr, GFP_ATOMIC);
547 if (!desc)
548 return -ENOMEM;
549
550 offset = (void *)&desc->buffer - (void *)desc;
551 desc->buffer_size = PAGE_SIZE - offset;
552 desc->buffer_bus = bus_addr + offset;
553 desc->used = 0;
554
555 list_add_tail(&desc->list, &ctx->buffer_list);
556 ctx->total_allocation += PAGE_SIZE;
557
558 return 0;
559}
560
483static int 561static int
484context_init(struct context *ctx, struct fw_ohci *ohci, 562context_init(struct context *ctx, struct fw_ohci *ohci,
485 size_t buffer_size, u32 regs, 563 u32 regs, descriptor_callback_t callback)
486 descriptor_callback_t callback)
487{ 564{
488 ctx->ohci = ohci; 565 ctx->ohci = ohci;
489 ctx->regs = regs; 566 ctx->regs = regs;
490 ctx->buffer_size = buffer_size; 567 ctx->total_allocation = 0;
491 ctx->buffer = kmalloc(buffer_size, GFP_KERNEL); 568
492 if (ctx->buffer == NULL) 569 INIT_LIST_HEAD(&ctx->buffer_list);
570 if (context_add_buffer(ctx) < 0)
493 return -ENOMEM; 571 return -ENOMEM;
494 572
573 ctx->buffer_tail = list_entry(ctx->buffer_list.next,
574 struct descriptor_buffer, list);
575
495 tasklet_init(&ctx->tasklet, context_tasklet, (unsigned long)ctx); 576 tasklet_init(&ctx->tasklet, context_tasklet, (unsigned long)ctx);
496 ctx->callback = callback; 577 ctx->callback = callback;
497 578
498 ctx->buffer_bus =
499 dma_map_single(ohci->card.device, ctx->buffer,
500 buffer_size, DMA_TO_DEVICE);
501 if (dma_mapping_error(ctx->buffer_bus)) {
502 kfree(ctx->buffer);
503 return -ENOMEM;
504 }
505
506 ctx->head_descriptor = ctx->buffer;
507 ctx->prev_descriptor = ctx->buffer;
508 ctx->tail_descriptor = ctx->buffer;
509 ctx->tail_descriptor_last = ctx->buffer;
510
511 /* 579 /*
512 * We put a dummy descriptor in the buffer that has a NULL 580 * We put a dummy descriptor in the buffer that has a NULL
513 * branch address and looks like it's been sent. That way we 581 * branch address and looks like it's been sent. That way we
514 * have a descriptor to append DMA programs to. Also, the 582 * have a descriptor to append DMA programs to.
515 * ring buffer invariant is that it always has at least one
516 * element so that head == tail means buffer full.
517 */ 583 */
518 584 memset(ctx->buffer_tail->buffer, 0, sizeof(*ctx->buffer_tail->buffer));
519 memset(ctx->head_descriptor, 0, sizeof(*ctx->head_descriptor)); 585 ctx->buffer_tail->buffer->control = cpu_to_le16(DESCRIPTOR_OUTPUT_LAST);
520 ctx->head_descriptor->control = cpu_to_le16(DESCRIPTOR_OUTPUT_LAST); 586 ctx->buffer_tail->buffer->transfer_status = cpu_to_le16(0x8011);
521 ctx->head_descriptor->transfer_status = cpu_to_le16(0x8011); 587 ctx->buffer_tail->used += sizeof(*ctx->buffer_tail->buffer);
522 ctx->head_descriptor++; 588 ctx->last = ctx->buffer_tail->buffer;
589 ctx->prev = ctx->buffer_tail->buffer;
523 590
524 return 0; 591 return 0;
525} 592}
@@ -528,35 +595,42 @@ static void
528context_release(struct context *ctx) 595context_release(struct context *ctx)
529{ 596{
530 struct fw_card *card = &ctx->ohci->card; 597 struct fw_card *card = &ctx->ohci->card;
598 struct descriptor_buffer *desc, *tmp;
531 599
532 dma_unmap_single(card->device, ctx->buffer_bus, 600 list_for_each_entry_safe(desc, tmp, &ctx->buffer_list, list)
533 ctx->buffer_size, DMA_TO_DEVICE); 601 dma_free_coherent(card->device, PAGE_SIZE, desc,
534 kfree(ctx->buffer); 602 desc->buffer_bus -
603 ((void *)&desc->buffer - (void *)desc));
535} 604}
536 605
606/* Must be called with ohci->lock held */
537static struct descriptor * 607static struct descriptor *
538context_get_descriptors(struct context *ctx, int z, dma_addr_t *d_bus) 608context_get_descriptors(struct context *ctx, int z, dma_addr_t *d_bus)
539{ 609{
540 struct descriptor *d, *tail, *end; 610 struct descriptor *d = NULL;
541 611 struct descriptor_buffer *desc = ctx->buffer_tail;
542 d = ctx->head_descriptor; 612
543 tail = ctx->tail_descriptor; 613 if (z * sizeof(*d) > desc->buffer_size)
544 end = ctx->buffer + ctx->buffer_size / sizeof(*d); 614 return NULL;
545
546 if (d + z <= tail) {
547 goto has_space;
548 } else if (d > tail && d + z <= end) {
549 goto has_space;
550 } else if (d > tail && ctx->buffer + z <= tail) {
551 d = ctx->buffer;
552 goto has_space;
553 }
554 615
555 return NULL; 616 if (z * sizeof(*d) > desc->buffer_size - desc->used) {
617 /* No room for the descriptor in this buffer, so advance to the
618 * next one. */
619
620 if (desc->list.next == &ctx->buffer_list) {
621 /* If there is no free buffer next in the list,
622 * allocate one. */
623 if (context_add_buffer(ctx) < 0)
624 return NULL;
625 }
626 desc = list_entry(desc->list.next,
627 struct descriptor_buffer, list);
628 ctx->buffer_tail = desc;
629 }
556 630
557 has_space: 631 d = desc->buffer + desc->used / sizeof(*d);
558 memset(d, 0, z * sizeof(*d)); 632 memset(d, 0, z * sizeof(*d));
559 *d_bus = ctx->buffer_bus + (d - ctx->buffer) * sizeof(*d); 633 *d_bus = desc->buffer_bus + desc->used;
560 634
561 return d; 635 return d;
562} 636}
@@ -566,7 +640,7 @@ static void context_run(struct context *ctx, u32 extra)
566 struct fw_ohci *ohci = ctx->ohci; 640 struct fw_ohci *ohci = ctx->ohci;
567 641
568 reg_write(ohci, COMMAND_PTR(ctx->regs), 642 reg_write(ohci, COMMAND_PTR(ctx->regs),
569 le32_to_cpu(ctx->tail_descriptor_last->branch_address)); 643 le32_to_cpu(ctx->last->branch_address));
570 reg_write(ohci, CONTROL_CLEAR(ctx->regs), ~0); 644 reg_write(ohci, CONTROL_CLEAR(ctx->regs), ~0);
571 reg_write(ohci, CONTROL_SET(ctx->regs), CONTEXT_RUN | extra); 645 reg_write(ohci, CONTROL_SET(ctx->regs), CONTEXT_RUN | extra);
572 flush_writes(ohci); 646 flush_writes(ohci);
@@ -576,15 +650,13 @@ static void context_append(struct context *ctx,
576 struct descriptor *d, int z, int extra) 650 struct descriptor *d, int z, int extra)
577{ 651{
578 dma_addr_t d_bus; 652 dma_addr_t d_bus;
653 struct descriptor_buffer *desc = ctx->buffer_tail;
579 654
580 d_bus = ctx->buffer_bus + (d - ctx->buffer) * sizeof(*d); 655 d_bus = desc->buffer_bus + (d - desc->buffer) * sizeof(*d);
581 656
582 ctx->head_descriptor = d + z + extra; 657 desc->used += (z + extra) * sizeof(*d);
583 ctx->prev_descriptor->branch_address = cpu_to_le32(d_bus | z); 658 ctx->prev->branch_address = cpu_to_le32(d_bus | z);
584 ctx->prev_descriptor = find_branch_descriptor(d, z); 659 ctx->prev = find_branch_descriptor(d, z);
585
586 dma_sync_single_for_device(ctx->ohci->card.device, ctx->buffer_bus,
587 ctx->buffer_size, DMA_TO_DEVICE);
588 660
589 reg_write(ctx->ohci, CONTROL_SET(ctx->regs), CONTEXT_WAKE); 661 reg_write(ctx->ohci, CONTROL_SET(ctx->regs), CONTEXT_WAKE);
590 flush_writes(ctx->ohci); 662 flush_writes(ctx->ohci);
@@ -1078,6 +1150,13 @@ static irqreturn_t irq_handler(int irq, void *data)
1078 if (unlikely(event & OHCI1394_postedWriteErr)) 1150 if (unlikely(event & OHCI1394_postedWriteErr))
1079 fw_error("PCI posted write error\n"); 1151 fw_error("PCI posted write error\n");
1080 1152
1153 if (unlikely(event & OHCI1394_cycleTooLong)) {
1154 if (printk_ratelimit())
1155 fw_notify("isochronous cycle too long\n");
1156 reg_write(ohci, OHCI1394_LinkControlSet,
1157 OHCI1394_LinkControl_cycleMaster);
1158 }
1159
1081 if (event & OHCI1394_cycle64Seconds) { 1160 if (event & OHCI1394_cycle64Seconds) {
1082 cycle_time = reg_read(ohci, OHCI1394_IsochronousCycleTimer); 1161 cycle_time = reg_read(ohci, OHCI1394_IsochronousCycleTimer);
1083 if ((cycle_time & 0x80000000) == 0) 1162 if ((cycle_time & 0x80000000) == 0)
@@ -1151,8 +1230,8 @@ static int ohci_enable(struct fw_card *card, u32 *config_rom, size_t length)
1151 OHCI1394_RQPkt | OHCI1394_RSPkt | 1230 OHCI1394_RQPkt | OHCI1394_RSPkt |
1152 OHCI1394_reqTxComplete | OHCI1394_respTxComplete | 1231 OHCI1394_reqTxComplete | OHCI1394_respTxComplete |
1153 OHCI1394_isochRx | OHCI1394_isochTx | 1232 OHCI1394_isochRx | OHCI1394_isochTx |
1154 OHCI1394_postedWriteErr | OHCI1394_cycle64Seconds | 1233 OHCI1394_postedWriteErr | OHCI1394_cycleTooLong |
1155 OHCI1394_masterIntEnable); 1234 OHCI1394_cycle64Seconds | OHCI1394_masterIntEnable);
1156 1235
1157 /* Activate link_on bit and contender bit in our self ID packets.*/ 1236 /* Activate link_on bit and contender bit in our self ID packets.*/
1158 if (ohci_update_phy_reg(card, 4, 0, 1237 if (ohci_update_phy_reg(card, 4, 0,
@@ -1408,9 +1487,13 @@ static int handle_ir_dualbuffer_packet(struct context *context,
1408 void *p, *end; 1487 void *p, *end;
1409 int i; 1488 int i;
1410 1489
1411 if (db->first_res_count > 0 && db->second_res_count > 0) 1490 if (db->first_res_count > 0 && db->second_res_count > 0) {
1412 /* This descriptor isn't done yet, stop iteration. */ 1491 if (ctx->excess_bytes <= le16_to_cpu(db->second_req_count)) {
1413 return 0; 1492 /* This descriptor isn't done yet, stop iteration. */
1493 return 0;
1494 }
1495 ctx->excess_bytes -= le16_to_cpu(db->second_req_count);
1496 }
1414 1497
1415 header_length = le16_to_cpu(db->first_req_count) - 1498 header_length = le16_to_cpu(db->first_req_count) -
1416 le16_to_cpu(db->first_res_count); 1499 le16_to_cpu(db->first_res_count);
@@ -1429,11 +1512,15 @@ static int handle_ir_dualbuffer_packet(struct context *context,
1429 *(u32 *) (ctx->header + i) = __swab32(*(u32 *) (p + 4)); 1512 *(u32 *) (ctx->header + i) = __swab32(*(u32 *) (p + 4));
1430 memcpy(ctx->header + i + 4, p + 8, ctx->base.header_size - 4); 1513 memcpy(ctx->header + i + 4, p + 8, ctx->base.header_size - 4);
1431 i += ctx->base.header_size; 1514 i += ctx->base.header_size;
1515 ctx->excess_bytes +=
1516 (le32_to_cpu(*(u32 *)(p + 4)) >> 16) & 0xffff;
1432 p += ctx->base.header_size + 4; 1517 p += ctx->base.header_size + 4;
1433 } 1518 }
1434
1435 ctx->header_length = i; 1519 ctx->header_length = i;
1436 1520
1521 ctx->excess_bytes -= le16_to_cpu(db->second_req_count) -
1522 le16_to_cpu(db->second_res_count);
1523
1437 if (le16_to_cpu(db->control) & DESCRIPTOR_IRQ_ALWAYS) { 1524 if (le16_to_cpu(db->control) & DESCRIPTOR_IRQ_ALWAYS) {
1438 ir_header = (__le32 *) (db + 1); 1525 ir_header = (__le32 *) (db + 1);
1439 ctx->base.callback(&ctx->base, 1526 ctx->base.callback(&ctx->base,
@@ -1452,24 +1539,24 @@ static int handle_ir_packet_per_buffer(struct context *context,
1452{ 1539{
1453 struct iso_context *ctx = 1540 struct iso_context *ctx =
1454 container_of(context, struct iso_context, context); 1541 container_of(context, struct iso_context, context);
1455 struct descriptor *pd = d + 1; 1542 struct descriptor *pd;
1456 __le32 *ir_header; 1543 __le32 *ir_header;
1457 size_t header_length; 1544 void *p;
1458 void *p, *end; 1545 int i;
1459 int i, z;
1460 1546
1461 if (pd->res_count == pd->req_count) 1547 for (pd = d; pd <= last; pd++) {
1548 if (pd->transfer_status)
1549 break;
1550 }
1551 if (pd > last)
1462 /* Descriptor(s) not done yet, stop iteration */ 1552 /* Descriptor(s) not done yet, stop iteration */
1463 return 0; 1553 return 0;
1464 1554
1465 header_length = le16_to_cpu(d->req_count);
1466
1467 i = ctx->header_length; 1555 i = ctx->header_length;
1468 z = le32_to_cpu(pd->branch_address) & 0xf; 1556 p = last + 1;
1469 p = d + z;
1470 end = p + header_length;
1471 1557
1472 while (p < end && i + ctx->base.header_size <= PAGE_SIZE) { 1558 if (ctx->base.header_size > 0 &&
1559 i + ctx->base.header_size <= PAGE_SIZE) {
1473 /* 1560 /*
1474 * The iso header is byteswapped to little endian by 1561 * The iso header is byteswapped to little endian by
1475 * the controller, but the remaining header quadlets 1562 * the controller, but the remaining header quadlets
@@ -1478,14 +1565,11 @@ static int handle_ir_packet_per_buffer(struct context *context,
1478 */ 1565 */
1479 *(u32 *) (ctx->header + i) = __swab32(*(u32 *) (p + 4)); 1566 *(u32 *) (ctx->header + i) = __swab32(*(u32 *) (p + 4));
1480 memcpy(ctx->header + i + 4, p + 8, ctx->base.header_size - 4); 1567 memcpy(ctx->header + i + 4, p + 8, ctx->base.header_size - 4);
1481 i += ctx->base.header_size; 1568 ctx->header_length += ctx->base.header_size;
1482 p += ctx->base.header_size + 4;
1483 } 1569 }
1484 1570
1485 ctx->header_length = i; 1571 if (le16_to_cpu(last->control) & DESCRIPTOR_IRQ_ALWAYS) {
1486 1572 ir_header = (__le32 *) p;
1487 if (le16_to_cpu(pd->control) & DESCRIPTOR_IRQ_ALWAYS) {
1488 ir_header = (__le32 *) (d + z);
1489 ctx->base.callback(&ctx->base, 1573 ctx->base.callback(&ctx->base,
1490 le32_to_cpu(ir_header[0]) & 0xffff, 1574 le32_to_cpu(ir_header[0]) & 0xffff,
1491 ctx->header_length, ctx->header, 1575 ctx->header_length, ctx->header,
@@ -1493,7 +1577,6 @@ static int handle_ir_packet_per_buffer(struct context *context,
1493 ctx->header_length = 0; 1577 ctx->header_length = 0;
1494 } 1578 }
1495 1579
1496
1497 return 1; 1580 return 1;
1498} 1581}
1499 1582
@@ -1559,8 +1642,7 @@ ohci_allocate_iso_context(struct fw_card *card, int type, size_t header_size)
1559 if (ctx->header == NULL) 1642 if (ctx->header == NULL)
1560 goto out; 1643 goto out;
1561 1644
1562 retval = context_init(&ctx->context, ohci, ISO_BUFFER_SIZE, 1645 retval = context_init(&ctx->context, ohci, regs, callback);
1563 regs, callback);
1564 if (retval < 0) 1646 if (retval < 0)
1565 goto out_with_header; 1647 goto out_with_header;
1566 1648
@@ -1775,19 +1857,6 @@ ohci_queue_iso_receive_dualbuffer(struct fw_iso_context *base,
1775 * packet, retransmit or terminate.. 1857 * packet, retransmit or terminate..
1776 */ 1858 */
1777 1859
1778 if (packet->skip) {
1779 d = context_get_descriptors(&ctx->context, 2, &d_bus);
1780 if (d == NULL)
1781 return -ENOMEM;
1782
1783 db = (struct db_descriptor *) d;
1784 db->control = cpu_to_le16(DESCRIPTOR_STATUS |
1785 DESCRIPTOR_BRANCH_ALWAYS |
1786 DESCRIPTOR_WAIT);
1787 db->first_size = cpu_to_le16(ctx->base.header_size + 4);
1788 context_append(&ctx->context, d, 2, 0);
1789 }
1790
1791 p = packet; 1860 p = packet;
1792 z = 2; 1861 z = 2;
1793 1862
@@ -1815,11 +1884,18 @@ ohci_queue_iso_receive_dualbuffer(struct fw_iso_context *base,
1815 db->control = cpu_to_le16(DESCRIPTOR_STATUS | 1884 db->control = cpu_to_le16(DESCRIPTOR_STATUS |
1816 DESCRIPTOR_BRANCH_ALWAYS); 1885 DESCRIPTOR_BRANCH_ALWAYS);
1817 db->first_size = cpu_to_le16(ctx->base.header_size + 4); 1886 db->first_size = cpu_to_le16(ctx->base.header_size + 4);
1818 db->first_req_count = cpu_to_le16(header_size); 1887 if (p->skip && rest == p->payload_length) {
1888 db->control |= cpu_to_le16(DESCRIPTOR_WAIT);
1889 db->first_req_count = db->first_size;
1890 } else {
1891 db->first_req_count = cpu_to_le16(header_size);
1892 }
1819 db->first_res_count = db->first_req_count; 1893 db->first_res_count = db->first_req_count;
1820 db->first_buffer = cpu_to_le32(d_bus + sizeof(*db)); 1894 db->first_buffer = cpu_to_le32(d_bus + sizeof(*db));
1821 1895
1822 if (offset + rest < PAGE_SIZE) 1896 if (p->skip && rest == p->payload_length)
1897 length = 4;
1898 else if (offset + rest < PAGE_SIZE)
1823 length = rest; 1899 length = rest;
1824 else 1900 else
1825 length = PAGE_SIZE - offset; 1901 length = PAGE_SIZE - offset;
@@ -1835,7 +1911,8 @@ ohci_queue_iso_receive_dualbuffer(struct fw_iso_context *base,
1835 context_append(&ctx->context, d, z, header_z); 1911 context_append(&ctx->context, d, z, header_z);
1836 offset = (offset + length) & ~PAGE_MASK; 1912 offset = (offset + length) & ~PAGE_MASK;
1837 rest -= length; 1913 rest -= length;
1838 page++; 1914 if (offset == 0)
1915 page++;
1839 } 1916 }
1840 1917
1841 return 0; 1918 return 0;
@@ -1849,67 +1926,70 @@ ohci_queue_iso_receive_packet_per_buffer(struct fw_iso_context *base,
1849{ 1926{
1850 struct iso_context *ctx = container_of(base, struct iso_context, base); 1927 struct iso_context *ctx = container_of(base, struct iso_context, base);
1851 struct descriptor *d = NULL, *pd = NULL; 1928 struct descriptor *d = NULL, *pd = NULL;
1852 struct fw_iso_packet *p; 1929 struct fw_iso_packet *p = packet;
1853 dma_addr_t d_bus, page_bus; 1930 dma_addr_t d_bus, page_bus;
1854 u32 z, header_z, rest; 1931 u32 z, header_z, rest;
1855 int i, page, offset, packet_count, header_size; 1932 int i, j, length;
1856 1933 int page, offset, packet_count, header_size, payload_per_buffer;
1857 if (packet->skip) {
1858 d = context_get_descriptors(&ctx->context, 1, &d_bus);
1859 if (d == NULL)
1860 return -ENOMEM;
1861
1862 d->control = cpu_to_le16(DESCRIPTOR_STATUS |
1863 DESCRIPTOR_INPUT_LAST |
1864 DESCRIPTOR_BRANCH_ALWAYS |
1865 DESCRIPTOR_WAIT);
1866 context_append(&ctx->context, d, 1, 0);
1867 }
1868
1869 /* one descriptor for header, one for payload */
1870 /* FIXME: handle cases where we need multiple desc. for payload */
1871 z = 2;
1872 p = packet;
1873 1934
1874 /* 1935 /*
1875 * The OHCI controller puts the status word in the 1936 * The OHCI controller puts the status word in the
1876 * buffer too, so we need 4 extra bytes per packet. 1937 * buffer too, so we need 4 extra bytes per packet.
1877 */ 1938 */
1878 packet_count = p->header_length / ctx->base.header_size; 1939 packet_count = p->header_length / ctx->base.header_size;
1879 header_size = packet_count * (ctx->base.header_size + 4); 1940 header_size = ctx->base.header_size + 4;
1880 1941
1881 /* Get header size in number of descriptors. */ 1942 /* Get header size in number of descriptors. */
1882 header_z = DIV_ROUND_UP(header_size, sizeof(*d)); 1943 header_z = DIV_ROUND_UP(header_size, sizeof(*d));
1883 page = payload >> PAGE_SHIFT; 1944 page = payload >> PAGE_SHIFT;
1884 offset = payload & ~PAGE_MASK; 1945 offset = payload & ~PAGE_MASK;
1885 rest = p->payload_length; 1946 payload_per_buffer = p->payload_length / packet_count;
1886 1947
1887 for (i = 0; i < packet_count; i++) { 1948 for (i = 0; i < packet_count; i++) {
1888 /* d points to the header descriptor */ 1949 /* d points to the header descriptor */
1950 z = DIV_ROUND_UP(payload_per_buffer + offset, PAGE_SIZE) + 1;
1889 d = context_get_descriptors(&ctx->context, 1951 d = context_get_descriptors(&ctx->context,
1890 z + header_z, &d_bus); 1952 z + header_z, &d_bus);
1891 if (d == NULL) 1953 if (d == NULL)
1892 return -ENOMEM; 1954 return -ENOMEM;
1893 1955
1894 d->control = cpu_to_le16(DESCRIPTOR_INPUT_MORE); 1956 d->control = cpu_to_le16(DESCRIPTOR_STATUS |
1957 DESCRIPTOR_INPUT_MORE);
1958 if (p->skip && i == 0)
1959 d->control |= cpu_to_le16(DESCRIPTOR_WAIT);
1895 d->req_count = cpu_to_le16(header_size); 1960 d->req_count = cpu_to_le16(header_size);
1896 d->res_count = d->req_count; 1961 d->res_count = d->req_count;
1962 d->transfer_status = 0;
1897 d->data_address = cpu_to_le32(d_bus + (z * sizeof(*d))); 1963 d->data_address = cpu_to_le32(d_bus + (z * sizeof(*d)));
1898 1964
1899 /* pd points to the payload descriptor */ 1965 rest = payload_per_buffer;
1900 pd = d + 1; 1966 for (j = 1; j < z; j++) {
1967 pd = d + j;
1968 pd->control = cpu_to_le16(DESCRIPTOR_STATUS |
1969 DESCRIPTOR_INPUT_MORE);
1970
1971 if (offset + rest < PAGE_SIZE)
1972 length = rest;
1973 else
1974 length = PAGE_SIZE - offset;
1975 pd->req_count = cpu_to_le16(length);
1976 pd->res_count = pd->req_count;
1977 pd->transfer_status = 0;
1978
1979 page_bus = page_private(buffer->pages[page]);
1980 pd->data_address = cpu_to_le32(page_bus + offset);
1981
1982 offset = (offset + length) & ~PAGE_MASK;
1983 rest -= length;
1984 if (offset == 0)
1985 page++;
1986 }
1901 pd->control = cpu_to_le16(DESCRIPTOR_STATUS | 1987 pd->control = cpu_to_le16(DESCRIPTOR_STATUS |
1902 DESCRIPTOR_INPUT_LAST | 1988 DESCRIPTOR_INPUT_LAST |
1903 DESCRIPTOR_BRANCH_ALWAYS); 1989 DESCRIPTOR_BRANCH_ALWAYS);
1904 if (p->interrupt) 1990 if (p->interrupt && i == packet_count - 1)
1905 pd->control |= cpu_to_le16(DESCRIPTOR_IRQ_ALWAYS); 1991 pd->control |= cpu_to_le16(DESCRIPTOR_IRQ_ALWAYS);
1906 1992
1907 pd->req_count = cpu_to_le16(rest);
1908 pd->res_count = pd->req_count;
1909
1910 page_bus = page_private(buffer->pages[page]);
1911 pd->data_address = cpu_to_le32(page_bus + offset);
1912
1913 context_append(&ctx->context, d, z, header_z); 1993 context_append(&ctx->context, d, z, header_z);
1914 } 1994 }
1915 1995
@@ -1923,16 +2003,22 @@ ohci_queue_iso(struct fw_iso_context *base,
1923 unsigned long payload) 2003 unsigned long payload)
1924{ 2004{
1925 struct iso_context *ctx = container_of(base, struct iso_context, base); 2005 struct iso_context *ctx = container_of(base, struct iso_context, base);
2006 unsigned long flags;
2007 int retval;
1926 2008
2009 spin_lock_irqsave(&ctx->context.ohci->lock, flags);
1927 if (base->type == FW_ISO_CONTEXT_TRANSMIT) 2010 if (base->type == FW_ISO_CONTEXT_TRANSMIT)
1928 return ohci_queue_iso_transmit(base, packet, buffer, payload); 2011 retval = ohci_queue_iso_transmit(base, packet, buffer, payload);
1929 else if (ctx->context.ohci->version >= OHCI_VERSION_1_1) 2012 else if (ctx->context.ohci->version >= OHCI_VERSION_1_1)
1930 return ohci_queue_iso_receive_dualbuffer(base, packet, 2013 retval = ohci_queue_iso_receive_dualbuffer(base, packet,
1931 buffer, payload); 2014 buffer, payload);
1932 else 2015 else
1933 return ohci_queue_iso_receive_packet_per_buffer(base, packet, 2016 retval = ohci_queue_iso_receive_packet_per_buffer(base, packet,
1934 buffer, 2017 buffer,
1935 payload); 2018 payload);
2019 spin_unlock_irqrestore(&ctx->context.ohci->lock, flags);
2020
2021 return retval;
1936} 2022}
1937 2023
1938static const struct fw_card_driver ohci_driver = { 2024static const struct fw_card_driver ohci_driver = {
@@ -2004,10 +2090,10 @@ pci_probe(struct pci_dev *dev, const struct pci_device_id *ent)
2004 ar_context_init(&ohci->ar_response_ctx, ohci, 2090 ar_context_init(&ohci->ar_response_ctx, ohci,
2005 OHCI1394_AsRspRcvContextControlSet); 2091 OHCI1394_AsRspRcvContextControlSet);
2006 2092
2007 context_init(&ohci->at_request_ctx, ohci, AT_BUFFER_SIZE, 2093 context_init(&ohci->at_request_ctx, ohci,
2008 OHCI1394_AsReqTrContextControlSet, handle_at_packet); 2094 OHCI1394_AsReqTrContextControlSet, handle_at_packet);
2009 2095
2010 context_init(&ohci->at_response_ctx, ohci, AT_BUFFER_SIZE, 2096 context_init(&ohci->at_response_ctx, ohci,
2011 OHCI1394_AsRspTrContextControlSet, handle_at_packet); 2097 OHCI1394_AsRspTrContextControlSet, handle_at_packet);
2012 2098
2013 reg_write(ohci, OHCI1394_IsoRecvIntMaskSet, ~0); 2099 reg_write(ohci, OHCI1394_IsoRecvIntMaskSet, ~0);
diff --git a/drivers/firewire/fw-sbp2.c b/drivers/firewire/fw-sbp2.c
index c2169d215bf7..19ece9b6d742 100644
--- a/drivers/firewire/fw-sbp2.c
+++ b/drivers/firewire/fw-sbp2.c
@@ -40,6 +40,7 @@
40#include <linux/stringify.h> 40#include <linux/stringify.h>
41#include <linux/timer.h> 41#include <linux/timer.h>
42#include <linux/workqueue.h> 42#include <linux/workqueue.h>
43#include <asm/system.h>
43 44
44#include <scsi/scsi.h> 45#include <scsi/scsi.h>
45#include <scsi/scsi_cmnd.h> 46#include <scsi/scsi_cmnd.h>
@@ -148,18 +149,26 @@ struct sbp2_target {
148 149
149 unsigned workarounds; 150 unsigned workarounds;
150 struct list_head lu_list; 151 struct list_head lu_list;
152
153 unsigned int mgt_orb_timeout;
151}; 154};
152 155
153#define SBP2_MAX_SG_ELEMENT_LENGTH 0xf000 156/*
154#define SBP2_MAX_SECTORS 255 /* Max sectors supported */ 157 * Per section 7.4.8 of the SBP-2 spec, a mgt_ORB_timeout value can be
158 * provided in the config rom. Most devices do provide a value, which
159 * we'll use for login management orbs, but with some sane limits.
160 */
161#define SBP2_MIN_LOGIN_ORB_TIMEOUT 5000U /* Timeout in ms */
162#define SBP2_MAX_LOGIN_ORB_TIMEOUT 40000U /* Timeout in ms */
155#define SBP2_ORB_TIMEOUT 2000 /* Timeout in ms */ 163#define SBP2_ORB_TIMEOUT 2000 /* Timeout in ms */
156
157#define SBP2_ORB_NULL 0x80000000 164#define SBP2_ORB_NULL 0x80000000
165#define SBP2_MAX_SG_ELEMENT_LENGTH 0xf000
158 166
159#define SBP2_DIRECTION_TO_MEDIA 0x0 167#define SBP2_DIRECTION_TO_MEDIA 0x0
160#define SBP2_DIRECTION_FROM_MEDIA 0x1 168#define SBP2_DIRECTION_FROM_MEDIA 0x1
161 169
162/* Unit directory keys */ 170/* Unit directory keys */
171#define SBP2_CSR_UNIT_CHARACTERISTICS 0x3a
163#define SBP2_CSR_FIRMWARE_REVISION 0x3c 172#define SBP2_CSR_FIRMWARE_REVISION 0x3c
164#define SBP2_CSR_LOGICAL_UNIT_NUMBER 0x14 173#define SBP2_CSR_LOGICAL_UNIT_NUMBER 0x14
165#define SBP2_CSR_LOGICAL_UNIT_DIRECTORY 0xd4 174#define SBP2_CSR_LOGICAL_UNIT_DIRECTORY 0xd4
@@ -489,6 +498,7 @@ sbp2_send_management_orb(struct sbp2_logical_unit *lu, int node_id,
489{ 498{
490 struct fw_device *device = fw_device(lu->tgt->unit->device.parent); 499 struct fw_device *device = fw_device(lu->tgt->unit->device.parent);
491 struct sbp2_management_orb *orb; 500 struct sbp2_management_orb *orb;
501 unsigned int timeout;
492 int retval = -ENOMEM; 502 int retval = -ENOMEM;
493 503
494 orb = kzalloc(sizeof(*orb), GFP_ATOMIC); 504 orb = kzalloc(sizeof(*orb), GFP_ATOMIC);
@@ -516,9 +526,13 @@ sbp2_send_management_orb(struct sbp2_logical_unit *lu, int node_id,
516 orb->request.status_fifo.low = lu->address_handler.offset; 526 orb->request.status_fifo.low = lu->address_handler.offset;
517 527
518 if (function == SBP2_LOGIN_REQUEST) { 528 if (function == SBP2_LOGIN_REQUEST) {
529 /* Ask for 2^2 == 4 seconds reconnect grace period */
519 orb->request.misc |= 530 orb->request.misc |=
520 MANAGEMENT_ORB_EXCLUSIVE(sbp2_param_exclusive_login) | 531 MANAGEMENT_ORB_RECONNECT(2) |
521 MANAGEMENT_ORB_RECONNECT(0); 532 MANAGEMENT_ORB_EXCLUSIVE(sbp2_param_exclusive_login);
533 timeout = lu->tgt->mgt_orb_timeout;
534 } else {
535 timeout = SBP2_ORB_TIMEOUT;
522 } 536 }
523 537
524 fw_memcpy_to_be32(&orb->request, &orb->request, sizeof(orb->request)); 538 fw_memcpy_to_be32(&orb->request, &orb->request, sizeof(orb->request));
@@ -535,8 +549,7 @@ sbp2_send_management_orb(struct sbp2_logical_unit *lu, int node_id,
535 sbp2_send_orb(&orb->base, lu, node_id, generation, 549 sbp2_send_orb(&orb->base, lu, node_id, generation,
536 lu->tgt->management_agent_address); 550 lu->tgt->management_agent_address);
537 551
538 wait_for_completion_timeout(&orb->done, 552 wait_for_completion_timeout(&orb->done, msecs_to_jiffies(timeout));
539 msecs_to_jiffies(SBP2_ORB_TIMEOUT));
540 553
541 retval = -EIO; 554 retval = -EIO;
542 if (sbp2_cancel_orbs(lu) == 0) { 555 if (sbp2_cancel_orbs(lu) == 0) {
@@ -608,13 +621,17 @@ static void sbp2_release_target(struct kref *kref)
608 struct sbp2_logical_unit *lu, *next; 621 struct sbp2_logical_unit *lu, *next;
609 struct Scsi_Host *shost = 622 struct Scsi_Host *shost =
610 container_of((void *)tgt, struct Scsi_Host, hostdata[0]); 623 container_of((void *)tgt, struct Scsi_Host, hostdata[0]);
624 struct fw_device *device = fw_device(tgt->unit->device.parent);
611 625
612 list_for_each_entry_safe(lu, next, &tgt->lu_list, link) { 626 list_for_each_entry_safe(lu, next, &tgt->lu_list, link) {
613 if (lu->sdev) 627 if (lu->sdev)
614 scsi_remove_device(lu->sdev); 628 scsi_remove_device(lu->sdev);
615 629
616 sbp2_send_management_orb(lu, tgt->node_id, lu->generation, 630 if (!fw_device_is_shutdown(device))
617 SBP2_LOGOUT_REQUEST, lu->login_id, NULL); 631 sbp2_send_management_orb(lu, tgt->node_id,
632 lu->generation, SBP2_LOGOUT_REQUEST,
633 lu->login_id, NULL);
634
618 fw_core_remove_address_handler(&lu->address_handler); 635 fw_core_remove_address_handler(&lu->address_handler);
619 list_del(&lu->link); 636 list_del(&lu->link);
620 kfree(lu); 637 kfree(lu);
@@ -628,6 +645,21 @@ static void sbp2_release_target(struct kref *kref)
628 645
629static struct workqueue_struct *sbp2_wq; 646static struct workqueue_struct *sbp2_wq;
630 647
648/*
649 * Always get the target's kref when scheduling work on one its units.
650 * Each workqueue job is responsible to call sbp2_target_put() upon return.
651 */
652static void sbp2_queue_work(struct sbp2_logical_unit *lu, unsigned long delay)
653{
654 if (queue_delayed_work(sbp2_wq, &lu->work, delay))
655 kref_get(&lu->tgt->kref);
656}
657
658static void sbp2_target_put(struct sbp2_target *tgt)
659{
660 kref_put(&tgt->kref, sbp2_release_target);
661}
662
631static void sbp2_reconnect(struct work_struct *work); 663static void sbp2_reconnect(struct work_struct *work);
632 664
633static void sbp2_login(struct work_struct *work) 665static void sbp2_login(struct work_struct *work)
@@ -643,22 +675,19 @@ static void sbp2_login(struct work_struct *work)
643 struct sbp2_login_response response; 675 struct sbp2_login_response response;
644 int generation, node_id, local_node_id; 676 int generation, node_id, local_node_id;
645 677
646 generation = device->card->generation; 678 generation = device->generation;
647 node_id = device->node->node_id; 679 smp_rmb(); /* node_id must not be older than generation */
648 local_node_id = device->card->local_node->node_id; 680 node_id = device->node_id;
681 local_node_id = device->card->node_id;
649 682
650 if (sbp2_send_management_orb(lu, node_id, generation, 683 if (sbp2_send_management_orb(lu, node_id, generation,
651 SBP2_LOGIN_REQUEST, lu->lun, &response) < 0) { 684 SBP2_LOGIN_REQUEST, lu->lun, &response) < 0) {
652 if (lu->retries++ < 5) { 685 if (lu->retries++ < 5)
653 if (queue_delayed_work(sbp2_wq, &lu->work, 686 sbp2_queue_work(lu, DIV_ROUND_UP(HZ, 5));
654 DIV_ROUND_UP(HZ, 5))) 687 else
655 kref_get(&lu->tgt->kref);
656 } else {
657 fw_error("failed to login to %s LUN %04x\n", 688 fw_error("failed to login to %s LUN %04x\n",
658 unit->device.bus_id, lu->lun); 689 unit->device.bus_id, lu->lun);
659 } 690 goto out;
660 kref_put(&lu->tgt->kref, sbp2_release_target);
661 return;
662 } 691 }
663 692
664 lu->generation = generation; 693 lu->generation = generation;
@@ -700,7 +729,8 @@ static void sbp2_login(struct work_struct *work)
700 lu->sdev = sdev; 729 lu->sdev = sdev;
701 scsi_device_put(sdev); 730 scsi_device_put(sdev);
702 } 731 }
703 kref_put(&lu->tgt->kref, sbp2_release_target); 732 out:
733 sbp2_target_put(lu->tgt);
704} 734}
705 735
706static int sbp2_add_logical_unit(struct sbp2_target *tgt, int lun_entry) 736static int sbp2_add_logical_unit(struct sbp2_target *tgt, int lun_entry)
@@ -750,6 +780,7 @@ static int sbp2_scan_unit_dir(struct sbp2_target *tgt, u32 *directory,
750{ 780{
751 struct fw_csr_iterator ci; 781 struct fw_csr_iterator ci;
752 int key, value; 782 int key, value;
783 unsigned int timeout;
753 784
754 fw_csr_iterator_init(&ci, directory); 785 fw_csr_iterator_init(&ci, directory);
755 while (fw_csr_iterator_next(&ci, &key, &value)) { 786 while (fw_csr_iterator_next(&ci, &key, &value)) {
@@ -772,6 +803,21 @@ static int sbp2_scan_unit_dir(struct sbp2_target *tgt, u32 *directory,
772 *firmware_revision = value; 803 *firmware_revision = value;
773 break; 804 break;
774 805
806 case SBP2_CSR_UNIT_CHARACTERISTICS:
807 /* the timeout value is stored in 500ms units */
808 timeout = ((unsigned int) value >> 8 & 0xff) * 500;
809 timeout = max(timeout, SBP2_MIN_LOGIN_ORB_TIMEOUT);
810 tgt->mgt_orb_timeout =
811 min(timeout, SBP2_MAX_LOGIN_ORB_TIMEOUT);
812
813 if (timeout > tgt->mgt_orb_timeout)
814 fw_notify("%s: config rom contains %ds "
815 "management ORB timeout, limiting "
816 "to %ds\n", tgt->unit->device.bus_id,
817 timeout / 1000,
818 tgt->mgt_orb_timeout / 1000);
819 break;
820
775 case SBP2_CSR_LOGICAL_UNIT_NUMBER: 821 case SBP2_CSR_LOGICAL_UNIT_NUMBER:
776 if (sbp2_add_logical_unit(tgt, value) < 0) 822 if (sbp2_add_logical_unit(tgt, value) < 0)
777 return -ENOMEM; 823 return -ENOMEM;
@@ -865,18 +911,13 @@ static int sbp2_probe(struct device *dev)
865 911
866 get_device(&unit->device); 912 get_device(&unit->device);
867 913
868 /* 914 /* Do the login in a workqueue so we can easily reschedule retries. */
869 * We schedule work to do the login so we can easily
870 * reschedule retries. Always get the ref before scheduling
871 * work.
872 */
873 list_for_each_entry(lu, &tgt->lu_list, link) 915 list_for_each_entry(lu, &tgt->lu_list, link)
874 if (queue_delayed_work(sbp2_wq, &lu->work, 0)) 916 sbp2_queue_work(lu, 0);
875 kref_get(&tgt->kref);
876 return 0; 917 return 0;
877 918
878 fail_tgt_put: 919 fail_tgt_put:
879 kref_put(&tgt->kref, sbp2_release_target); 920 sbp2_target_put(tgt);
880 return -ENOMEM; 921 return -ENOMEM;
881 922
882 fail_shost_put: 923 fail_shost_put:
@@ -889,7 +930,7 @@ static int sbp2_remove(struct device *dev)
889 struct fw_unit *unit = fw_unit(dev); 930 struct fw_unit *unit = fw_unit(dev);
890 struct sbp2_target *tgt = unit->device.driver_data; 931 struct sbp2_target *tgt = unit->device.driver_data;
891 932
892 kref_put(&tgt->kref, sbp2_release_target); 933 sbp2_target_put(tgt);
893 return 0; 934 return 0;
894} 935}
895 936
@@ -901,9 +942,10 @@ static void sbp2_reconnect(struct work_struct *work)
901 struct fw_device *device = fw_device(unit->device.parent); 942 struct fw_device *device = fw_device(unit->device.parent);
902 int generation, node_id, local_node_id; 943 int generation, node_id, local_node_id;
903 944
904 generation = device->card->generation; 945 generation = device->generation;
905 node_id = device->node->node_id; 946 smp_rmb(); /* node_id must not be older than generation */
906 local_node_id = device->card->local_node->node_id; 947 node_id = device->node_id;
948 local_node_id = device->card->node_id;
907 949
908 if (sbp2_send_management_orb(lu, node_id, generation, 950 if (sbp2_send_management_orb(lu, node_id, generation,
909 SBP2_RECONNECT_REQUEST, 951 SBP2_RECONNECT_REQUEST,
@@ -915,10 +957,8 @@ static void sbp2_reconnect(struct work_struct *work)
915 lu->retries = 0; 957 lu->retries = 0;
916 PREPARE_DELAYED_WORK(&lu->work, sbp2_login); 958 PREPARE_DELAYED_WORK(&lu->work, sbp2_login);
917 } 959 }
918 if (queue_delayed_work(sbp2_wq, &lu->work, DIV_ROUND_UP(HZ, 5))) 960 sbp2_queue_work(lu, DIV_ROUND_UP(HZ, 5));
919 kref_get(&lu->tgt->kref); 961 goto out;
920 kref_put(&lu->tgt->kref, sbp2_release_target);
921 return;
922 } 962 }
923 963
924 lu->generation = generation; 964 lu->generation = generation;
@@ -930,8 +970,8 @@ static void sbp2_reconnect(struct work_struct *work)
930 970
931 sbp2_agent_reset(lu); 971 sbp2_agent_reset(lu);
932 sbp2_cancel_orbs(lu); 972 sbp2_cancel_orbs(lu);
933 973 out:
934 kref_put(&lu->tgt->kref, sbp2_release_target); 974 sbp2_target_put(lu->tgt);
935} 975}
936 976
937static void sbp2_update(struct fw_unit *unit) 977static void sbp2_update(struct fw_unit *unit)
@@ -947,8 +987,7 @@ static void sbp2_update(struct fw_unit *unit)
947 */ 987 */
948 list_for_each_entry(lu, &tgt->lu_list, link) { 988 list_for_each_entry(lu, &tgt->lu_list, link) {
949 lu->retries = 0; 989 lu->retries = 0;
950 if (queue_delayed_work(sbp2_wq, &lu->work, 0)) 990 sbp2_queue_work(lu, 0);
951 kref_get(&tgt->kref);
952 } 991 }
953} 992}
954 993
@@ -1103,9 +1142,9 @@ sbp2_map_scatterlist(struct sbp2_command_orb *orb, struct fw_device *device,
1103 * elements larger than 65535 bytes, some IOMMUs may merge sg elements 1142 * elements larger than 65535 bytes, some IOMMUs may merge sg elements
1104 * during DMA mapping, and Linux currently doesn't prevent this. 1143 * during DMA mapping, and Linux currently doesn't prevent this.
1105 */ 1144 */
1106 for (i = 0, j = 0; i < count; i++) { 1145 for (i = 0, j = 0; i < count; i++, sg = sg_next(sg)) {
1107 sg_len = sg_dma_len(sg + i); 1146 sg_len = sg_dma_len(sg);
1108 sg_addr = sg_dma_address(sg + i); 1147 sg_addr = sg_dma_address(sg);
1109 while (sg_len) { 1148 while (sg_len) {
1110 /* FIXME: This won't get us out of the pinch. */ 1149 /* FIXME: This won't get us out of the pinch. */
1111 if (unlikely(j >= ARRAY_SIZE(orb->page_table))) { 1150 if (unlikely(j >= ARRAY_SIZE(orb->page_table))) {
diff --git a/drivers/firewire/fw-topology.c b/drivers/firewire/fw-topology.c
index 0fc9b000e99d..172c1867e9aa 100644
--- a/drivers/firewire/fw-topology.c
+++ b/drivers/firewire/fw-topology.c
@@ -21,6 +21,7 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/wait.h> 22#include <linux/wait.h>
23#include <linux/errno.h> 23#include <linux/errno.h>
24#include <asm/system.h>
24#include "fw-transaction.h" 25#include "fw-transaction.h"
25#include "fw-topology.h" 26#include "fw-topology.h"
26 27
@@ -518,6 +519,11 @@ fw_core_handle_bus_reset(struct fw_card *card,
518 card->bm_retries = 0; 519 card->bm_retries = 0;
519 520
520 card->node_id = node_id; 521 card->node_id = node_id;
522 /*
523 * Update node_id before generation to prevent anybody from using
524 * a stale node_id together with a current generation.
525 */
526 smp_wmb();
521 card->generation = generation; 527 card->generation = generation;
522 card->reset_jiffies = jiffies; 528 card->reset_jiffies = jiffies;
523 schedule_delayed_work(&card->work, 0); 529 schedule_delayed_work(&card->work, 0);
diff --git a/drivers/firewire/fw-transaction.c b/drivers/firewire/fw-transaction.c
index c00d4a9b39e5..7fcc59dedf08 100644
--- a/drivers/firewire/fw-transaction.c
+++ b/drivers/firewire/fw-transaction.c
@@ -153,7 +153,7 @@ fw_fill_request(struct fw_packet *packet, int tcode, int tlabel,
153 int ext_tcode; 153 int ext_tcode;
154 154
155 if (tcode > 0x10) { 155 if (tcode > 0x10) {
156 ext_tcode = tcode - 0x10; 156 ext_tcode = tcode & ~0x10;
157 tcode = TCODE_LOCK_REQUEST; 157 tcode = TCODE_LOCK_REQUEST;
158 } else 158 } else
159 ext_tcode = 0; 159 ext_tcode = 0;
@@ -650,7 +650,7 @@ fw_core_handle_request(struct fw_card *card, struct fw_packet *p)
650 HEADER_GET_OFFSET_HIGH(p->header[1]) << 32) | p->header[2]; 650 HEADER_GET_OFFSET_HIGH(p->header[1]) << 32) | p->header[2];
651 tcode = HEADER_GET_TCODE(p->header[0]); 651 tcode = HEADER_GET_TCODE(p->header[0]);
652 destination = HEADER_GET_DESTINATION(p->header[0]); 652 destination = HEADER_GET_DESTINATION(p->header[0]);
653 source = HEADER_GET_SOURCE(p->header[0]); 653 source = HEADER_GET_SOURCE(p->header[1]);
654 654
655 spin_lock_irqsave(&address_handler_lock, flags); 655 spin_lock_irqsave(&address_handler_lock, flags);
656 handler = lookup_enclosing_address_handler(&address_handler_list, 656 handler = lookup_enclosing_address_handler(&address_handler_list,
diff --git a/drivers/ieee1394/dma.c b/drivers/ieee1394/dma.c
index 7c4eb39b7024..73685e7dc7e4 100644
--- a/drivers/ieee1394/dma.c
+++ b/drivers/ieee1394/dma.c
@@ -231,37 +231,24 @@ void dma_region_sync_for_device(struct dma_region *dma, unsigned long offset,
231 231
232#ifdef CONFIG_MMU 232#ifdef CONFIG_MMU
233 233
234/* nopage() handler for mmap access */ 234static int dma_region_pagefault(struct vm_area_struct *vma,
235 235 struct vm_fault *vmf)
236static struct page *dma_region_pagefault(struct vm_area_struct *area,
237 unsigned long address, int *type)
238{ 236{
239 unsigned long offset; 237 struct dma_region *dma = (struct dma_region *)vma->vm_private_data;
240 unsigned long kernel_virt_addr;
241 struct page *ret = NOPAGE_SIGBUS;
242
243 struct dma_region *dma = (struct dma_region *)area->vm_private_data;
244 238
245 if (!dma->kvirt) 239 if (!dma->kvirt)
246 goto out; 240 return VM_FAULT_SIGBUS;
247 241
248 if ((address < (unsigned long)area->vm_start) || 242 if (vmf->pgoff >= dma->n_pages)
249 (address > 243 return VM_FAULT_SIGBUS;
250 (unsigned long)area->vm_start + (dma->n_pages << PAGE_SHIFT))) 244
251 goto out; 245 vmf->page = vmalloc_to_page(dma->kvirt + (vmf->pgoff << PAGE_SHIFT));
252 246 get_page(vmf->page);
253 if (type) 247 return 0;
254 *type = VM_FAULT_MINOR;
255 offset = address - area->vm_start;
256 kernel_virt_addr = (unsigned long)dma->kvirt + offset;
257 ret = vmalloc_to_page((void *)kernel_virt_addr);
258 get_page(ret);
259 out:
260 return ret;
261} 248}
262 249
263static struct vm_operations_struct dma_region_vm_ops = { 250static struct vm_operations_struct dma_region_vm_ops = {
264 .nopage = dma_region_pagefault, 251 .fault = dma_region_pagefault,
265}; 252};
266 253
267/** 254/**
@@ -275,7 +262,7 @@ int dma_region_mmap(struct dma_region *dma, struct file *file,
275 if (!dma->kvirt) 262 if (!dma->kvirt)
276 return -EINVAL; 263 return -EINVAL;
277 264
278 /* must be page-aligned */ 265 /* must be page-aligned (XXX: comment is wrong, we could allow pgoff) */
279 if (vma->vm_pgoff != 0) 266 if (vma->vm_pgoff != 0)
280 return -EINVAL; 267 return -EINVAL;
281 268
diff --git a/drivers/ieee1394/ieee1394_transactions.c b/drivers/ieee1394/ieee1394_transactions.c
index 677989320951..10c3d9f8c038 100644
--- a/drivers/ieee1394/ieee1394_transactions.c
+++ b/drivers/ieee1394/ieee1394_transactions.c
@@ -570,71 +570,3 @@ int hpsb_write(struct hpsb_host *host, nodeid_t node, unsigned int generation,
570 570
571 return retval; 571 return retval;
572} 572}
573
574#if 0
575
576int hpsb_lock(struct hpsb_host *host, nodeid_t node, unsigned int generation,
577 u64 addr, int extcode, quadlet_t * data, quadlet_t arg)
578{
579 struct hpsb_packet *packet;
580 int retval = 0;
581
582 BUG_ON(in_interrupt()); // We can't be called in an interrupt, yet
583
584 packet = hpsb_make_lockpacket(host, node, addr, extcode, data, arg);
585 if (!packet)
586 return -ENOMEM;
587
588 packet->generation = generation;
589 retval = hpsb_send_packet_and_wait(packet);
590 if (retval < 0)
591 goto hpsb_lock_fail;
592
593 retval = hpsb_packet_success(packet);
594
595 if (retval == 0) {
596 *data = packet->data[0];
597 }
598
599 hpsb_lock_fail:
600 hpsb_free_tlabel(packet);
601 hpsb_free_packet(packet);
602
603 return retval;
604}
605
606int hpsb_send_gasp(struct hpsb_host *host, int channel, unsigned int generation,
607 quadlet_t * buffer, size_t length, u32 specifier_id,
608 unsigned int version)
609{
610 struct hpsb_packet *packet;
611 int retval = 0;
612 u16 specifier_id_hi = (specifier_id & 0x00ffff00) >> 8;
613 u8 specifier_id_lo = specifier_id & 0xff;
614
615 HPSB_VERBOSE("Send GASP: channel = %d, length = %Zd", channel, length);
616
617 length += 8;
618
619 packet = hpsb_make_streampacket(host, NULL, length, channel, 3, 0);
620 if (!packet)
621 return -ENOMEM;
622
623 packet->data[0] = cpu_to_be32((host->node_id << 16) | specifier_id_hi);
624 packet->data[1] =
625 cpu_to_be32((specifier_id_lo << 24) | (version & 0x00ffffff));
626
627 memcpy(&(packet->data[2]), buffer, length - 8);
628
629 packet->generation = generation;
630
631 packet->no_waiter = 1;
632
633 retval = hpsb_send_packet(packet);
634 if (retval < 0)
635 hpsb_free_packet(packet);
636
637 return retval;
638}
639
640#endif /* 0 */
diff --git a/drivers/ieee1394/ohci1394.c b/drivers/ieee1394/ohci1394.c
index 372c5c16eb31..969de2a2d633 100644
--- a/drivers/ieee1394/ohci1394.c
+++ b/drivers/ieee1394/ohci1394.c
@@ -2126,10 +2126,14 @@ static void ohci_schedule_iso_tasklets(struct ti_ohci *ohci,
2126 list_for_each_entry(t, &ohci->iso_tasklet_list, link) { 2126 list_for_each_entry(t, &ohci->iso_tasklet_list, link) {
2127 mask = 1 << t->context; 2127 mask = 1 << t->context;
2128 2128
2129 if (t->type == OHCI_ISO_TRANSMIT && tx_event & mask) 2129 if (t->type == OHCI_ISO_TRANSMIT) {
2130 tasklet_schedule(&t->tasklet); 2130 if (tx_event & mask)
2131 else if (rx_event & mask) 2131 tasklet_schedule(&t->tasklet);
2132 tasklet_schedule(&t->tasklet); 2132 } else {
2133 /* OHCI_ISO_RECEIVE or OHCI_ISO_MULTICHANNEL_RECEIVE */
2134 if (rx_event & mask)
2135 tasklet_schedule(&t->tasklet);
2136 }
2133 } 2137 }
2134 2138
2135 spin_unlock_irqrestore(&ohci->iso_tasklet_list_lock, flags); 2139 spin_unlock_irqrestore(&ohci->iso_tasklet_list_lock, flags);
diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c
index cadf0479cce5..37e7e109af38 100644
--- a/drivers/ieee1394/raw1394.c
+++ b/drivers/ieee1394/raw1394.c
@@ -858,7 +858,7 @@ static int arm_read(struct hpsb_host *host, int nodeid, quadlet_t * buffer,
858 int found = 0, size = 0, rcode = -1; 858 int found = 0, size = 0, rcode = -1;
859 struct arm_request_response *arm_req_resp = NULL; 859 struct arm_request_response *arm_req_resp = NULL;
860 860
861 DBGMSG("arm_read called by node: %X" 861 DBGMSG("arm_read called by node: %X "
862 "addr: %4.4x %8.8x length: %Zu", nodeid, 862 "addr: %4.4x %8.8x length: %Zu", nodeid,
863 (u16) ((addr >> 32) & 0xFFFF), (u32) (addr & 0xFFFFFFFF), 863 (u16) ((addr >> 32) & 0xFFFF), (u32) (addr & 0xFFFFFFFF),
864 length); 864 length);
@@ -1012,7 +1012,7 @@ static int arm_write(struct hpsb_host *host, int nodeid, int destid,
1012 int found = 0, size = 0, rcode = -1, length_conflict = 0; 1012 int found = 0, size = 0, rcode = -1, length_conflict = 0;
1013 struct arm_request_response *arm_req_resp = NULL; 1013 struct arm_request_response *arm_req_resp = NULL;
1014 1014
1015 DBGMSG("arm_write called by node: %X" 1015 DBGMSG("arm_write called by node: %X "
1016 "addr: %4.4x %8.8x length: %Zu", nodeid, 1016 "addr: %4.4x %8.8x length: %Zu", nodeid,
1017 (u16) ((addr >> 32) & 0xFFFF), (u32) (addr & 0xFFFFFFFF), 1017 (u16) ((addr >> 32) & 0xFFFF), (u32) (addr & 0xFFFFFFFF),
1018 length); 1018 length);
diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index 1eda11abeb1e..2b889d91e673 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -51,6 +51,7 @@
51 * Grep for inline FIXME comments below. 51 * Grep for inline FIXME comments below.
52 */ 52 */
53 53
54#include <linux/blkdev.h>
54#include <linux/compiler.h> 55#include <linux/compiler.h>
55#include <linux/delay.h> 56#include <linux/delay.h>
56#include <linux/device.h> 57#include <linux/device.h>
@@ -127,17 +128,21 @@ MODULE_PARM_DESC(serialize_io, "Serialize requests coming from SCSI drivers "
127 "(default = Y, faster but buggy = N)"); 128 "(default = Y, faster but buggy = N)");
128 129
129/* 130/*
130 * Bump up max_sectors if you'd like to support very large sized 131 * Adjust max_sectors if you'd like to influence how many sectors each SCSI
131 * transfers. Please note that some older sbp2 bridge chips are broken for 132 * command can transfer at most. Please note that some older SBP-2 bridge
132 * transfers greater or equal to 128KB. Default is a value of 255 133 * chips are broken for transfers greater or equal to 128KB, therefore
133 * sectors, or just under 128KB (at 512 byte sector size). I can note that 134 * max_sectors used to be a safe 255 sectors for many years. We now have a
134 * the Oxsemi sbp2 chipsets have no problems supporting very large 135 * default of 0 here which means that we let the SCSI stack choose a limit.
135 * transfer sizes. 136 *
137 * The SBP2_WORKAROUND_128K_MAX_TRANS flag, if set either in the workarounds
138 * module parameter or in the sbp2_workarounds_table[], will override the
139 * value of max_sectors. We should use sbp2_workarounds_table[] to cover any
140 * bridge chip which becomes known to need the 255 sectors limit.
136 */ 141 */
137static int sbp2_max_sectors = SBP2_MAX_SECTORS; 142static int sbp2_max_sectors;
138module_param_named(max_sectors, sbp2_max_sectors, int, 0444); 143module_param_named(max_sectors, sbp2_max_sectors, int, 0444);
139MODULE_PARM_DESC(max_sectors, "Change max sectors per I/O supported " 144MODULE_PARM_DESC(max_sectors, "Change max sectors per I/O supported "
140 "(default = " __stringify(SBP2_MAX_SECTORS) ")"); 145 "(default = 0 = use SCSI stack's default)");
141 146
142/* 147/*
143 * Exclusive login to sbp2 device? In most cases, the sbp2 driver should 148 * Exclusive login to sbp2 device? In most cases, the sbp2 driver should
@@ -1451,7 +1456,7 @@ static void sbp2_prep_command_orb_sg(struct sbp2_command_orb *orb,
1451 struct sbp2_fwhost_info *hi, 1456 struct sbp2_fwhost_info *hi,
1452 struct sbp2_command_info *cmd, 1457 struct sbp2_command_info *cmd,
1453 unsigned int scsi_use_sg, 1458 unsigned int scsi_use_sg,
1454 struct scatterlist *sgpnt, 1459 struct scatterlist *sg,
1455 u32 orb_direction, 1460 u32 orb_direction,
1456 enum dma_data_direction dma_dir) 1461 enum dma_data_direction dma_dir)
1457{ 1462{
@@ -1461,12 +1466,12 @@ static void sbp2_prep_command_orb_sg(struct sbp2_command_orb *orb,
1461 1466
1462 /* special case if only one element (and less than 64KB in size) */ 1467 /* special case if only one element (and less than 64KB in size) */
1463 if ((scsi_use_sg == 1) && 1468 if ((scsi_use_sg == 1) &&
1464 (sgpnt[0].length <= SBP2_MAX_SG_ELEMENT_LENGTH)) { 1469 (sg_dma_len(sg) <= SBP2_MAX_SG_ELEMENT_LENGTH)) {
1465 1470
1466 cmd->dma_size = sgpnt[0].length; 1471 cmd->dma_size = sg_dma_len(sg);
1467 cmd->dma_type = CMD_DMA_PAGE; 1472 cmd->dma_type = CMD_DMA_PAGE;
1468 cmd->cmd_dma = dma_map_page(hi->host->device.parent, 1473 cmd->cmd_dma = dma_map_page(hi->host->device.parent,
1469 sg_page(&sgpnt[0]), sgpnt[0].offset, 1474 sg_page(sg), sg->offset,
1470 cmd->dma_size, cmd->dma_dir); 1475 cmd->dma_size, cmd->dma_dir);
1471 1476
1472 orb->data_descriptor_lo = cmd->cmd_dma; 1477 orb->data_descriptor_lo = cmd->cmd_dma;
@@ -1477,11 +1482,11 @@ static void sbp2_prep_command_orb_sg(struct sbp2_command_orb *orb,
1477 &cmd->scatter_gather_element[0]; 1482 &cmd->scatter_gather_element[0];
1478 u32 sg_count, sg_len; 1483 u32 sg_count, sg_len;
1479 dma_addr_t sg_addr; 1484 dma_addr_t sg_addr;
1480 int i, count = dma_map_sg(hi->host->device.parent, sgpnt, 1485 int i, count = dma_map_sg(hi->host->device.parent, sg,
1481 scsi_use_sg, dma_dir); 1486 scsi_use_sg, dma_dir);
1482 1487
1483 cmd->dma_size = scsi_use_sg; 1488 cmd->dma_size = scsi_use_sg;
1484 cmd->sge_buffer = sgpnt; 1489 cmd->sge_buffer = sg;
1485 1490
1486 /* use page tables (s/g) */ 1491 /* use page tables (s/g) */
1487 orb->misc |= ORB_SET_PAGE_TABLE_PRESENT(0x1); 1492 orb->misc |= ORB_SET_PAGE_TABLE_PRESENT(0x1);
@@ -1489,9 +1494,9 @@ static void sbp2_prep_command_orb_sg(struct sbp2_command_orb *orb,
1489 1494
1490 /* loop through and fill out our SBP-2 page tables 1495 /* loop through and fill out our SBP-2 page tables
1491 * (and split up anything too large) */ 1496 * (and split up anything too large) */
1492 for (i = 0, sg_count = 0 ; i < count; i++, sgpnt++) { 1497 for (i = 0, sg_count = 0; i < count; i++, sg = sg_next(sg)) {
1493 sg_len = sg_dma_len(sgpnt); 1498 sg_len = sg_dma_len(sg);
1494 sg_addr = sg_dma_address(sgpnt); 1499 sg_addr = sg_dma_address(sg);
1495 while (sg_len) { 1500 while (sg_len) {
1496 sg_element[sg_count].segment_base_lo = sg_addr; 1501 sg_element[sg_count].segment_base_lo = sg_addr;
1497 if (sg_len > SBP2_MAX_SG_ELEMENT_LENGTH) { 1502 if (sg_len > SBP2_MAX_SG_ELEMENT_LENGTH) {
@@ -1521,11 +1526,10 @@ static void sbp2_create_command_orb(struct sbp2_lu *lu,
1521 unchar *scsi_cmd, 1526 unchar *scsi_cmd,
1522 unsigned int scsi_use_sg, 1527 unsigned int scsi_use_sg,
1523 unsigned int scsi_request_bufflen, 1528 unsigned int scsi_request_bufflen,
1524 void *scsi_request_buffer, 1529 struct scatterlist *sg,
1525 enum dma_data_direction dma_dir) 1530 enum dma_data_direction dma_dir)
1526{ 1531{
1527 struct sbp2_fwhost_info *hi = lu->hi; 1532 struct sbp2_fwhost_info *hi = lu->hi;
1528 struct scatterlist *sgpnt = (struct scatterlist *)scsi_request_buffer;
1529 struct sbp2_command_orb *orb = &cmd->command_orb; 1533 struct sbp2_command_orb *orb = &cmd->command_orb;
1530 u32 orb_direction; 1534 u32 orb_direction;
1531 1535
@@ -1560,7 +1564,7 @@ static void sbp2_create_command_orb(struct sbp2_lu *lu,
1560 orb->data_descriptor_lo = 0x0; 1564 orb->data_descriptor_lo = 0x0;
1561 orb->misc |= ORB_SET_DIRECTION(1); 1565 orb->misc |= ORB_SET_DIRECTION(1);
1562 } else 1566 } else
1563 sbp2_prep_command_orb_sg(orb, hi, cmd, scsi_use_sg, sgpnt, 1567 sbp2_prep_command_orb_sg(orb, hi, cmd, scsi_use_sg, sg,
1564 orb_direction, dma_dir); 1568 orb_direction, dma_dir);
1565 1569
1566 sbp2util_cpu_to_be32_buffer(orb, sizeof(*orb)); 1570 sbp2util_cpu_to_be32_buffer(orb, sizeof(*orb));
@@ -1650,7 +1654,6 @@ static int sbp2_send_command(struct sbp2_lu *lu, struct scsi_cmnd *SCpnt,
1650 void (*done)(struct scsi_cmnd *)) 1654 void (*done)(struct scsi_cmnd *))
1651{ 1655{
1652 unchar *scsi_cmd = (unchar *)SCpnt->cmnd; 1656 unchar *scsi_cmd = (unchar *)SCpnt->cmnd;
1653 unsigned int request_bufflen = scsi_bufflen(SCpnt);
1654 struct sbp2_command_info *cmd; 1657 struct sbp2_command_info *cmd;
1655 1658
1656 cmd = sbp2util_allocate_command_orb(lu, SCpnt, done); 1659 cmd = sbp2util_allocate_command_orb(lu, SCpnt, done);
@@ -1658,7 +1661,7 @@ static int sbp2_send_command(struct sbp2_lu *lu, struct scsi_cmnd *SCpnt,
1658 return -EIO; 1661 return -EIO;
1659 1662
1660 sbp2_create_command_orb(lu, cmd, scsi_cmd, scsi_sg_count(SCpnt), 1663 sbp2_create_command_orb(lu, cmd, scsi_cmd, scsi_sg_count(SCpnt),
1661 request_bufflen, scsi_sglist(SCpnt), 1664 scsi_bufflen(SCpnt), scsi_sglist(SCpnt),
1662 SCpnt->sc_data_direction); 1665 SCpnt->sc_data_direction);
1663 sbp2_link_orb_command(lu, cmd); 1666 sbp2_link_orb_command(lu, cmd);
1664 1667
@@ -1987,6 +1990,8 @@ static int sbp2scsi_slave_configure(struct scsi_device *sdev)
1987 sdev->skip_ms_page_8 = 1; 1990 sdev->skip_ms_page_8 = 1;
1988 if (lu->workarounds & SBP2_WORKAROUND_FIX_CAPACITY) 1991 if (lu->workarounds & SBP2_WORKAROUND_FIX_CAPACITY)
1989 sdev->fix_capacity = 1; 1992 sdev->fix_capacity = 1;
1993 if (lu->workarounds & SBP2_WORKAROUND_128K_MAX_TRANS)
1994 blk_queue_max_sectors(sdev->request_queue, 128 * 1024 / 512);
1990 return 0; 1995 return 0;
1991} 1996}
1992 1997
@@ -2093,9 +2098,6 @@ static int sbp2_module_init(void)
2093 sbp2_shost_template.cmd_per_lun = 1; 2098 sbp2_shost_template.cmd_per_lun = 1;
2094 } 2099 }
2095 2100
2096 if (sbp2_default_workarounds & SBP2_WORKAROUND_128K_MAX_TRANS &&
2097 (sbp2_max_sectors * 512) > (128 * 1024))
2098 sbp2_max_sectors = 128 * 1024 / 512;
2099 sbp2_shost_template.max_sectors = sbp2_max_sectors; 2101 sbp2_shost_template.max_sectors = sbp2_max_sectors;
2100 2102
2101 hpsb_register_highlevel(&sbp2_highlevel); 2103 hpsb_register_highlevel(&sbp2_highlevel);
diff --git a/drivers/ieee1394/sbp2.h b/drivers/ieee1394/sbp2.h
index 333a4bb76743..d2ecb0d8a1bb 100644
--- a/drivers/ieee1394/sbp2.h
+++ b/drivers/ieee1394/sbp2.h
@@ -222,7 +222,6 @@ struct sbp2_status_block {
222 */ 222 */
223 223
224#define SBP2_MAX_SG_ELEMENT_LENGTH 0xf000 224#define SBP2_MAX_SG_ELEMENT_LENGTH 0xf000
225#define SBP2_MAX_SECTORS 255
226/* There is no real limitation of the queue depth (i.e. length of the linked 225/* There is no real limitation of the queue depth (i.e. length of the linked
227 * list of command ORBs) at the target. The chosen depth is merely an 226 * list of command ORBs) at the target. The chosen depth is merely an
228 * implementation detail of the sbp2 driver. */ 227 * implementation detail of the sbp2 driver. */
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index f2d2c7e2c76b..195ce7c12319 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1571,7 +1571,6 @@ static struct scsi_host_template srp_template = {
1571 .this_id = -1, 1571 .this_id = -1,
1572 .cmd_per_lun = SRP_SQ_SIZE, 1572 .cmd_per_lun = SRP_SQ_SIZE,
1573 .use_clustering = ENABLE_CLUSTERING, 1573 .use_clustering = ENABLE_CLUSTERING,
1574 .use_sg_chaining = ENABLE_SG_CHAINING,
1575 .shost_attrs = srp_host_attrs 1574 .shost_attrs = srp_host_attrs
1576}; 1575};
1577 1576
diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig
deleted file mode 100644
index 656920636cb2..000000000000
--- a/drivers/kvm/Kconfig
+++ /dev/null
@@ -1,54 +0,0 @@
1#
2# KVM configuration
3#
4menuconfig VIRTUALIZATION
5 bool "Virtualization"
6 depends on X86
7 default y
8 ---help---
9 Say Y here to get to see options for using your Linux host to run other
10 operating systems inside virtual machines (guests).
11 This option alone does not add any kernel code.
12
13 If you say N, all options in this submenu will be skipped and disabled.
14
15if VIRTUALIZATION
16
17config KVM
18 tristate "Kernel-based Virtual Machine (KVM) support"
19 depends on X86 && EXPERIMENTAL
20 select PREEMPT_NOTIFIERS
21 select ANON_INODES
22 ---help---
23 Support hosting fully virtualized guest machines using hardware
24 virtualization extensions. You will need a fairly recent
25 processor equipped with virtualization extensions. You will also
26 need to select one or more of the processor modules below.
27
28 This module provides access to the hardware capabilities through
29 a character device node named /dev/kvm.
30
31 To compile this as a module, choose M here: the module
32 will be called kvm.
33
34 If unsure, say N.
35
36config KVM_INTEL
37 tristate "KVM for Intel processors support"
38 depends on KVM
39 ---help---
40 Provides support for KVM on Intel processors equipped with the VT
41 extensions.
42
43config KVM_AMD
44 tristate "KVM for AMD processors support"
45 depends on KVM
46 ---help---
47 Provides support for KVM on AMD processors equipped with the AMD-V
48 (SVM) extensions.
49
50# OK, it's a little counter-intuitive to do this, but it puts it neatly under
51# the virtualization menu.
52source drivers/lguest/Kconfig
53
54endif # VIRTUALIZATION
diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
deleted file mode 100644
index e5a8f4d3e973..000000000000
--- a/drivers/kvm/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
1#
2# Makefile for Kernel-based Virtual Machine module
3#
4
5kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o
6obj-$(CONFIG_KVM) += kvm.o
7kvm-intel-objs = vmx.o
8obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
9kvm-amd-objs = svm.o
10obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/drivers/kvm/i8259.c b/drivers/kvm/i8259.c
deleted file mode 100644
index a679157bc599..000000000000
--- a/drivers/kvm/i8259.c
+++ /dev/null
@@ -1,450 +0,0 @@
1/*
2 * 8259 interrupt controller emulation
3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 * THE SOFTWARE.
24 * Authors:
25 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
26 * Port from Qemu.
27 */
28#include <linux/mm.h>
29#include "irq.h"
30
31/*
32 * set irq level. If an edge is detected, then the IRR is set to 1
33 */
34static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
35{
36 int mask;
37 mask = 1 << irq;
38 if (s->elcr & mask) /* level triggered */
39 if (level) {
40 s->irr |= mask;
41 s->last_irr |= mask;
42 } else {
43 s->irr &= ~mask;
44 s->last_irr &= ~mask;
45 }
46 else /* edge triggered */
47 if (level) {
48 if ((s->last_irr & mask) == 0)
49 s->irr |= mask;
50 s->last_irr |= mask;
51 } else
52 s->last_irr &= ~mask;
53}
54
55/*
56 * return the highest priority found in mask (highest = smallest
57 * number). Return 8 if no irq
58 */
59static inline int get_priority(struct kvm_kpic_state *s, int mask)
60{
61 int priority;
62 if (mask == 0)
63 return 8;
64 priority = 0;
65 while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
66 priority++;
67 return priority;
68}
69
70/*
71 * return the pic wanted interrupt. return -1 if none
72 */
73static int pic_get_irq(struct kvm_kpic_state *s)
74{
75 int mask, cur_priority, priority;
76
77 mask = s->irr & ~s->imr;
78 priority = get_priority(s, mask);
79 if (priority == 8)
80 return -1;
81 /*
82 * compute current priority. If special fully nested mode on the
83 * master, the IRQ coming from the slave is not taken into account
84 * for the priority computation.
85 */
86 mask = s->isr;
87 if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
88 mask &= ~(1 << 2);
89 cur_priority = get_priority(s, mask);
90 if (priority < cur_priority)
91 /*
92 * higher priority found: an irq should be generated
93 */
94 return (priority + s->priority_add) & 7;
95 else
96 return -1;
97}
98
99/*
100 * raise irq to CPU if necessary. must be called every time the active
101 * irq may change
102 */
103static void pic_update_irq(struct kvm_pic *s)
104{
105 int irq2, irq;
106
107 irq2 = pic_get_irq(&s->pics[1]);
108 if (irq2 >= 0) {
109 /*
110 * if irq request by slave pic, signal master PIC
111 */
112 pic_set_irq1(&s->pics[0], 2, 1);
113 pic_set_irq1(&s->pics[0], 2, 0);
114 }
115 irq = pic_get_irq(&s->pics[0]);
116 if (irq >= 0)
117 s->irq_request(s->irq_request_opaque, 1);
118 else
119 s->irq_request(s->irq_request_opaque, 0);
120}
121
122void kvm_pic_update_irq(struct kvm_pic *s)
123{
124 pic_update_irq(s);
125}
126
127void kvm_pic_set_irq(void *opaque, int irq, int level)
128{
129 struct kvm_pic *s = opaque;
130
131 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
132 pic_update_irq(s);
133}
134
135/*
136 * acknowledge interrupt 'irq'
137 */
138static inline void pic_intack(struct kvm_kpic_state *s, int irq)
139{
140 if (s->auto_eoi) {
141 if (s->rotate_on_auto_eoi)
142 s->priority_add = (irq + 1) & 7;
143 } else
144 s->isr |= (1 << irq);
145 /*
146 * We don't clear a level sensitive interrupt here
147 */
148 if (!(s->elcr & (1 << irq)))
149 s->irr &= ~(1 << irq);
150}
151
152int kvm_pic_read_irq(struct kvm_pic *s)
153{
154 int irq, irq2, intno;
155
156 irq = pic_get_irq(&s->pics[0]);
157 if (irq >= 0) {
158 pic_intack(&s->pics[0], irq);
159 if (irq == 2) {
160 irq2 = pic_get_irq(&s->pics[1]);
161 if (irq2 >= 0)
162 pic_intack(&s->pics[1], irq2);
163 else
164 /*
165 * spurious IRQ on slave controller
166 */
167 irq2 = 7;
168 intno = s->pics[1].irq_base + irq2;
169 irq = irq2 + 8;
170 } else
171 intno = s->pics[0].irq_base + irq;
172 } else {
173 /*
174 * spurious IRQ on host controller
175 */
176 irq = 7;
177 intno = s->pics[0].irq_base + irq;
178 }
179 pic_update_irq(s);
180
181 return intno;
182}
183
184static void pic_reset(void *opaque)
185{
186 struct kvm_kpic_state *s = opaque;
187
188 s->last_irr = 0;
189 s->irr = 0;
190 s->imr = 0;
191 s->isr = 0;
192 s->priority_add = 0;
193 s->irq_base = 0;
194 s->read_reg_select = 0;
195 s->poll = 0;
196 s->special_mask = 0;
197 s->init_state = 0;
198 s->auto_eoi = 0;
199 s->rotate_on_auto_eoi = 0;
200 s->special_fully_nested_mode = 0;
201 s->init4 = 0;
202}
203
204static void pic_ioport_write(void *opaque, u32 addr, u32 val)
205{
206 struct kvm_kpic_state *s = opaque;
207 int priority, cmd, irq;
208
209 addr &= 1;
210 if (addr == 0) {
211 if (val & 0x10) {
212 pic_reset(s); /* init */
213 /*
214 * deassert a pending interrupt
215 */
216 s->pics_state->irq_request(s->pics_state->
217 irq_request_opaque, 0);
218 s->init_state = 1;
219 s->init4 = val & 1;
220 if (val & 0x02)
221 printk(KERN_ERR "single mode not supported");
222 if (val & 0x08)
223 printk(KERN_ERR
224 "level sensitive irq not supported");
225 } else if (val & 0x08) {
226 if (val & 0x04)
227 s->poll = 1;
228 if (val & 0x02)
229 s->read_reg_select = val & 1;
230 if (val & 0x40)
231 s->special_mask = (val >> 5) & 1;
232 } else {
233 cmd = val >> 5;
234 switch (cmd) {
235 case 0:
236 case 4:
237 s->rotate_on_auto_eoi = cmd >> 2;
238 break;
239 case 1: /* end of interrupt */
240 case 5:
241 priority = get_priority(s, s->isr);
242 if (priority != 8) {
243 irq = (priority + s->priority_add) & 7;
244 s->isr &= ~(1 << irq);
245 if (cmd == 5)
246 s->priority_add = (irq + 1) & 7;
247 pic_update_irq(s->pics_state);
248 }
249 break;
250 case 3:
251 irq = val & 7;
252 s->isr &= ~(1 << irq);
253 pic_update_irq(s->pics_state);
254 break;
255 case 6:
256 s->priority_add = (val + 1) & 7;
257 pic_update_irq(s->pics_state);
258 break;
259 case 7:
260 irq = val & 7;
261 s->isr &= ~(1 << irq);
262 s->priority_add = (irq + 1) & 7;
263 pic_update_irq(s->pics_state);
264 break;
265 default:
266 break; /* no operation */
267 }
268 }
269 } else
270 switch (s->init_state) {
271 case 0: /* normal mode */
272 s->imr = val;
273 pic_update_irq(s->pics_state);
274 break;
275 case 1:
276 s->irq_base = val & 0xf8;
277 s->init_state = 2;
278 break;
279 case 2:
280 if (s->init4)
281 s->init_state = 3;
282 else
283 s->init_state = 0;
284 break;
285 case 3:
286 s->special_fully_nested_mode = (val >> 4) & 1;
287 s->auto_eoi = (val >> 1) & 1;
288 s->init_state = 0;
289 break;
290 }
291}
292
293static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
294{
295 int ret;
296
297 ret = pic_get_irq(s);
298 if (ret >= 0) {
299 if (addr1 >> 7) {
300 s->pics_state->pics[0].isr &= ~(1 << 2);
301 s->pics_state->pics[0].irr &= ~(1 << 2);
302 }
303 s->irr &= ~(1 << ret);
304 s->isr &= ~(1 << ret);
305 if (addr1 >> 7 || ret != 2)
306 pic_update_irq(s->pics_state);
307 } else {
308 ret = 0x07;
309 pic_update_irq(s->pics_state);
310 }
311
312 return ret;
313}
314
315static u32 pic_ioport_read(void *opaque, u32 addr1)
316{
317 struct kvm_kpic_state *s = opaque;
318 unsigned int addr;
319 int ret;
320
321 addr = addr1;
322 addr &= 1;
323 if (s->poll) {
324 ret = pic_poll_read(s, addr1);
325 s->poll = 0;
326 } else
327 if (addr == 0)
328 if (s->read_reg_select)
329 ret = s->isr;
330 else
331 ret = s->irr;
332 else
333 ret = s->imr;
334 return ret;
335}
336
337static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
338{
339 struct kvm_kpic_state *s = opaque;
340 s->elcr = val & s->elcr_mask;
341}
342
343static u32 elcr_ioport_read(void *opaque, u32 addr1)
344{
345 struct kvm_kpic_state *s = opaque;
346 return s->elcr;
347}
348
349static int picdev_in_range(struct kvm_io_device *this, gpa_t addr)
350{
351 switch (addr) {
352 case 0x20:
353 case 0x21:
354 case 0xa0:
355 case 0xa1:
356 case 0x4d0:
357 case 0x4d1:
358 return 1;
359 default:
360 return 0;
361 }
362}
363
364static void picdev_write(struct kvm_io_device *this,
365 gpa_t addr, int len, const void *val)
366{
367 struct kvm_pic *s = this->private;
368 unsigned char data = *(unsigned char *)val;
369
370 if (len != 1) {
371 if (printk_ratelimit())
372 printk(KERN_ERR "PIC: non byte write\n");
373 return;
374 }
375 switch (addr) {
376 case 0x20:
377 case 0x21:
378 case 0xa0:
379 case 0xa1:
380 pic_ioport_write(&s->pics[addr >> 7], addr, data);
381 break;
382 case 0x4d0:
383 case 0x4d1:
384 elcr_ioport_write(&s->pics[addr & 1], addr, data);
385 break;
386 }
387}
388
389static void picdev_read(struct kvm_io_device *this,
390 gpa_t addr, int len, void *val)
391{
392 struct kvm_pic *s = this->private;
393 unsigned char data = 0;
394
395 if (len != 1) {
396 if (printk_ratelimit())
397 printk(KERN_ERR "PIC: non byte read\n");
398 return;
399 }
400 switch (addr) {
401 case 0x20:
402 case 0x21:
403 case 0xa0:
404 case 0xa1:
405 data = pic_ioport_read(&s->pics[addr >> 7], addr);
406 break;
407 case 0x4d0:
408 case 0x4d1:
409 data = elcr_ioport_read(&s->pics[addr & 1], addr);
410 break;
411 }
412 *(unsigned char *)val = data;
413}
414
415/*
416 * callback when PIC0 irq status changed
417 */
418static void pic_irq_request(void *opaque, int level)
419{
420 struct kvm *kvm = opaque;
421 struct kvm_vcpu *vcpu = kvm->vcpus[0];
422
423 pic_irqchip(kvm)->output = level;
424 if (vcpu)
425 kvm_vcpu_kick(vcpu);
426}
427
428struct kvm_pic *kvm_create_pic(struct kvm *kvm)
429{
430 struct kvm_pic *s;
431 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
432 if (!s)
433 return NULL;
434 s->pics[0].elcr_mask = 0xf8;
435 s->pics[1].elcr_mask = 0xde;
436 s->irq_request = pic_irq_request;
437 s->irq_request_opaque = kvm;
438 s->pics[0].pics_state = s;
439 s->pics[1].pics_state = s;
440
441 /*
442 * Initialize PIO device
443 */
444 s->dev.read = picdev_read;
445 s->dev.write = picdev_write;
446 s->dev.in_range = picdev_in_range;
447 s->dev.private = s;
448 kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
449 return s;
450}
diff --git a/drivers/kvm/ioapic.c b/drivers/kvm/ioapic.c
deleted file mode 100644
index c7992e667fdb..000000000000
--- a/drivers/kvm/ioapic.c
+++ /dev/null
@@ -1,388 +0,0 @@
1/*
2 * Copyright (C) 2001 MandrakeSoft S.A.
3 *
4 * MandrakeSoft S.A.
5 * 43, rue d'Aboukir
6 * 75002 Paris - France
7 * http://www.linux-mandrake.com/
8 * http://www.mandrakesoft.com/
9 *
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with this library; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 *
24 * Yunhong Jiang <yunhong.jiang@intel.com>
25 * Yaozu (Eddie) Dong <eddie.dong@intel.com>
26 * Based on Xen 3.1 code.
27 */
28
29#include "kvm.h"
30#include <linux/kvm.h>
31#include <linux/mm.h>
32#include <linux/highmem.h>
33#include <linux/smp.h>
34#include <linux/hrtimer.h>
35#include <linux/io.h>
36#include <asm/processor.h>
37#include <asm/msr.h>
38#include <asm/page.h>
39#include <asm/current.h>
40#include <asm/apicdef.h>
41#include <asm/io_apic.h>
42#include "irq.h"
43/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
44#define ioapic_debug(fmt, arg...)
45static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
46
47static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
48 unsigned long addr,
49 unsigned long length)
50{
51 unsigned long result = 0;
52
53 switch (ioapic->ioregsel) {
54 case IOAPIC_REG_VERSION:
55 result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
56 | (IOAPIC_VERSION_ID & 0xff));
57 break;
58
59 case IOAPIC_REG_APIC_ID:
60 case IOAPIC_REG_ARB_ID:
61 result = ((ioapic->id & 0xf) << 24);
62 break;
63
64 default:
65 {
66 u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
67 u64 redir_content;
68
69 ASSERT(redir_index < IOAPIC_NUM_PINS);
70
71 redir_content = ioapic->redirtbl[redir_index].bits;
72 result = (ioapic->ioregsel & 0x1) ?
73 (redir_content >> 32) & 0xffffffff :
74 redir_content & 0xffffffff;
75 break;
76 }
77 }
78
79 return result;
80}
81
82static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
83{
84 union ioapic_redir_entry *pent;
85
86 pent = &ioapic->redirtbl[idx];
87
88 if (!pent->fields.mask) {
89 ioapic_deliver(ioapic, idx);
90 if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
91 pent->fields.remote_irr = 1;
92 }
93 if (!pent->fields.trig_mode)
94 ioapic->irr &= ~(1 << idx);
95}
96
97static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
98{
99 unsigned index;
100
101 switch (ioapic->ioregsel) {
102 case IOAPIC_REG_VERSION:
103 /* Writes are ignored. */
104 break;
105
106 case IOAPIC_REG_APIC_ID:
107 ioapic->id = (val >> 24) & 0xf;
108 break;
109
110 case IOAPIC_REG_ARB_ID:
111 break;
112
113 default:
114 index = (ioapic->ioregsel - 0x10) >> 1;
115
116 ioapic_debug("change redir index %x val %x", index, val);
117 if (index >= IOAPIC_NUM_PINS)
118 return;
119 if (ioapic->ioregsel & 1) {
120 ioapic->redirtbl[index].bits &= 0xffffffff;
121 ioapic->redirtbl[index].bits |= (u64) val << 32;
122 } else {
123 ioapic->redirtbl[index].bits &= ~0xffffffffULL;
124 ioapic->redirtbl[index].bits |= (u32) val;
125 ioapic->redirtbl[index].fields.remote_irr = 0;
126 }
127 if (ioapic->irr & (1 << index))
128 ioapic_service(ioapic, index);
129 break;
130 }
131}
132
133static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
134 struct kvm_lapic *target,
135 u8 vector, u8 trig_mode, u8 delivery_mode)
136{
137 ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode,
138 delivery_mode);
139
140 ASSERT((delivery_mode == dest_Fixed) ||
141 (delivery_mode == dest_LowestPrio));
142
143 kvm_apic_set_irq(target, vector, trig_mode);
144}
145
146static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
147 u8 dest_mode)
148{
149 u32 mask = 0;
150 int i;
151 struct kvm *kvm = ioapic->kvm;
152 struct kvm_vcpu *vcpu;
153
154 ioapic_debug("dest %d dest_mode %d", dest, dest_mode);
155
156 if (dest_mode == 0) { /* Physical mode. */
157 if (dest == 0xFF) { /* Broadcast. */
158 for (i = 0; i < KVM_MAX_VCPUS; ++i)
159 if (kvm->vcpus[i] && kvm->vcpus[i]->apic)
160 mask |= 1 << i;
161 return mask;
162 }
163 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
164 vcpu = kvm->vcpus[i];
165 if (!vcpu)
166 continue;
167 if (kvm_apic_match_physical_addr(vcpu->apic, dest)) {
168 if (vcpu->apic)
169 mask = 1 << i;
170 break;
171 }
172 }
173 } else if (dest != 0) /* Logical mode, MDA non-zero. */
174 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
175 vcpu = kvm->vcpus[i];
176 if (!vcpu)
177 continue;
178 if (vcpu->apic &&
179 kvm_apic_match_logical_addr(vcpu->apic, dest))
180 mask |= 1 << vcpu->vcpu_id;
181 }
182 ioapic_debug("mask %x", mask);
183 return mask;
184}
185
186static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
187{
188 u8 dest = ioapic->redirtbl[irq].fields.dest_id;
189 u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode;
190 u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode;
191 u8 vector = ioapic->redirtbl[irq].fields.vector;
192 u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
193 u32 deliver_bitmask;
194 struct kvm_lapic *target;
195 struct kvm_vcpu *vcpu;
196 int vcpu_id;
197
198 ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
199 "vector=%x trig_mode=%x",
200 dest, dest_mode, delivery_mode, vector, trig_mode);
201
202 deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
203 if (!deliver_bitmask) {
204 ioapic_debug("no target on destination");
205 return;
206 }
207
208 switch (delivery_mode) {
209 case dest_LowestPrio:
210 target =
211 kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask);
212 if (target != NULL)
213 ioapic_inj_irq(ioapic, target, vector,
214 trig_mode, delivery_mode);
215 else
216 ioapic_debug("null round robin: "
217 "mask=%x vector=%x delivery_mode=%x",
218 deliver_bitmask, vector, dest_LowestPrio);
219 break;
220 case dest_Fixed:
221 for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
222 if (!(deliver_bitmask & (1 << vcpu_id)))
223 continue;
224 deliver_bitmask &= ~(1 << vcpu_id);
225 vcpu = ioapic->kvm->vcpus[vcpu_id];
226 if (vcpu) {
227 target = vcpu->apic;
228 ioapic_inj_irq(ioapic, target, vector,
229 trig_mode, delivery_mode);
230 }
231 }
232 break;
233
234 /* TODO: NMI */
235 default:
236 printk(KERN_WARNING "Unsupported delivery mode %d\n",
237 delivery_mode);
238 break;
239 }
240}
241
242void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
243{
244 u32 old_irr = ioapic->irr;
245 u32 mask = 1 << irq;
246 union ioapic_redir_entry entry;
247
248 if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
249 entry = ioapic->redirtbl[irq];
250 level ^= entry.fields.polarity;
251 if (!level)
252 ioapic->irr &= ~mask;
253 else {
254 ioapic->irr |= mask;
255 if ((!entry.fields.trig_mode && old_irr != ioapic->irr)
256 || !entry.fields.remote_irr)
257 ioapic_service(ioapic, irq);
258 }
259 }
260}
261
262static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
263{
264 int i;
265
266 for (i = 0; i < IOAPIC_NUM_PINS; i++)
267 if (ioapic->redirtbl[i].fields.vector == vector)
268 return i;
269 return -1;
270}
271
272void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
273{
274 struct kvm_ioapic *ioapic = kvm->vioapic;
275 union ioapic_redir_entry *ent;
276 int gsi;
277
278 gsi = get_eoi_gsi(ioapic, vector);
279 if (gsi == -1) {
280 printk(KERN_WARNING "Can't find redir item for %d EOI\n",
281 vector);
282 return;
283 }
284
285 ent = &ioapic->redirtbl[gsi];
286 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
287
288 ent->fields.remote_irr = 0;
289 if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
290 ioapic_deliver(ioapic, gsi);
291}
292
293static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)
294{
295 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
296
297 return ((addr >= ioapic->base_address &&
298 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
299}
300
301static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
302 void *val)
303{
304 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
305 u32 result;
306
307 ioapic_debug("addr %lx", (unsigned long)addr);
308 ASSERT(!(addr & 0xf)); /* check alignment */
309
310 addr &= 0xff;
311 switch (addr) {
312 case IOAPIC_REG_SELECT:
313 result = ioapic->ioregsel;
314 break;
315
316 case IOAPIC_REG_WINDOW:
317 result = ioapic_read_indirect(ioapic, addr, len);
318 break;
319
320 default:
321 result = 0;
322 break;
323 }
324 switch (len) {
325 case 8:
326 *(u64 *) val = result;
327 break;
328 case 1:
329 case 2:
330 case 4:
331 memcpy(val, (char *)&result, len);
332 break;
333 default:
334 printk(KERN_WARNING "ioapic: wrong length %d\n", len);
335 }
336}
337
338static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
339 const void *val)
340{
341 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
342 u32 data;
343
344 ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n",
345 addr, len, val);
346 ASSERT(!(addr & 0xf)); /* check alignment */
347 if (len == 4 || len == 8)
348 data = *(u32 *) val;
349 else {
350 printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
351 return;
352 }
353
354 addr &= 0xff;
355 switch (addr) {
356 case IOAPIC_REG_SELECT:
357 ioapic->ioregsel = data;
358 break;
359
360 case IOAPIC_REG_WINDOW:
361 ioapic_write_indirect(ioapic, data);
362 break;
363
364 default:
365 break;
366 }
367}
368
369int kvm_ioapic_init(struct kvm *kvm)
370{
371 struct kvm_ioapic *ioapic;
372 int i;
373
374 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
375 if (!ioapic)
376 return -ENOMEM;
377 kvm->vioapic = ioapic;
378 for (i = 0; i < IOAPIC_NUM_PINS; i++)
379 ioapic->redirtbl[i].fields.mask = 1;
380 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
381 ioapic->dev.read = ioapic_mmio_read;
382 ioapic->dev.write = ioapic_mmio_write;
383 ioapic->dev.in_range = ioapic_in_range;
384 ioapic->dev.private = ioapic;
385 ioapic->kvm = kvm;
386 kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev);
387 return 0;
388}
diff --git a/drivers/kvm/irq.c b/drivers/kvm/irq.c
deleted file mode 100644
index 7628c7ff628f..000000000000
--- a/drivers/kvm/irq.c
+++ /dev/null
@@ -1,98 +0,0 @@
1/*
2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#include <linux/module.h>
23
24#include "kvm.h"
25#include "irq.h"
26
27/*
28 * check if there is pending interrupt without
29 * intack.
30 */
31int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
32{
33 struct kvm_pic *s;
34
35 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
36 if (kvm_apic_accept_pic_intr(v)) {
37 s = pic_irqchip(v->kvm); /* PIC */
38 return s->output;
39 } else
40 return 0;
41 }
42 return 1;
43}
44EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
45
46/*
47 * Read pending interrupt vector and intack.
48 */
49int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
50{
51 struct kvm_pic *s;
52 int vector;
53
54 vector = kvm_get_apic_interrupt(v); /* APIC */
55 if (vector == -1) {
56 if (kvm_apic_accept_pic_intr(v)) {
57 s = pic_irqchip(v->kvm);
58 s->output = 0; /* PIC */
59 vector = kvm_pic_read_irq(s);
60 }
61 }
62 return vector;
63}
64EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
65
66static void vcpu_kick_intr(void *info)
67{
68#ifdef DEBUG
69 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
70 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
71#endif
72}
73
74void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
75{
76 int ipi_pcpu = vcpu->cpu;
77
78 if (waitqueue_active(&vcpu->wq)) {
79 wake_up_interruptible(&vcpu->wq);
80 ++vcpu->stat.halt_wakeup;
81 }
82 if (vcpu->guest_mode)
83 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
84}
85
86void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
87{
88 kvm_inject_apic_timer_irqs(vcpu);
89 /* TODO: PIT, RTC etc. */
90}
91EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
92
93void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
94{
95 kvm_apic_timer_intr_post(vcpu, vec);
96 /* TODO: PIT, RTC etc. */
97}
98EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h
deleted file mode 100644
index 11fc014e2b30..000000000000
--- a/drivers/kvm/irq.h
+++ /dev/null
@@ -1,165 +0,0 @@
1/*
2 * irq.h: in kernel interrupt controller related definitions
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#ifndef __IRQ_H
23#define __IRQ_H
24
25#include "kvm.h"
26
27typedef void irq_request_func(void *opaque, int level);
28
29struct kvm_kpic_state {
30 u8 last_irr; /* edge detection */
31 u8 irr; /* interrupt request register */
32 u8 imr; /* interrupt mask register */
33 u8 isr; /* interrupt service register */
34 u8 priority_add; /* highest irq priority */
35 u8 irq_base;
36 u8 read_reg_select;
37 u8 poll;
38 u8 special_mask;
39 u8 init_state;
40 u8 auto_eoi;
41 u8 rotate_on_auto_eoi;
42 u8 special_fully_nested_mode;
43 u8 init4; /* true if 4 byte init */
44 u8 elcr; /* PIIX edge/trigger selection */
45 u8 elcr_mask;
46 struct kvm_pic *pics_state;
47};
48
49struct kvm_pic {
50 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
51 irq_request_func *irq_request;
52 void *irq_request_opaque;
53 int output; /* intr from master PIC */
54 struct kvm_io_device dev;
55};
56
57struct kvm_pic *kvm_create_pic(struct kvm *kvm);
58void kvm_pic_set_irq(void *opaque, int irq, int level);
59int kvm_pic_read_irq(struct kvm_pic *s);
60int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
61int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
62void kvm_pic_update_irq(struct kvm_pic *s);
63
64#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
65#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
66#define IOAPIC_EDGE_TRIG 0
67#define IOAPIC_LEVEL_TRIG 1
68
69#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
70#define IOAPIC_MEM_LENGTH 0x100
71
72/* Direct registers. */
73#define IOAPIC_REG_SELECT 0x00
74#define IOAPIC_REG_WINDOW 0x10
75#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
76
77/* Indirect registers. */
78#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
79#define IOAPIC_REG_VERSION 0x01
80#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
81
82struct kvm_ioapic {
83 u64 base_address;
84 u32 ioregsel;
85 u32 id;
86 u32 irr;
87 u32 pad;
88 union ioapic_redir_entry {
89 u64 bits;
90 struct {
91 u8 vector;
92 u8 delivery_mode:3;
93 u8 dest_mode:1;
94 u8 delivery_status:1;
95 u8 polarity:1;
96 u8 remote_irr:1;
97 u8 trig_mode:1;
98 u8 mask:1;
99 u8 reserve:7;
100 u8 reserved[4];
101 u8 dest_id;
102 } fields;
103 } redirtbl[IOAPIC_NUM_PINS];
104 struct kvm_io_device dev;
105 struct kvm *kvm;
106};
107
108struct kvm_lapic {
109 unsigned long base_address;
110 struct kvm_io_device dev;
111 struct {
112 atomic_t pending;
113 s64 period; /* unit: ns */
114 u32 divide_count;
115 ktime_t last_update;
116 struct hrtimer dev;
117 } timer;
118 struct kvm_vcpu *vcpu;
119 struct page *regs_page;
120 void *regs;
121};
122
123#ifdef DEBUG
124#define ASSERT(x) \
125do { \
126 if (!(x)) { \
127 printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
128 __FILE__, __LINE__, #x); \
129 BUG(); \
130 } \
131} while (0)
132#else
133#define ASSERT(x) do { } while (0)
134#endif
135
136void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
137int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
138int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
139int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
140int kvm_create_lapic(struct kvm_vcpu *vcpu);
141void kvm_lapic_reset(struct kvm_vcpu *vcpu);
142void kvm_free_apic(struct kvm_lapic *apic);
143u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
144void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
145void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
146struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
147 unsigned long bitmap);
148u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
149void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
150int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
151void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
152int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
153int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig);
154void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
155int kvm_ioapic_init(struct kvm *kvm);
156void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
157int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
158int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
159void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
160void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
161void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
162void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
163void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
164
165#endif
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
deleted file mode 100644
index 3b0bc4bda5f2..000000000000
--- a/drivers/kvm/kvm.h
+++ /dev/null
@@ -1,796 +0,0 @@
1#ifndef __KVM_H
2#define __KVM_H
3
4/*
5 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory.
7 */
8
9#include <linux/types.h>
10#include <linux/list.h>
11#include <linux/mutex.h>
12#include <linux/spinlock.h>
13#include <linux/signal.h>
14#include <linux/sched.h>
15#include <linux/mm.h>
16#include <linux/preempt.h>
17#include <asm/signal.h>
18
19#include <linux/kvm.h>
20#include <linux/kvm_para.h>
21
22#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
23#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
24#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS|0xFFFFFF0000000000ULL)
25
26#define KVM_GUEST_CR0_MASK \
27 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
28 | X86_CR0_NW | X86_CR0_CD)
29#define KVM_VM_CR0_ALWAYS_ON \
30 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
31 | X86_CR0_MP)
32#define KVM_GUEST_CR4_MASK \
33 (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
34#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
35#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
36
37#define INVALID_PAGE (~(hpa_t)0)
38#define UNMAPPED_GVA (~(gpa_t)0)
39
40#define KVM_MAX_VCPUS 4
41#define KVM_ALIAS_SLOTS 4
42#define KVM_MEMORY_SLOTS 8
43#define KVM_NUM_MMU_PAGES 1024
44#define KVM_MIN_FREE_MMU_PAGES 5
45#define KVM_REFILL_PAGES 25
46#define KVM_MAX_CPUID_ENTRIES 40
47
48#define DE_VECTOR 0
49#define NM_VECTOR 7
50#define DF_VECTOR 8
51#define TS_VECTOR 10
52#define NP_VECTOR 11
53#define SS_VECTOR 12
54#define GP_VECTOR 13
55#define PF_VECTOR 14
56
57#define SELECTOR_TI_MASK (1 << 2)
58#define SELECTOR_RPL_MASK 0x03
59
60#define IOPL_SHIFT 12
61
62#define KVM_PIO_PAGE_OFFSET 1
63
64/*
65 * vcpu->requests bit members
66 */
67#define KVM_TLB_FLUSH 0
68
69/*
70 * Address types:
71 *
72 * gva - guest virtual address
73 * gpa - guest physical address
74 * gfn - guest frame number
75 * hva - host virtual address
76 * hpa - host physical address
77 * hfn - host frame number
78 */
79
80typedef unsigned long gva_t;
81typedef u64 gpa_t;
82typedef unsigned long gfn_t;
83
84typedef unsigned long hva_t;
85typedef u64 hpa_t;
86typedef unsigned long hfn_t;
87
88#define NR_PTE_CHAIN_ENTRIES 5
89
90struct kvm_pte_chain {
91 u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
92 struct hlist_node link;
93};
94
95/*
96 * kvm_mmu_page_role, below, is defined as:
97 *
98 * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
99 * bits 4:7 - page table level for this shadow (1-4)
100 * bits 8:9 - page table quadrant for 2-level guests
101 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
102 * bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde
103 */
104union kvm_mmu_page_role {
105 unsigned word;
106 struct {
107 unsigned glevels : 4;
108 unsigned level : 4;
109 unsigned quadrant : 2;
110 unsigned pad_for_nice_hex_output : 6;
111 unsigned metaphysical : 1;
112 unsigned hugepage_access : 3;
113 };
114};
115
116struct kvm_mmu_page {
117 struct list_head link;
118 struct hlist_node hash_link;
119
120 /*
121 * The following two entries are used to key the shadow page in the
122 * hash table.
123 */
124 gfn_t gfn;
125 union kvm_mmu_page_role role;
126
127 u64 *spt;
128 unsigned long slot_bitmap; /* One bit set per slot which has memory
129 * in this shadow page.
130 */
131 int multimapped; /* More than one parent_pte? */
132 int root_count; /* Currently serving as active root */
133 union {
134 u64 *parent_pte; /* !multimapped */
135 struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
136 };
137};
138
139struct kvm_vcpu;
140extern struct kmem_cache *kvm_vcpu_cache;
141
142/*
143 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
144 * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
145 * mode.
146 */
147struct kvm_mmu {
148 void (*new_cr3)(struct kvm_vcpu *vcpu);
149 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
150 void (*free)(struct kvm_vcpu *vcpu);
151 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
152 hpa_t root_hpa;
153 int root_level;
154 int shadow_root_level;
155
156 u64 *pae_root;
157};
158
159#define KVM_NR_MEM_OBJS 20
160
161struct kvm_mmu_memory_cache {
162 int nobjs;
163 void *objects[KVM_NR_MEM_OBJS];
164};
165
166/*
167 * We don't want allocation failures within the mmu code, so we preallocate
168 * enough memory for a single page fault in a cache.
169 */
170struct kvm_guest_debug {
171 int enabled;
172 unsigned long bp[4];
173 int singlestep;
174};
175
176enum {
177 VCPU_REGS_RAX = 0,
178 VCPU_REGS_RCX = 1,
179 VCPU_REGS_RDX = 2,
180 VCPU_REGS_RBX = 3,
181 VCPU_REGS_RSP = 4,
182 VCPU_REGS_RBP = 5,
183 VCPU_REGS_RSI = 6,
184 VCPU_REGS_RDI = 7,
185#ifdef CONFIG_X86_64
186 VCPU_REGS_R8 = 8,
187 VCPU_REGS_R9 = 9,
188 VCPU_REGS_R10 = 10,
189 VCPU_REGS_R11 = 11,
190 VCPU_REGS_R12 = 12,
191 VCPU_REGS_R13 = 13,
192 VCPU_REGS_R14 = 14,
193 VCPU_REGS_R15 = 15,
194#endif
195 NR_VCPU_REGS
196};
197
198enum {
199 VCPU_SREG_CS,
200 VCPU_SREG_DS,
201 VCPU_SREG_ES,
202 VCPU_SREG_FS,
203 VCPU_SREG_GS,
204 VCPU_SREG_SS,
205 VCPU_SREG_TR,
206 VCPU_SREG_LDTR,
207};
208
209struct kvm_pio_request {
210 unsigned long count;
211 int cur_count;
212 struct page *guest_pages[2];
213 unsigned guest_page_offset;
214 int in;
215 int port;
216 int size;
217 int string;
218 int down;
219 int rep;
220};
221
222struct kvm_stat {
223 u32 pf_fixed;
224 u32 pf_guest;
225 u32 tlb_flush;
226 u32 invlpg;
227
228 u32 exits;
229 u32 io_exits;
230 u32 mmio_exits;
231 u32 signal_exits;
232 u32 irq_window_exits;
233 u32 halt_exits;
234 u32 halt_wakeup;
235 u32 request_irq_exits;
236 u32 irq_exits;
237 u32 light_exits;
238 u32 efer_reload;
239};
240
241struct kvm_io_device {
242 void (*read)(struct kvm_io_device *this,
243 gpa_t addr,
244 int len,
245 void *val);
246 void (*write)(struct kvm_io_device *this,
247 gpa_t addr,
248 int len,
249 const void *val);
250 int (*in_range)(struct kvm_io_device *this, gpa_t addr);
251 void (*destructor)(struct kvm_io_device *this);
252
253 void *private;
254};
255
256static inline void kvm_iodevice_read(struct kvm_io_device *dev,
257 gpa_t addr,
258 int len,
259 void *val)
260{
261 dev->read(dev, addr, len, val);
262}
263
264static inline void kvm_iodevice_write(struct kvm_io_device *dev,
265 gpa_t addr,
266 int len,
267 const void *val)
268{
269 dev->write(dev, addr, len, val);
270}
271
272static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
273{
274 return dev->in_range(dev, addr);
275}
276
277static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
278{
279 if (dev->destructor)
280 dev->destructor(dev);
281}
282
283/*
284 * It would be nice to use something smarter than a linear search, TBD...
285 * Thankfully we dont expect many devices to register (famous last words :),
286 * so until then it will suffice. At least its abstracted so we can change
287 * in one place.
288 */
289struct kvm_io_bus {
290 int dev_count;
291#define NR_IOBUS_DEVS 6
292 struct kvm_io_device *devs[NR_IOBUS_DEVS];
293};
294
295void kvm_io_bus_init(struct kvm_io_bus *bus);
296void kvm_io_bus_destroy(struct kvm_io_bus *bus);
297struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
298void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
299 struct kvm_io_device *dev);
300
301struct kvm_vcpu {
302 struct kvm *kvm;
303 struct preempt_notifier preempt_notifier;
304 int vcpu_id;
305 struct mutex mutex;
306 int cpu;
307 u64 host_tsc;
308 struct kvm_run *run;
309 int interrupt_window_open;
310 int guest_mode;
311 unsigned long requests;
312 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
313 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
314 unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
315 unsigned long rip; /* needs vcpu_load_rsp_rip() */
316
317 unsigned long cr0;
318 unsigned long cr2;
319 unsigned long cr3;
320 gpa_t para_state_gpa;
321 struct page *para_state_page;
322 gpa_t hypercall_gpa;
323 unsigned long cr4;
324 unsigned long cr8;
325 u64 pdptrs[4]; /* pae */
326 u64 shadow_efer;
327 u64 apic_base;
328 struct kvm_lapic *apic; /* kernel irqchip context */
329#define VCPU_MP_STATE_RUNNABLE 0
330#define VCPU_MP_STATE_UNINITIALIZED 1
331#define VCPU_MP_STATE_INIT_RECEIVED 2
332#define VCPU_MP_STATE_SIPI_RECEIVED 3
333#define VCPU_MP_STATE_HALTED 4
334 int mp_state;
335 int sipi_vector;
336 u64 ia32_misc_enable_msr;
337
338 struct kvm_mmu mmu;
339
340 struct kvm_mmu_memory_cache mmu_pte_chain_cache;
341 struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
342 struct kvm_mmu_memory_cache mmu_page_cache;
343 struct kvm_mmu_memory_cache mmu_page_header_cache;
344
345 gfn_t last_pt_write_gfn;
346 int last_pt_write_count;
347
348 struct kvm_guest_debug guest_debug;
349
350 struct i387_fxsave_struct host_fx_image;
351 struct i387_fxsave_struct guest_fx_image;
352 int fpu_active;
353 int guest_fpu_loaded;
354
355 int mmio_needed;
356 int mmio_read_completed;
357 int mmio_is_write;
358 int mmio_size;
359 unsigned char mmio_data[8];
360 gpa_t mmio_phys_addr;
361 gva_t mmio_fault_cr2;
362 struct kvm_pio_request pio;
363 void *pio_data;
364 wait_queue_head_t wq;
365
366 int sigset_active;
367 sigset_t sigset;
368
369 struct kvm_stat stat;
370
371 struct {
372 int active;
373 u8 save_iopl;
374 struct kvm_save_segment {
375 u16 selector;
376 unsigned long base;
377 u32 limit;
378 u32 ar;
379 } tr, es, ds, fs, gs;
380 } rmode;
381 int halt_request; /* real mode on Intel only */
382
383 int cpuid_nent;
384 struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
385};
386
387struct kvm_mem_alias {
388 gfn_t base_gfn;
389 unsigned long npages;
390 gfn_t target_gfn;
391};
392
393struct kvm_memory_slot {
394 gfn_t base_gfn;
395 unsigned long npages;
396 unsigned long flags;
397 struct page **phys_mem;
398 unsigned long *dirty_bitmap;
399};
400
401struct kvm {
402 struct mutex lock; /* protects everything except vcpus */
403 int naliases;
404 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
405 int nmemslots;
406 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
407 /*
408 * Hash table of struct kvm_mmu_page.
409 */
410 struct list_head active_mmu_pages;
411 int n_free_mmu_pages;
412 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
413 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
414 unsigned long rmap_overflow;
415 struct list_head vm_list;
416 struct file *filp;
417 struct kvm_io_bus mmio_bus;
418 struct kvm_io_bus pio_bus;
419 struct kvm_pic *vpic;
420 struct kvm_ioapic *vioapic;
421 int round_robin_prev_vcpu;
422};
423
424static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
425{
426 return kvm->vpic;
427}
428
429static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
430{
431 return kvm->vioapic;
432}
433
434static inline int irqchip_in_kernel(struct kvm *kvm)
435{
436 return pic_irqchip(kvm) != 0;
437}
438
439struct descriptor_table {
440 u16 limit;
441 unsigned long base;
442} __attribute__((packed));
443
444struct kvm_x86_ops {
445 int (*cpu_has_kvm_support)(void); /* __init */
446 int (*disabled_by_bios)(void); /* __init */
447 void (*hardware_enable)(void *dummy); /* __init */
448 void (*hardware_disable)(void *dummy);
449 void (*check_processor_compatibility)(void *rtn);
450 int (*hardware_setup)(void); /* __init */
451 void (*hardware_unsetup)(void); /* __exit */
452
453 /* Create, but do not attach this VCPU */
454 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
455 void (*vcpu_free)(struct kvm_vcpu *vcpu);
456 void (*vcpu_reset)(struct kvm_vcpu *vcpu);
457
458 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
459 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
460 void (*vcpu_put)(struct kvm_vcpu *vcpu);
461 void (*vcpu_decache)(struct kvm_vcpu *vcpu);
462
463 int (*set_guest_debug)(struct kvm_vcpu *vcpu,
464 struct kvm_debug_guest *dbg);
465 void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
466 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
467 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
468 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
469 void (*get_segment)(struct kvm_vcpu *vcpu,
470 struct kvm_segment *var, int seg);
471 void (*set_segment)(struct kvm_vcpu *vcpu,
472 struct kvm_segment *var, int seg);
473 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
474 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
475 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
476 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
477 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
478 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
479 void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
480 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
481 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
482 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
483 unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
484 void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
485 int *exception);
486 void (*cache_regs)(struct kvm_vcpu *vcpu);
487 void (*decache_regs)(struct kvm_vcpu *vcpu);
488 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
489 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
490
491 void (*tlb_flush)(struct kvm_vcpu *vcpu);
492 void (*inject_page_fault)(struct kvm_vcpu *vcpu,
493 unsigned long addr, u32 err_code);
494
495 void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
496
497 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
498 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
499 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
500 void (*patch_hypercall)(struct kvm_vcpu *vcpu,
501 unsigned char *hypercall_addr);
502 int (*get_irq)(struct kvm_vcpu *vcpu);
503 void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
504 void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
505 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
506 struct kvm_run *run);
507};
508
509extern struct kvm_x86_ops *kvm_x86_ops;
510
511/* The guest did something we don't support. */
512#define pr_unimpl(vcpu, fmt, ...) \
513 do { \
514 if (printk_ratelimit()) \
515 printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
516 current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
517 } while(0)
518
519#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
520#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
521
522int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
523void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
524
525int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
526 struct module *module);
527void kvm_exit_x86(void);
528
529int kvm_mmu_module_init(void);
530void kvm_mmu_module_exit(void);
531
532void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
533int kvm_mmu_create(struct kvm_vcpu *vcpu);
534int kvm_mmu_setup(struct kvm_vcpu *vcpu);
535
536int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
537void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
538void kvm_mmu_zap_all(struct kvm *kvm);
539
540hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa);
541#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
542#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
543static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
544hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
545struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
546
547extern hpa_t bad_page_address;
548
549struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
550struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
551void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
552
553enum emulation_result {
554 EMULATE_DONE, /* no further processing */
555 EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
556 EMULATE_FAIL, /* can't emulate this instruction */
557};
558
559int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
560 unsigned long cr2, u16 error_code);
561void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
562void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
563void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
564void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
565 unsigned long *rflags);
566
567unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
568void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
569 unsigned long *rflags);
570int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
571int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
572
573struct x86_emulate_ctxt;
574
575int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
576 int size, unsigned port);
577int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
578 int size, unsigned long count, int down,
579 gva_t address, int rep, unsigned port);
580void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
581int kvm_emulate_halt(struct kvm_vcpu *vcpu);
582int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
583int emulate_clts(struct kvm_vcpu *vcpu);
584int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
585 unsigned long *dest);
586int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
587 unsigned long value);
588
589void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
590void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0);
591void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0);
592void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0);
593unsigned long get_cr8(struct kvm_vcpu *vcpu);
594void lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
595void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
596
597int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
598int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
599
600void fx_init(struct kvm_vcpu *vcpu);
601
602void kvm_resched(struct kvm_vcpu *vcpu);
603void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
604void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
605void kvm_flush_remote_tlbs(struct kvm *kvm);
606
607int emulator_read_std(unsigned long addr,
608 void *val,
609 unsigned int bytes,
610 struct kvm_vcpu *vcpu);
611int emulator_write_emulated(unsigned long addr,
612 const void *val,
613 unsigned int bytes,
614 struct kvm_vcpu *vcpu);
615
616unsigned long segment_base(u16 selector);
617
618void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
619 const u8 *new, int bytes);
620int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
621void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
622int kvm_mmu_load(struct kvm_vcpu *vcpu);
623void kvm_mmu_unload(struct kvm_vcpu *vcpu);
624
625int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
626
627static inline void kvm_guest_enter(void)
628{
629 current->flags |= PF_VCPU;
630}
631
632static inline void kvm_guest_exit(void)
633{
634 current->flags &= ~PF_VCPU;
635}
636
637static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
638 u32 error_code)
639{
640 return vcpu->mmu.page_fault(vcpu, gva, error_code);
641}
642
643static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
644{
645 if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
646 __kvm_mmu_free_some_pages(vcpu);
647}
648
649static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
650{
651 if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
652 return 0;
653
654 return kvm_mmu_load(vcpu);
655}
656
657static inline int is_long_mode(struct kvm_vcpu *vcpu)
658{
659#ifdef CONFIG_X86_64
660 return vcpu->shadow_efer & EFER_LME;
661#else
662 return 0;
663#endif
664}
665
666static inline int is_pae(struct kvm_vcpu *vcpu)
667{
668 return vcpu->cr4 & X86_CR4_PAE;
669}
670
671static inline int is_pse(struct kvm_vcpu *vcpu)
672{
673 return vcpu->cr4 & X86_CR4_PSE;
674}
675
676static inline int is_paging(struct kvm_vcpu *vcpu)
677{
678 return vcpu->cr0 & X86_CR0_PG;
679}
680
681static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
682{
683 return slot - kvm->memslots;
684}
685
686static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
687{
688 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
689
690 return (struct kvm_mmu_page *)page_private(page);
691}
692
693static inline u16 read_fs(void)
694{
695 u16 seg;
696 asm ("mov %%fs, %0" : "=g"(seg));
697 return seg;
698}
699
700static inline u16 read_gs(void)
701{
702 u16 seg;
703 asm ("mov %%gs, %0" : "=g"(seg));
704 return seg;
705}
706
707static inline u16 read_ldt(void)
708{
709 u16 ldt;
710 asm ("sldt %0" : "=g"(ldt));
711 return ldt;
712}
713
714static inline void load_fs(u16 sel)
715{
716 asm ("mov %0, %%fs" : : "rm"(sel));
717}
718
719static inline void load_gs(u16 sel)
720{
721 asm ("mov %0, %%gs" : : "rm"(sel));
722}
723
724#ifndef load_ldt
725static inline void load_ldt(u16 sel)
726{
727 asm ("lldt %0" : : "rm"(sel));
728}
729#endif
730
731static inline void get_idt(struct descriptor_table *table)
732{
733 asm ("sidt %0" : "=m"(*table));
734}
735
736static inline void get_gdt(struct descriptor_table *table)
737{
738 asm ("sgdt %0" : "=m"(*table));
739}
740
741static inline unsigned long read_tr_base(void)
742{
743 u16 tr;
744 asm ("str %0" : "=g"(tr));
745 return segment_base(tr);
746}
747
748#ifdef CONFIG_X86_64
749static inline unsigned long read_msr(unsigned long msr)
750{
751 u64 value;
752
753 rdmsrl(msr, value);
754 return value;
755}
756#endif
757
758static inline void fx_save(struct i387_fxsave_struct *image)
759{
760 asm ("fxsave (%0)":: "r" (image));
761}
762
763static inline void fx_restore(struct i387_fxsave_struct *image)
764{
765 asm ("fxrstor (%0)":: "r" (image));
766}
767
768static inline void fpu_init(void)
769{
770 asm ("finit");
771}
772
773static inline u32 get_rdx_init_val(void)
774{
775 return 0x600; /* P6 family */
776}
777
778#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
779#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
780#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
781#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
782#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
783#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
784#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
785#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
786#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
787
788#define MSR_IA32_TIME_STAMP_COUNTER 0x010
789
790#define TSS_IOPB_BASE_OFFSET 0x66
791#define TSS_BASE_SIZE 0x68
792#define TSS_IOPB_SIZE (65536 / 8)
793#define TSS_REDIRECTION_SIZE (256 / 8)
794#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
795
796#endif
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
deleted file mode 100644
index c0f372f1d761..000000000000
--- a/drivers/kvm/kvm_main.c
+++ /dev/null
@@ -1,3628 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "kvm.h"
19#include "x86_emulate.h"
20#include "segment_descriptor.h"
21#include "irq.h"
22
23#include <linux/kvm.h>
24#include <linux/module.h>
25#include <linux/errno.h>
26#include <linux/percpu.h>
27#include <linux/gfp.h>
28#include <linux/mm.h>
29#include <linux/miscdevice.h>
30#include <linux/vmalloc.h>
31#include <linux/reboot.h>
32#include <linux/debugfs.h>
33#include <linux/highmem.h>
34#include <linux/file.h>
35#include <linux/sysdev.h>
36#include <linux/cpu.h>
37#include <linux/sched.h>
38#include <linux/cpumask.h>
39#include <linux/smp.h>
40#include <linux/anon_inodes.h>
41#include <linux/profile.h>
42
43#include <asm/processor.h>
44#include <asm/msr.h>
45#include <asm/io.h>
46#include <asm/uaccess.h>
47#include <asm/desc.h>
48
49MODULE_AUTHOR("Qumranet");
50MODULE_LICENSE("GPL");
51
52static DEFINE_SPINLOCK(kvm_lock);
53static LIST_HEAD(vm_list);
54
55static cpumask_t cpus_hardware_enabled;
56
57struct kvm_x86_ops *kvm_x86_ops;
58struct kmem_cache *kvm_vcpu_cache;
59EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
60
61static __read_mostly struct preempt_ops kvm_preempt_ops;
62
63#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
64
65static struct kvm_stats_debugfs_item {
66 const char *name;
67 int offset;
68 struct dentry *dentry;
69} debugfs_entries[] = {
70 { "pf_fixed", STAT_OFFSET(pf_fixed) },
71 { "pf_guest", STAT_OFFSET(pf_guest) },
72 { "tlb_flush", STAT_OFFSET(tlb_flush) },
73 { "invlpg", STAT_OFFSET(invlpg) },
74 { "exits", STAT_OFFSET(exits) },
75 { "io_exits", STAT_OFFSET(io_exits) },
76 { "mmio_exits", STAT_OFFSET(mmio_exits) },
77 { "signal_exits", STAT_OFFSET(signal_exits) },
78 { "irq_window", STAT_OFFSET(irq_window_exits) },
79 { "halt_exits", STAT_OFFSET(halt_exits) },
80 { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
81 { "request_irq", STAT_OFFSET(request_irq_exits) },
82 { "irq_exits", STAT_OFFSET(irq_exits) },
83 { "light_exits", STAT_OFFSET(light_exits) },
84 { "efer_reload", STAT_OFFSET(efer_reload) },
85 { NULL }
86};
87
88static struct dentry *debugfs_dir;
89
90#define MAX_IO_MSRS 256
91
92#define CR0_RESERVED_BITS \
93 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
94 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
95 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
96#define CR4_RESERVED_BITS \
97 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
98 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
99 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
100 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
101
102#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
103#define EFER_RESERVED_BITS 0xfffffffffffff2fe
104
105#ifdef CONFIG_X86_64
106// LDT or TSS descriptor in the GDT. 16 bytes.
107struct segment_descriptor_64 {
108 struct segment_descriptor s;
109 u32 base_higher;
110 u32 pad_zero;
111};
112
113#endif
114
115static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
116 unsigned long arg);
117
118unsigned long segment_base(u16 selector)
119{
120 struct descriptor_table gdt;
121 struct segment_descriptor *d;
122 unsigned long table_base;
123 typedef unsigned long ul;
124 unsigned long v;
125
126 if (selector == 0)
127 return 0;
128
129 asm ("sgdt %0" : "=m"(gdt));
130 table_base = gdt.base;
131
132 if (selector & 4) { /* from ldt */
133 u16 ldt_selector;
134
135 asm ("sldt %0" : "=g"(ldt_selector));
136 table_base = segment_base(ldt_selector);
137 }
138 d = (struct segment_descriptor *)(table_base + (selector & ~7));
139 v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
140#ifdef CONFIG_X86_64
141 if (d->system == 0
142 && (d->type == 2 || d->type == 9 || d->type == 11))
143 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
144#endif
145 return v;
146}
147EXPORT_SYMBOL_GPL(segment_base);
148
149static inline int valid_vcpu(int n)
150{
151 return likely(n >= 0 && n < KVM_MAX_VCPUS);
152}
153
154void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
155{
156 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
157 return;
158
159 vcpu->guest_fpu_loaded = 1;
160 fx_save(&vcpu->host_fx_image);
161 fx_restore(&vcpu->guest_fx_image);
162}
163EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
164
165void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
166{
167 if (!vcpu->guest_fpu_loaded)
168 return;
169
170 vcpu->guest_fpu_loaded = 0;
171 fx_save(&vcpu->guest_fx_image);
172 fx_restore(&vcpu->host_fx_image);
173}
174EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
175
176/*
177 * Switches to specified vcpu, until a matching vcpu_put()
178 */
179static void vcpu_load(struct kvm_vcpu *vcpu)
180{
181 int cpu;
182
183 mutex_lock(&vcpu->mutex);
184 cpu = get_cpu();
185 preempt_notifier_register(&vcpu->preempt_notifier);
186 kvm_x86_ops->vcpu_load(vcpu, cpu);
187 put_cpu();
188}
189
190static void vcpu_put(struct kvm_vcpu *vcpu)
191{
192 preempt_disable();
193 kvm_x86_ops->vcpu_put(vcpu);
194 preempt_notifier_unregister(&vcpu->preempt_notifier);
195 preempt_enable();
196 mutex_unlock(&vcpu->mutex);
197}
198
199static void ack_flush(void *_completed)
200{
201}
202
203void kvm_flush_remote_tlbs(struct kvm *kvm)
204{
205 int i, cpu;
206 cpumask_t cpus;
207 struct kvm_vcpu *vcpu;
208
209 cpus_clear(cpus);
210 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
211 vcpu = kvm->vcpus[i];
212 if (!vcpu)
213 continue;
214 if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
215 continue;
216 cpu = vcpu->cpu;
217 if (cpu != -1 && cpu != raw_smp_processor_id())
218 cpu_set(cpu, cpus);
219 }
220 smp_call_function_mask(cpus, ack_flush, NULL, 1);
221}
222
223int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
224{
225 struct page *page;
226 int r;
227
228 mutex_init(&vcpu->mutex);
229 vcpu->cpu = -1;
230 vcpu->mmu.root_hpa = INVALID_PAGE;
231 vcpu->kvm = kvm;
232 vcpu->vcpu_id = id;
233 if (!irqchip_in_kernel(kvm) || id == 0)
234 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
235 else
236 vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
237 init_waitqueue_head(&vcpu->wq);
238
239 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
240 if (!page) {
241 r = -ENOMEM;
242 goto fail;
243 }
244 vcpu->run = page_address(page);
245
246 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
247 if (!page) {
248 r = -ENOMEM;
249 goto fail_free_run;
250 }
251 vcpu->pio_data = page_address(page);
252
253 r = kvm_mmu_create(vcpu);
254 if (r < 0)
255 goto fail_free_pio_data;
256
257 return 0;
258
259fail_free_pio_data:
260 free_page((unsigned long)vcpu->pio_data);
261fail_free_run:
262 free_page((unsigned long)vcpu->run);
263fail:
264 return -ENOMEM;
265}
266EXPORT_SYMBOL_GPL(kvm_vcpu_init);
267
268void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
269{
270 kvm_mmu_destroy(vcpu);
271 if (vcpu->apic)
272 hrtimer_cancel(&vcpu->apic->timer.dev);
273 kvm_free_apic(vcpu->apic);
274 free_page((unsigned long)vcpu->pio_data);
275 free_page((unsigned long)vcpu->run);
276}
277EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
278
279static struct kvm *kvm_create_vm(void)
280{
281 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
282
283 if (!kvm)
284 return ERR_PTR(-ENOMEM);
285
286 kvm_io_bus_init(&kvm->pio_bus);
287 mutex_init(&kvm->lock);
288 INIT_LIST_HEAD(&kvm->active_mmu_pages);
289 kvm_io_bus_init(&kvm->mmio_bus);
290 spin_lock(&kvm_lock);
291 list_add(&kvm->vm_list, &vm_list);
292 spin_unlock(&kvm_lock);
293 return kvm;
294}
295
296/*
297 * Free any memory in @free but not in @dont.
298 */
299static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
300 struct kvm_memory_slot *dont)
301{
302 int i;
303
304 if (!dont || free->phys_mem != dont->phys_mem)
305 if (free->phys_mem) {
306 for (i = 0; i < free->npages; ++i)
307 if (free->phys_mem[i])
308 __free_page(free->phys_mem[i]);
309 vfree(free->phys_mem);
310 }
311
312 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
313 vfree(free->dirty_bitmap);
314
315 free->phys_mem = NULL;
316 free->npages = 0;
317 free->dirty_bitmap = NULL;
318}
319
320static void kvm_free_physmem(struct kvm *kvm)
321{
322 int i;
323
324 for (i = 0; i < kvm->nmemslots; ++i)
325 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
326}
327
328static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
329{
330 int i;
331
332 for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
333 if (vcpu->pio.guest_pages[i]) {
334 __free_page(vcpu->pio.guest_pages[i]);
335 vcpu->pio.guest_pages[i] = NULL;
336 }
337}
338
339static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
340{
341 vcpu_load(vcpu);
342 kvm_mmu_unload(vcpu);
343 vcpu_put(vcpu);
344}
345
346static void kvm_free_vcpus(struct kvm *kvm)
347{
348 unsigned int i;
349
350 /*
351 * Unpin any mmu pages first.
352 */
353 for (i = 0; i < KVM_MAX_VCPUS; ++i)
354 if (kvm->vcpus[i])
355 kvm_unload_vcpu_mmu(kvm->vcpus[i]);
356 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
357 if (kvm->vcpus[i]) {
358 kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
359 kvm->vcpus[i] = NULL;
360 }
361 }
362
363}
364
365static void kvm_destroy_vm(struct kvm *kvm)
366{
367 spin_lock(&kvm_lock);
368 list_del(&kvm->vm_list);
369 spin_unlock(&kvm_lock);
370 kvm_io_bus_destroy(&kvm->pio_bus);
371 kvm_io_bus_destroy(&kvm->mmio_bus);
372 kfree(kvm->vpic);
373 kfree(kvm->vioapic);
374 kvm_free_vcpus(kvm);
375 kvm_free_physmem(kvm);
376 kfree(kvm);
377}
378
379static int kvm_vm_release(struct inode *inode, struct file *filp)
380{
381 struct kvm *kvm = filp->private_data;
382
383 kvm_destroy_vm(kvm);
384 return 0;
385}
386
387static void inject_gp(struct kvm_vcpu *vcpu)
388{
389 kvm_x86_ops->inject_gp(vcpu, 0);
390}
391
392/*
393 * Load the pae pdptrs. Return true is they are all valid.
394 */
395static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
396{
397 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
398 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
399 int i;
400 u64 *pdpt;
401 int ret;
402 struct page *page;
403 u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
404
405 mutex_lock(&vcpu->kvm->lock);
406 page = gfn_to_page(vcpu->kvm, pdpt_gfn);
407 if (!page) {
408 ret = 0;
409 goto out;
410 }
411
412 pdpt = kmap_atomic(page, KM_USER0);
413 memcpy(pdpte, pdpt+offset, sizeof(pdpte));
414 kunmap_atomic(pdpt, KM_USER0);
415
416 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
417 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
418 ret = 0;
419 goto out;
420 }
421 }
422 ret = 1;
423
424 memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
425out:
426 mutex_unlock(&vcpu->kvm->lock);
427
428 return ret;
429}
430
431void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
432{
433 if (cr0 & CR0_RESERVED_BITS) {
434 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
435 cr0, vcpu->cr0);
436 inject_gp(vcpu);
437 return;
438 }
439
440 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
441 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
442 inject_gp(vcpu);
443 return;
444 }
445
446 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
447 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
448 "and a clear PE flag\n");
449 inject_gp(vcpu);
450 return;
451 }
452
453 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
454#ifdef CONFIG_X86_64
455 if ((vcpu->shadow_efer & EFER_LME)) {
456 int cs_db, cs_l;
457
458 if (!is_pae(vcpu)) {
459 printk(KERN_DEBUG "set_cr0: #GP, start paging "
460 "in long mode while PAE is disabled\n");
461 inject_gp(vcpu);
462 return;
463 }
464 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
465 if (cs_l) {
466 printk(KERN_DEBUG "set_cr0: #GP, start paging "
467 "in long mode while CS.L == 1\n");
468 inject_gp(vcpu);
469 return;
470
471 }
472 } else
473#endif
474 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
475 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
476 "reserved bits\n");
477 inject_gp(vcpu);
478 return;
479 }
480
481 }
482
483 kvm_x86_ops->set_cr0(vcpu, cr0);
484 vcpu->cr0 = cr0;
485
486 mutex_lock(&vcpu->kvm->lock);
487 kvm_mmu_reset_context(vcpu);
488 mutex_unlock(&vcpu->kvm->lock);
489 return;
490}
491EXPORT_SYMBOL_GPL(set_cr0);
492
493void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
494{
495 set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
496}
497EXPORT_SYMBOL_GPL(lmsw);
498
499void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
500{
501 if (cr4 & CR4_RESERVED_BITS) {
502 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
503 inject_gp(vcpu);
504 return;
505 }
506
507 if (is_long_mode(vcpu)) {
508 if (!(cr4 & X86_CR4_PAE)) {
509 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
510 "in long mode\n");
511 inject_gp(vcpu);
512 return;
513 }
514 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
515 && !load_pdptrs(vcpu, vcpu->cr3)) {
516 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
517 inject_gp(vcpu);
518 return;
519 }
520
521 if (cr4 & X86_CR4_VMXE) {
522 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
523 inject_gp(vcpu);
524 return;
525 }
526 kvm_x86_ops->set_cr4(vcpu, cr4);
527 vcpu->cr4 = cr4;
528 mutex_lock(&vcpu->kvm->lock);
529 kvm_mmu_reset_context(vcpu);
530 mutex_unlock(&vcpu->kvm->lock);
531}
532EXPORT_SYMBOL_GPL(set_cr4);
533
534void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
535{
536 if (is_long_mode(vcpu)) {
537 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
538 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
539 inject_gp(vcpu);
540 return;
541 }
542 } else {
543 if (is_pae(vcpu)) {
544 if (cr3 & CR3_PAE_RESERVED_BITS) {
545 printk(KERN_DEBUG
546 "set_cr3: #GP, reserved bits\n");
547 inject_gp(vcpu);
548 return;
549 }
550 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
551 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
552 "reserved bits\n");
553 inject_gp(vcpu);
554 return;
555 }
556 } else {
557 if (cr3 & CR3_NONPAE_RESERVED_BITS) {
558 printk(KERN_DEBUG
559 "set_cr3: #GP, reserved bits\n");
560 inject_gp(vcpu);
561 return;
562 }
563 }
564 }
565
566 mutex_lock(&vcpu->kvm->lock);
567 /*
568 * Does the new cr3 value map to physical memory? (Note, we
569 * catch an invalid cr3 even in real-mode, because it would
570 * cause trouble later on when we turn on paging anyway.)
571 *
572 * A real CPU would silently accept an invalid cr3 and would
573 * attempt to use it - with largely undefined (and often hard
574 * to debug) behavior on the guest side.
575 */
576 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
577 inject_gp(vcpu);
578 else {
579 vcpu->cr3 = cr3;
580 vcpu->mmu.new_cr3(vcpu);
581 }
582 mutex_unlock(&vcpu->kvm->lock);
583}
584EXPORT_SYMBOL_GPL(set_cr3);
585
586void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
587{
588 if (cr8 & CR8_RESERVED_BITS) {
589 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
590 inject_gp(vcpu);
591 return;
592 }
593 if (irqchip_in_kernel(vcpu->kvm))
594 kvm_lapic_set_tpr(vcpu, cr8);
595 else
596 vcpu->cr8 = cr8;
597}
598EXPORT_SYMBOL_GPL(set_cr8);
599
600unsigned long get_cr8(struct kvm_vcpu *vcpu)
601{
602 if (irqchip_in_kernel(vcpu->kvm))
603 return kvm_lapic_get_cr8(vcpu);
604 else
605 return vcpu->cr8;
606}
607EXPORT_SYMBOL_GPL(get_cr8);
608
609u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
610{
611 if (irqchip_in_kernel(vcpu->kvm))
612 return vcpu->apic_base;
613 else
614 return vcpu->apic_base;
615}
616EXPORT_SYMBOL_GPL(kvm_get_apic_base);
617
618void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
619{
620 /* TODO: reserve bits check */
621 if (irqchip_in_kernel(vcpu->kvm))
622 kvm_lapic_set_base(vcpu, data);
623 else
624 vcpu->apic_base = data;
625}
626EXPORT_SYMBOL_GPL(kvm_set_apic_base);
627
628void fx_init(struct kvm_vcpu *vcpu)
629{
630 unsigned after_mxcsr_mask;
631
632 /* Initialize guest FPU by resetting ours and saving into guest's */
633 preempt_disable();
634 fx_save(&vcpu->host_fx_image);
635 fpu_init();
636 fx_save(&vcpu->guest_fx_image);
637 fx_restore(&vcpu->host_fx_image);
638 preempt_enable();
639
640 vcpu->cr0 |= X86_CR0_ET;
641 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
642 vcpu->guest_fx_image.mxcsr = 0x1f80;
643 memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
644 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
645}
646EXPORT_SYMBOL_GPL(fx_init);
647
648/*
649 * Allocate some memory and give it an address in the guest physical address
650 * space.
651 *
652 * Discontiguous memory is allowed, mostly for framebuffers.
653 */
654static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
655 struct kvm_memory_region *mem)
656{
657 int r;
658 gfn_t base_gfn;
659 unsigned long npages;
660 unsigned long i;
661 struct kvm_memory_slot *memslot;
662 struct kvm_memory_slot old, new;
663
664 r = -EINVAL;
665 /* General sanity checks */
666 if (mem->memory_size & (PAGE_SIZE - 1))
667 goto out;
668 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
669 goto out;
670 if (mem->slot >= KVM_MEMORY_SLOTS)
671 goto out;
672 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
673 goto out;
674
675 memslot = &kvm->memslots[mem->slot];
676 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
677 npages = mem->memory_size >> PAGE_SHIFT;
678
679 if (!npages)
680 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
681
682 mutex_lock(&kvm->lock);
683
684 new = old = *memslot;
685
686 new.base_gfn = base_gfn;
687 new.npages = npages;
688 new.flags = mem->flags;
689
690 /* Disallow changing a memory slot's size. */
691 r = -EINVAL;
692 if (npages && old.npages && npages != old.npages)
693 goto out_unlock;
694
695 /* Check for overlaps */
696 r = -EEXIST;
697 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
698 struct kvm_memory_slot *s = &kvm->memslots[i];
699
700 if (s == memslot)
701 continue;
702 if (!((base_gfn + npages <= s->base_gfn) ||
703 (base_gfn >= s->base_gfn + s->npages)))
704 goto out_unlock;
705 }
706
707 /* Deallocate if slot is being removed */
708 if (!npages)
709 new.phys_mem = NULL;
710
711 /* Free page dirty bitmap if unneeded */
712 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
713 new.dirty_bitmap = NULL;
714
715 r = -ENOMEM;
716
717 /* Allocate if a slot is being created */
718 if (npages && !new.phys_mem) {
719 new.phys_mem = vmalloc(npages * sizeof(struct page *));
720
721 if (!new.phys_mem)
722 goto out_unlock;
723
724 memset(new.phys_mem, 0, npages * sizeof(struct page *));
725 for (i = 0; i < npages; ++i) {
726 new.phys_mem[i] = alloc_page(GFP_HIGHUSER
727 | __GFP_ZERO);
728 if (!new.phys_mem[i])
729 goto out_unlock;
730 set_page_private(new.phys_mem[i],0);
731 }
732 }
733
734 /* Allocate page dirty bitmap if needed */
735 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
736 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
737
738 new.dirty_bitmap = vmalloc(dirty_bytes);
739 if (!new.dirty_bitmap)
740 goto out_unlock;
741 memset(new.dirty_bitmap, 0, dirty_bytes);
742 }
743
744 if (mem->slot >= kvm->nmemslots)
745 kvm->nmemslots = mem->slot + 1;
746
747 *memslot = new;
748
749 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
750 kvm_flush_remote_tlbs(kvm);
751
752 mutex_unlock(&kvm->lock);
753
754 kvm_free_physmem_slot(&old, &new);
755 return 0;
756
757out_unlock:
758 mutex_unlock(&kvm->lock);
759 kvm_free_physmem_slot(&new, &old);
760out:
761 return r;
762}
763
764/*
765 * Get (and clear) the dirty memory log for a memory slot.
766 */
767static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
768 struct kvm_dirty_log *log)
769{
770 struct kvm_memory_slot *memslot;
771 int r, i;
772 int n;
773 unsigned long any = 0;
774
775 mutex_lock(&kvm->lock);
776
777 r = -EINVAL;
778 if (log->slot >= KVM_MEMORY_SLOTS)
779 goto out;
780
781 memslot = &kvm->memslots[log->slot];
782 r = -ENOENT;
783 if (!memslot->dirty_bitmap)
784 goto out;
785
786 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
787
788 for (i = 0; !any && i < n/sizeof(long); ++i)
789 any = memslot->dirty_bitmap[i];
790
791 r = -EFAULT;
792 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
793 goto out;
794
795 /* If nothing is dirty, don't bother messing with page tables. */
796 if (any) {
797 kvm_mmu_slot_remove_write_access(kvm, log->slot);
798 kvm_flush_remote_tlbs(kvm);
799 memset(memslot->dirty_bitmap, 0, n);
800 }
801
802 r = 0;
803
804out:
805 mutex_unlock(&kvm->lock);
806 return r;
807}
808
809/*
810 * Set a new alias region. Aliases map a portion of physical memory into
811 * another portion. This is useful for memory windows, for example the PC
812 * VGA region.
813 */
814static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
815 struct kvm_memory_alias *alias)
816{
817 int r, n;
818 struct kvm_mem_alias *p;
819
820 r = -EINVAL;
821 /* General sanity checks */
822 if (alias->memory_size & (PAGE_SIZE - 1))
823 goto out;
824 if (alias->guest_phys_addr & (PAGE_SIZE - 1))
825 goto out;
826 if (alias->slot >= KVM_ALIAS_SLOTS)
827 goto out;
828 if (alias->guest_phys_addr + alias->memory_size
829 < alias->guest_phys_addr)
830 goto out;
831 if (alias->target_phys_addr + alias->memory_size
832 < alias->target_phys_addr)
833 goto out;
834
835 mutex_lock(&kvm->lock);
836
837 p = &kvm->aliases[alias->slot];
838 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
839 p->npages = alias->memory_size >> PAGE_SHIFT;
840 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
841
842 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
843 if (kvm->aliases[n - 1].npages)
844 break;
845 kvm->naliases = n;
846
847 kvm_mmu_zap_all(kvm);
848
849 mutex_unlock(&kvm->lock);
850
851 return 0;
852
853out:
854 return r;
855}
856
857static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
858{
859 int r;
860
861 r = 0;
862 switch (chip->chip_id) {
863 case KVM_IRQCHIP_PIC_MASTER:
864 memcpy (&chip->chip.pic,
865 &pic_irqchip(kvm)->pics[0],
866 sizeof(struct kvm_pic_state));
867 break;
868 case KVM_IRQCHIP_PIC_SLAVE:
869 memcpy (&chip->chip.pic,
870 &pic_irqchip(kvm)->pics[1],
871 sizeof(struct kvm_pic_state));
872 break;
873 case KVM_IRQCHIP_IOAPIC:
874 memcpy (&chip->chip.ioapic,
875 ioapic_irqchip(kvm),
876 sizeof(struct kvm_ioapic_state));
877 break;
878 default:
879 r = -EINVAL;
880 break;
881 }
882 return r;
883}
884
885static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
886{
887 int r;
888
889 r = 0;
890 switch (chip->chip_id) {
891 case KVM_IRQCHIP_PIC_MASTER:
892 memcpy (&pic_irqchip(kvm)->pics[0],
893 &chip->chip.pic,
894 sizeof(struct kvm_pic_state));
895 break;
896 case KVM_IRQCHIP_PIC_SLAVE:
897 memcpy (&pic_irqchip(kvm)->pics[1],
898 &chip->chip.pic,
899 sizeof(struct kvm_pic_state));
900 break;
901 case KVM_IRQCHIP_IOAPIC:
902 memcpy (ioapic_irqchip(kvm),
903 &chip->chip.ioapic,
904 sizeof(struct kvm_ioapic_state));
905 break;
906 default:
907 r = -EINVAL;
908 break;
909 }
910 kvm_pic_update_irq(pic_irqchip(kvm));
911 return r;
912}
913
914static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
915{
916 int i;
917 struct kvm_mem_alias *alias;
918
919 for (i = 0; i < kvm->naliases; ++i) {
920 alias = &kvm->aliases[i];
921 if (gfn >= alias->base_gfn
922 && gfn < alias->base_gfn + alias->npages)
923 return alias->target_gfn + gfn - alias->base_gfn;
924 }
925 return gfn;
926}
927
928static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
929{
930 int i;
931
932 for (i = 0; i < kvm->nmemslots; ++i) {
933 struct kvm_memory_slot *memslot = &kvm->memslots[i];
934
935 if (gfn >= memslot->base_gfn
936 && gfn < memslot->base_gfn + memslot->npages)
937 return memslot;
938 }
939 return NULL;
940}
941
942struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
943{
944 gfn = unalias_gfn(kvm, gfn);
945 return __gfn_to_memslot(kvm, gfn);
946}
947
948struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
949{
950 struct kvm_memory_slot *slot;
951
952 gfn = unalias_gfn(kvm, gfn);
953 slot = __gfn_to_memslot(kvm, gfn);
954 if (!slot)
955 return NULL;
956 return slot->phys_mem[gfn - slot->base_gfn];
957}
958EXPORT_SYMBOL_GPL(gfn_to_page);
959
960/* WARNING: Does not work on aliased pages. */
961void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
962{
963 struct kvm_memory_slot *memslot;
964
965 memslot = __gfn_to_memslot(kvm, gfn);
966 if (memslot && memslot->dirty_bitmap) {
967 unsigned long rel_gfn = gfn - memslot->base_gfn;
968
969 /* avoid RMW */
970 if (!test_bit(rel_gfn, memslot->dirty_bitmap))
971 set_bit(rel_gfn, memslot->dirty_bitmap);
972 }
973}
974
975int emulator_read_std(unsigned long addr,
976 void *val,
977 unsigned int bytes,
978 struct kvm_vcpu *vcpu)
979{
980 void *data = val;
981
982 while (bytes) {
983 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
984 unsigned offset = addr & (PAGE_SIZE-1);
985 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
986 unsigned long pfn;
987 struct page *page;
988 void *page_virt;
989
990 if (gpa == UNMAPPED_GVA)
991 return X86EMUL_PROPAGATE_FAULT;
992 pfn = gpa >> PAGE_SHIFT;
993 page = gfn_to_page(vcpu->kvm, pfn);
994 if (!page)
995 return X86EMUL_UNHANDLEABLE;
996 page_virt = kmap_atomic(page, KM_USER0);
997
998 memcpy(data, page_virt + offset, tocopy);
999
1000 kunmap_atomic(page_virt, KM_USER0);
1001
1002 bytes -= tocopy;
1003 data += tocopy;
1004 addr += tocopy;
1005 }
1006
1007 return X86EMUL_CONTINUE;
1008}
1009EXPORT_SYMBOL_GPL(emulator_read_std);
1010
1011static int emulator_write_std(unsigned long addr,
1012 const void *val,
1013 unsigned int bytes,
1014 struct kvm_vcpu *vcpu)
1015{
1016 pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
1017 return X86EMUL_UNHANDLEABLE;
1018}
1019
1020/*
1021 * Only apic need an MMIO device hook, so shortcut now..
1022 */
1023static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1024 gpa_t addr)
1025{
1026 struct kvm_io_device *dev;
1027
1028 if (vcpu->apic) {
1029 dev = &vcpu->apic->dev;
1030 if (dev->in_range(dev, addr))
1031 return dev;
1032 }
1033 return NULL;
1034}
1035
1036static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1037 gpa_t addr)
1038{
1039 struct kvm_io_device *dev;
1040
1041 dev = vcpu_find_pervcpu_dev(vcpu, addr);
1042 if (dev == NULL)
1043 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1044 return dev;
1045}
1046
1047static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1048 gpa_t addr)
1049{
1050 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1051}
1052
1053static int emulator_read_emulated(unsigned long addr,
1054 void *val,
1055 unsigned int bytes,
1056 struct kvm_vcpu *vcpu)
1057{
1058 struct kvm_io_device *mmio_dev;
1059 gpa_t gpa;
1060
1061 if (vcpu->mmio_read_completed) {
1062 memcpy(val, vcpu->mmio_data, bytes);
1063 vcpu->mmio_read_completed = 0;
1064 return X86EMUL_CONTINUE;
1065 } else if (emulator_read_std(addr, val, bytes, vcpu)
1066 == X86EMUL_CONTINUE)
1067 return X86EMUL_CONTINUE;
1068
1069 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1070 if (gpa == UNMAPPED_GVA)
1071 return X86EMUL_PROPAGATE_FAULT;
1072
1073 /*
1074 * Is this MMIO handled locally?
1075 */
1076 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1077 if (mmio_dev) {
1078 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1079 return X86EMUL_CONTINUE;
1080 }
1081
1082 vcpu->mmio_needed = 1;
1083 vcpu->mmio_phys_addr = gpa;
1084 vcpu->mmio_size = bytes;
1085 vcpu->mmio_is_write = 0;
1086
1087 return X86EMUL_UNHANDLEABLE;
1088}
1089
1090static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1091 const void *val, int bytes)
1092{
1093 struct page *page;
1094 void *virt;
1095
1096 if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
1097 return 0;
1098 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1099 if (!page)
1100 return 0;
1101 mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1102 virt = kmap_atomic(page, KM_USER0);
1103 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1104 memcpy(virt + offset_in_page(gpa), val, bytes);
1105 kunmap_atomic(virt, KM_USER0);
1106 return 1;
1107}
1108
1109static int emulator_write_emulated_onepage(unsigned long addr,
1110 const void *val,
1111 unsigned int bytes,
1112 struct kvm_vcpu *vcpu)
1113{
1114 struct kvm_io_device *mmio_dev;
1115 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1116
1117 if (gpa == UNMAPPED_GVA) {
1118 kvm_x86_ops->inject_page_fault(vcpu, addr, 2);
1119 return X86EMUL_PROPAGATE_FAULT;
1120 }
1121
1122 if (emulator_write_phys(vcpu, gpa, val, bytes))
1123 return X86EMUL_CONTINUE;
1124
1125 /*
1126 * Is this MMIO handled locally?
1127 */
1128 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1129 if (mmio_dev) {
1130 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1131 return X86EMUL_CONTINUE;
1132 }
1133
1134 vcpu->mmio_needed = 1;
1135 vcpu->mmio_phys_addr = gpa;
1136 vcpu->mmio_size = bytes;
1137 vcpu->mmio_is_write = 1;
1138 memcpy(vcpu->mmio_data, val, bytes);
1139
1140 return X86EMUL_CONTINUE;
1141}
1142
1143int emulator_write_emulated(unsigned long addr,
1144 const void *val,
1145 unsigned int bytes,
1146 struct kvm_vcpu *vcpu)
1147{
1148 /* Crossing a page boundary? */
1149 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1150 int rc, now;
1151
1152 now = -addr & ~PAGE_MASK;
1153 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1154 if (rc != X86EMUL_CONTINUE)
1155 return rc;
1156 addr += now;
1157 val += now;
1158 bytes -= now;
1159 }
1160 return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1161}
1162EXPORT_SYMBOL_GPL(emulator_write_emulated);
1163
1164static int emulator_cmpxchg_emulated(unsigned long addr,
1165 const void *old,
1166 const void *new,
1167 unsigned int bytes,
1168 struct kvm_vcpu *vcpu)
1169{
1170 static int reported;
1171
1172 if (!reported) {
1173 reported = 1;
1174 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1175 }
1176 return emulator_write_emulated(addr, new, bytes, vcpu);
1177}
1178
1179static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1180{
1181 return kvm_x86_ops->get_segment_base(vcpu, seg);
1182}
1183
1184int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1185{
1186 return X86EMUL_CONTINUE;
1187}
1188
1189int emulate_clts(struct kvm_vcpu *vcpu)
1190{
1191 kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS);
1192 return X86EMUL_CONTINUE;
1193}
1194
1195int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
1196{
1197 struct kvm_vcpu *vcpu = ctxt->vcpu;
1198
1199 switch (dr) {
1200 case 0 ... 3:
1201 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1202 return X86EMUL_CONTINUE;
1203 default:
1204 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1205 return X86EMUL_UNHANDLEABLE;
1206 }
1207}
1208
1209int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1210{
1211 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1212 int exception;
1213
1214 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1215 if (exception) {
1216 /* FIXME: better handling */
1217 return X86EMUL_UNHANDLEABLE;
1218 }
1219 return X86EMUL_CONTINUE;
1220}
1221
1222void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1223{
1224 static int reported;
1225 u8 opcodes[4];
1226 unsigned long rip = vcpu->rip;
1227 unsigned long rip_linear;
1228
1229 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
1230
1231 if (reported)
1232 return;
1233
1234 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
1235
1236 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
1237 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1238 reported = 1;
1239}
1240EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1241
1242struct x86_emulate_ops emulate_ops = {
1243 .read_std = emulator_read_std,
1244 .write_std = emulator_write_std,
1245 .read_emulated = emulator_read_emulated,
1246 .write_emulated = emulator_write_emulated,
1247 .cmpxchg_emulated = emulator_cmpxchg_emulated,
1248};
1249
1250int emulate_instruction(struct kvm_vcpu *vcpu,
1251 struct kvm_run *run,
1252 unsigned long cr2,
1253 u16 error_code)
1254{
1255 struct x86_emulate_ctxt emulate_ctxt;
1256 int r;
1257 int cs_db, cs_l;
1258
1259 vcpu->mmio_fault_cr2 = cr2;
1260 kvm_x86_ops->cache_regs(vcpu);
1261
1262 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1263
1264 emulate_ctxt.vcpu = vcpu;
1265 emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1266 emulate_ctxt.cr2 = cr2;
1267 emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1268 ? X86EMUL_MODE_REAL : cs_l
1269 ? X86EMUL_MODE_PROT64 : cs_db
1270 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1271
1272 if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1273 emulate_ctxt.cs_base = 0;
1274 emulate_ctxt.ds_base = 0;
1275 emulate_ctxt.es_base = 0;
1276 emulate_ctxt.ss_base = 0;
1277 } else {
1278 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1279 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1280 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1281 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1282 }
1283
1284 emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1285 emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1286
1287 vcpu->mmio_is_write = 0;
1288 vcpu->pio.string = 0;
1289 r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1290 if (vcpu->pio.string)
1291 return EMULATE_DO_MMIO;
1292
1293 if ((r || vcpu->mmio_is_write) && run) {
1294 run->exit_reason = KVM_EXIT_MMIO;
1295 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1296 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1297 run->mmio.len = vcpu->mmio_size;
1298 run->mmio.is_write = vcpu->mmio_is_write;
1299 }
1300
1301 if (r) {
1302 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1303 return EMULATE_DONE;
1304 if (!vcpu->mmio_needed) {
1305 kvm_report_emulation_failure(vcpu, "mmio");
1306 return EMULATE_FAIL;
1307 }
1308 return EMULATE_DO_MMIO;
1309 }
1310
1311 kvm_x86_ops->decache_regs(vcpu);
1312 kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1313
1314 if (vcpu->mmio_is_write) {
1315 vcpu->mmio_needed = 0;
1316 return EMULATE_DO_MMIO;
1317 }
1318
1319 return EMULATE_DONE;
1320}
1321EXPORT_SYMBOL_GPL(emulate_instruction);
1322
1323/*
1324 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1325 */
1326static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1327{
1328 DECLARE_WAITQUEUE(wait, current);
1329
1330 add_wait_queue(&vcpu->wq, &wait);
1331
1332 /*
1333 * We will block until either an interrupt or a signal wakes us up
1334 */
1335 while (!kvm_cpu_has_interrupt(vcpu)
1336 && !signal_pending(current)
1337 && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
1338 && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
1339 set_current_state(TASK_INTERRUPTIBLE);
1340 vcpu_put(vcpu);
1341 schedule();
1342 vcpu_load(vcpu);
1343 }
1344
1345 __set_current_state(TASK_RUNNING);
1346 remove_wait_queue(&vcpu->wq, &wait);
1347}
1348
1349int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1350{
1351 ++vcpu->stat.halt_exits;
1352 if (irqchip_in_kernel(vcpu->kvm)) {
1353 vcpu->mp_state = VCPU_MP_STATE_HALTED;
1354 kvm_vcpu_block(vcpu);
1355 if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
1356 return -EINTR;
1357 return 1;
1358 } else {
1359 vcpu->run->exit_reason = KVM_EXIT_HLT;
1360 return 0;
1361 }
1362}
1363EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1364
1365int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1366{
1367 unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1368
1369 kvm_x86_ops->cache_regs(vcpu);
1370 ret = -KVM_EINVAL;
1371#ifdef CONFIG_X86_64
1372 if (is_long_mode(vcpu)) {
1373 nr = vcpu->regs[VCPU_REGS_RAX];
1374 a0 = vcpu->regs[VCPU_REGS_RDI];
1375 a1 = vcpu->regs[VCPU_REGS_RSI];
1376 a2 = vcpu->regs[VCPU_REGS_RDX];
1377 a3 = vcpu->regs[VCPU_REGS_RCX];
1378 a4 = vcpu->regs[VCPU_REGS_R8];
1379 a5 = vcpu->regs[VCPU_REGS_R9];
1380 } else
1381#endif
1382 {
1383 nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
1384 a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
1385 a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
1386 a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
1387 a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
1388 a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
1389 a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
1390 }
1391 switch (nr) {
1392 default:
1393 run->hypercall.nr = nr;
1394 run->hypercall.args[0] = a0;
1395 run->hypercall.args[1] = a1;
1396 run->hypercall.args[2] = a2;
1397 run->hypercall.args[3] = a3;
1398 run->hypercall.args[4] = a4;
1399 run->hypercall.args[5] = a5;
1400 run->hypercall.ret = ret;
1401 run->hypercall.longmode = is_long_mode(vcpu);
1402 kvm_x86_ops->decache_regs(vcpu);
1403 return 0;
1404 }
1405 vcpu->regs[VCPU_REGS_RAX] = ret;
1406 kvm_x86_ops->decache_regs(vcpu);
1407 return 1;
1408}
1409EXPORT_SYMBOL_GPL(kvm_hypercall);
1410
1411static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1412{
1413 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1414}
1415
1416void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1417{
1418 struct descriptor_table dt = { limit, base };
1419
1420 kvm_x86_ops->set_gdt(vcpu, &dt);
1421}
1422
1423void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1424{
1425 struct descriptor_table dt = { limit, base };
1426
1427 kvm_x86_ops->set_idt(vcpu, &dt);
1428}
1429
1430void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1431 unsigned long *rflags)
1432{
1433 lmsw(vcpu, msw);
1434 *rflags = kvm_x86_ops->get_rflags(vcpu);
1435}
1436
1437unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1438{
1439 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
1440 switch (cr) {
1441 case 0:
1442 return vcpu->cr0;
1443 case 2:
1444 return vcpu->cr2;
1445 case 3:
1446 return vcpu->cr3;
1447 case 4:
1448 return vcpu->cr4;
1449 default:
1450 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1451 return 0;
1452 }
1453}
1454
1455void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1456 unsigned long *rflags)
1457{
1458 switch (cr) {
1459 case 0:
1460 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1461 *rflags = kvm_x86_ops->get_rflags(vcpu);
1462 break;
1463 case 2:
1464 vcpu->cr2 = val;
1465 break;
1466 case 3:
1467 set_cr3(vcpu, val);
1468 break;
1469 case 4:
1470 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1471 break;
1472 default:
1473 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1474 }
1475}
1476
1477/*
1478 * Register the para guest with the host:
1479 */
1480static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1481{
1482 struct kvm_vcpu_para_state *para_state;
1483 hpa_t para_state_hpa, hypercall_hpa;
1484 struct page *para_state_page;
1485 unsigned char *hypercall;
1486 gpa_t hypercall_gpa;
1487
1488 printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
1489 printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
1490
1491 /*
1492 * Needs to be page aligned:
1493 */
1494 if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
1495 goto err_gp;
1496
1497 para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
1498 printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
1499 if (is_error_hpa(para_state_hpa))
1500 goto err_gp;
1501
1502 mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
1503 para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
1504 para_state = kmap(para_state_page);
1505
1506 printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version);
1507 printk(KERN_DEBUG ".... size: %d\n", para_state->size);
1508
1509 para_state->host_version = KVM_PARA_API_VERSION;
1510 /*
1511 * We cannot support guests that try to register themselves
1512 * with a newer API version than the host supports:
1513 */
1514 if (para_state->guest_version > KVM_PARA_API_VERSION) {
1515 para_state->ret = -KVM_EINVAL;
1516 goto err_kunmap_skip;
1517 }
1518
1519 hypercall_gpa = para_state->hypercall_gpa;
1520 hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
1521 printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
1522 if (is_error_hpa(hypercall_hpa)) {
1523 para_state->ret = -KVM_EINVAL;
1524 goto err_kunmap_skip;
1525 }
1526
1527 printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
1528 vcpu->para_state_page = para_state_page;
1529 vcpu->para_state_gpa = para_state_gpa;
1530 vcpu->hypercall_gpa = hypercall_gpa;
1531
1532 mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
1533 hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
1534 KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
1535 kvm_x86_ops->patch_hypercall(vcpu, hypercall);
1536 kunmap_atomic(hypercall, KM_USER1);
1537
1538 para_state->ret = 0;
1539err_kunmap_skip:
1540 kunmap(para_state_page);
1541 return 0;
1542err_gp:
1543 return 1;
1544}
1545
1546int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1547{
1548 u64 data;
1549
1550 switch (msr) {
1551 case 0xc0010010: /* SYSCFG */
1552 case 0xc0010015: /* HWCR */
1553 case MSR_IA32_PLATFORM_ID:
1554 case MSR_IA32_P5_MC_ADDR:
1555 case MSR_IA32_P5_MC_TYPE:
1556 case MSR_IA32_MC0_CTL:
1557 case MSR_IA32_MCG_STATUS:
1558 case MSR_IA32_MCG_CAP:
1559 case MSR_IA32_MC0_MISC:
1560 case MSR_IA32_MC0_MISC+4:
1561 case MSR_IA32_MC0_MISC+8:
1562 case MSR_IA32_MC0_MISC+12:
1563 case MSR_IA32_MC0_MISC+16:
1564 case MSR_IA32_UCODE_REV:
1565 case MSR_IA32_PERF_STATUS:
1566 case MSR_IA32_EBL_CR_POWERON:
1567 /* MTRR registers */
1568 case 0xfe:
1569 case 0x200 ... 0x2ff:
1570 data = 0;
1571 break;
1572 case 0xcd: /* fsb frequency */
1573 data = 3;
1574 break;
1575 case MSR_IA32_APICBASE:
1576 data = kvm_get_apic_base(vcpu);
1577 break;
1578 case MSR_IA32_MISC_ENABLE:
1579 data = vcpu->ia32_misc_enable_msr;
1580 break;
1581#ifdef CONFIG_X86_64
1582 case MSR_EFER:
1583 data = vcpu->shadow_efer;
1584 break;
1585#endif
1586 default:
1587 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1588 return 1;
1589 }
1590 *pdata = data;
1591 return 0;
1592}
1593EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1594
1595/*
1596 * Reads an msr value (of 'msr_index') into 'pdata'.
1597 * Returns 0 on success, non-0 otherwise.
1598 * Assumes vcpu_load() was already called.
1599 */
1600int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1601{
1602 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
1603}
1604
1605#ifdef CONFIG_X86_64
1606
1607static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1608{
1609 if (efer & EFER_RESERVED_BITS) {
1610 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1611 efer);
1612 inject_gp(vcpu);
1613 return;
1614 }
1615
1616 if (is_paging(vcpu)
1617 && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1618 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1619 inject_gp(vcpu);
1620 return;
1621 }
1622
1623 kvm_x86_ops->set_efer(vcpu, efer);
1624
1625 efer &= ~EFER_LMA;
1626 efer |= vcpu->shadow_efer & EFER_LMA;
1627
1628 vcpu->shadow_efer = efer;
1629}
1630
1631#endif
1632
1633int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1634{
1635 switch (msr) {
1636#ifdef CONFIG_X86_64
1637 case MSR_EFER:
1638 set_efer(vcpu, data);
1639 break;
1640#endif
1641 case MSR_IA32_MC0_STATUS:
1642 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1643 __FUNCTION__, data);
1644 break;
1645 case MSR_IA32_MCG_STATUS:
1646 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1647 __FUNCTION__, data);
1648 break;
1649 case MSR_IA32_UCODE_REV:
1650 case MSR_IA32_UCODE_WRITE:
1651 case 0x200 ... 0x2ff: /* MTRRs */
1652 break;
1653 case MSR_IA32_APICBASE:
1654 kvm_set_apic_base(vcpu, data);
1655 break;
1656 case MSR_IA32_MISC_ENABLE:
1657 vcpu->ia32_misc_enable_msr = data;
1658 break;
1659 /*
1660 * This is the 'probe whether the host is KVM' logic:
1661 */
1662 case MSR_KVM_API_MAGIC:
1663 return vcpu_register_para(vcpu, data);
1664
1665 default:
1666 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
1667 return 1;
1668 }
1669 return 0;
1670}
1671EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1672
1673/*
1674 * Writes msr value into into the appropriate "register".
1675 * Returns 0 on success, non-0 otherwise.
1676 * Assumes vcpu_load() was already called.
1677 */
1678int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1679{
1680 return kvm_x86_ops->set_msr(vcpu, msr_index, data);
1681}
1682
1683void kvm_resched(struct kvm_vcpu *vcpu)
1684{
1685 if (!need_resched())
1686 return;
1687 cond_resched();
1688}
1689EXPORT_SYMBOL_GPL(kvm_resched);
1690
1691void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1692{
1693 int i;
1694 u32 function;
1695 struct kvm_cpuid_entry *e, *best;
1696
1697 kvm_x86_ops->cache_regs(vcpu);
1698 function = vcpu->regs[VCPU_REGS_RAX];
1699 vcpu->regs[VCPU_REGS_RAX] = 0;
1700 vcpu->regs[VCPU_REGS_RBX] = 0;
1701 vcpu->regs[VCPU_REGS_RCX] = 0;
1702 vcpu->regs[VCPU_REGS_RDX] = 0;
1703 best = NULL;
1704 for (i = 0; i < vcpu->cpuid_nent; ++i) {
1705 e = &vcpu->cpuid_entries[i];
1706 if (e->function == function) {
1707 best = e;
1708 break;
1709 }
1710 /*
1711 * Both basic or both extended?
1712 */
1713 if (((e->function ^ function) & 0x80000000) == 0)
1714 if (!best || e->function > best->function)
1715 best = e;
1716 }
1717 if (best) {
1718 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1719 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1720 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1721 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1722 }
1723 kvm_x86_ops->decache_regs(vcpu);
1724 kvm_x86_ops->skip_emulated_instruction(vcpu);
1725}
1726EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1727
1728static int pio_copy_data(struct kvm_vcpu *vcpu)
1729{
1730 void *p = vcpu->pio_data;
1731 void *q;
1732 unsigned bytes;
1733 int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1734
1735 q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1736 PAGE_KERNEL);
1737 if (!q) {
1738 free_pio_guest_pages(vcpu);
1739 return -ENOMEM;
1740 }
1741 q += vcpu->pio.guest_page_offset;
1742 bytes = vcpu->pio.size * vcpu->pio.cur_count;
1743 if (vcpu->pio.in)
1744 memcpy(q, p, bytes);
1745 else
1746 memcpy(p, q, bytes);
1747 q -= vcpu->pio.guest_page_offset;
1748 vunmap(q);
1749 free_pio_guest_pages(vcpu);
1750 return 0;
1751}
1752
1753static int complete_pio(struct kvm_vcpu *vcpu)
1754{
1755 struct kvm_pio_request *io = &vcpu->pio;
1756 long delta;
1757 int r;
1758
1759 kvm_x86_ops->cache_regs(vcpu);
1760
1761 if (!io->string) {
1762 if (io->in)
1763 memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1764 io->size);
1765 } else {
1766 if (io->in) {
1767 r = pio_copy_data(vcpu);
1768 if (r) {
1769 kvm_x86_ops->cache_regs(vcpu);
1770 return r;
1771 }
1772 }
1773
1774 delta = 1;
1775 if (io->rep) {
1776 delta *= io->cur_count;
1777 /*
1778 * The size of the register should really depend on
1779 * current address size.
1780 */
1781 vcpu->regs[VCPU_REGS_RCX] -= delta;
1782 }
1783 if (io->down)
1784 delta = -delta;
1785 delta *= io->size;
1786 if (io->in)
1787 vcpu->regs[VCPU_REGS_RDI] += delta;
1788 else
1789 vcpu->regs[VCPU_REGS_RSI] += delta;
1790 }
1791
1792 kvm_x86_ops->decache_regs(vcpu);
1793
1794 io->count -= io->cur_count;
1795 io->cur_count = 0;
1796
1797 return 0;
1798}
1799
1800static void kernel_pio(struct kvm_io_device *pio_dev,
1801 struct kvm_vcpu *vcpu,
1802 void *pd)
1803{
1804 /* TODO: String I/O for in kernel device */
1805
1806 mutex_lock(&vcpu->kvm->lock);
1807 if (vcpu->pio.in)
1808 kvm_iodevice_read(pio_dev, vcpu->pio.port,
1809 vcpu->pio.size,
1810 pd);
1811 else
1812 kvm_iodevice_write(pio_dev, vcpu->pio.port,
1813 vcpu->pio.size,
1814 pd);
1815 mutex_unlock(&vcpu->kvm->lock);
1816}
1817
1818static void pio_string_write(struct kvm_io_device *pio_dev,
1819 struct kvm_vcpu *vcpu)
1820{
1821 struct kvm_pio_request *io = &vcpu->pio;
1822 void *pd = vcpu->pio_data;
1823 int i;
1824
1825 mutex_lock(&vcpu->kvm->lock);
1826 for (i = 0; i < io->cur_count; i++) {
1827 kvm_iodevice_write(pio_dev, io->port,
1828 io->size,
1829 pd);
1830 pd += io->size;
1831 }
1832 mutex_unlock(&vcpu->kvm->lock);
1833}
1834
1835int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1836 int size, unsigned port)
1837{
1838 struct kvm_io_device *pio_dev;
1839
1840 vcpu->run->exit_reason = KVM_EXIT_IO;
1841 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1842 vcpu->run->io.size = vcpu->pio.size = size;
1843 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1844 vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
1845 vcpu->run->io.port = vcpu->pio.port = port;
1846 vcpu->pio.in = in;
1847 vcpu->pio.string = 0;
1848 vcpu->pio.down = 0;
1849 vcpu->pio.guest_page_offset = 0;
1850 vcpu->pio.rep = 0;
1851
1852 kvm_x86_ops->cache_regs(vcpu);
1853 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1854 kvm_x86_ops->decache_regs(vcpu);
1855
1856 kvm_x86_ops->skip_emulated_instruction(vcpu);
1857
1858 pio_dev = vcpu_find_pio_dev(vcpu, port);
1859 if (pio_dev) {
1860 kernel_pio(pio_dev, vcpu, vcpu->pio_data);
1861 complete_pio(vcpu);
1862 return 1;
1863 }
1864 return 0;
1865}
1866EXPORT_SYMBOL_GPL(kvm_emulate_pio);
1867
1868int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1869 int size, unsigned long count, int down,
1870 gva_t address, int rep, unsigned port)
1871{
1872 unsigned now, in_page;
1873 int i, ret = 0;
1874 int nr_pages = 1;
1875 struct page *page;
1876 struct kvm_io_device *pio_dev;
1877
1878 vcpu->run->exit_reason = KVM_EXIT_IO;
1879 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1880 vcpu->run->io.size = vcpu->pio.size = size;
1881 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1882 vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
1883 vcpu->run->io.port = vcpu->pio.port = port;
1884 vcpu->pio.in = in;
1885 vcpu->pio.string = 1;
1886 vcpu->pio.down = down;
1887 vcpu->pio.guest_page_offset = offset_in_page(address);
1888 vcpu->pio.rep = rep;
1889
1890 if (!count) {
1891 kvm_x86_ops->skip_emulated_instruction(vcpu);
1892 return 1;
1893 }
1894
1895 if (!down)
1896 in_page = PAGE_SIZE - offset_in_page(address);
1897 else
1898 in_page = offset_in_page(address) + size;
1899 now = min(count, (unsigned long)in_page / size);
1900 if (!now) {
1901 /*
1902 * String I/O straddles page boundary. Pin two guest pages
1903 * so that we satisfy atomicity constraints. Do just one
1904 * transaction to avoid complexity.
1905 */
1906 nr_pages = 2;
1907 now = 1;
1908 }
1909 if (down) {
1910 /*
1911 * String I/O in reverse. Yuck. Kill the guest, fix later.
1912 */
1913 pr_unimpl(vcpu, "guest string pio down\n");
1914 inject_gp(vcpu);
1915 return 1;
1916 }
1917 vcpu->run->io.count = now;
1918 vcpu->pio.cur_count = now;
1919
1920 if (vcpu->pio.cur_count == vcpu->pio.count)
1921 kvm_x86_ops->skip_emulated_instruction(vcpu);
1922
1923 for (i = 0; i < nr_pages; ++i) {
1924 mutex_lock(&vcpu->kvm->lock);
1925 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1926 if (page)
1927 get_page(page);
1928 vcpu->pio.guest_pages[i] = page;
1929 mutex_unlock(&vcpu->kvm->lock);
1930 if (!page) {
1931 inject_gp(vcpu);
1932 free_pio_guest_pages(vcpu);
1933 return 1;
1934 }
1935 }
1936
1937 pio_dev = vcpu_find_pio_dev(vcpu, port);
1938 if (!vcpu->pio.in) {
1939 /* string PIO write */
1940 ret = pio_copy_data(vcpu);
1941 if (ret >= 0 && pio_dev) {
1942 pio_string_write(pio_dev, vcpu);
1943 complete_pio(vcpu);
1944 if (vcpu->pio.count == 0)
1945 ret = 1;
1946 }
1947 } else if (pio_dev)
1948 pr_unimpl(vcpu, "no string pio read support yet, "
1949 "port %x size %d count %ld\n",
1950 port, size, count);
1951
1952 return ret;
1953}
1954EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
1955
1956/*
1957 * Check if userspace requested an interrupt window, and that the
1958 * interrupt window is open.
1959 *
1960 * No need to exit to userspace if we already have an interrupt queued.
1961 */
1962static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1963 struct kvm_run *kvm_run)
1964{
1965 return (!vcpu->irq_summary &&
1966 kvm_run->request_interrupt_window &&
1967 vcpu->interrupt_window_open &&
1968 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
1969}
1970
1971static void post_kvm_run_save(struct kvm_vcpu *vcpu,
1972 struct kvm_run *kvm_run)
1973{
1974 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
1975 kvm_run->cr8 = get_cr8(vcpu);
1976 kvm_run->apic_base = kvm_get_apic_base(vcpu);
1977 if (irqchip_in_kernel(vcpu->kvm))
1978 kvm_run->ready_for_interrupt_injection = 1;
1979 else
1980 kvm_run->ready_for_interrupt_injection =
1981 (vcpu->interrupt_window_open &&
1982 vcpu->irq_summary == 0);
1983}
1984
1985static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1986{
1987 int r;
1988
1989 if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
1990 printk("vcpu %d received sipi with vector # %x\n",
1991 vcpu->vcpu_id, vcpu->sipi_vector);
1992 kvm_lapic_reset(vcpu);
1993 kvm_x86_ops->vcpu_reset(vcpu);
1994 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
1995 }
1996
1997preempted:
1998 if (vcpu->guest_debug.enabled)
1999 kvm_x86_ops->guest_debug_pre(vcpu);
2000
2001again:
2002 r = kvm_mmu_reload(vcpu);
2003 if (unlikely(r))
2004 goto out;
2005
2006 preempt_disable();
2007
2008 kvm_x86_ops->prepare_guest_switch(vcpu);
2009 kvm_load_guest_fpu(vcpu);
2010
2011 local_irq_disable();
2012
2013 if (signal_pending(current)) {
2014 local_irq_enable();
2015 preempt_enable();
2016 r = -EINTR;
2017 kvm_run->exit_reason = KVM_EXIT_INTR;
2018 ++vcpu->stat.signal_exits;
2019 goto out;
2020 }
2021
2022 if (irqchip_in_kernel(vcpu->kvm))
2023 kvm_x86_ops->inject_pending_irq(vcpu);
2024 else if (!vcpu->mmio_read_completed)
2025 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2026
2027 vcpu->guest_mode = 1;
2028 kvm_guest_enter();
2029
2030 if (vcpu->requests)
2031 if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
2032 kvm_x86_ops->tlb_flush(vcpu);
2033
2034 kvm_x86_ops->run(vcpu, kvm_run);
2035
2036 vcpu->guest_mode = 0;
2037 local_irq_enable();
2038
2039 ++vcpu->stat.exits;
2040
2041 /*
2042 * We must have an instruction between local_irq_enable() and
2043 * kvm_guest_exit(), so the timer interrupt isn't delayed by
2044 * the interrupt shadow. The stat.exits increment will do nicely.
2045 * But we need to prevent reordering, hence this barrier():
2046 */
2047 barrier();
2048
2049 kvm_guest_exit();
2050
2051 preempt_enable();
2052
2053 /*
2054 * Profile KVM exit RIPs:
2055 */
2056 if (unlikely(prof_on == KVM_PROFILING)) {
2057 kvm_x86_ops->cache_regs(vcpu);
2058 profile_hit(KVM_PROFILING, (void *)vcpu->rip);
2059 }
2060
2061 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2062
2063 if (r > 0) {
2064 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2065 r = -EINTR;
2066 kvm_run->exit_reason = KVM_EXIT_INTR;
2067 ++vcpu->stat.request_irq_exits;
2068 goto out;
2069 }
2070 if (!need_resched()) {
2071 ++vcpu->stat.light_exits;
2072 goto again;
2073 }
2074 }
2075
2076out:
2077 if (r > 0) {
2078 kvm_resched(vcpu);
2079 goto preempted;
2080 }
2081
2082 post_kvm_run_save(vcpu, kvm_run);
2083
2084 return r;
2085}
2086
2087
2088static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2089{
2090 int r;
2091 sigset_t sigsaved;
2092
2093 vcpu_load(vcpu);
2094
2095 if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2096 kvm_vcpu_block(vcpu);
2097 vcpu_put(vcpu);
2098 return -EAGAIN;
2099 }
2100
2101 if (vcpu->sigset_active)
2102 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2103
2104 /* re-sync apic's tpr */
2105 if (!irqchip_in_kernel(vcpu->kvm))
2106 set_cr8(vcpu, kvm_run->cr8);
2107
2108 if (vcpu->pio.cur_count) {
2109 r = complete_pio(vcpu);
2110 if (r)
2111 goto out;
2112 }
2113
2114 if (vcpu->mmio_needed) {
2115 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2116 vcpu->mmio_read_completed = 1;
2117 vcpu->mmio_needed = 0;
2118 r = emulate_instruction(vcpu, kvm_run,
2119 vcpu->mmio_fault_cr2, 0);
2120 if (r == EMULATE_DO_MMIO) {
2121 /*
2122 * Read-modify-write. Back to userspace.
2123 */
2124 r = 0;
2125 goto out;
2126 }
2127 }
2128
2129 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2130 kvm_x86_ops->cache_regs(vcpu);
2131 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2132 kvm_x86_ops->decache_regs(vcpu);
2133 }
2134
2135 r = __vcpu_run(vcpu, kvm_run);
2136
2137out:
2138 if (vcpu->sigset_active)
2139 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
2140
2141 vcpu_put(vcpu);
2142 return r;
2143}
2144
2145static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
2146 struct kvm_regs *regs)
2147{
2148 vcpu_load(vcpu);
2149
2150 kvm_x86_ops->cache_regs(vcpu);
2151
2152 regs->rax = vcpu->regs[VCPU_REGS_RAX];
2153 regs->rbx = vcpu->regs[VCPU_REGS_RBX];
2154 regs->rcx = vcpu->regs[VCPU_REGS_RCX];
2155 regs->rdx = vcpu->regs[VCPU_REGS_RDX];
2156 regs->rsi = vcpu->regs[VCPU_REGS_RSI];
2157 regs->rdi = vcpu->regs[VCPU_REGS_RDI];
2158 regs->rsp = vcpu->regs[VCPU_REGS_RSP];
2159 regs->rbp = vcpu->regs[VCPU_REGS_RBP];
2160#ifdef CONFIG_X86_64
2161 regs->r8 = vcpu->regs[VCPU_REGS_R8];
2162 regs->r9 = vcpu->regs[VCPU_REGS_R9];
2163 regs->r10 = vcpu->regs[VCPU_REGS_R10];
2164 regs->r11 = vcpu->regs[VCPU_REGS_R11];
2165 regs->r12 = vcpu->regs[VCPU_REGS_R12];
2166 regs->r13 = vcpu->regs[VCPU_REGS_R13];
2167 regs->r14 = vcpu->regs[VCPU_REGS_R14];
2168 regs->r15 = vcpu->regs[VCPU_REGS_R15];
2169#endif
2170
2171 regs->rip = vcpu->rip;
2172 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2173
2174 /*
2175 * Don't leak debug flags in case they were set for guest debugging
2176 */
2177 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2178 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2179
2180 vcpu_put(vcpu);
2181
2182 return 0;
2183}
2184
2185static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
2186 struct kvm_regs *regs)
2187{
2188 vcpu_load(vcpu);
2189
2190 vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2191 vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2192 vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2193 vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2194 vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2195 vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2196 vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
2197 vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2198#ifdef CONFIG_X86_64
2199 vcpu->regs[VCPU_REGS_R8] = regs->r8;
2200 vcpu->regs[VCPU_REGS_R9] = regs->r9;
2201 vcpu->regs[VCPU_REGS_R10] = regs->r10;
2202 vcpu->regs[VCPU_REGS_R11] = regs->r11;
2203 vcpu->regs[VCPU_REGS_R12] = regs->r12;
2204 vcpu->regs[VCPU_REGS_R13] = regs->r13;
2205 vcpu->regs[VCPU_REGS_R14] = regs->r14;
2206 vcpu->regs[VCPU_REGS_R15] = regs->r15;
2207#endif
2208
2209 vcpu->rip = regs->rip;
2210 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2211
2212 kvm_x86_ops->decache_regs(vcpu);
2213
2214 vcpu_put(vcpu);
2215
2216 return 0;
2217}
2218
2219static void get_segment(struct kvm_vcpu *vcpu,
2220 struct kvm_segment *var, int seg)
2221{
2222 return kvm_x86_ops->get_segment(vcpu, var, seg);
2223}
2224
2225static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2226 struct kvm_sregs *sregs)
2227{
2228 struct descriptor_table dt;
2229 int pending_vec;
2230
2231 vcpu_load(vcpu);
2232
2233 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2234 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2235 get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2236 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2237 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2238 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2239
2240 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2241 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2242
2243 kvm_x86_ops->get_idt(vcpu, &dt);
2244 sregs->idt.limit = dt.limit;
2245 sregs->idt.base = dt.base;
2246 kvm_x86_ops->get_gdt(vcpu, &dt);
2247 sregs->gdt.limit = dt.limit;
2248 sregs->gdt.base = dt.base;
2249
2250 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2251 sregs->cr0 = vcpu->cr0;
2252 sregs->cr2 = vcpu->cr2;
2253 sregs->cr3 = vcpu->cr3;
2254 sregs->cr4 = vcpu->cr4;
2255 sregs->cr8 = get_cr8(vcpu);
2256 sregs->efer = vcpu->shadow_efer;
2257 sregs->apic_base = kvm_get_apic_base(vcpu);
2258
2259 if (irqchip_in_kernel(vcpu->kvm)) {
2260 memset(sregs->interrupt_bitmap, 0,
2261 sizeof sregs->interrupt_bitmap);
2262 pending_vec = kvm_x86_ops->get_irq(vcpu);
2263 if (pending_vec >= 0)
2264 set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap);
2265 } else
2266 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
2267 sizeof sregs->interrupt_bitmap);
2268
2269 vcpu_put(vcpu);
2270
2271 return 0;
2272}
2273
2274static void set_segment(struct kvm_vcpu *vcpu,
2275 struct kvm_segment *var, int seg)
2276{
2277 return kvm_x86_ops->set_segment(vcpu, var, seg);
2278}
2279
2280static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2281 struct kvm_sregs *sregs)
2282{
2283 int mmu_reset_needed = 0;
2284 int i, pending_vec, max_bits;
2285 struct descriptor_table dt;
2286
2287 vcpu_load(vcpu);
2288
2289 dt.limit = sregs->idt.limit;
2290 dt.base = sregs->idt.base;
2291 kvm_x86_ops->set_idt(vcpu, &dt);
2292 dt.limit = sregs->gdt.limit;
2293 dt.base = sregs->gdt.base;
2294 kvm_x86_ops->set_gdt(vcpu, &dt);
2295
2296 vcpu->cr2 = sregs->cr2;
2297 mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2298 vcpu->cr3 = sregs->cr3;
2299
2300 set_cr8(vcpu, sregs->cr8);
2301
2302 mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2303#ifdef CONFIG_X86_64
2304 kvm_x86_ops->set_efer(vcpu, sregs->efer);
2305#endif
2306 kvm_set_apic_base(vcpu, sregs->apic_base);
2307
2308 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2309
2310 mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2311 vcpu->cr0 = sregs->cr0;
2312 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2313
2314 mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2315 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2316 if (!is_long_mode(vcpu) && is_pae(vcpu))
2317 load_pdptrs(vcpu, vcpu->cr3);
2318
2319 if (mmu_reset_needed)
2320 kvm_mmu_reset_context(vcpu);
2321
2322 if (!irqchip_in_kernel(vcpu->kvm)) {
2323 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2324 sizeof vcpu->irq_pending);
2325 vcpu->irq_summary = 0;
2326 for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
2327 if (vcpu->irq_pending[i])
2328 __set_bit(i, &vcpu->irq_summary);
2329 } else {
2330 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2331 pending_vec = find_first_bit(
2332 (const unsigned long *)sregs->interrupt_bitmap,
2333 max_bits);
2334 /* Only pending external irq is handled here */
2335 if (pending_vec < max_bits) {
2336 kvm_x86_ops->set_irq(vcpu, pending_vec);
2337 printk("Set back pending irq %d\n", pending_vec);
2338 }
2339 }
2340
2341 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2342 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2343 set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2344 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2345 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2346 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2347
2348 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2349 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2350
2351 vcpu_put(vcpu);
2352
2353 return 0;
2354}
2355
2356void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2357{
2358 struct kvm_segment cs;
2359
2360 get_segment(vcpu, &cs, VCPU_SREG_CS);
2361 *db = cs.db;
2362 *l = cs.l;
2363}
2364EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2365
2366/*
2367 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
2368 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
2369 *
2370 * This list is modified at module load time to reflect the
2371 * capabilities of the host cpu.
2372 */
2373static u32 msrs_to_save[] = {
2374 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
2375 MSR_K6_STAR,
2376#ifdef CONFIG_X86_64
2377 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
2378#endif
2379 MSR_IA32_TIME_STAMP_COUNTER,
2380};
2381
2382static unsigned num_msrs_to_save;
2383
2384static u32 emulated_msrs[] = {
2385 MSR_IA32_MISC_ENABLE,
2386};
2387
2388static __init void kvm_init_msr_list(void)
2389{
2390 u32 dummy[2];
2391 unsigned i, j;
2392
2393 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2394 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2395 continue;
2396 if (j < i)
2397 msrs_to_save[j] = msrs_to_save[i];
2398 j++;
2399 }
2400 num_msrs_to_save = j;
2401}
2402
2403/*
2404 * Adapt set_msr() to msr_io()'s calling convention
2405 */
2406static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2407{
2408 return kvm_set_msr(vcpu, index, *data);
2409}
2410
2411/*
2412 * Read or write a bunch of msrs. All parameters are kernel addresses.
2413 *
2414 * @return number of msrs set successfully.
2415 */
2416static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2417 struct kvm_msr_entry *entries,
2418 int (*do_msr)(struct kvm_vcpu *vcpu,
2419 unsigned index, u64 *data))
2420{
2421 int i;
2422
2423 vcpu_load(vcpu);
2424
2425 for (i = 0; i < msrs->nmsrs; ++i)
2426 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2427 break;
2428
2429 vcpu_put(vcpu);
2430
2431 return i;
2432}
2433
2434/*
2435 * Read or write a bunch of msrs. Parameters are user addresses.
2436 *
2437 * @return number of msrs set successfully.
2438 */
2439static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2440 int (*do_msr)(struct kvm_vcpu *vcpu,
2441 unsigned index, u64 *data),
2442 int writeback)
2443{
2444 struct kvm_msrs msrs;
2445 struct kvm_msr_entry *entries;
2446 int r, n;
2447 unsigned size;
2448
2449 r = -EFAULT;
2450 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2451 goto out;
2452
2453 r = -E2BIG;
2454 if (msrs.nmsrs >= MAX_IO_MSRS)
2455 goto out;
2456
2457 r = -ENOMEM;
2458 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2459 entries = vmalloc(size);
2460 if (!entries)
2461 goto out;
2462
2463 r = -EFAULT;
2464 if (copy_from_user(entries, user_msrs->entries, size))
2465 goto out_free;
2466
2467 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2468 if (r < 0)
2469 goto out_free;
2470
2471 r = -EFAULT;
2472 if (writeback && copy_to_user(user_msrs->entries, entries, size))
2473 goto out_free;
2474
2475 r = n;
2476
2477out_free:
2478 vfree(entries);
2479out:
2480 return r;
2481}
2482
2483/*
2484 * Translate a guest virtual address to a guest physical address.
2485 */
2486static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2487 struct kvm_translation *tr)
2488{
2489 unsigned long vaddr = tr->linear_address;
2490 gpa_t gpa;
2491
2492 vcpu_load(vcpu);
2493 mutex_lock(&vcpu->kvm->lock);
2494 gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2495 tr->physical_address = gpa;
2496 tr->valid = gpa != UNMAPPED_GVA;
2497 tr->writeable = 1;
2498 tr->usermode = 0;
2499 mutex_unlock(&vcpu->kvm->lock);
2500 vcpu_put(vcpu);
2501
2502 return 0;
2503}
2504
2505static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2506 struct kvm_interrupt *irq)
2507{
2508 if (irq->irq < 0 || irq->irq >= 256)
2509 return -EINVAL;
2510 if (irqchip_in_kernel(vcpu->kvm))
2511 return -ENXIO;
2512 vcpu_load(vcpu);
2513
2514 set_bit(irq->irq, vcpu->irq_pending);
2515 set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2516
2517 vcpu_put(vcpu);
2518
2519 return 0;
2520}
2521
2522static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2523 struct kvm_debug_guest *dbg)
2524{
2525 int r;
2526
2527 vcpu_load(vcpu);
2528
2529 r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
2530
2531 vcpu_put(vcpu);
2532
2533 return r;
2534}
2535
2536static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2537 unsigned long address,
2538 int *type)
2539{
2540 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2541 unsigned long pgoff;
2542 struct page *page;
2543
2544 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2545 if (pgoff == 0)
2546 page = virt_to_page(vcpu->run);
2547 else if (pgoff == KVM_PIO_PAGE_OFFSET)
2548 page = virt_to_page(vcpu->pio_data);
2549 else
2550 return NOPAGE_SIGBUS;
2551 get_page(page);
2552 if (type != NULL)
2553 *type = VM_FAULT_MINOR;
2554
2555 return page;
2556}
2557
2558static struct vm_operations_struct kvm_vcpu_vm_ops = {
2559 .nopage = kvm_vcpu_nopage,
2560};
2561
2562static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2563{
2564 vma->vm_ops = &kvm_vcpu_vm_ops;
2565 return 0;
2566}
2567
2568static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2569{
2570 struct kvm_vcpu *vcpu = filp->private_data;
2571
2572 fput(vcpu->kvm->filp);
2573 return 0;
2574}
2575
2576static struct file_operations kvm_vcpu_fops = {
2577 .release = kvm_vcpu_release,
2578 .unlocked_ioctl = kvm_vcpu_ioctl,
2579 .compat_ioctl = kvm_vcpu_ioctl,
2580 .mmap = kvm_vcpu_mmap,
2581};
2582
2583/*
2584 * Allocates an inode for the vcpu.
2585 */
2586static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2587{
2588 int fd, r;
2589 struct inode *inode;
2590 struct file *file;
2591
2592 r = anon_inode_getfd(&fd, &inode, &file,
2593 "kvm-vcpu", &kvm_vcpu_fops, vcpu);
2594 if (r)
2595 return r;
2596 atomic_inc(&vcpu->kvm->filp->f_count);
2597 return fd;
2598}
2599
2600/*
2601 * Creates some virtual cpus. Good luck creating more than one.
2602 */
2603static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2604{
2605 int r;
2606 struct kvm_vcpu *vcpu;
2607
2608 if (!valid_vcpu(n))
2609 return -EINVAL;
2610
2611 vcpu = kvm_x86_ops->vcpu_create(kvm, n);
2612 if (IS_ERR(vcpu))
2613 return PTR_ERR(vcpu);
2614
2615 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
2616
2617 /* We do fxsave: this must be aligned. */
2618 BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
2619
2620 vcpu_load(vcpu);
2621 r = kvm_mmu_setup(vcpu);
2622 vcpu_put(vcpu);
2623 if (r < 0)
2624 goto free_vcpu;
2625
2626 mutex_lock(&kvm->lock);
2627 if (kvm->vcpus[n]) {
2628 r = -EEXIST;
2629 mutex_unlock(&kvm->lock);
2630 goto mmu_unload;
2631 }
2632 kvm->vcpus[n] = vcpu;
2633 mutex_unlock(&kvm->lock);
2634
2635 /* Now it's all set up, let userspace reach it */
2636 r = create_vcpu_fd(vcpu);
2637 if (r < 0)
2638 goto unlink;
2639 return r;
2640
2641unlink:
2642 mutex_lock(&kvm->lock);
2643 kvm->vcpus[n] = NULL;
2644 mutex_unlock(&kvm->lock);
2645
2646mmu_unload:
2647 vcpu_load(vcpu);
2648 kvm_mmu_unload(vcpu);
2649 vcpu_put(vcpu);
2650
2651free_vcpu:
2652 kvm_x86_ops->vcpu_free(vcpu);
2653 return r;
2654}
2655
2656static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2657{
2658 u64 efer;
2659 int i;
2660 struct kvm_cpuid_entry *e, *entry;
2661
2662 rdmsrl(MSR_EFER, efer);
2663 entry = NULL;
2664 for (i = 0; i < vcpu->cpuid_nent; ++i) {
2665 e = &vcpu->cpuid_entries[i];
2666 if (e->function == 0x80000001) {
2667 entry = e;
2668 break;
2669 }
2670 }
2671 if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
2672 entry->edx &= ~(1 << 20);
2673 printk(KERN_INFO "kvm: guest NX capability removed\n");
2674 }
2675}
2676
2677static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2678 struct kvm_cpuid *cpuid,
2679 struct kvm_cpuid_entry __user *entries)
2680{
2681 int r;
2682
2683 r = -E2BIG;
2684 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2685 goto out;
2686 r = -EFAULT;
2687 if (copy_from_user(&vcpu->cpuid_entries, entries,
2688 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2689 goto out;
2690 vcpu->cpuid_nent = cpuid->nent;
2691 cpuid_fix_nx_cap(vcpu);
2692 return 0;
2693
2694out:
2695 return r;
2696}
2697
2698static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2699{
2700 if (sigset) {
2701 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2702 vcpu->sigset_active = 1;
2703 vcpu->sigset = *sigset;
2704 } else
2705 vcpu->sigset_active = 0;
2706 return 0;
2707}
2708
2709/*
2710 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
2711 * we have asm/x86/processor.h
2712 */
2713struct fxsave {
2714 u16 cwd;
2715 u16 swd;
2716 u16 twd;
2717 u16 fop;
2718 u64 rip;
2719 u64 rdp;
2720 u32 mxcsr;
2721 u32 mxcsr_mask;
2722 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
2723#ifdef CONFIG_X86_64
2724 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
2725#else
2726 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
2727#endif
2728};
2729
2730static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2731{
2732 struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2733
2734 vcpu_load(vcpu);
2735
2736 memcpy(fpu->fpr, fxsave->st_space, 128);
2737 fpu->fcw = fxsave->cwd;
2738 fpu->fsw = fxsave->swd;
2739 fpu->ftwx = fxsave->twd;
2740 fpu->last_opcode = fxsave->fop;
2741 fpu->last_ip = fxsave->rip;
2742 fpu->last_dp = fxsave->rdp;
2743 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2744
2745 vcpu_put(vcpu);
2746
2747 return 0;
2748}
2749
2750static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2751{
2752 struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2753
2754 vcpu_load(vcpu);
2755
2756 memcpy(fxsave->st_space, fpu->fpr, 128);
2757 fxsave->cwd = fpu->fcw;
2758 fxsave->swd = fpu->fsw;
2759 fxsave->twd = fpu->ftwx;
2760 fxsave->fop = fpu->last_opcode;
2761 fxsave->rip = fpu->last_ip;
2762 fxsave->rdp = fpu->last_dp;
2763 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2764
2765 vcpu_put(vcpu);
2766
2767 return 0;
2768}
2769
2770static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2771 struct kvm_lapic_state *s)
2772{
2773 vcpu_load(vcpu);
2774 memcpy(s->regs, vcpu->apic->regs, sizeof *s);
2775 vcpu_put(vcpu);
2776
2777 return 0;
2778}
2779
2780static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2781 struct kvm_lapic_state *s)
2782{
2783 vcpu_load(vcpu);
2784 memcpy(vcpu->apic->regs, s->regs, sizeof *s);
2785 kvm_apic_post_state_restore(vcpu);
2786 vcpu_put(vcpu);
2787
2788 return 0;
2789}
2790
2791static long kvm_vcpu_ioctl(struct file *filp,
2792 unsigned int ioctl, unsigned long arg)
2793{
2794 struct kvm_vcpu *vcpu = filp->private_data;
2795 void __user *argp = (void __user *)arg;
2796 int r = -EINVAL;
2797
2798 switch (ioctl) {
2799 case KVM_RUN:
2800 r = -EINVAL;
2801 if (arg)
2802 goto out;
2803 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2804 break;
2805 case KVM_GET_REGS: {
2806 struct kvm_regs kvm_regs;
2807
2808 memset(&kvm_regs, 0, sizeof kvm_regs);
2809 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2810 if (r)
2811 goto out;
2812 r = -EFAULT;
2813 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2814 goto out;
2815 r = 0;
2816 break;
2817 }
2818 case KVM_SET_REGS: {
2819 struct kvm_regs kvm_regs;
2820
2821 r = -EFAULT;
2822 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2823 goto out;
2824 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2825 if (r)
2826 goto out;
2827 r = 0;
2828 break;
2829 }
2830 case KVM_GET_SREGS: {
2831 struct kvm_sregs kvm_sregs;
2832
2833 memset(&kvm_sregs, 0, sizeof kvm_sregs);
2834 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2835 if (r)
2836 goto out;
2837 r = -EFAULT;
2838 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2839 goto out;
2840 r = 0;
2841 break;
2842 }
2843 case KVM_SET_SREGS: {
2844 struct kvm_sregs kvm_sregs;
2845
2846 r = -EFAULT;
2847 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2848 goto out;
2849 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2850 if (r)
2851 goto out;
2852 r = 0;
2853 break;
2854 }
2855 case KVM_TRANSLATE: {
2856 struct kvm_translation tr;
2857
2858 r = -EFAULT;
2859 if (copy_from_user(&tr, argp, sizeof tr))
2860 goto out;
2861 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2862 if (r)
2863 goto out;
2864 r = -EFAULT;
2865 if (copy_to_user(argp, &tr, sizeof tr))
2866 goto out;
2867 r = 0;
2868 break;
2869 }
2870 case KVM_INTERRUPT: {
2871 struct kvm_interrupt irq;
2872
2873 r = -EFAULT;
2874 if (copy_from_user(&irq, argp, sizeof irq))
2875 goto out;
2876 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2877 if (r)
2878 goto out;
2879 r = 0;
2880 break;
2881 }
2882 case KVM_DEBUG_GUEST: {
2883 struct kvm_debug_guest dbg;
2884
2885 r = -EFAULT;
2886 if (copy_from_user(&dbg, argp, sizeof dbg))
2887 goto out;
2888 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2889 if (r)
2890 goto out;
2891 r = 0;
2892 break;
2893 }
2894 case KVM_GET_MSRS:
2895 r = msr_io(vcpu, argp, kvm_get_msr, 1);
2896 break;
2897 case KVM_SET_MSRS:
2898 r = msr_io(vcpu, argp, do_set_msr, 0);
2899 break;
2900 case KVM_SET_CPUID: {
2901 struct kvm_cpuid __user *cpuid_arg = argp;
2902 struct kvm_cpuid cpuid;
2903
2904 r = -EFAULT;
2905 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2906 goto out;
2907 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2908 if (r)
2909 goto out;
2910 break;
2911 }
2912 case KVM_SET_SIGNAL_MASK: {
2913 struct kvm_signal_mask __user *sigmask_arg = argp;
2914 struct kvm_signal_mask kvm_sigmask;
2915 sigset_t sigset, *p;
2916
2917 p = NULL;
2918 if (argp) {
2919 r = -EFAULT;
2920 if (copy_from_user(&kvm_sigmask, argp,
2921 sizeof kvm_sigmask))
2922 goto out;
2923 r = -EINVAL;
2924 if (kvm_sigmask.len != sizeof sigset)
2925 goto out;
2926 r = -EFAULT;
2927 if (copy_from_user(&sigset, sigmask_arg->sigset,
2928 sizeof sigset))
2929 goto out;
2930 p = &sigset;
2931 }
2932 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2933 break;
2934 }
2935 case KVM_GET_FPU: {
2936 struct kvm_fpu fpu;
2937
2938 memset(&fpu, 0, sizeof fpu);
2939 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2940 if (r)
2941 goto out;
2942 r = -EFAULT;
2943 if (copy_to_user(argp, &fpu, sizeof fpu))
2944 goto out;
2945 r = 0;
2946 break;
2947 }
2948 case KVM_SET_FPU: {
2949 struct kvm_fpu fpu;
2950
2951 r = -EFAULT;
2952 if (copy_from_user(&fpu, argp, sizeof fpu))
2953 goto out;
2954 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2955 if (r)
2956 goto out;
2957 r = 0;
2958 break;
2959 }
2960 case KVM_GET_LAPIC: {
2961 struct kvm_lapic_state lapic;
2962
2963 memset(&lapic, 0, sizeof lapic);
2964 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
2965 if (r)
2966 goto out;
2967 r = -EFAULT;
2968 if (copy_to_user(argp, &lapic, sizeof lapic))
2969 goto out;
2970 r = 0;
2971 break;
2972 }
2973 case KVM_SET_LAPIC: {
2974 struct kvm_lapic_state lapic;
2975
2976 r = -EFAULT;
2977 if (copy_from_user(&lapic, argp, sizeof lapic))
2978 goto out;
2979 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
2980 if (r)
2981 goto out;
2982 r = 0;
2983 break;
2984 }
2985 default:
2986 ;
2987 }
2988out:
2989 return r;
2990}
2991
2992static long kvm_vm_ioctl(struct file *filp,
2993 unsigned int ioctl, unsigned long arg)
2994{
2995 struct kvm *kvm = filp->private_data;
2996 void __user *argp = (void __user *)arg;
2997 int r = -EINVAL;
2998
2999 switch (ioctl) {
3000 case KVM_CREATE_VCPU:
3001 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
3002 if (r < 0)
3003 goto out;
3004 break;
3005 case KVM_SET_MEMORY_REGION: {
3006 struct kvm_memory_region kvm_mem;
3007
3008 r = -EFAULT;
3009 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
3010 goto out;
3011 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
3012 if (r)
3013 goto out;
3014 break;
3015 }
3016 case KVM_GET_DIRTY_LOG: {
3017 struct kvm_dirty_log log;
3018
3019 r = -EFAULT;
3020 if (copy_from_user(&log, argp, sizeof log))
3021 goto out;
3022 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3023 if (r)
3024 goto out;
3025 break;
3026 }
3027 case KVM_SET_MEMORY_ALIAS: {
3028 struct kvm_memory_alias alias;
3029
3030 r = -EFAULT;
3031 if (copy_from_user(&alias, argp, sizeof alias))
3032 goto out;
3033 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
3034 if (r)
3035 goto out;
3036 break;
3037 }
3038 case KVM_CREATE_IRQCHIP:
3039 r = -ENOMEM;
3040 kvm->vpic = kvm_create_pic(kvm);
3041 if (kvm->vpic) {
3042 r = kvm_ioapic_init(kvm);
3043 if (r) {
3044 kfree(kvm->vpic);
3045 kvm->vpic = NULL;
3046 goto out;
3047 }
3048 }
3049 else
3050 goto out;
3051 break;
3052 case KVM_IRQ_LINE: {
3053 struct kvm_irq_level irq_event;
3054
3055 r = -EFAULT;
3056 if (copy_from_user(&irq_event, argp, sizeof irq_event))
3057 goto out;
3058 if (irqchip_in_kernel(kvm)) {
3059 mutex_lock(&kvm->lock);
3060 if (irq_event.irq < 16)
3061 kvm_pic_set_irq(pic_irqchip(kvm),
3062 irq_event.irq,
3063 irq_event.level);
3064 kvm_ioapic_set_irq(kvm->vioapic,
3065 irq_event.irq,
3066 irq_event.level);
3067 mutex_unlock(&kvm->lock);
3068 r = 0;
3069 }
3070 break;
3071 }
3072 case KVM_GET_IRQCHIP: {
3073 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3074 struct kvm_irqchip chip;
3075
3076 r = -EFAULT;
3077 if (copy_from_user(&chip, argp, sizeof chip))
3078 goto out;
3079 r = -ENXIO;
3080 if (!irqchip_in_kernel(kvm))
3081 goto out;
3082 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
3083 if (r)
3084 goto out;
3085 r = -EFAULT;
3086 if (copy_to_user(argp, &chip, sizeof chip))
3087 goto out;
3088 r = 0;
3089 break;
3090 }
3091 case KVM_SET_IRQCHIP: {
3092 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3093 struct kvm_irqchip chip;
3094
3095 r = -EFAULT;
3096 if (copy_from_user(&chip, argp, sizeof chip))
3097 goto out;
3098 r = -ENXIO;
3099 if (!irqchip_in_kernel(kvm))
3100 goto out;
3101 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
3102 if (r)
3103 goto out;
3104 r = 0;
3105 break;
3106 }
3107 default:
3108 ;
3109 }
3110out:
3111 return r;
3112}
3113
3114static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
3115 unsigned long address,
3116 int *type)
3117{
3118 struct kvm *kvm = vma->vm_file->private_data;
3119 unsigned long pgoff;
3120 struct page *page;
3121
3122 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3123 page = gfn_to_page(kvm, pgoff);
3124 if (!page)
3125 return NOPAGE_SIGBUS;
3126 get_page(page);
3127 if (type != NULL)
3128 *type = VM_FAULT_MINOR;
3129
3130 return page;
3131}
3132
3133static struct vm_operations_struct kvm_vm_vm_ops = {
3134 .nopage = kvm_vm_nopage,
3135};
3136
3137static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
3138{
3139 vma->vm_ops = &kvm_vm_vm_ops;
3140 return 0;
3141}
3142
3143static struct file_operations kvm_vm_fops = {
3144 .release = kvm_vm_release,
3145 .unlocked_ioctl = kvm_vm_ioctl,
3146 .compat_ioctl = kvm_vm_ioctl,
3147 .mmap = kvm_vm_mmap,
3148};
3149
3150static int kvm_dev_ioctl_create_vm(void)
3151{
3152 int fd, r;
3153 struct inode *inode;
3154 struct file *file;
3155 struct kvm *kvm;
3156
3157 kvm = kvm_create_vm();
3158 if (IS_ERR(kvm))
3159 return PTR_ERR(kvm);
3160 r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
3161 if (r) {
3162 kvm_destroy_vm(kvm);
3163 return r;
3164 }
3165
3166 kvm->filp = file;
3167
3168 return fd;
3169}
3170
3171static long kvm_dev_ioctl(struct file *filp,
3172 unsigned int ioctl, unsigned long arg)
3173{
3174 void __user *argp = (void __user *)arg;
3175 long r = -EINVAL;
3176
3177 switch (ioctl) {
3178 case KVM_GET_API_VERSION:
3179 r = -EINVAL;
3180 if (arg)
3181 goto out;
3182 r = KVM_API_VERSION;
3183 break;
3184 case KVM_CREATE_VM:
3185 r = -EINVAL;
3186 if (arg)
3187 goto out;
3188 r = kvm_dev_ioctl_create_vm();
3189 break;
3190 case KVM_GET_MSR_INDEX_LIST: {
3191 struct kvm_msr_list __user *user_msr_list = argp;
3192 struct kvm_msr_list msr_list;
3193 unsigned n;
3194
3195 r = -EFAULT;
3196 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
3197 goto out;
3198 n = msr_list.nmsrs;
3199 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
3200 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
3201 goto out;
3202 r = -E2BIG;
3203 if (n < num_msrs_to_save)
3204 goto out;
3205 r = -EFAULT;
3206 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
3207 num_msrs_to_save * sizeof(u32)))
3208 goto out;
3209 if (copy_to_user(user_msr_list->indices
3210 + num_msrs_to_save * sizeof(u32),
3211 &emulated_msrs,
3212 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
3213 goto out;
3214 r = 0;
3215 break;
3216 }
3217 case KVM_CHECK_EXTENSION: {
3218 int ext = (long)argp;
3219
3220 switch (ext) {
3221 case KVM_CAP_IRQCHIP:
3222 case KVM_CAP_HLT:
3223 r = 1;
3224 break;
3225 default:
3226 r = 0;
3227 break;
3228 }
3229 break;
3230 }
3231 case KVM_GET_VCPU_MMAP_SIZE:
3232 r = -EINVAL;
3233 if (arg)
3234 goto out;
3235 r = 2 * PAGE_SIZE;
3236 break;
3237 default:
3238 ;
3239 }
3240out:
3241 return r;
3242}
3243
3244static struct file_operations kvm_chardev_ops = {
3245 .unlocked_ioctl = kvm_dev_ioctl,
3246 .compat_ioctl = kvm_dev_ioctl,
3247};
3248
3249static struct miscdevice kvm_dev = {
3250 KVM_MINOR,
3251 "kvm",
3252 &kvm_chardev_ops,
3253};
3254
3255/*
3256 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
3257 * cached on it.
3258 */
3259static void decache_vcpus_on_cpu(int cpu)
3260{
3261 struct kvm *vm;
3262 struct kvm_vcpu *vcpu;
3263 int i;
3264
3265 spin_lock(&kvm_lock);
3266 list_for_each_entry(vm, &vm_list, vm_list)
3267 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3268 vcpu = vm->vcpus[i];
3269 if (!vcpu)
3270 continue;
3271 /*
3272 * If the vcpu is locked, then it is running on some
3273 * other cpu and therefore it is not cached on the
3274 * cpu in question.
3275 *
3276 * If it's not locked, check the last cpu it executed
3277 * on.
3278 */
3279 if (mutex_trylock(&vcpu->mutex)) {
3280 if (vcpu->cpu == cpu) {
3281 kvm_x86_ops->vcpu_decache(vcpu);
3282 vcpu->cpu = -1;
3283 }
3284 mutex_unlock(&vcpu->mutex);
3285 }
3286 }
3287 spin_unlock(&kvm_lock);
3288}
3289
3290static void hardware_enable(void *junk)
3291{
3292 int cpu = raw_smp_processor_id();
3293
3294 if (cpu_isset(cpu, cpus_hardware_enabled))
3295 return;
3296 cpu_set(cpu, cpus_hardware_enabled);
3297 kvm_x86_ops->hardware_enable(NULL);
3298}
3299
3300static void hardware_disable(void *junk)
3301{
3302 int cpu = raw_smp_processor_id();
3303
3304 if (!cpu_isset(cpu, cpus_hardware_enabled))
3305 return;
3306 cpu_clear(cpu, cpus_hardware_enabled);
3307 decache_vcpus_on_cpu(cpu);
3308 kvm_x86_ops->hardware_disable(NULL);
3309}
3310
3311static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
3312 void *v)
3313{
3314 int cpu = (long)v;
3315
3316 switch (val) {
3317 case CPU_DYING:
3318 case CPU_DYING_FROZEN:
3319 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3320 cpu);
3321 hardware_disable(NULL);
3322 break;
3323 case CPU_UP_CANCELED:
3324 case CPU_UP_CANCELED_FROZEN:
3325 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3326 cpu);
3327 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
3328 break;
3329 case CPU_ONLINE:
3330 case CPU_ONLINE_FROZEN:
3331 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
3332 cpu);
3333 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
3334 break;
3335 }
3336 return NOTIFY_OK;
3337}
3338
3339static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
3340 void *v)
3341{
3342 if (val == SYS_RESTART) {
3343 /*
3344 * Some (well, at least mine) BIOSes hang on reboot if
3345 * in vmx root mode.
3346 */
3347 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
3348 on_each_cpu(hardware_disable, NULL, 0, 1);
3349 }
3350 return NOTIFY_OK;
3351}
3352
3353static struct notifier_block kvm_reboot_notifier = {
3354 .notifier_call = kvm_reboot,
3355 .priority = 0,
3356};
3357
3358void kvm_io_bus_init(struct kvm_io_bus *bus)
3359{
3360 memset(bus, 0, sizeof(*bus));
3361}
3362
3363void kvm_io_bus_destroy(struct kvm_io_bus *bus)
3364{
3365 int i;
3366
3367 for (i = 0; i < bus->dev_count; i++) {
3368 struct kvm_io_device *pos = bus->devs[i];
3369
3370 kvm_iodevice_destructor(pos);
3371 }
3372}
3373
3374struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
3375{
3376 int i;
3377
3378 for (i = 0; i < bus->dev_count; i++) {
3379 struct kvm_io_device *pos = bus->devs[i];
3380
3381 if (pos->in_range(pos, addr))
3382 return pos;
3383 }
3384
3385 return NULL;
3386}
3387
3388void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
3389{
3390 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
3391
3392 bus->devs[bus->dev_count++] = dev;
3393}
3394
3395static struct notifier_block kvm_cpu_notifier = {
3396 .notifier_call = kvm_cpu_hotplug,
3397 .priority = 20, /* must be > scheduler priority */
3398};
3399
3400static u64 stat_get(void *_offset)
3401{
3402 unsigned offset = (long)_offset;
3403 u64 total = 0;
3404 struct kvm *kvm;
3405 struct kvm_vcpu *vcpu;
3406 int i;
3407
3408 spin_lock(&kvm_lock);
3409 list_for_each_entry(kvm, &vm_list, vm_list)
3410 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3411 vcpu = kvm->vcpus[i];
3412 if (vcpu)
3413 total += *(u32 *)((void *)vcpu + offset);
3414 }
3415 spin_unlock(&kvm_lock);
3416 return total;
3417}
3418
3419DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n");
3420
3421static __init void kvm_init_debug(void)
3422{
3423 struct kvm_stats_debugfs_item *p;
3424
3425 debugfs_dir = debugfs_create_dir("kvm", NULL);
3426 for (p = debugfs_entries; p->name; ++p)
3427 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
3428 (void *)(long)p->offset,
3429 &stat_fops);
3430}
3431
3432static void kvm_exit_debug(void)
3433{
3434 struct kvm_stats_debugfs_item *p;
3435
3436 for (p = debugfs_entries; p->name; ++p)
3437 debugfs_remove(p->dentry);
3438 debugfs_remove(debugfs_dir);
3439}
3440
3441static int kvm_suspend(struct sys_device *dev, pm_message_t state)
3442{
3443 hardware_disable(NULL);
3444 return 0;
3445}
3446
3447static int kvm_resume(struct sys_device *dev)
3448{
3449 hardware_enable(NULL);
3450 return 0;
3451}
3452
3453static struct sysdev_class kvm_sysdev_class = {
3454 .name = "kvm",
3455 .suspend = kvm_suspend,
3456 .resume = kvm_resume,
3457};
3458
3459static struct sys_device kvm_sysdev = {
3460 .id = 0,
3461 .cls = &kvm_sysdev_class,
3462};
3463
3464hpa_t bad_page_address;
3465
3466static inline
3467struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
3468{
3469 return container_of(pn, struct kvm_vcpu, preempt_notifier);
3470}
3471
3472static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
3473{
3474 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
3475
3476 kvm_x86_ops->vcpu_load(vcpu, cpu);
3477}
3478
3479static void kvm_sched_out(struct preempt_notifier *pn,
3480 struct task_struct *next)
3481{
3482 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
3483
3484 kvm_x86_ops->vcpu_put(vcpu);
3485}
3486
3487int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
3488 struct module *module)
3489{
3490 int r;
3491 int cpu;
3492
3493 if (kvm_x86_ops) {
3494 printk(KERN_ERR "kvm: already loaded the other module\n");
3495 return -EEXIST;
3496 }
3497
3498 if (!ops->cpu_has_kvm_support()) {
3499 printk(KERN_ERR "kvm: no hardware support\n");
3500 return -EOPNOTSUPP;
3501 }
3502 if (ops->disabled_by_bios()) {
3503 printk(KERN_ERR "kvm: disabled by bios\n");
3504 return -EOPNOTSUPP;
3505 }
3506
3507 kvm_x86_ops = ops;
3508
3509 r = kvm_x86_ops->hardware_setup();
3510 if (r < 0)
3511 goto out;
3512
3513 for_each_online_cpu(cpu) {
3514 smp_call_function_single(cpu,
3515 kvm_x86_ops->check_processor_compatibility,
3516 &r, 0, 1);
3517 if (r < 0)
3518 goto out_free_0;
3519 }
3520
3521 on_each_cpu(hardware_enable, NULL, 0, 1);
3522 r = register_cpu_notifier(&kvm_cpu_notifier);
3523 if (r)
3524 goto out_free_1;
3525 register_reboot_notifier(&kvm_reboot_notifier);
3526
3527 r = sysdev_class_register(&kvm_sysdev_class);
3528 if (r)
3529 goto out_free_2;
3530
3531 r = sysdev_register(&kvm_sysdev);
3532 if (r)
3533 goto out_free_3;
3534
3535 /* A kmem cache lets us meet the alignment requirements of fx_save. */
3536 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
3537 __alignof__(struct kvm_vcpu), 0, 0);
3538 if (!kvm_vcpu_cache) {
3539 r = -ENOMEM;
3540 goto out_free_4;
3541 }
3542
3543 kvm_chardev_ops.owner = module;
3544
3545 r = misc_register(&kvm_dev);
3546 if (r) {
3547 printk (KERN_ERR "kvm: misc device register failed\n");
3548 goto out_free;
3549 }
3550
3551 kvm_preempt_ops.sched_in = kvm_sched_in;
3552 kvm_preempt_ops.sched_out = kvm_sched_out;
3553
3554 return r;
3555
3556out_free:
3557 kmem_cache_destroy(kvm_vcpu_cache);
3558out_free_4:
3559 sysdev_unregister(&kvm_sysdev);
3560out_free_3:
3561 sysdev_class_unregister(&kvm_sysdev_class);
3562out_free_2:
3563 unregister_reboot_notifier(&kvm_reboot_notifier);
3564 unregister_cpu_notifier(&kvm_cpu_notifier);
3565out_free_1:
3566 on_each_cpu(hardware_disable, NULL, 0, 1);
3567out_free_0:
3568 kvm_x86_ops->hardware_unsetup();
3569out:
3570 kvm_x86_ops = NULL;
3571 return r;
3572}
3573
3574void kvm_exit_x86(void)
3575{
3576 misc_deregister(&kvm_dev);
3577 kmem_cache_destroy(kvm_vcpu_cache);
3578 sysdev_unregister(&kvm_sysdev);
3579 sysdev_class_unregister(&kvm_sysdev_class);
3580 unregister_reboot_notifier(&kvm_reboot_notifier);
3581 unregister_cpu_notifier(&kvm_cpu_notifier);
3582 on_each_cpu(hardware_disable, NULL, 0, 1);
3583 kvm_x86_ops->hardware_unsetup();
3584 kvm_x86_ops = NULL;
3585}
3586
3587static __init int kvm_init(void)
3588{
3589 static struct page *bad_page;
3590 int r;
3591
3592 r = kvm_mmu_module_init();
3593 if (r)
3594 goto out4;
3595
3596 kvm_init_debug();
3597
3598 kvm_init_msr_list();
3599
3600 if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
3601 r = -ENOMEM;
3602 goto out;
3603 }
3604
3605 bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3606 memset(__va(bad_page_address), 0, PAGE_SIZE);
3607
3608 return 0;
3609
3610out:
3611 kvm_exit_debug();
3612 kvm_mmu_module_exit();
3613out4:
3614 return r;
3615}
3616
3617static __exit void kvm_exit(void)
3618{
3619 kvm_exit_debug();
3620 __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3621 kvm_mmu_module_exit();
3622}
3623
3624module_init(kvm_init)
3625module_exit(kvm_exit)
3626
3627EXPORT_SYMBOL_GPL(kvm_init_x86);
3628EXPORT_SYMBOL_GPL(kvm_exit_x86);
diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h
deleted file mode 100644
index a0e415daef5b..000000000000
--- a/drivers/kvm/kvm_svm.h
+++ /dev/null
@@ -1,45 +0,0 @@
1#ifndef __KVM_SVM_H
2#define __KVM_SVM_H
3
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/list.h>
7#include <asm/msr.h>
8
9#include "svm.h"
10#include "kvm.h"
11
12static const u32 host_save_user_msrs[] = {
13#ifdef CONFIG_X86_64
14 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
15 MSR_FS_BASE,
16#endif
17 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
18};
19
20#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
21#define NUM_DB_REGS 4
22
23struct kvm_vcpu;
24
25struct vcpu_svm {
26 struct kvm_vcpu vcpu;
27 struct vmcb *vmcb;
28 unsigned long vmcb_pa;
29 struct svm_cpu_data *svm_data;
30 uint64_t asid_generation;
31
32 unsigned long db_regs[NUM_DB_REGS];
33
34 u64 next_rip;
35
36 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
37 u64 host_gs_base;
38 unsigned long host_cr2;
39 unsigned long host_db_regs[NUM_DB_REGS];
40 unsigned long host_dr6;
41 unsigned long host_dr7;
42};
43
44#endif
45
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c
deleted file mode 100644
index 238fcad3cece..000000000000
--- a/drivers/kvm/lapic.c
+++ /dev/null
@@ -1,1080 +0,0 @@
1
2/*
3 * Local APIC virtualization
4 *
5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel
8 *
9 * Authors:
10 * Dor Laor <dor.laor@qumranet.com>
11 * Gregory Haskins <ghaskins@novell.com>
12 * Yaozu (Eddie) Dong <eddie.dong@intel.com>
13 *
14 * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 */
19
20#include "kvm.h"
21#include <linux/kvm.h>
22#include <linux/mm.h>
23#include <linux/highmem.h>
24#include <linux/smp.h>
25#include <linux/hrtimer.h>
26#include <linux/io.h>
27#include <linux/module.h>
28#include <asm/processor.h>
29#include <asm/msr.h>
30#include <asm/page.h>
31#include <asm/current.h>
32#include <asm/apicdef.h>
33#include <asm/atomic.h>
34#include <asm/div64.h>
35#include "irq.h"
36
37#define PRId64 "d"
38#define PRIx64 "llx"
39#define PRIu64 "u"
40#define PRIo64 "o"
41
42#define APIC_BUS_CYCLE_NS 1
43
44/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
45#define apic_debug(fmt, arg...)
46
47#define APIC_LVT_NUM 6
48/* 14 is the version for Xeon and Pentium 8.4.8*/
49#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16))
50#define LAPIC_MMIO_LENGTH (1 << 12)
51/* followed define is not in apicdef.h */
52#define APIC_SHORT_MASK 0xc0000
53#define APIC_DEST_NOSHORT 0x0
54#define APIC_DEST_MASK 0x800
55#define MAX_APIC_VECTOR 256
56
57#define VEC_POS(v) ((v) & (32 - 1))
58#define REG_POS(v) (((v) >> 5) << 4)
59static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
60{
61 return *((u32 *) (apic->regs + reg_off));
62}
63
64static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
65{
66 *((u32 *) (apic->regs + reg_off)) = val;
67}
68
69static inline int apic_test_and_set_vector(int vec, void *bitmap)
70{
71 return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
72}
73
74static inline int apic_test_and_clear_vector(int vec, void *bitmap)
75{
76 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
77}
78
79static inline void apic_set_vector(int vec, void *bitmap)
80{
81 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
82}
83
84static inline void apic_clear_vector(int vec, void *bitmap)
85{
86 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
87}
88
89static inline int apic_hw_enabled(struct kvm_lapic *apic)
90{
91 return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE;
92}
93
94static inline int apic_sw_enabled(struct kvm_lapic *apic)
95{
96 return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
97}
98
99static inline int apic_enabled(struct kvm_lapic *apic)
100{
101 return apic_sw_enabled(apic) && apic_hw_enabled(apic);
102}
103
104#define LVT_MASK \
105 (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
106
107#define LINT_MASK \
108 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
109 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
110
111static inline int kvm_apic_id(struct kvm_lapic *apic)
112{
113 return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
114}
115
116static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
117{
118 return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
119}
120
121static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
122{
123 return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
124}
125
126static inline int apic_lvtt_period(struct kvm_lapic *apic)
127{
128 return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
129}
130
131static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
132 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
133 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
134 LVT_MASK | APIC_MODE_MASK, /* LVTPC */
135 LINT_MASK, LINT_MASK, /* LVT0-1 */
136 LVT_MASK /* LVTERR */
137};
138
139static int find_highest_vector(void *bitmap)
140{
141 u32 *word = bitmap;
142 int word_offset = MAX_APIC_VECTOR >> 5;
143
144 while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
145 continue;
146
147 if (likely(!word_offset && !word[0]))
148 return -1;
149 else
150 return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
151}
152
153static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
154{
155 return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
156}
157
158static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
159{
160 apic_clear_vector(vec, apic->regs + APIC_IRR);
161}
162
163static inline int apic_find_highest_irr(struct kvm_lapic *apic)
164{
165 int result;
166
167 result = find_highest_vector(apic->regs + APIC_IRR);
168 ASSERT(result == -1 || result >= 16);
169
170 return result;
171}
172
173int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
174{
175 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
176 int highest_irr;
177
178 if (!apic)
179 return 0;
180 highest_irr = apic_find_highest_irr(apic);
181
182 return highest_irr;
183}
184EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
185
186int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig)
187{
188 if (!apic_test_and_set_irr(vec, apic)) {
189 /* a new pending irq is set in IRR */
190 if (trig)
191 apic_set_vector(vec, apic->regs + APIC_TMR);
192 else
193 apic_clear_vector(vec, apic->regs + APIC_TMR);
194 kvm_vcpu_kick(apic->vcpu);
195 return 1;
196 }
197 return 0;
198}
199
200static inline int apic_find_highest_isr(struct kvm_lapic *apic)
201{
202 int result;
203
204 result = find_highest_vector(apic->regs + APIC_ISR);
205 ASSERT(result == -1 || result >= 16);
206
207 return result;
208}
209
210static void apic_update_ppr(struct kvm_lapic *apic)
211{
212 u32 tpr, isrv, ppr;
213 int isr;
214
215 tpr = apic_get_reg(apic, APIC_TASKPRI);
216 isr = apic_find_highest_isr(apic);
217 isrv = (isr != -1) ? isr : 0;
218
219 if ((tpr & 0xf0) >= (isrv & 0xf0))
220 ppr = tpr & 0xff;
221 else
222 ppr = isrv & 0xf0;
223
224 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
225 apic, ppr, isr, isrv);
226
227 apic_set_reg(apic, APIC_PROCPRI, ppr);
228}
229
230static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
231{
232 apic_set_reg(apic, APIC_TASKPRI, tpr);
233 apic_update_ppr(apic);
234}
235
236int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
237{
238 return kvm_apic_id(apic) == dest;
239}
240
241int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
242{
243 int result = 0;
244 u8 logical_id;
245
246 logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
247
248 switch (apic_get_reg(apic, APIC_DFR)) {
249 case APIC_DFR_FLAT:
250 if (logical_id & mda)
251 result = 1;
252 break;
253 case APIC_DFR_CLUSTER:
254 if (((logical_id >> 4) == (mda >> 0x4))
255 && (logical_id & mda & 0xf))
256 result = 1;
257 break;
258 default:
259 printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
260 apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
261 break;
262 }
263
264 return result;
265}
266
267static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
268 int short_hand, int dest, int dest_mode)
269{
270 int result = 0;
271 struct kvm_lapic *target = vcpu->apic;
272
273 apic_debug("target %p, source %p, dest 0x%x, "
274 "dest_mode 0x%x, short_hand 0x%x",
275 target, source, dest, dest_mode, short_hand);
276
277 ASSERT(!target);
278 switch (short_hand) {
279 case APIC_DEST_NOSHORT:
280 if (dest_mode == 0) {
281 /* Physical mode. */
282 if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
283 result = 1;
284 } else
285 /* Logical mode. */
286 result = kvm_apic_match_logical_addr(target, dest);
287 break;
288 case APIC_DEST_SELF:
289 if (target == source)
290 result = 1;
291 break;
292 case APIC_DEST_ALLINC:
293 result = 1;
294 break;
295 case APIC_DEST_ALLBUT:
296 if (target != source)
297 result = 1;
298 break;
299 default:
300 printk(KERN_WARNING "Bad dest shorthand value %x\n",
301 short_hand);
302 break;
303 }
304
305 return result;
306}
307
308/*
309 * Add a pending IRQ into lapic.
310 * Return 1 if successfully added and 0 if discarded.
311 */
312static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
313 int vector, int level, int trig_mode)
314{
315 int orig_irr, result = 0;
316 struct kvm_vcpu *vcpu = apic->vcpu;
317
318 switch (delivery_mode) {
319 case APIC_DM_FIXED:
320 case APIC_DM_LOWEST:
321 /* FIXME add logic for vcpu on reset */
322 if (unlikely(!apic_enabled(apic)))
323 break;
324
325 orig_irr = apic_test_and_set_irr(vector, apic);
326 if (orig_irr && trig_mode) {
327 apic_debug("level trig mode repeatedly for vector %d",
328 vector);
329 break;
330 }
331
332 if (trig_mode) {
333 apic_debug("level trig mode for vector %d", vector);
334 apic_set_vector(vector, apic->regs + APIC_TMR);
335 } else
336 apic_clear_vector(vector, apic->regs + APIC_TMR);
337
338 if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
339 kvm_vcpu_kick(vcpu);
340 else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) {
341 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
342 if (waitqueue_active(&vcpu->wq))
343 wake_up_interruptible(&vcpu->wq);
344 }
345
346 result = (orig_irr == 0);
347 break;
348
349 case APIC_DM_REMRD:
350 printk(KERN_DEBUG "Ignoring delivery mode 3\n");
351 break;
352
353 case APIC_DM_SMI:
354 printk(KERN_DEBUG "Ignoring guest SMI\n");
355 break;
356 case APIC_DM_NMI:
357 printk(KERN_DEBUG "Ignoring guest NMI\n");
358 break;
359
360 case APIC_DM_INIT:
361 if (level) {
362 if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
363 printk(KERN_DEBUG
364 "INIT on a runnable vcpu %d\n",
365 vcpu->vcpu_id);
366 vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED;
367 kvm_vcpu_kick(vcpu);
368 } else {
369 printk(KERN_DEBUG
370 "Ignoring de-assert INIT to vcpu %d\n",
371 vcpu->vcpu_id);
372 }
373
374 break;
375
376 case APIC_DM_STARTUP:
377 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
378 vcpu->vcpu_id, vector);
379 if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
380 vcpu->sipi_vector = vector;
381 vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
382 if (waitqueue_active(&vcpu->wq))
383 wake_up_interruptible(&vcpu->wq);
384 }
385 break;
386
387 default:
388 printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
389 delivery_mode);
390 break;
391 }
392 return result;
393}
394
395struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
396 unsigned long bitmap)
397{
398 int vcpu_id;
399 int last;
400 int next;
401 struct kvm_lapic *apic;
402
403 last = kvm->round_robin_prev_vcpu;
404 next = last;
405
406 do {
407 if (++next == KVM_MAX_VCPUS)
408 next = 0;
409 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
410 continue;
411 apic = kvm->vcpus[next]->apic;
412 if (apic && apic_enabled(apic))
413 break;
414 apic = NULL;
415 } while (next != last);
416 kvm->round_robin_prev_vcpu = next;
417
418 if (!apic) {
419 vcpu_id = ffs(bitmap) - 1;
420 if (vcpu_id < 0) {
421 vcpu_id = 0;
422 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
423 }
424 apic = kvm->vcpus[vcpu_id]->apic;
425 }
426
427 return apic;
428}
429
430static void apic_set_eoi(struct kvm_lapic *apic)
431{
432 int vector = apic_find_highest_isr(apic);
433
434 /*
435 * Not every write EOI will has corresponding ISR,
436 * one example is when Kernel check timer on setup_IO_APIC
437 */
438 if (vector == -1)
439 return;
440
441 apic_clear_vector(vector, apic->regs + APIC_ISR);
442 apic_update_ppr(apic);
443
444 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
445 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
446}
447
448static void apic_send_ipi(struct kvm_lapic *apic)
449{
450 u32 icr_low = apic_get_reg(apic, APIC_ICR);
451 u32 icr_high = apic_get_reg(apic, APIC_ICR2);
452
453 unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
454 unsigned int short_hand = icr_low & APIC_SHORT_MASK;
455 unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
456 unsigned int level = icr_low & APIC_INT_ASSERT;
457 unsigned int dest_mode = icr_low & APIC_DEST_MASK;
458 unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
459 unsigned int vector = icr_low & APIC_VECTOR_MASK;
460
461 struct kvm_lapic *target;
462 struct kvm_vcpu *vcpu;
463 unsigned long lpr_map = 0;
464 int i;
465
466 apic_debug("icr_high 0x%x, icr_low 0x%x, "
467 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
468 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
469 icr_high, icr_low, short_hand, dest,
470 trig_mode, level, dest_mode, delivery_mode, vector);
471
472 for (i = 0; i < KVM_MAX_VCPUS; i++) {
473 vcpu = apic->vcpu->kvm->vcpus[i];
474 if (!vcpu)
475 continue;
476
477 if (vcpu->apic &&
478 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
479 if (delivery_mode == APIC_DM_LOWEST)
480 set_bit(vcpu->vcpu_id, &lpr_map);
481 else
482 __apic_accept_irq(vcpu->apic, delivery_mode,
483 vector, level, trig_mode);
484 }
485 }
486
487 if (delivery_mode == APIC_DM_LOWEST) {
488 target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map);
489 if (target != NULL)
490 __apic_accept_irq(target, delivery_mode,
491 vector, level, trig_mode);
492 }
493}
494
495static u32 apic_get_tmcct(struct kvm_lapic *apic)
496{
497 u64 counter_passed;
498 ktime_t passed, now;
499 u32 tmcct;
500
501 ASSERT(apic != NULL);
502
503 now = apic->timer.dev.base->get_time();
504 tmcct = apic_get_reg(apic, APIC_TMICT);
505
506 /* if initial count is 0, current count should also be 0 */
507 if (tmcct == 0)
508 return 0;
509
510 if (unlikely(ktime_to_ns(now) <=
511 ktime_to_ns(apic->timer.last_update))) {
512 /* Wrap around */
513 passed = ktime_add(( {
514 (ktime_t) {
515 .tv64 = KTIME_MAX -
516 (apic->timer.last_update).tv64}; }
517 ), now);
518 apic_debug("time elapsed\n");
519 } else
520 passed = ktime_sub(now, apic->timer.last_update);
521
522 counter_passed = div64_64(ktime_to_ns(passed),
523 (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
524
525 if (counter_passed > tmcct) {
526 if (unlikely(!apic_lvtt_period(apic))) {
527 /* one-shot timers stick at 0 until reset */
528 tmcct = 0;
529 } else {
530 /*
531 * periodic timers reset to APIC_TMICT when they
532 * hit 0. The while loop simulates this happening N
533 * times. (counter_passed %= tmcct) would also work,
534 * but might be slower or not work on 32-bit??
535 */
536 while (counter_passed > tmcct)
537 counter_passed -= tmcct;
538 tmcct -= counter_passed;
539 }
540 } else {
541 tmcct -= counter_passed;
542 }
543
544 return tmcct;
545}
546
547static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
548{
549 u32 val = 0;
550
551 if (offset >= LAPIC_MMIO_LENGTH)
552 return 0;
553
554 switch (offset) {
555 case APIC_ARBPRI:
556 printk(KERN_WARNING "Access APIC ARBPRI register "
557 "which is for P6\n");
558 break;
559
560 case APIC_TMCCT: /* Timer CCR */
561 val = apic_get_tmcct(apic);
562 break;
563
564 default:
565 apic_update_ppr(apic);
566 val = apic_get_reg(apic, offset);
567 break;
568 }
569
570 return val;
571}
572
573static void apic_mmio_read(struct kvm_io_device *this,
574 gpa_t address, int len, void *data)
575{
576 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
577 unsigned int offset = address - apic->base_address;
578 unsigned char alignment = offset & 0xf;
579 u32 result;
580
581 if ((alignment + len) > 4) {
582 printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
583 (unsigned long)address, len);
584 return;
585 }
586 result = __apic_read(apic, offset & ~0xf);
587
588 switch (len) {
589 case 1:
590 case 2:
591 case 4:
592 memcpy(data, (char *)&result + alignment, len);
593 break;
594 default:
595 printk(KERN_ERR "Local APIC read with len = %x, "
596 "should be 1,2, or 4 instead\n", len);
597 break;
598 }
599}
600
601static void update_divide_count(struct kvm_lapic *apic)
602{
603 u32 tmp1, tmp2, tdcr;
604
605 tdcr = apic_get_reg(apic, APIC_TDCR);
606 tmp1 = tdcr & 0xf;
607 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
608 apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
609
610 apic_debug("timer divide count is 0x%x\n",
611 apic->timer.divide_count);
612}
613
614static void start_apic_timer(struct kvm_lapic *apic)
615{
616 ktime_t now = apic->timer.dev.base->get_time();
617
618 apic->timer.last_update = now;
619
620 apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
621 APIC_BUS_CYCLE_NS * apic->timer.divide_count;
622 atomic_set(&apic->timer.pending, 0);
623 hrtimer_start(&apic->timer.dev,
624 ktime_add_ns(now, apic->timer.period),
625 HRTIMER_MODE_ABS);
626
627 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
628 PRIx64 ", "
629 "timer initial count 0x%x, period %lldns, "
630 "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
631 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
632 apic_get_reg(apic, APIC_TMICT),
633 apic->timer.period,
634 ktime_to_ns(ktime_add_ns(now,
635 apic->timer.period)));
636}
637
638static void apic_mmio_write(struct kvm_io_device *this,
639 gpa_t address, int len, const void *data)
640{
641 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
642 unsigned int offset = address - apic->base_address;
643 unsigned char alignment = offset & 0xf;
644 u32 val;
645
646 /*
647 * APIC register must be aligned on 128-bits boundary.
648 * 32/64/128 bits registers must be accessed thru 32 bits.
649 * Refer SDM 8.4.1
650 */
651 if (len != 4 || alignment) {
652 if (printk_ratelimit())
653 printk(KERN_ERR "apic write: bad size=%d %lx\n",
654 len, (long)address);
655 return;
656 }
657
658 val = *(u32 *) data;
659
660 /* too common printing */
661 if (offset != APIC_EOI)
662 apic_debug("%s: offset 0x%x with length 0x%x, and value is "
663 "0x%x\n", __FUNCTION__, offset, len, val);
664
665 offset &= 0xff0;
666
667 switch (offset) {
668 case APIC_ID: /* Local APIC ID */
669 apic_set_reg(apic, APIC_ID, val);
670 break;
671
672 case APIC_TASKPRI:
673 apic_set_tpr(apic, val & 0xff);
674 break;
675
676 case APIC_EOI:
677 apic_set_eoi(apic);
678 break;
679
680 case APIC_LDR:
681 apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
682 break;
683
684 case APIC_DFR:
685 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
686 break;
687
688 case APIC_SPIV:
689 apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
690 if (!(val & APIC_SPIV_APIC_ENABLED)) {
691 int i;
692 u32 lvt_val;
693
694 for (i = 0; i < APIC_LVT_NUM; i++) {
695 lvt_val = apic_get_reg(apic,
696 APIC_LVTT + 0x10 * i);
697 apic_set_reg(apic, APIC_LVTT + 0x10 * i,
698 lvt_val | APIC_LVT_MASKED);
699 }
700 atomic_set(&apic->timer.pending, 0);
701
702 }
703 break;
704
705 case APIC_ICR:
706 /* No delay here, so we always clear the pending bit */
707 apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
708 apic_send_ipi(apic);
709 break;
710
711 case APIC_ICR2:
712 apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
713 break;
714
715 case APIC_LVTT:
716 case APIC_LVTTHMR:
717 case APIC_LVTPC:
718 case APIC_LVT0:
719 case APIC_LVT1:
720 case APIC_LVTERR:
721 /* TODO: Check vector */
722 if (!apic_sw_enabled(apic))
723 val |= APIC_LVT_MASKED;
724
725 val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
726 apic_set_reg(apic, offset, val);
727
728 break;
729
730 case APIC_TMICT:
731 hrtimer_cancel(&apic->timer.dev);
732 apic_set_reg(apic, APIC_TMICT, val);
733 start_apic_timer(apic);
734 return;
735
736 case APIC_TDCR:
737 if (val & 4)
738 printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
739 apic_set_reg(apic, APIC_TDCR, val);
740 update_divide_count(apic);
741 break;
742
743 default:
744 apic_debug("Local APIC Write to read-only register %x\n",
745 offset);
746 break;
747 }
748
749}
750
751static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
752{
753 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
754 int ret = 0;
755
756
757 if (apic_hw_enabled(apic) &&
758 (addr >= apic->base_address) &&
759 (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
760 ret = 1;
761
762 return ret;
763}
764
765void kvm_free_apic(struct kvm_lapic *apic)
766{
767 if (!apic)
768 return;
769
770 hrtimer_cancel(&apic->timer.dev);
771
772 if (apic->regs_page) {
773 __free_page(apic->regs_page);
774 apic->regs_page = 0;
775 }
776
777 kfree(apic);
778}
779
780/*
781 *----------------------------------------------------------------------
782 * LAPIC interface
783 *----------------------------------------------------------------------
784 */
785
786void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
787{
788 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
789
790 if (!apic)
791 return;
792 apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
793}
794
795u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
796{
797 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
798 u64 tpr;
799
800 if (!apic)
801 return 0;
802 tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
803
804 return (tpr & 0xf0) >> 4;
805}
806EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
807
808void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
809{
810 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
811
812 if (!apic) {
813 value |= MSR_IA32_APICBASE_BSP;
814 vcpu->apic_base = value;
815 return;
816 }
817 if (apic->vcpu->vcpu_id)
818 value &= ~MSR_IA32_APICBASE_BSP;
819
820 vcpu->apic_base = value;
821 apic->base_address = apic->vcpu->apic_base &
822 MSR_IA32_APICBASE_BASE;
823
824 /* with FSB delivery interrupt, we can restart APIC functionality */
825 apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
826 "0x%lx.\n", apic->apic_base, apic->base_address);
827
828}
829
830u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
831{
832 return vcpu->apic_base;
833}
834EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
835
836void kvm_lapic_reset(struct kvm_vcpu *vcpu)
837{
838 struct kvm_lapic *apic;
839 int i;
840
841 apic_debug("%s\n", __FUNCTION__);
842
843 ASSERT(vcpu);
844 apic = vcpu->apic;
845 ASSERT(apic != NULL);
846
847 /* Stop the timer in case it's a reset to an active apic */
848 hrtimer_cancel(&apic->timer.dev);
849
850 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
851 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
852
853 for (i = 0; i < APIC_LVT_NUM; i++)
854 apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
855 apic_set_reg(apic, APIC_LVT0,
856 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
857
858 apic_set_reg(apic, APIC_DFR, 0xffffffffU);
859 apic_set_reg(apic, APIC_SPIV, 0xff);
860 apic_set_reg(apic, APIC_TASKPRI, 0);
861 apic_set_reg(apic, APIC_LDR, 0);
862 apic_set_reg(apic, APIC_ESR, 0);
863 apic_set_reg(apic, APIC_ICR, 0);
864 apic_set_reg(apic, APIC_ICR2, 0);
865 apic_set_reg(apic, APIC_TDCR, 0);
866 apic_set_reg(apic, APIC_TMICT, 0);
867 for (i = 0; i < 8; i++) {
868 apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
869 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
870 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
871 }
872 update_divide_count(apic);
873 atomic_set(&apic->timer.pending, 0);
874 if (vcpu->vcpu_id == 0)
875 vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
876 apic_update_ppr(apic);
877
878 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
879 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
880 vcpu, kvm_apic_id(apic),
881 vcpu->apic_base, apic->base_address);
882}
883EXPORT_SYMBOL_GPL(kvm_lapic_reset);
884
885int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
886{
887 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
888 int ret = 0;
889
890 if (!apic)
891 return 0;
892 ret = apic_enabled(apic);
893
894 return ret;
895}
896EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
897
898/*
899 *----------------------------------------------------------------------
900 * timer interface
901 *----------------------------------------------------------------------
902 */
903
904/* TODO: make sure __apic_timer_fn runs in current pCPU */
905static int __apic_timer_fn(struct kvm_lapic *apic)
906{
907 int result = 0;
908 wait_queue_head_t *q = &apic->vcpu->wq;
909
910 atomic_inc(&apic->timer.pending);
911 if (waitqueue_active(q))
912 {
913 apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
914 wake_up_interruptible(q);
915 }
916 if (apic_lvtt_period(apic)) {
917 result = 1;
918 apic->timer.dev.expires = ktime_add_ns(
919 apic->timer.dev.expires,
920 apic->timer.period);
921 }
922 return result;
923}
924
925static int __inject_apic_timer_irq(struct kvm_lapic *apic)
926{
927 int vector;
928
929 vector = apic_lvt_vector(apic, APIC_LVTT);
930 return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
931}
932
933static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
934{
935 struct kvm_lapic *apic;
936 int restart_timer = 0;
937
938 apic = container_of(data, struct kvm_lapic, timer.dev);
939
940 restart_timer = __apic_timer_fn(apic);
941
942 if (restart_timer)
943 return HRTIMER_RESTART;
944 else
945 return HRTIMER_NORESTART;
946}
947
948int kvm_create_lapic(struct kvm_vcpu *vcpu)
949{
950 struct kvm_lapic *apic;
951
952 ASSERT(vcpu != NULL);
953 apic_debug("apic_init %d\n", vcpu->vcpu_id);
954
955 apic = kzalloc(sizeof(*apic), GFP_KERNEL);
956 if (!apic)
957 goto nomem;
958
959 vcpu->apic = apic;
960
961 apic->regs_page = alloc_page(GFP_KERNEL);
962 if (apic->regs_page == NULL) {
963 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
964 vcpu->vcpu_id);
965 goto nomem;
966 }
967 apic->regs = page_address(apic->regs_page);
968 memset(apic->regs, 0, PAGE_SIZE);
969 apic->vcpu = vcpu;
970
971 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
972 apic->timer.dev.function = apic_timer_fn;
973 apic->base_address = APIC_DEFAULT_PHYS_BASE;
974 vcpu->apic_base = APIC_DEFAULT_PHYS_BASE;
975
976 kvm_lapic_reset(vcpu);
977 apic->dev.read = apic_mmio_read;
978 apic->dev.write = apic_mmio_write;
979 apic->dev.in_range = apic_mmio_range;
980 apic->dev.private = apic;
981
982 return 0;
983nomem:
984 kvm_free_apic(apic);
985 return -ENOMEM;
986}
987EXPORT_SYMBOL_GPL(kvm_create_lapic);
988
989int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
990{
991 struct kvm_lapic *apic = vcpu->apic;
992 int highest_irr;
993
994 if (!apic || !apic_enabled(apic))
995 return -1;
996
997 apic_update_ppr(apic);
998 highest_irr = apic_find_highest_irr(apic);
999 if ((highest_irr == -1) ||
1000 ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
1001 return -1;
1002 return highest_irr;
1003}
1004
1005int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1006{
1007 u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0);
1008 int r = 0;
1009
1010 if (vcpu->vcpu_id == 0) {
1011 if (!apic_hw_enabled(vcpu->apic))
1012 r = 1;
1013 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1014 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
1015 r = 1;
1016 }
1017 return r;
1018}
1019
1020void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1021{
1022 struct kvm_lapic *apic = vcpu->apic;
1023
1024 if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
1025 atomic_read(&apic->timer.pending) > 0) {
1026 if (__inject_apic_timer_irq(apic))
1027 atomic_dec(&apic->timer.pending);
1028 }
1029}
1030
1031void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
1032{
1033 struct kvm_lapic *apic = vcpu->apic;
1034
1035 if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
1036 apic->timer.last_update = ktime_add_ns(
1037 apic->timer.last_update,
1038 apic->timer.period);
1039}
1040
1041int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1042{
1043 int vector = kvm_apic_has_interrupt(vcpu);
1044 struct kvm_lapic *apic = vcpu->apic;
1045
1046 if (vector == -1)
1047 return -1;
1048
1049 apic_set_vector(vector, apic->regs + APIC_ISR);
1050 apic_update_ppr(apic);
1051 apic_clear_irr(vector, apic);
1052 return vector;
1053}
1054
1055void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1056{
1057 struct kvm_lapic *apic = vcpu->apic;
1058
1059 apic->base_address = vcpu->apic_base &
1060 MSR_IA32_APICBASE_BASE;
1061 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
1062 apic_update_ppr(apic);
1063 hrtimer_cancel(&apic->timer.dev);
1064 update_divide_count(apic);
1065 start_apic_timer(apic);
1066}
1067
1068void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1069{
1070 struct kvm_lapic *apic = vcpu->apic;
1071 struct hrtimer *timer;
1072
1073 if (!apic)
1074 return;
1075
1076 timer = &apic->timer.dev;
1077 if (hrtimer_cancel(timer))
1078 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
1079}
1080EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer);
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
deleted file mode 100644
index feb5ac986c5d..000000000000
--- a/drivers/kvm/mmu.c
+++ /dev/null
@@ -1,1498 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include "vmx.h"
21#include "kvm.h"
22
23#include <linux/types.h>
24#include <linux/string.h>
25#include <linux/mm.h>
26#include <linux/highmem.h>
27#include <linux/module.h>
28
29#include <asm/page.h>
30#include <asm/cmpxchg.h>
31
32#undef MMU_DEBUG
33
34#undef AUDIT
35
36#ifdef AUDIT
37static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
38#else
39static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
40#endif
41
42#ifdef MMU_DEBUG
43
44#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
45#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
46
47#else
48
49#define pgprintk(x...) do { } while (0)
50#define rmap_printk(x...) do { } while (0)
51
52#endif
53
54#if defined(MMU_DEBUG) || defined(AUDIT)
55static int dbg = 1;
56#endif
57
58#ifndef MMU_DEBUG
59#define ASSERT(x) do { } while (0)
60#else
61#define ASSERT(x) \
62 if (!(x)) { \
63 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
64 __FILE__, __LINE__, #x); \
65 }
66#endif
67
68#define PT64_PT_BITS 9
69#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
70#define PT32_PT_BITS 10
71#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
72
73#define PT_WRITABLE_SHIFT 1
74
75#define PT_PRESENT_MASK (1ULL << 0)
76#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
77#define PT_USER_MASK (1ULL << 2)
78#define PT_PWT_MASK (1ULL << 3)
79#define PT_PCD_MASK (1ULL << 4)
80#define PT_ACCESSED_MASK (1ULL << 5)
81#define PT_DIRTY_MASK (1ULL << 6)
82#define PT_PAGE_SIZE_MASK (1ULL << 7)
83#define PT_PAT_MASK (1ULL << 7)
84#define PT_GLOBAL_MASK (1ULL << 8)
85#define PT64_NX_MASK (1ULL << 63)
86
87#define PT_PAT_SHIFT 7
88#define PT_DIR_PAT_SHIFT 12
89#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
90
91#define PT32_DIR_PSE36_SIZE 4
92#define PT32_DIR_PSE36_SHIFT 13
93#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
94
95
96#define PT_FIRST_AVAIL_BITS_SHIFT 9
97#define PT64_SECOND_AVAIL_BITS_SHIFT 52
98
99#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
100
101#define VALID_PAGE(x) ((x) != INVALID_PAGE)
102
103#define PT64_LEVEL_BITS 9
104
105#define PT64_LEVEL_SHIFT(level) \
106 ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
107
108#define PT64_LEVEL_MASK(level) \
109 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
110
111#define PT64_INDEX(address, level)\
112 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
113
114
115#define PT32_LEVEL_BITS 10
116
117#define PT32_LEVEL_SHIFT(level) \
118 ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
119
120#define PT32_LEVEL_MASK(level) \
121 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
122
123#define PT32_INDEX(address, level)\
124 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
125
126
127#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
128#define PT64_DIR_BASE_ADDR_MASK \
129 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
130
131#define PT32_BASE_ADDR_MASK PAGE_MASK
132#define PT32_DIR_BASE_ADDR_MASK \
133 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
134
135
136#define PFERR_PRESENT_MASK (1U << 0)
137#define PFERR_WRITE_MASK (1U << 1)
138#define PFERR_USER_MASK (1U << 2)
139#define PFERR_FETCH_MASK (1U << 4)
140
141#define PT64_ROOT_LEVEL 4
142#define PT32_ROOT_LEVEL 2
143#define PT32E_ROOT_LEVEL 3
144
145#define PT_DIRECTORY_LEVEL 2
146#define PT_PAGE_TABLE_LEVEL 1
147
148#define RMAP_EXT 4
149
150struct kvm_rmap_desc {
151 u64 *shadow_ptes[RMAP_EXT];
152 struct kvm_rmap_desc *more;
153};
154
155static struct kmem_cache *pte_chain_cache;
156static struct kmem_cache *rmap_desc_cache;
157static struct kmem_cache *mmu_page_header_cache;
158
159static int is_write_protection(struct kvm_vcpu *vcpu)
160{
161 return vcpu->cr0 & X86_CR0_WP;
162}
163
164static int is_cpuid_PSE36(void)
165{
166 return 1;
167}
168
169static int is_nx(struct kvm_vcpu *vcpu)
170{
171 return vcpu->shadow_efer & EFER_NX;
172}
173
174static int is_present_pte(unsigned long pte)
175{
176 return pte & PT_PRESENT_MASK;
177}
178
179static int is_writeble_pte(unsigned long pte)
180{
181 return pte & PT_WRITABLE_MASK;
182}
183
184static int is_io_pte(unsigned long pte)
185{
186 return pte & PT_SHADOW_IO_MARK;
187}
188
189static int is_rmap_pte(u64 pte)
190{
191 return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
192 == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
193}
194
195static void set_shadow_pte(u64 *sptep, u64 spte)
196{
197#ifdef CONFIG_X86_64
198 set_64bit((unsigned long *)sptep, spte);
199#else
200 set_64bit((unsigned long long *)sptep, spte);
201#endif
202}
203
204static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
205 struct kmem_cache *base_cache, int min)
206{
207 void *obj;
208
209 if (cache->nobjs >= min)
210 return 0;
211 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
212 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
213 if (!obj)
214 return -ENOMEM;
215 cache->objects[cache->nobjs++] = obj;
216 }
217 return 0;
218}
219
220static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
221{
222 while (mc->nobjs)
223 kfree(mc->objects[--mc->nobjs]);
224}
225
226static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
227 int min)
228{
229 struct page *page;
230
231 if (cache->nobjs >= min)
232 return 0;
233 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
234 page = alloc_page(GFP_KERNEL);
235 if (!page)
236 return -ENOMEM;
237 set_page_private(page, 0);
238 cache->objects[cache->nobjs++] = page_address(page);
239 }
240 return 0;
241}
242
243static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
244{
245 while (mc->nobjs)
246 free_page((unsigned long)mc->objects[--mc->nobjs]);
247}
248
249static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
250{
251 int r;
252
253 kvm_mmu_free_some_pages(vcpu);
254 r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
255 pte_chain_cache, 4);
256 if (r)
257 goto out;
258 r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
259 rmap_desc_cache, 1);
260 if (r)
261 goto out;
262 r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4);
263 if (r)
264 goto out;
265 r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
266 mmu_page_header_cache, 4);
267out:
268 return r;
269}
270
271static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
272{
273 mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
274 mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
275 mmu_free_memory_cache_page(&vcpu->mmu_page_cache);
276 mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
277}
278
279static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
280 size_t size)
281{
282 void *p;
283
284 BUG_ON(!mc->nobjs);
285 p = mc->objects[--mc->nobjs];
286 memset(p, 0, size);
287 return p;
288}
289
290static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
291{
292 return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
293 sizeof(struct kvm_pte_chain));
294}
295
296static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
297{
298 kfree(pc);
299}
300
301static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
302{
303 return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
304 sizeof(struct kvm_rmap_desc));
305}
306
307static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
308{
309 kfree(rd);
310}
311
312/*
313 * Reverse mapping data structures:
314 *
315 * If page->private bit zero is zero, then page->private points to the
316 * shadow page table entry that points to page_address(page).
317 *
318 * If page->private bit zero is one, (then page->private & ~1) points
319 * to a struct kvm_rmap_desc containing more mappings.
320 */
321static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
322{
323 struct page *page;
324 struct kvm_rmap_desc *desc;
325 int i;
326
327 if (!is_rmap_pte(*spte))
328 return;
329 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
330 if (!page_private(page)) {
331 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
332 set_page_private(page,(unsigned long)spte);
333 } else if (!(page_private(page) & 1)) {
334 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
335 desc = mmu_alloc_rmap_desc(vcpu);
336 desc->shadow_ptes[0] = (u64 *)page_private(page);
337 desc->shadow_ptes[1] = spte;
338 set_page_private(page,(unsigned long)desc | 1);
339 } else {
340 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
341 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
342 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
343 desc = desc->more;
344 if (desc->shadow_ptes[RMAP_EXT-1]) {
345 desc->more = mmu_alloc_rmap_desc(vcpu);
346 desc = desc->more;
347 }
348 for (i = 0; desc->shadow_ptes[i]; ++i)
349 ;
350 desc->shadow_ptes[i] = spte;
351 }
352}
353
354static void rmap_desc_remove_entry(struct page *page,
355 struct kvm_rmap_desc *desc,
356 int i,
357 struct kvm_rmap_desc *prev_desc)
358{
359 int j;
360
361 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
362 ;
363 desc->shadow_ptes[i] = desc->shadow_ptes[j];
364 desc->shadow_ptes[j] = NULL;
365 if (j != 0)
366 return;
367 if (!prev_desc && !desc->more)
368 set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
369 else
370 if (prev_desc)
371 prev_desc->more = desc->more;
372 else
373 set_page_private(page,(unsigned long)desc->more | 1);
374 mmu_free_rmap_desc(desc);
375}
376
377static void rmap_remove(u64 *spte)
378{
379 struct page *page;
380 struct kvm_rmap_desc *desc;
381 struct kvm_rmap_desc *prev_desc;
382 int i;
383
384 if (!is_rmap_pte(*spte))
385 return;
386 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
387 if (!page_private(page)) {
388 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
389 BUG();
390 } else if (!(page_private(page) & 1)) {
391 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
392 if ((u64 *)page_private(page) != spte) {
393 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
394 spte, *spte);
395 BUG();
396 }
397 set_page_private(page,0);
398 } else {
399 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
400 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
401 prev_desc = NULL;
402 while (desc) {
403 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
404 if (desc->shadow_ptes[i] == spte) {
405 rmap_desc_remove_entry(page,
406 desc, i,
407 prev_desc);
408 return;
409 }
410 prev_desc = desc;
411 desc = desc->more;
412 }
413 BUG();
414 }
415}
416
417static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
418{
419 struct kvm *kvm = vcpu->kvm;
420 struct page *page;
421 struct kvm_rmap_desc *desc;
422 u64 *spte;
423
424 page = gfn_to_page(kvm, gfn);
425 BUG_ON(!page);
426
427 while (page_private(page)) {
428 if (!(page_private(page) & 1))
429 spte = (u64 *)page_private(page);
430 else {
431 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
432 spte = desc->shadow_ptes[0];
433 }
434 BUG_ON(!spte);
435 BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
436 != page_to_pfn(page));
437 BUG_ON(!(*spte & PT_PRESENT_MASK));
438 BUG_ON(!(*spte & PT_WRITABLE_MASK));
439 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
440 rmap_remove(spte);
441 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
442 kvm_flush_remote_tlbs(vcpu->kvm);
443 }
444}
445
446#ifdef MMU_DEBUG
447static int is_empty_shadow_page(u64 *spt)
448{
449 u64 *pos;
450 u64 *end;
451
452 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
453 if (*pos != 0) {
454 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
455 pos, *pos);
456 return 0;
457 }
458 return 1;
459}
460#endif
461
462static void kvm_mmu_free_page(struct kvm *kvm,
463 struct kvm_mmu_page *page_head)
464{
465 ASSERT(is_empty_shadow_page(page_head->spt));
466 list_del(&page_head->link);
467 __free_page(virt_to_page(page_head->spt));
468 kfree(page_head);
469 ++kvm->n_free_mmu_pages;
470}
471
472static unsigned kvm_page_table_hashfn(gfn_t gfn)
473{
474 return gfn;
475}
476
477static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
478 u64 *parent_pte)
479{
480 struct kvm_mmu_page *page;
481
482 if (!vcpu->kvm->n_free_mmu_pages)
483 return NULL;
484
485 page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
486 sizeof *page);
487 page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
488 set_page_private(virt_to_page(page->spt), (unsigned long)page);
489 list_add(&page->link, &vcpu->kvm->active_mmu_pages);
490 ASSERT(is_empty_shadow_page(page->spt));
491 page->slot_bitmap = 0;
492 page->multimapped = 0;
493 page->parent_pte = parent_pte;
494 --vcpu->kvm->n_free_mmu_pages;
495 return page;
496}
497
498static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
499 struct kvm_mmu_page *page, u64 *parent_pte)
500{
501 struct kvm_pte_chain *pte_chain;
502 struct hlist_node *node;
503 int i;
504
505 if (!parent_pte)
506 return;
507 if (!page->multimapped) {
508 u64 *old = page->parent_pte;
509
510 if (!old) {
511 page->parent_pte = parent_pte;
512 return;
513 }
514 page->multimapped = 1;
515 pte_chain = mmu_alloc_pte_chain(vcpu);
516 INIT_HLIST_HEAD(&page->parent_ptes);
517 hlist_add_head(&pte_chain->link, &page->parent_ptes);
518 pte_chain->parent_ptes[0] = old;
519 }
520 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
521 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
522 continue;
523 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
524 if (!pte_chain->parent_ptes[i]) {
525 pte_chain->parent_ptes[i] = parent_pte;
526 return;
527 }
528 }
529 pte_chain = mmu_alloc_pte_chain(vcpu);
530 BUG_ON(!pte_chain);
531 hlist_add_head(&pte_chain->link, &page->parent_ptes);
532 pte_chain->parent_ptes[0] = parent_pte;
533}
534
535static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
536 u64 *parent_pte)
537{
538 struct kvm_pte_chain *pte_chain;
539 struct hlist_node *node;
540 int i;
541
542 if (!page->multimapped) {
543 BUG_ON(page->parent_pte != parent_pte);
544 page->parent_pte = NULL;
545 return;
546 }
547 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
548 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
549 if (!pte_chain->parent_ptes[i])
550 break;
551 if (pte_chain->parent_ptes[i] != parent_pte)
552 continue;
553 while (i + 1 < NR_PTE_CHAIN_ENTRIES
554 && pte_chain->parent_ptes[i + 1]) {
555 pte_chain->parent_ptes[i]
556 = pte_chain->parent_ptes[i + 1];
557 ++i;
558 }
559 pte_chain->parent_ptes[i] = NULL;
560 if (i == 0) {
561 hlist_del(&pte_chain->link);
562 mmu_free_pte_chain(pte_chain);
563 if (hlist_empty(&page->parent_ptes)) {
564 page->multimapped = 0;
565 page->parent_pte = NULL;
566 }
567 }
568 return;
569 }
570 BUG();
571}
572
573static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
574 gfn_t gfn)
575{
576 unsigned index;
577 struct hlist_head *bucket;
578 struct kvm_mmu_page *page;
579 struct hlist_node *node;
580
581 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
582 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
583 bucket = &vcpu->kvm->mmu_page_hash[index];
584 hlist_for_each_entry(page, node, bucket, hash_link)
585 if (page->gfn == gfn && !page->role.metaphysical) {
586 pgprintk("%s: found role %x\n",
587 __FUNCTION__, page->role.word);
588 return page;
589 }
590 return NULL;
591}
592
593static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
594 gfn_t gfn,
595 gva_t gaddr,
596 unsigned level,
597 int metaphysical,
598 unsigned hugepage_access,
599 u64 *parent_pte)
600{
601 union kvm_mmu_page_role role;
602 unsigned index;
603 unsigned quadrant;
604 struct hlist_head *bucket;
605 struct kvm_mmu_page *page;
606 struct hlist_node *node;
607
608 role.word = 0;
609 role.glevels = vcpu->mmu.root_level;
610 role.level = level;
611 role.metaphysical = metaphysical;
612 role.hugepage_access = hugepage_access;
613 if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
614 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
615 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
616 role.quadrant = quadrant;
617 }
618 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
619 gfn, role.word);
620 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
621 bucket = &vcpu->kvm->mmu_page_hash[index];
622 hlist_for_each_entry(page, node, bucket, hash_link)
623 if (page->gfn == gfn && page->role.word == role.word) {
624 mmu_page_add_parent_pte(vcpu, page, parent_pte);
625 pgprintk("%s: found\n", __FUNCTION__);
626 return page;
627 }
628 page = kvm_mmu_alloc_page(vcpu, parent_pte);
629 if (!page)
630 return page;
631 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
632 page->gfn = gfn;
633 page->role = role;
634 hlist_add_head(&page->hash_link, bucket);
635 if (!metaphysical)
636 rmap_write_protect(vcpu, gfn);
637 return page;
638}
639
640static void kvm_mmu_page_unlink_children(struct kvm *kvm,
641 struct kvm_mmu_page *page)
642{
643 unsigned i;
644 u64 *pt;
645 u64 ent;
646
647 pt = page->spt;
648
649 if (page->role.level == PT_PAGE_TABLE_LEVEL) {
650 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
651 if (pt[i] & PT_PRESENT_MASK)
652 rmap_remove(&pt[i]);
653 pt[i] = 0;
654 }
655 kvm_flush_remote_tlbs(kvm);
656 return;
657 }
658
659 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
660 ent = pt[i];
661
662 pt[i] = 0;
663 if (!(ent & PT_PRESENT_MASK))
664 continue;
665 ent &= PT64_BASE_ADDR_MASK;
666 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
667 }
668 kvm_flush_remote_tlbs(kvm);
669}
670
671static void kvm_mmu_put_page(struct kvm_mmu_page *page,
672 u64 *parent_pte)
673{
674 mmu_page_remove_parent_pte(page, parent_pte);
675}
676
677static void kvm_mmu_zap_page(struct kvm *kvm,
678 struct kvm_mmu_page *page)
679{
680 u64 *parent_pte;
681
682 while (page->multimapped || page->parent_pte) {
683 if (!page->multimapped)
684 parent_pte = page->parent_pte;
685 else {
686 struct kvm_pte_chain *chain;
687
688 chain = container_of(page->parent_ptes.first,
689 struct kvm_pte_chain, link);
690 parent_pte = chain->parent_ptes[0];
691 }
692 BUG_ON(!parent_pte);
693 kvm_mmu_put_page(page, parent_pte);
694 set_shadow_pte(parent_pte, 0);
695 }
696 kvm_mmu_page_unlink_children(kvm, page);
697 if (!page->root_count) {
698 hlist_del(&page->hash_link);
699 kvm_mmu_free_page(kvm, page);
700 } else
701 list_move(&page->link, &kvm->active_mmu_pages);
702}
703
704static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
705{
706 unsigned index;
707 struct hlist_head *bucket;
708 struct kvm_mmu_page *page;
709 struct hlist_node *node, *n;
710 int r;
711
712 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
713 r = 0;
714 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
715 bucket = &vcpu->kvm->mmu_page_hash[index];
716 hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
717 if (page->gfn == gfn && !page->role.metaphysical) {
718 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
719 page->role.word);
720 kvm_mmu_zap_page(vcpu->kvm, page);
721 r = 1;
722 }
723 return r;
724}
725
726static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
727{
728 struct kvm_mmu_page *page;
729
730 while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
731 pgprintk("%s: zap %lx %x\n",
732 __FUNCTION__, gfn, page->role.word);
733 kvm_mmu_zap_page(vcpu->kvm, page);
734 }
735}
736
737static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
738{
739 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
740 struct kvm_mmu_page *page_head = page_header(__pa(pte));
741
742 __set_bit(slot, &page_head->slot_bitmap);
743}
744
745hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
746{
747 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
748
749 return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
750}
751
752hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
753{
754 struct page *page;
755
756 ASSERT((gpa & HPA_ERR_MASK) == 0);
757 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
758 if (!page)
759 return gpa | HPA_ERR_MASK;
760 return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
761 | (gpa & (PAGE_SIZE-1));
762}
763
764hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
765{
766 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
767
768 if (gpa == UNMAPPED_GVA)
769 return UNMAPPED_GVA;
770 return gpa_to_hpa(vcpu, gpa);
771}
772
773struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
774{
775 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
776
777 if (gpa == UNMAPPED_GVA)
778 return NULL;
779 return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
780}
781
782static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
783{
784}
785
786static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
787{
788 int level = PT32E_ROOT_LEVEL;
789 hpa_t table_addr = vcpu->mmu.root_hpa;
790
791 for (; ; level--) {
792 u32 index = PT64_INDEX(v, level);
793 u64 *table;
794 u64 pte;
795
796 ASSERT(VALID_PAGE(table_addr));
797 table = __va(table_addr);
798
799 if (level == 1) {
800 pte = table[index];
801 if (is_present_pte(pte) && is_writeble_pte(pte))
802 return 0;
803 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
804 page_header_update_slot(vcpu->kvm, table, v);
805 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
806 PT_USER_MASK;
807 rmap_add(vcpu, &table[index]);
808 return 0;
809 }
810
811 if (table[index] == 0) {
812 struct kvm_mmu_page *new_table;
813 gfn_t pseudo_gfn;
814
815 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
816 >> PAGE_SHIFT;
817 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
818 v, level - 1,
819 1, 0, &table[index]);
820 if (!new_table) {
821 pgprintk("nonpaging_map: ENOMEM\n");
822 return -ENOMEM;
823 }
824
825 table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
826 | PT_WRITABLE_MASK | PT_USER_MASK;
827 }
828 table_addr = table[index] & PT64_BASE_ADDR_MASK;
829 }
830}
831
832static void mmu_free_roots(struct kvm_vcpu *vcpu)
833{
834 int i;
835 struct kvm_mmu_page *page;
836
837 if (!VALID_PAGE(vcpu->mmu.root_hpa))
838 return;
839#ifdef CONFIG_X86_64
840 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
841 hpa_t root = vcpu->mmu.root_hpa;
842
843 page = page_header(root);
844 --page->root_count;
845 vcpu->mmu.root_hpa = INVALID_PAGE;
846 return;
847 }
848#endif
849 for (i = 0; i < 4; ++i) {
850 hpa_t root = vcpu->mmu.pae_root[i];
851
852 if (root) {
853 root &= PT64_BASE_ADDR_MASK;
854 page = page_header(root);
855 --page->root_count;
856 }
857 vcpu->mmu.pae_root[i] = INVALID_PAGE;
858 }
859 vcpu->mmu.root_hpa = INVALID_PAGE;
860}
861
862static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
863{
864 int i;
865 gfn_t root_gfn;
866 struct kvm_mmu_page *page;
867
868 root_gfn = vcpu->cr3 >> PAGE_SHIFT;
869
870#ifdef CONFIG_X86_64
871 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
872 hpa_t root = vcpu->mmu.root_hpa;
873
874 ASSERT(!VALID_PAGE(root));
875 page = kvm_mmu_get_page(vcpu, root_gfn, 0,
876 PT64_ROOT_LEVEL, 0, 0, NULL);
877 root = __pa(page->spt);
878 ++page->root_count;
879 vcpu->mmu.root_hpa = root;
880 return;
881 }
882#endif
883 for (i = 0; i < 4; ++i) {
884 hpa_t root = vcpu->mmu.pae_root[i];
885
886 ASSERT(!VALID_PAGE(root));
887 if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
888 if (!is_present_pte(vcpu->pdptrs[i])) {
889 vcpu->mmu.pae_root[i] = 0;
890 continue;
891 }
892 root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
893 } else if (vcpu->mmu.root_level == 0)
894 root_gfn = 0;
895 page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
896 PT32_ROOT_LEVEL, !is_paging(vcpu),
897 0, NULL);
898 root = __pa(page->spt);
899 ++page->root_count;
900 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
901 }
902 vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
903}
904
905static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
906{
907 return vaddr;
908}
909
910static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
911 u32 error_code)
912{
913 gpa_t addr = gva;
914 hpa_t paddr;
915 int r;
916
917 r = mmu_topup_memory_caches(vcpu);
918 if (r)
919 return r;
920
921 ASSERT(vcpu);
922 ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
923
924
925 paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
926
927 if (is_error_hpa(paddr))
928 return 1;
929
930 return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
931}
932
933static void nonpaging_free(struct kvm_vcpu *vcpu)
934{
935 mmu_free_roots(vcpu);
936}
937
938static int nonpaging_init_context(struct kvm_vcpu *vcpu)
939{
940 struct kvm_mmu *context = &vcpu->mmu;
941
942 context->new_cr3 = nonpaging_new_cr3;
943 context->page_fault = nonpaging_page_fault;
944 context->gva_to_gpa = nonpaging_gva_to_gpa;
945 context->free = nonpaging_free;
946 context->root_level = 0;
947 context->shadow_root_level = PT32E_ROOT_LEVEL;
948 context->root_hpa = INVALID_PAGE;
949 return 0;
950}
951
952static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
953{
954 ++vcpu->stat.tlb_flush;
955 kvm_x86_ops->tlb_flush(vcpu);
956}
957
958static void paging_new_cr3(struct kvm_vcpu *vcpu)
959{
960 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
961 mmu_free_roots(vcpu);
962}
963
964static void inject_page_fault(struct kvm_vcpu *vcpu,
965 u64 addr,
966 u32 err_code)
967{
968 kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
969}
970
971static void paging_free(struct kvm_vcpu *vcpu)
972{
973 nonpaging_free(vcpu);
974}
975
976#define PTTYPE 64
977#include "paging_tmpl.h"
978#undef PTTYPE
979
980#define PTTYPE 32
981#include "paging_tmpl.h"
982#undef PTTYPE
983
984static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
985{
986 struct kvm_mmu *context = &vcpu->mmu;
987
988 ASSERT(is_pae(vcpu));
989 context->new_cr3 = paging_new_cr3;
990 context->page_fault = paging64_page_fault;
991 context->gva_to_gpa = paging64_gva_to_gpa;
992 context->free = paging_free;
993 context->root_level = level;
994 context->shadow_root_level = level;
995 context->root_hpa = INVALID_PAGE;
996 return 0;
997}
998
999static int paging64_init_context(struct kvm_vcpu *vcpu)
1000{
1001 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1002}
1003
1004static int paging32_init_context(struct kvm_vcpu *vcpu)
1005{
1006 struct kvm_mmu *context = &vcpu->mmu;
1007
1008 context->new_cr3 = paging_new_cr3;
1009 context->page_fault = paging32_page_fault;
1010 context->gva_to_gpa = paging32_gva_to_gpa;
1011 context->free = paging_free;
1012 context->root_level = PT32_ROOT_LEVEL;
1013 context->shadow_root_level = PT32E_ROOT_LEVEL;
1014 context->root_hpa = INVALID_PAGE;
1015 return 0;
1016}
1017
1018static int paging32E_init_context(struct kvm_vcpu *vcpu)
1019{
1020 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1021}
1022
1023static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1024{
1025 ASSERT(vcpu);
1026 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1027
1028 if (!is_paging(vcpu))
1029 return nonpaging_init_context(vcpu);
1030 else if (is_long_mode(vcpu))
1031 return paging64_init_context(vcpu);
1032 else if (is_pae(vcpu))
1033 return paging32E_init_context(vcpu);
1034 else
1035 return paging32_init_context(vcpu);
1036}
1037
1038static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1039{
1040 ASSERT(vcpu);
1041 if (VALID_PAGE(vcpu->mmu.root_hpa)) {
1042 vcpu->mmu.free(vcpu);
1043 vcpu->mmu.root_hpa = INVALID_PAGE;
1044 }
1045}
1046
1047int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1048{
1049 destroy_kvm_mmu(vcpu);
1050 return init_kvm_mmu(vcpu);
1051}
1052EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1053
1054int kvm_mmu_load(struct kvm_vcpu *vcpu)
1055{
1056 int r;
1057
1058 mutex_lock(&vcpu->kvm->lock);
1059 r = mmu_topup_memory_caches(vcpu);
1060 if (r)
1061 goto out;
1062 mmu_alloc_roots(vcpu);
1063 kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
1064 kvm_mmu_flush_tlb(vcpu);
1065out:
1066 mutex_unlock(&vcpu->kvm->lock);
1067 return r;
1068}
1069EXPORT_SYMBOL_GPL(kvm_mmu_load);
1070
1071void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1072{
1073 mmu_free_roots(vcpu);
1074}
1075
1076static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1077 struct kvm_mmu_page *page,
1078 u64 *spte)
1079{
1080 u64 pte;
1081 struct kvm_mmu_page *child;
1082
1083 pte = *spte;
1084 if (is_present_pte(pte)) {
1085 if (page->role.level == PT_PAGE_TABLE_LEVEL)
1086 rmap_remove(spte);
1087 else {
1088 child = page_header(pte & PT64_BASE_ADDR_MASK);
1089 mmu_page_remove_parent_pte(child, spte);
1090 }
1091 }
1092 set_shadow_pte(spte, 0);
1093 kvm_flush_remote_tlbs(vcpu->kvm);
1094}
1095
1096static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1097 struct kvm_mmu_page *page,
1098 u64 *spte,
1099 const void *new, int bytes)
1100{
1101 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1102 return;
1103
1104 if (page->role.glevels == PT32_ROOT_LEVEL)
1105 paging32_update_pte(vcpu, page, spte, new, bytes);
1106 else
1107 paging64_update_pte(vcpu, page, spte, new, bytes);
1108}
1109
1110void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1111 const u8 *new, int bytes)
1112{
1113 gfn_t gfn = gpa >> PAGE_SHIFT;
1114 struct kvm_mmu_page *page;
1115 struct hlist_node *node, *n;
1116 struct hlist_head *bucket;
1117 unsigned index;
1118 u64 *spte;
1119 unsigned offset = offset_in_page(gpa);
1120 unsigned pte_size;
1121 unsigned page_offset;
1122 unsigned misaligned;
1123 unsigned quadrant;
1124 int level;
1125 int flooded = 0;
1126 int npte;
1127
1128 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1129 if (gfn == vcpu->last_pt_write_gfn) {
1130 ++vcpu->last_pt_write_count;
1131 if (vcpu->last_pt_write_count >= 3)
1132 flooded = 1;
1133 } else {
1134 vcpu->last_pt_write_gfn = gfn;
1135 vcpu->last_pt_write_count = 1;
1136 }
1137 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1138 bucket = &vcpu->kvm->mmu_page_hash[index];
1139 hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
1140 if (page->gfn != gfn || page->role.metaphysical)
1141 continue;
1142 pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1143 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1144 misaligned |= bytes < 4;
1145 if (misaligned || flooded) {
1146 /*
1147 * Misaligned accesses are too much trouble to fix
1148 * up; also, they usually indicate a page is not used
1149 * as a page table.
1150 *
1151 * If we're seeing too many writes to a page,
1152 * it may no longer be a page table, or we may be
1153 * forking, in which case it is better to unmap the
1154 * page.
1155 */
1156 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1157 gpa, bytes, page->role.word);
1158 kvm_mmu_zap_page(vcpu->kvm, page);
1159 continue;
1160 }
1161 page_offset = offset;
1162 level = page->role.level;
1163 npte = 1;
1164 if (page->role.glevels == PT32_ROOT_LEVEL) {
1165 page_offset <<= 1; /* 32->64 */
1166 /*
1167 * A 32-bit pde maps 4MB while the shadow pdes map
1168 * only 2MB. So we need to double the offset again
1169 * and zap two pdes instead of one.
1170 */
1171 if (level == PT32_ROOT_LEVEL) {
1172 page_offset &= ~7; /* kill rounding error */
1173 page_offset <<= 1;
1174 npte = 2;
1175 }
1176 quadrant = page_offset >> PAGE_SHIFT;
1177 page_offset &= ~PAGE_MASK;
1178 if (quadrant != page->role.quadrant)
1179 continue;
1180 }
1181 spte = &page->spt[page_offset / sizeof(*spte)];
1182 while (npte--) {
1183 mmu_pte_write_zap_pte(vcpu, page, spte);
1184 mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
1185 ++spte;
1186 }
1187 }
1188}
1189
1190int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1191{
1192 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
1193
1194 return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
1195}
1196
1197void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1198{
1199 while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
1200 struct kvm_mmu_page *page;
1201
1202 page = container_of(vcpu->kvm->active_mmu_pages.prev,
1203 struct kvm_mmu_page, link);
1204 kvm_mmu_zap_page(vcpu->kvm, page);
1205 }
1206}
1207
1208static void free_mmu_pages(struct kvm_vcpu *vcpu)
1209{
1210 struct kvm_mmu_page *page;
1211
1212 while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1213 page = container_of(vcpu->kvm->active_mmu_pages.next,
1214 struct kvm_mmu_page, link);
1215 kvm_mmu_zap_page(vcpu->kvm, page);
1216 }
1217 free_page((unsigned long)vcpu->mmu.pae_root);
1218}
1219
1220static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1221{
1222 struct page *page;
1223 int i;
1224
1225 ASSERT(vcpu);
1226
1227 vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
1228
1229 /*
1230 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1231 * Therefore we need to allocate shadow page tables in the first
1232 * 4GB of memory, which happens to fit the DMA32 zone.
1233 */
1234 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1235 if (!page)
1236 goto error_1;
1237 vcpu->mmu.pae_root = page_address(page);
1238 for (i = 0; i < 4; ++i)
1239 vcpu->mmu.pae_root[i] = INVALID_PAGE;
1240
1241 return 0;
1242
1243error_1:
1244 free_mmu_pages(vcpu);
1245 return -ENOMEM;
1246}
1247
1248int kvm_mmu_create(struct kvm_vcpu *vcpu)
1249{
1250 ASSERT(vcpu);
1251 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1252
1253 return alloc_mmu_pages(vcpu);
1254}
1255
1256int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1257{
1258 ASSERT(vcpu);
1259 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1260
1261 return init_kvm_mmu(vcpu);
1262}
1263
1264void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1265{
1266 ASSERT(vcpu);
1267
1268 destroy_kvm_mmu(vcpu);
1269 free_mmu_pages(vcpu);
1270 mmu_free_memory_caches(vcpu);
1271}
1272
1273void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1274{
1275 struct kvm_mmu_page *page;
1276
1277 list_for_each_entry(page, &kvm->active_mmu_pages, link) {
1278 int i;
1279 u64 *pt;
1280
1281 if (!test_bit(slot, &page->slot_bitmap))
1282 continue;
1283
1284 pt = page->spt;
1285 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1286 /* avoid RMW */
1287 if (pt[i] & PT_WRITABLE_MASK) {
1288 rmap_remove(&pt[i]);
1289 pt[i] &= ~PT_WRITABLE_MASK;
1290 }
1291 }
1292}
1293
1294void kvm_mmu_zap_all(struct kvm *kvm)
1295{
1296 struct kvm_mmu_page *page, *node;
1297
1298 list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link)
1299 kvm_mmu_zap_page(kvm, page);
1300
1301 kvm_flush_remote_tlbs(kvm);
1302}
1303
1304void kvm_mmu_module_exit(void)
1305{
1306 if (pte_chain_cache)
1307 kmem_cache_destroy(pte_chain_cache);
1308 if (rmap_desc_cache)
1309 kmem_cache_destroy(rmap_desc_cache);
1310 if (mmu_page_header_cache)
1311 kmem_cache_destroy(mmu_page_header_cache);
1312}
1313
1314int kvm_mmu_module_init(void)
1315{
1316 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1317 sizeof(struct kvm_pte_chain),
1318 0, 0, NULL);
1319 if (!pte_chain_cache)
1320 goto nomem;
1321 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1322 sizeof(struct kvm_rmap_desc),
1323 0, 0, NULL);
1324 if (!rmap_desc_cache)
1325 goto nomem;
1326
1327 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1328 sizeof(struct kvm_mmu_page),
1329 0, 0, NULL);
1330 if (!mmu_page_header_cache)
1331 goto nomem;
1332
1333 return 0;
1334
1335nomem:
1336 kvm_mmu_module_exit();
1337 return -ENOMEM;
1338}
1339
1340#ifdef AUDIT
1341
1342static const char *audit_msg;
1343
1344static gva_t canonicalize(gva_t gva)
1345{
1346#ifdef CONFIG_X86_64
1347 gva = (long long)(gva << 16) >> 16;
1348#endif
1349 return gva;
1350}
1351
1352static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1353 gva_t va, int level)
1354{
1355 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1356 int i;
1357 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1358
1359 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1360 u64 ent = pt[i];
1361
1362 if (!(ent & PT_PRESENT_MASK))
1363 continue;
1364
1365 va = canonicalize(va);
1366 if (level > 1)
1367 audit_mappings_page(vcpu, ent, va, level - 1);
1368 else {
1369 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
1370 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
1371
1372 if ((ent & PT_PRESENT_MASK)
1373 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1374 printk(KERN_ERR "audit error: (%s) levels %d"
1375 " gva %lx gpa %llx hpa %llx ent %llx\n",
1376 audit_msg, vcpu->mmu.root_level,
1377 va, gpa, hpa, ent);
1378 }
1379 }
1380}
1381
1382static void audit_mappings(struct kvm_vcpu *vcpu)
1383{
1384 unsigned i;
1385
1386 if (vcpu->mmu.root_level == 4)
1387 audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
1388 else
1389 for (i = 0; i < 4; ++i)
1390 if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
1391 audit_mappings_page(vcpu,
1392 vcpu->mmu.pae_root[i],
1393 i << 30,
1394 2);
1395}
1396
1397static int count_rmaps(struct kvm_vcpu *vcpu)
1398{
1399 int nmaps = 0;
1400 int i, j, k;
1401
1402 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1403 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1404 struct kvm_rmap_desc *d;
1405
1406 for (j = 0; j < m->npages; ++j) {
1407 struct page *page = m->phys_mem[j];
1408
1409 if (!page->private)
1410 continue;
1411 if (!(page->private & 1)) {
1412 ++nmaps;
1413 continue;
1414 }
1415 d = (struct kvm_rmap_desc *)(page->private & ~1ul);
1416 while (d) {
1417 for (k = 0; k < RMAP_EXT; ++k)
1418 if (d->shadow_ptes[k])
1419 ++nmaps;
1420 else
1421 break;
1422 d = d->more;
1423 }
1424 }
1425 }
1426 return nmaps;
1427}
1428
1429static int count_writable_mappings(struct kvm_vcpu *vcpu)
1430{
1431 int nmaps = 0;
1432 struct kvm_mmu_page *page;
1433 int i;
1434
1435 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1436 u64 *pt = page->spt;
1437
1438 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1439 continue;
1440
1441 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1442 u64 ent = pt[i];
1443
1444 if (!(ent & PT_PRESENT_MASK))
1445 continue;
1446 if (!(ent & PT_WRITABLE_MASK))
1447 continue;
1448 ++nmaps;
1449 }
1450 }
1451 return nmaps;
1452}
1453
1454static void audit_rmap(struct kvm_vcpu *vcpu)
1455{
1456 int n_rmap = count_rmaps(vcpu);
1457 int n_actual = count_writable_mappings(vcpu);
1458
1459 if (n_rmap != n_actual)
1460 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1461 __FUNCTION__, audit_msg, n_rmap, n_actual);
1462}
1463
1464static void audit_write_protection(struct kvm_vcpu *vcpu)
1465{
1466 struct kvm_mmu_page *page;
1467
1468 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1469 hfn_t hfn;
1470 struct page *pg;
1471
1472 if (page->role.metaphysical)
1473 continue;
1474
1475 hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
1476 >> PAGE_SHIFT;
1477 pg = pfn_to_page(hfn);
1478 if (pg->private)
1479 printk(KERN_ERR "%s: (%s) shadow page has writable"
1480 " mappings: gfn %lx role %x\n",
1481 __FUNCTION__, audit_msg, page->gfn,
1482 page->role.word);
1483 }
1484}
1485
1486static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1487{
1488 int olddbg = dbg;
1489
1490 dbg = 0;
1491 audit_msg = msg;
1492 audit_rmap(vcpu);
1493 audit_write_protection(vcpu);
1494 audit_mappings(vcpu);
1495 dbg = olddbg;
1496}
1497
1498#endif
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
deleted file mode 100644
index 6b094b44f8fb..000000000000
--- a/drivers/kvm/paging_tmpl.h
+++ /dev/null
@@ -1,511 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20/*
21 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
22 * so the code in this file is compiled twice, once per pte size.
23 */
24
25#if PTTYPE == 64
26 #define pt_element_t u64
27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #ifdef CONFIG_X86_64
35 #define PT_MAX_FULL_LEVELS 4
36 #else
37 #define PT_MAX_FULL_LEVELS 2
38 #endif
39#elif PTTYPE == 32
40 #define pt_element_t u32
41 #define guest_walker guest_walker32
42 #define FNAME(name) paging##32_##name
43 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
44 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
45 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
46 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
47 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
48 #define PT_MAX_FULL_LEVELS 2
49#else
50 #error Invalid PTTYPE value
51#endif
52
53/*
54 * The guest_walker structure emulates the behavior of the hardware page
55 * table walker.
56 */
57struct guest_walker {
58 int level;
59 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
60 pt_element_t *table;
61 pt_element_t pte;
62 pt_element_t *ptep;
63 struct page *page;
64 int index;
65 pt_element_t inherited_ar;
66 gfn_t gfn;
67 u32 error_code;
68};
69
70/*
71 * Fetch a guest pte for a guest virtual address
72 */
73static int FNAME(walk_addr)(struct guest_walker *walker,
74 struct kvm_vcpu *vcpu, gva_t addr,
75 int write_fault, int user_fault, int fetch_fault)
76{
77 hpa_t hpa;
78 struct kvm_memory_slot *slot;
79 pt_element_t *ptep;
80 pt_element_t root;
81 gfn_t table_gfn;
82
83 pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
84 walker->level = vcpu->mmu.root_level;
85 walker->table = NULL;
86 walker->page = NULL;
87 walker->ptep = NULL;
88 root = vcpu->cr3;
89#if PTTYPE == 64
90 if (!is_long_mode(vcpu)) {
91 walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
92 root = *walker->ptep;
93 walker->pte = root;
94 if (!(root & PT_PRESENT_MASK))
95 goto not_present;
96 --walker->level;
97 }
98#endif
99 table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
100 walker->table_gfn[walker->level - 1] = table_gfn;
101 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
102 walker->level - 1, table_gfn);
103 slot = gfn_to_memslot(vcpu->kvm, table_gfn);
104 hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
105 walker->page = pfn_to_page(hpa >> PAGE_SHIFT);
106 walker->table = kmap_atomic(walker->page, KM_USER0);
107
108 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
109 (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
110
111 walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
112
113 for (;;) {
114 int index = PT_INDEX(addr, walker->level);
115 hpa_t paddr;
116
117 ptep = &walker->table[index];
118 walker->index = index;
119 ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
120 ((unsigned long)ptep & PAGE_MASK));
121
122 if (!is_present_pte(*ptep))
123 goto not_present;
124
125 if (write_fault && !is_writeble_pte(*ptep))
126 if (user_fault || is_write_protection(vcpu))
127 goto access_error;
128
129 if (user_fault && !(*ptep & PT_USER_MASK))
130 goto access_error;
131
132#if PTTYPE == 64
133 if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK))
134 goto access_error;
135#endif
136
137 if (!(*ptep & PT_ACCESSED_MASK)) {
138 mark_page_dirty(vcpu->kvm, table_gfn);
139 *ptep |= PT_ACCESSED_MASK;
140 }
141
142 if (walker->level == PT_PAGE_TABLE_LEVEL) {
143 walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
144 >> PAGE_SHIFT;
145 break;
146 }
147
148 if (walker->level == PT_DIRECTORY_LEVEL
149 && (*ptep & PT_PAGE_SIZE_MASK)
150 && (PTTYPE == 64 || is_pse(vcpu))) {
151 walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK)
152 >> PAGE_SHIFT;
153 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
154 break;
155 }
156
157 walker->inherited_ar &= walker->table[index];
158 table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
159 kunmap_atomic(walker->table, KM_USER0);
160 paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT);
161 walker->page = pfn_to_page(paddr >> PAGE_SHIFT);
162 walker->table = kmap_atomic(walker->page, KM_USER0);
163 --walker->level;
164 walker->table_gfn[walker->level - 1 ] = table_gfn;
165 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
166 walker->level - 1, table_gfn);
167 }
168 walker->pte = *ptep;
169 if (walker->page)
170 walker->ptep = NULL;
171 if (walker->table)
172 kunmap_atomic(walker->table, KM_USER0);
173 pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
174 return 1;
175
176not_present:
177 walker->error_code = 0;
178 goto err;
179
180access_error:
181 walker->error_code = PFERR_PRESENT_MASK;
182
183err:
184 if (write_fault)
185 walker->error_code |= PFERR_WRITE_MASK;
186 if (user_fault)
187 walker->error_code |= PFERR_USER_MASK;
188 if (fetch_fault)
189 walker->error_code |= PFERR_FETCH_MASK;
190 if (walker->table)
191 kunmap_atomic(walker->table, KM_USER0);
192 return 0;
193}
194
195static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
196 struct guest_walker *walker)
197{
198 mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
199}
200
201static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
202 u64 *shadow_pte,
203 gpa_t gaddr,
204 pt_element_t gpte,
205 u64 access_bits,
206 int user_fault,
207 int write_fault,
208 int *ptwrite,
209 struct guest_walker *walker,
210 gfn_t gfn)
211{
212 hpa_t paddr;
213 int dirty = gpte & PT_DIRTY_MASK;
214 u64 spte = *shadow_pte;
215 int was_rmapped = is_rmap_pte(spte);
216
217 pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
218 " user_fault %d gfn %lx\n",
219 __FUNCTION__, spte, (u64)gpte, access_bits,
220 write_fault, user_fault, gfn);
221
222 if (write_fault && !dirty) {
223 pt_element_t *guest_ent, *tmp = NULL;
224
225 if (walker->ptep)
226 guest_ent = walker->ptep;
227 else {
228 tmp = kmap_atomic(walker->page, KM_USER0);
229 guest_ent = &tmp[walker->index];
230 }
231
232 *guest_ent |= PT_DIRTY_MASK;
233 if (!walker->ptep)
234 kunmap_atomic(tmp, KM_USER0);
235 dirty = 1;
236 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
237 }
238
239 spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
240 spte |= gpte & PT64_NX_MASK;
241 if (!dirty)
242 access_bits &= ~PT_WRITABLE_MASK;
243
244 paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
245
246 spte |= PT_PRESENT_MASK;
247 if (access_bits & PT_USER_MASK)
248 spte |= PT_USER_MASK;
249
250 if (is_error_hpa(paddr)) {
251 spte |= gaddr;
252 spte |= PT_SHADOW_IO_MARK;
253 spte &= ~PT_PRESENT_MASK;
254 set_shadow_pte(shadow_pte, spte);
255 return;
256 }
257
258 spte |= paddr;
259
260 if ((access_bits & PT_WRITABLE_MASK)
261 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
262 struct kvm_mmu_page *shadow;
263
264 spte |= PT_WRITABLE_MASK;
265 if (user_fault) {
266 mmu_unshadow(vcpu, gfn);
267 goto unshadowed;
268 }
269
270 shadow = kvm_mmu_lookup_page(vcpu, gfn);
271 if (shadow) {
272 pgprintk("%s: found shadow page for %lx, marking ro\n",
273 __FUNCTION__, gfn);
274 access_bits &= ~PT_WRITABLE_MASK;
275 if (is_writeble_pte(spte)) {
276 spte &= ~PT_WRITABLE_MASK;
277 kvm_x86_ops->tlb_flush(vcpu);
278 }
279 if (write_fault)
280 *ptwrite = 1;
281 }
282 }
283
284unshadowed:
285
286 if (access_bits & PT_WRITABLE_MASK)
287 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
288
289 set_shadow_pte(shadow_pte, spte);
290 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
291 if (!was_rmapped)
292 rmap_add(vcpu, shadow_pte);
293}
294
295static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
296 u64 *shadow_pte, u64 access_bits,
297 int user_fault, int write_fault, int *ptwrite,
298 struct guest_walker *walker, gfn_t gfn)
299{
300 access_bits &= gpte;
301 FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK,
302 gpte, access_bits, user_fault, write_fault,
303 ptwrite, walker, gfn);
304}
305
306static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
307 u64 *spte, const void *pte, int bytes)
308{
309 pt_element_t gpte;
310
311 if (bytes < sizeof(pt_element_t))
312 return;
313 gpte = *(const pt_element_t *)pte;
314 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
315 return;
316 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
317 FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
318 0, NULL, NULL,
319 (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
320}
321
322static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
323 u64 *shadow_pte, u64 access_bits,
324 int user_fault, int write_fault, int *ptwrite,
325 struct guest_walker *walker, gfn_t gfn)
326{
327 gpa_t gaddr;
328
329 access_bits &= gpde;
330 gaddr = (gpa_t)gfn << PAGE_SHIFT;
331 if (PTTYPE == 32 && is_cpuid_PSE36())
332 gaddr |= (gpde & PT32_DIR_PSE36_MASK) <<
333 (32 - PT32_DIR_PSE36_SHIFT);
334 FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
335 gpde, access_bits, user_fault, write_fault,
336 ptwrite, walker, gfn);
337}
338
339/*
340 * Fetch a shadow pte for a specific level in the paging hierarchy.
341 */
342static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
343 struct guest_walker *walker,
344 int user_fault, int write_fault, int *ptwrite)
345{
346 hpa_t shadow_addr;
347 int level;
348 u64 *shadow_ent;
349 u64 *prev_shadow_ent = NULL;
350
351 if (!is_present_pte(walker->pte))
352 return NULL;
353
354 shadow_addr = vcpu->mmu.root_hpa;
355 level = vcpu->mmu.shadow_root_level;
356 if (level == PT32E_ROOT_LEVEL) {
357 shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3];
358 shadow_addr &= PT64_BASE_ADDR_MASK;
359 --level;
360 }
361
362 for (; ; level--) {
363 u32 index = SHADOW_PT_INDEX(addr, level);
364 struct kvm_mmu_page *shadow_page;
365 u64 shadow_pte;
366 int metaphysical;
367 gfn_t table_gfn;
368 unsigned hugepage_access = 0;
369
370 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
371 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
372 if (level == PT_PAGE_TABLE_LEVEL)
373 break;
374 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
375 prev_shadow_ent = shadow_ent;
376 continue;
377 }
378
379 if (level == PT_PAGE_TABLE_LEVEL)
380 break;
381
382 if (level - 1 == PT_PAGE_TABLE_LEVEL
383 && walker->level == PT_DIRECTORY_LEVEL) {
384 metaphysical = 1;
385 hugepage_access = walker->pte;
386 hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
387 if (walker->pte & PT64_NX_MASK)
388 hugepage_access |= (1 << 2);
389 hugepage_access >>= PT_WRITABLE_SHIFT;
390 table_gfn = (walker->pte & PT_BASE_ADDR_MASK)
391 >> PAGE_SHIFT;
392 } else {
393 metaphysical = 0;
394 table_gfn = walker->table_gfn[level - 2];
395 }
396 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
397 metaphysical, hugepage_access,
398 shadow_ent);
399 shadow_addr = __pa(shadow_page->spt);
400 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
401 | PT_WRITABLE_MASK | PT_USER_MASK;
402 *shadow_ent = shadow_pte;
403 prev_shadow_ent = shadow_ent;
404 }
405
406 if (walker->level == PT_DIRECTORY_LEVEL) {
407 FNAME(set_pde)(vcpu, walker->pte, shadow_ent,
408 walker->inherited_ar, user_fault, write_fault,
409 ptwrite, walker, walker->gfn);
410 } else {
411 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
412 FNAME(set_pte)(vcpu, walker->pte, shadow_ent,
413 walker->inherited_ar, user_fault, write_fault,
414 ptwrite, walker, walker->gfn);
415 }
416 return shadow_ent;
417}
418
419/*
420 * Page fault handler. There are several causes for a page fault:
421 * - there is no shadow pte for the guest pte
422 * - write access through a shadow pte marked read only so that we can set
423 * the dirty bit
424 * - write access to a shadow pte marked read only so we can update the page
425 * dirty bitmap, when userspace requests it
426 * - mmio access; in this case we will never install a present shadow pte
427 * - normal guest page fault due to the guest pte marked not present, not
428 * writable, or not executable
429 *
430 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
431 * a negative value on error.
432 */
433static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
434 u32 error_code)
435{
436 int write_fault = error_code & PFERR_WRITE_MASK;
437 int user_fault = error_code & PFERR_USER_MASK;
438 int fetch_fault = error_code & PFERR_FETCH_MASK;
439 struct guest_walker walker;
440 u64 *shadow_pte;
441 int write_pt = 0;
442 int r;
443
444 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
445 kvm_mmu_audit(vcpu, "pre page fault");
446
447 r = mmu_topup_memory_caches(vcpu);
448 if (r)
449 return r;
450
451 /*
452 * Look up the shadow pte for the faulting address.
453 */
454 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
455 fetch_fault);
456
457 /*
458 * The page is not mapped by the guest. Let the guest handle it.
459 */
460 if (!r) {
461 pgprintk("%s: guest page fault\n", __FUNCTION__);
462 inject_page_fault(vcpu, addr, walker.error_code);
463 vcpu->last_pt_write_count = 0; /* reset fork detector */
464 return 0;
465 }
466
467 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
468 &write_pt);
469 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
470 shadow_pte, *shadow_pte, write_pt);
471
472 if (!write_pt)
473 vcpu->last_pt_write_count = 0; /* reset fork detector */
474
475 /*
476 * mmio: emulate if accessible, otherwise its a guest fault.
477 */
478 if (is_io_pte(*shadow_pte))
479 return 1;
480
481 ++vcpu->stat.pf_fixed;
482 kvm_mmu_audit(vcpu, "post page fault (fixed)");
483
484 return write_pt;
485}
486
487static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
488{
489 struct guest_walker walker;
490 gpa_t gpa = UNMAPPED_GVA;
491 int r;
492
493 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
494
495 if (r) {
496 gpa = (gpa_t)walker.gfn << PAGE_SHIFT;
497 gpa |= vaddr & ~PAGE_MASK;
498 }
499
500 return gpa;
501}
502
503#undef pt_element_t
504#undef guest_walker
505#undef FNAME
506#undef PT_BASE_ADDR_MASK
507#undef PT_INDEX
508#undef SHADOW_PT_INDEX
509#undef PT_LEVEL_MASK
510#undef PT_DIR_BASE_ADDR_MASK
511#undef PT_MAX_FULL_LEVELS
diff --git a/drivers/kvm/segment_descriptor.h b/drivers/kvm/segment_descriptor.h
deleted file mode 100644
index 71fdf458619a..000000000000
--- a/drivers/kvm/segment_descriptor.h
+++ /dev/null
@@ -1,17 +0,0 @@
1struct segment_descriptor {
2 u16 limit_low;
3 u16 base_low;
4 u8 base_mid;
5 u8 type : 4;
6 u8 system : 1;
7 u8 dpl : 2;
8 u8 present : 1;
9 u8 limit_high : 4;
10 u8 avl : 1;
11 u8 long_mode : 1;
12 u8 default_op : 1;
13 u8 granularity : 1;
14 u8 base_high;
15} __attribute__((packed));
16
17
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
deleted file mode 100644
index ced4ac1955db..000000000000
--- a/drivers/kvm/svm.c
+++ /dev/null
@@ -1,1754 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * AMD SVM support
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 *
8 * Authors:
9 * Yaniv Kamay <yaniv@qumranet.com>
10 * Avi Kivity <avi@qumranet.com>
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory.
14 *
15 */
16
17#include "kvm_svm.h"
18#include "x86_emulate.h"
19#include "irq.h"
20
21#include <linux/module.h>
22#include <linux/kernel.h>
23#include <linux/vmalloc.h>
24#include <linux/highmem.h>
25#include <linux/sched.h>
26
27#include <asm/desc.h>
28
29MODULE_AUTHOR("Qumranet");
30MODULE_LICENSE("GPL");
31
32#define IOPM_ALLOC_ORDER 2
33#define MSRPM_ALLOC_ORDER 1
34
35#define DB_VECTOR 1
36#define UD_VECTOR 6
37#define GP_VECTOR 13
38
39#define DR7_GD_MASK (1 << 13)
40#define DR6_BD_MASK (1 << 13)
41
42#define SEG_TYPE_LDT 2
43#define SEG_TYPE_BUSY_TSS16 3
44
45#define KVM_EFER_LMA (1 << 10)
46#define KVM_EFER_LME (1 << 8)
47
48#define SVM_FEATURE_NPT (1 << 0)
49#define SVM_FEATURE_LBRV (1 << 1)
50#define SVM_DEATURE_SVML (1 << 2)
51
52static void kvm_reput_irq(struct vcpu_svm *svm);
53
54static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
55{
56 return container_of(vcpu, struct vcpu_svm, vcpu);
57}
58
59unsigned long iopm_base;
60unsigned long msrpm_base;
61
62struct kvm_ldttss_desc {
63 u16 limit0;
64 u16 base0;
65 unsigned base1 : 8, type : 5, dpl : 2, p : 1;
66 unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
67 u32 base3;
68 u32 zero1;
69} __attribute__((packed));
70
71struct svm_cpu_data {
72 int cpu;
73
74 u64 asid_generation;
75 u32 max_asid;
76 u32 next_asid;
77 struct kvm_ldttss_desc *tss_desc;
78
79 struct page *save_area;
80};
81
82static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
83static uint32_t svm_features;
84
85struct svm_init_data {
86 int cpu;
87 int r;
88};
89
90static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
91
92#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
93#define MSRS_RANGE_SIZE 2048
94#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
95
96#define MAX_INST_SIZE 15
97
98static inline u32 svm_has(u32 feat)
99{
100 return svm_features & feat;
101}
102
103static inline u8 pop_irq(struct kvm_vcpu *vcpu)
104{
105 int word_index = __ffs(vcpu->irq_summary);
106 int bit_index = __ffs(vcpu->irq_pending[word_index]);
107 int irq = word_index * BITS_PER_LONG + bit_index;
108
109 clear_bit(bit_index, &vcpu->irq_pending[word_index]);
110 if (!vcpu->irq_pending[word_index])
111 clear_bit(word_index, &vcpu->irq_summary);
112 return irq;
113}
114
115static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
116{
117 set_bit(irq, vcpu->irq_pending);
118 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
119}
120
121static inline void clgi(void)
122{
123 asm volatile (SVM_CLGI);
124}
125
126static inline void stgi(void)
127{
128 asm volatile (SVM_STGI);
129}
130
131static inline void invlpga(unsigned long addr, u32 asid)
132{
133 asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid));
134}
135
136static inline unsigned long kvm_read_cr2(void)
137{
138 unsigned long cr2;
139
140 asm volatile ("mov %%cr2, %0" : "=r" (cr2));
141 return cr2;
142}
143
144static inline void kvm_write_cr2(unsigned long val)
145{
146 asm volatile ("mov %0, %%cr2" :: "r" (val));
147}
148
149static inline unsigned long read_dr6(void)
150{
151 unsigned long dr6;
152
153 asm volatile ("mov %%dr6, %0" : "=r" (dr6));
154 return dr6;
155}
156
157static inline void write_dr6(unsigned long val)
158{
159 asm volatile ("mov %0, %%dr6" :: "r" (val));
160}
161
162static inline unsigned long read_dr7(void)
163{
164 unsigned long dr7;
165
166 asm volatile ("mov %%dr7, %0" : "=r" (dr7));
167 return dr7;
168}
169
170static inline void write_dr7(unsigned long val)
171{
172 asm volatile ("mov %0, %%dr7" :: "r" (val));
173}
174
175static inline void force_new_asid(struct kvm_vcpu *vcpu)
176{
177 to_svm(vcpu)->asid_generation--;
178}
179
180static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
181{
182 force_new_asid(vcpu);
183}
184
185static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
186{
187 if (!(efer & KVM_EFER_LMA))
188 efer &= ~KVM_EFER_LME;
189
190 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
191 vcpu->shadow_efer = efer;
192}
193
194static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
195{
196 struct vcpu_svm *svm = to_svm(vcpu);
197
198 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
199 SVM_EVTINJ_VALID_ERR |
200 SVM_EVTINJ_TYPE_EXEPT |
201 GP_VECTOR;
202 svm->vmcb->control.event_inj_err = error_code;
203}
204
205static void inject_ud(struct kvm_vcpu *vcpu)
206{
207 to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID |
208 SVM_EVTINJ_TYPE_EXEPT |
209 UD_VECTOR;
210}
211
212static int is_page_fault(uint32_t info)
213{
214 info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
215 return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT);
216}
217
218static int is_external_interrupt(u32 info)
219{
220 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
221 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
222}
223
224static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
225{
226 struct vcpu_svm *svm = to_svm(vcpu);
227
228 if (!svm->next_rip) {
229 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
230 return;
231 }
232 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) {
233 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
234 __FUNCTION__,
235 svm->vmcb->save.rip,
236 svm->next_rip);
237 }
238
239 vcpu->rip = svm->vmcb->save.rip = svm->next_rip;
240 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
241
242 vcpu->interrupt_window_open = 1;
243}
244
245static int has_svm(void)
246{
247 uint32_t eax, ebx, ecx, edx;
248
249 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
250 printk(KERN_INFO "has_svm: not amd\n");
251 return 0;
252 }
253
254 cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
255 if (eax < SVM_CPUID_FUNC) {
256 printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
257 return 0;
258 }
259
260 cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
261 if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
262 printk(KERN_DEBUG "has_svm: svm not available\n");
263 return 0;
264 }
265 return 1;
266}
267
268static void svm_hardware_disable(void *garbage)
269{
270 struct svm_cpu_data *svm_data
271 = per_cpu(svm_data, raw_smp_processor_id());
272
273 if (svm_data) {
274 uint64_t efer;
275
276 wrmsrl(MSR_VM_HSAVE_PA, 0);
277 rdmsrl(MSR_EFER, efer);
278 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
279 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
280 __free_page(svm_data->save_area);
281 kfree(svm_data);
282 }
283}
284
285static void svm_hardware_enable(void *garbage)
286{
287
288 struct svm_cpu_data *svm_data;
289 uint64_t efer;
290#ifdef CONFIG_X86_64
291 struct desc_ptr gdt_descr;
292#else
293 struct desc_ptr gdt_descr;
294#endif
295 struct desc_struct *gdt;
296 int me = raw_smp_processor_id();
297
298 if (!has_svm()) {
299 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
300 return;
301 }
302 svm_data = per_cpu(svm_data, me);
303
304 if (!svm_data) {
305 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
306 me);
307 return;
308 }
309
310 svm_data->asid_generation = 1;
311 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
312 svm_data->next_asid = svm_data->max_asid + 1;
313 svm_features = cpuid_edx(SVM_CPUID_FUNC);
314
315 asm volatile ( "sgdt %0" : "=m"(gdt_descr) );
316 gdt = (struct desc_struct *)gdt_descr.address;
317 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
318
319 rdmsrl(MSR_EFER, efer);
320 wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK);
321
322 wrmsrl(MSR_VM_HSAVE_PA,
323 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
324}
325
326static int svm_cpu_init(int cpu)
327{
328 struct svm_cpu_data *svm_data;
329 int r;
330
331 svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
332 if (!svm_data)
333 return -ENOMEM;
334 svm_data->cpu = cpu;
335 svm_data->save_area = alloc_page(GFP_KERNEL);
336 r = -ENOMEM;
337 if (!svm_data->save_area)
338 goto err_1;
339
340 per_cpu(svm_data, cpu) = svm_data;
341
342 return 0;
343
344err_1:
345 kfree(svm_data);
346 return r;
347
348}
349
350static void set_msr_interception(u32 *msrpm, unsigned msr,
351 int read, int write)
352{
353 int i;
354
355 for (i = 0; i < NUM_MSR_MAPS; i++) {
356 if (msr >= msrpm_ranges[i] &&
357 msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
358 u32 msr_offset = (i * MSRS_IN_RANGE + msr -
359 msrpm_ranges[i]) * 2;
360
361 u32 *base = msrpm + (msr_offset / 32);
362 u32 msr_shift = msr_offset % 32;
363 u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
364 *base = (*base & ~(0x3 << msr_shift)) |
365 (mask << msr_shift);
366 return;
367 }
368 }
369 BUG();
370}
371
372static __init int svm_hardware_setup(void)
373{
374 int cpu;
375 struct page *iopm_pages;
376 struct page *msrpm_pages;
377 void *iopm_va, *msrpm_va;
378 int r;
379
380 iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
381
382 if (!iopm_pages)
383 return -ENOMEM;
384
385 iopm_va = page_address(iopm_pages);
386 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
387 clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
388 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
389
390
391 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
392
393 r = -ENOMEM;
394 if (!msrpm_pages)
395 goto err_1;
396
397 msrpm_va = page_address(msrpm_pages);
398 memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
399 msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
400
401#ifdef CONFIG_X86_64
402 set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
403 set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
404 set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
405 set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
406 set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
407 set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
408#endif
409 set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
410 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
411 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
412 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
413
414 for_each_online_cpu(cpu) {
415 r = svm_cpu_init(cpu);
416 if (r)
417 goto err_2;
418 }
419 return 0;
420
421err_2:
422 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
423 msrpm_base = 0;
424err_1:
425 __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
426 iopm_base = 0;
427 return r;
428}
429
430static __exit void svm_hardware_unsetup(void)
431{
432 __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
433 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
434 iopm_base = msrpm_base = 0;
435}
436
437static void init_seg(struct vmcb_seg *seg)
438{
439 seg->selector = 0;
440 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
441 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
442 seg->limit = 0xffff;
443 seg->base = 0;
444}
445
446static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
447{
448 seg->selector = 0;
449 seg->attrib = SVM_SELECTOR_P_MASK | type;
450 seg->limit = 0xffff;
451 seg->base = 0;
452}
453
454static void init_vmcb(struct vmcb *vmcb)
455{
456 struct vmcb_control_area *control = &vmcb->control;
457 struct vmcb_save_area *save = &vmcb->save;
458
459 control->intercept_cr_read = INTERCEPT_CR0_MASK |
460 INTERCEPT_CR3_MASK |
461 INTERCEPT_CR4_MASK;
462
463 control->intercept_cr_write = INTERCEPT_CR0_MASK |
464 INTERCEPT_CR3_MASK |
465 INTERCEPT_CR4_MASK;
466
467 control->intercept_dr_read = INTERCEPT_DR0_MASK |
468 INTERCEPT_DR1_MASK |
469 INTERCEPT_DR2_MASK |
470 INTERCEPT_DR3_MASK;
471
472 control->intercept_dr_write = INTERCEPT_DR0_MASK |
473 INTERCEPT_DR1_MASK |
474 INTERCEPT_DR2_MASK |
475 INTERCEPT_DR3_MASK |
476 INTERCEPT_DR5_MASK |
477 INTERCEPT_DR7_MASK;
478
479 control->intercept_exceptions = 1 << PF_VECTOR;
480
481
482 control->intercept = (1ULL << INTERCEPT_INTR) |
483 (1ULL << INTERCEPT_NMI) |
484 (1ULL << INTERCEPT_SMI) |
485 /*
486 * selective cr0 intercept bug?
487 * 0: 0f 22 d8 mov %eax,%cr3
488 * 3: 0f 20 c0 mov %cr0,%eax
489 * 6: 0d 00 00 00 80 or $0x80000000,%eax
490 * b: 0f 22 c0 mov %eax,%cr0
491 * set cr3 ->interception
492 * get cr0 ->interception
493 * set cr0 -> no interception
494 */
495 /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */
496 (1ULL << INTERCEPT_CPUID) |
497 (1ULL << INTERCEPT_INVD) |
498 (1ULL << INTERCEPT_HLT) |
499 (1ULL << INTERCEPT_INVLPGA) |
500 (1ULL << INTERCEPT_IOIO_PROT) |
501 (1ULL << INTERCEPT_MSR_PROT) |
502 (1ULL << INTERCEPT_TASK_SWITCH) |
503 (1ULL << INTERCEPT_SHUTDOWN) |
504 (1ULL << INTERCEPT_VMRUN) |
505 (1ULL << INTERCEPT_VMMCALL) |
506 (1ULL << INTERCEPT_VMLOAD) |
507 (1ULL << INTERCEPT_VMSAVE) |
508 (1ULL << INTERCEPT_STGI) |
509 (1ULL << INTERCEPT_CLGI) |
510 (1ULL << INTERCEPT_SKINIT) |
511 (1ULL << INTERCEPT_WBINVD) |
512 (1ULL << INTERCEPT_MONITOR) |
513 (1ULL << INTERCEPT_MWAIT);
514
515 control->iopm_base_pa = iopm_base;
516 control->msrpm_base_pa = msrpm_base;
517 control->tsc_offset = 0;
518 control->int_ctl = V_INTR_MASKING_MASK;
519
520 init_seg(&save->es);
521 init_seg(&save->ss);
522 init_seg(&save->ds);
523 init_seg(&save->fs);
524 init_seg(&save->gs);
525
526 save->cs.selector = 0xf000;
527 /* Executable/Readable Code Segment */
528 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
529 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
530 save->cs.limit = 0xffff;
531 /*
532 * cs.base should really be 0xffff0000, but vmx can't handle that, so
533 * be consistent with it.
534 *
535 * Replace when we have real mode working for vmx.
536 */
537 save->cs.base = 0xf0000;
538
539 save->gdtr.limit = 0xffff;
540 save->idtr.limit = 0xffff;
541
542 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
543 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
544
545 save->efer = MSR_EFER_SVME_MASK;
546
547 save->dr6 = 0xffff0ff0;
548 save->dr7 = 0x400;
549 save->rflags = 2;
550 save->rip = 0x0000fff0;
551
552 /*
553 * cr0 val on cpu init should be 0x60000010, we enable cpu
554 * cache by default. the orderly way is to enable cache in bios.
555 */
556 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
557 save->cr4 = X86_CR4_PAE;
558 /* rdx = ?? */
559}
560
561static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
562{
563 struct vcpu_svm *svm = to_svm(vcpu);
564
565 init_vmcb(svm->vmcb);
566
567 if (vcpu->vcpu_id != 0) {
568 svm->vmcb->save.rip = 0;
569 svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12;
570 svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8;
571 }
572}
573
574static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
575{
576 struct vcpu_svm *svm;
577 struct page *page;
578 int err;
579
580 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
581 if (!svm) {
582 err = -ENOMEM;
583 goto out;
584 }
585
586 err = kvm_vcpu_init(&svm->vcpu, kvm, id);
587 if (err)
588 goto free_svm;
589
590 if (irqchip_in_kernel(kvm)) {
591 err = kvm_create_lapic(&svm->vcpu);
592 if (err < 0)
593 goto free_svm;
594 }
595
596 page = alloc_page(GFP_KERNEL);
597 if (!page) {
598 err = -ENOMEM;
599 goto uninit;
600 }
601
602 svm->vmcb = page_address(page);
603 clear_page(svm->vmcb);
604 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
605 svm->asid_generation = 0;
606 memset(svm->db_regs, 0, sizeof(svm->db_regs));
607 init_vmcb(svm->vmcb);
608
609 fx_init(&svm->vcpu);
610 svm->vcpu.fpu_active = 1;
611 svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
612 if (svm->vcpu.vcpu_id == 0)
613 svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP;
614
615 return &svm->vcpu;
616
617uninit:
618 kvm_vcpu_uninit(&svm->vcpu);
619free_svm:
620 kmem_cache_free(kvm_vcpu_cache, svm);
621out:
622 return ERR_PTR(err);
623}
624
625static void svm_free_vcpu(struct kvm_vcpu *vcpu)
626{
627 struct vcpu_svm *svm = to_svm(vcpu);
628
629 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
630 kvm_vcpu_uninit(vcpu);
631 kmem_cache_free(kvm_vcpu_cache, svm);
632}
633
634static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
635{
636 struct vcpu_svm *svm = to_svm(vcpu);
637 int i;
638
639 if (unlikely(cpu != vcpu->cpu)) {
640 u64 tsc_this, delta;
641
642 /*
643 * Make sure that the guest sees a monotonically
644 * increasing TSC.
645 */
646 rdtscll(tsc_this);
647 delta = vcpu->host_tsc - tsc_this;
648 svm->vmcb->control.tsc_offset += delta;
649 vcpu->cpu = cpu;
650 kvm_migrate_apic_timer(vcpu);
651 }
652
653 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
654 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
655}
656
657static void svm_vcpu_put(struct kvm_vcpu *vcpu)
658{
659 struct vcpu_svm *svm = to_svm(vcpu);
660 int i;
661
662 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
663 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
664
665 rdtscll(vcpu->host_tsc);
666 kvm_put_guest_fpu(vcpu);
667}
668
669static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
670{
671}
672
673static void svm_cache_regs(struct kvm_vcpu *vcpu)
674{
675 struct vcpu_svm *svm = to_svm(vcpu);
676
677 vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
678 vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
679 vcpu->rip = svm->vmcb->save.rip;
680}
681
682static void svm_decache_regs(struct kvm_vcpu *vcpu)
683{
684 struct vcpu_svm *svm = to_svm(vcpu);
685 svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
686 svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
687 svm->vmcb->save.rip = vcpu->rip;
688}
689
690static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
691{
692 return to_svm(vcpu)->vmcb->save.rflags;
693}
694
695static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
696{
697 to_svm(vcpu)->vmcb->save.rflags = rflags;
698}
699
700static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
701{
702 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
703
704 switch (seg) {
705 case VCPU_SREG_CS: return &save->cs;
706 case VCPU_SREG_DS: return &save->ds;
707 case VCPU_SREG_ES: return &save->es;
708 case VCPU_SREG_FS: return &save->fs;
709 case VCPU_SREG_GS: return &save->gs;
710 case VCPU_SREG_SS: return &save->ss;
711 case VCPU_SREG_TR: return &save->tr;
712 case VCPU_SREG_LDTR: return &save->ldtr;
713 }
714 BUG();
715 return NULL;
716}
717
718static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
719{
720 struct vmcb_seg *s = svm_seg(vcpu, seg);
721
722 return s->base;
723}
724
725static void svm_get_segment(struct kvm_vcpu *vcpu,
726 struct kvm_segment *var, int seg)
727{
728 struct vmcb_seg *s = svm_seg(vcpu, seg);
729
730 var->base = s->base;
731 var->limit = s->limit;
732 var->selector = s->selector;
733 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
734 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
735 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
736 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
737 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
738 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
739 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
740 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
741 var->unusable = !var->present;
742}
743
744static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
745{
746 struct vcpu_svm *svm = to_svm(vcpu);
747
748 dt->limit = svm->vmcb->save.idtr.limit;
749 dt->base = svm->vmcb->save.idtr.base;
750}
751
752static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
753{
754 struct vcpu_svm *svm = to_svm(vcpu);
755
756 svm->vmcb->save.idtr.limit = dt->limit;
757 svm->vmcb->save.idtr.base = dt->base ;
758}
759
760static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
761{
762 struct vcpu_svm *svm = to_svm(vcpu);
763
764 dt->limit = svm->vmcb->save.gdtr.limit;
765 dt->base = svm->vmcb->save.gdtr.base;
766}
767
768static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
769{
770 struct vcpu_svm *svm = to_svm(vcpu);
771
772 svm->vmcb->save.gdtr.limit = dt->limit;
773 svm->vmcb->save.gdtr.base = dt->base ;
774}
775
776static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
777{
778}
779
780static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
781{
782 struct vcpu_svm *svm = to_svm(vcpu);
783
784#ifdef CONFIG_X86_64
785 if (vcpu->shadow_efer & KVM_EFER_LME) {
786 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
787 vcpu->shadow_efer |= KVM_EFER_LMA;
788 svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME;
789 }
790
791 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) {
792 vcpu->shadow_efer &= ~KVM_EFER_LMA;
793 svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME);
794 }
795 }
796#endif
797 if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
798 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
799 vcpu->fpu_active = 1;
800 }
801
802 vcpu->cr0 = cr0;
803 cr0 |= X86_CR0_PG | X86_CR0_WP;
804 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
805 svm->vmcb->save.cr0 = cr0;
806}
807
808static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
809{
810 vcpu->cr4 = cr4;
811 to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
812}
813
814static void svm_set_segment(struct kvm_vcpu *vcpu,
815 struct kvm_segment *var, int seg)
816{
817 struct vcpu_svm *svm = to_svm(vcpu);
818 struct vmcb_seg *s = svm_seg(vcpu, seg);
819
820 s->base = var->base;
821 s->limit = var->limit;
822 s->selector = var->selector;
823 if (var->unusable)
824 s->attrib = 0;
825 else {
826 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
827 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
828 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
829 s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
830 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
831 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
832 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
833 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
834 }
835 if (seg == VCPU_SREG_CS)
836 svm->vmcb->save.cpl
837 = (svm->vmcb->save.cs.attrib
838 >> SVM_SELECTOR_DPL_SHIFT) & 3;
839
840}
841
842/* FIXME:
843
844 svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
845 svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
846
847*/
848
849static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
850{
851 return -EOPNOTSUPP;
852}
853
854static int svm_get_irq(struct kvm_vcpu *vcpu)
855{
856 struct vcpu_svm *svm = to_svm(vcpu);
857 u32 exit_int_info = svm->vmcb->control.exit_int_info;
858
859 if (is_external_interrupt(exit_int_info))
860 return exit_int_info & SVM_EVTINJ_VEC_MASK;
861 return -1;
862}
863
864static void load_host_msrs(struct kvm_vcpu *vcpu)
865{
866#ifdef CONFIG_X86_64
867 wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
868#endif
869}
870
871static void save_host_msrs(struct kvm_vcpu *vcpu)
872{
873#ifdef CONFIG_X86_64
874 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
875#endif
876}
877
878static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
879{
880 if (svm_data->next_asid > svm_data->max_asid) {
881 ++svm_data->asid_generation;
882 svm_data->next_asid = 1;
883 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
884 }
885
886 svm->vcpu.cpu = svm_data->cpu;
887 svm->asid_generation = svm_data->asid_generation;
888 svm->vmcb->control.asid = svm_data->next_asid++;
889}
890
891static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
892{
893 return to_svm(vcpu)->db_regs[dr];
894}
895
896static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
897 int *exception)
898{
899 struct vcpu_svm *svm = to_svm(vcpu);
900
901 *exception = 0;
902
903 if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
904 svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
905 svm->vmcb->save.dr6 |= DR6_BD_MASK;
906 *exception = DB_VECTOR;
907 return;
908 }
909
910 switch (dr) {
911 case 0 ... 3:
912 svm->db_regs[dr] = value;
913 return;
914 case 4 ... 5:
915 if (vcpu->cr4 & X86_CR4_DE) {
916 *exception = UD_VECTOR;
917 return;
918 }
919 case 7: {
920 if (value & ~((1ULL << 32) - 1)) {
921 *exception = GP_VECTOR;
922 return;
923 }
924 svm->vmcb->save.dr7 = value;
925 return;
926 }
927 default:
928 printk(KERN_DEBUG "%s: unexpected dr %u\n",
929 __FUNCTION__, dr);
930 *exception = UD_VECTOR;
931 return;
932 }
933}
934
935static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
936{
937 u32 exit_int_info = svm->vmcb->control.exit_int_info;
938 struct kvm *kvm = svm->vcpu.kvm;
939 u64 fault_address;
940 u32 error_code;
941 enum emulation_result er;
942 int r;
943
944 if (!irqchip_in_kernel(kvm) &&
945 is_external_interrupt(exit_int_info))
946 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
947
948 mutex_lock(&kvm->lock);
949
950 fault_address = svm->vmcb->control.exit_info_2;
951 error_code = svm->vmcb->control.exit_info_1;
952 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
953 if (r < 0) {
954 mutex_unlock(&kvm->lock);
955 return r;
956 }
957 if (!r) {
958 mutex_unlock(&kvm->lock);
959 return 1;
960 }
961 er = emulate_instruction(&svm->vcpu, kvm_run, fault_address,
962 error_code);
963 mutex_unlock(&kvm->lock);
964
965 switch (er) {
966 case EMULATE_DONE:
967 return 1;
968 case EMULATE_DO_MMIO:
969 ++svm->vcpu.stat.mmio_exits;
970 return 0;
971 case EMULATE_FAIL:
972 kvm_report_emulation_failure(&svm->vcpu, "pagetable");
973 break;
974 default:
975 BUG();
976 }
977
978 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
979 return 0;
980}
981
982static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
983{
984 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
985 if (!(svm->vcpu.cr0 & X86_CR0_TS))
986 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
987 svm->vcpu.fpu_active = 1;
988
989 return 1;
990}
991
992static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
993{
994 /*
995 * VMCB is undefined after a SHUTDOWN intercept
996 * so reinitialize it.
997 */
998 clear_page(svm->vmcb);
999 init_vmcb(svm->vmcb);
1000
1001 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1002 return 0;
1003}
1004
1005static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1006{
1007 u32 io_info = svm->vmcb->control.exit_info_1; //address size bug?
1008 int size, down, in, string, rep;
1009 unsigned port;
1010
1011 ++svm->vcpu.stat.io_exits;
1012
1013 svm->next_rip = svm->vmcb->control.exit_info_2;
1014
1015 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1016
1017 if (string) {
1018 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
1019 return 0;
1020 return 1;
1021 }
1022
1023 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1024 port = io_info >> 16;
1025 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1026 rep = (io_info & SVM_IOIO_REP_MASK) != 0;
1027 down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
1028
1029 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1030}
1031
1032static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1033{
1034 return 1;
1035}
1036
1037static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1038{
1039 svm->next_rip = svm->vmcb->save.rip + 1;
1040 skip_emulated_instruction(&svm->vcpu);
1041 return kvm_emulate_halt(&svm->vcpu);
1042}
1043
1044static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1045{
1046 svm->next_rip = svm->vmcb->save.rip + 3;
1047 skip_emulated_instruction(&svm->vcpu);
1048 return kvm_hypercall(&svm->vcpu, kvm_run);
1049}
1050
1051static int invalid_op_interception(struct vcpu_svm *svm,
1052 struct kvm_run *kvm_run)
1053{
1054 inject_ud(&svm->vcpu);
1055 return 1;
1056}
1057
1058static int task_switch_interception(struct vcpu_svm *svm,
1059 struct kvm_run *kvm_run)
1060{
1061 pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
1062 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1063 return 0;
1064}
1065
1066static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1067{
1068 svm->next_rip = svm->vmcb->save.rip + 2;
1069 kvm_emulate_cpuid(&svm->vcpu);
1070 return 1;
1071}
1072
1073static int emulate_on_interception(struct vcpu_svm *svm,
1074 struct kvm_run *kvm_run)
1075{
1076 if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE)
1077 pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
1078 return 1;
1079}
1080
1081static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1082{
1083 struct vcpu_svm *svm = to_svm(vcpu);
1084
1085 switch (ecx) {
1086 case MSR_IA32_TIME_STAMP_COUNTER: {
1087 u64 tsc;
1088
1089 rdtscll(tsc);
1090 *data = svm->vmcb->control.tsc_offset + tsc;
1091 break;
1092 }
1093 case MSR_K6_STAR:
1094 *data = svm->vmcb->save.star;
1095 break;
1096#ifdef CONFIG_X86_64
1097 case MSR_LSTAR:
1098 *data = svm->vmcb->save.lstar;
1099 break;
1100 case MSR_CSTAR:
1101 *data = svm->vmcb->save.cstar;
1102 break;
1103 case MSR_KERNEL_GS_BASE:
1104 *data = svm->vmcb->save.kernel_gs_base;
1105 break;
1106 case MSR_SYSCALL_MASK:
1107 *data = svm->vmcb->save.sfmask;
1108 break;
1109#endif
1110 case MSR_IA32_SYSENTER_CS:
1111 *data = svm->vmcb->save.sysenter_cs;
1112 break;
1113 case MSR_IA32_SYSENTER_EIP:
1114 *data = svm->vmcb->save.sysenter_eip;
1115 break;
1116 case MSR_IA32_SYSENTER_ESP:
1117 *data = svm->vmcb->save.sysenter_esp;
1118 break;
1119 default:
1120 return kvm_get_msr_common(vcpu, ecx, data);
1121 }
1122 return 0;
1123}
1124
1125static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1126{
1127 u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
1128 u64 data;
1129
1130 if (svm_get_msr(&svm->vcpu, ecx, &data))
1131 svm_inject_gp(&svm->vcpu, 0);
1132 else {
1133 svm->vmcb->save.rax = data & 0xffffffff;
1134 svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32;
1135 svm->next_rip = svm->vmcb->save.rip + 2;
1136 skip_emulated_instruction(&svm->vcpu);
1137 }
1138 return 1;
1139}
1140
1141static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1142{
1143 struct vcpu_svm *svm = to_svm(vcpu);
1144
1145 switch (ecx) {
1146 case MSR_IA32_TIME_STAMP_COUNTER: {
1147 u64 tsc;
1148
1149 rdtscll(tsc);
1150 svm->vmcb->control.tsc_offset = data - tsc;
1151 break;
1152 }
1153 case MSR_K6_STAR:
1154 svm->vmcb->save.star = data;
1155 break;
1156#ifdef CONFIG_X86_64
1157 case MSR_LSTAR:
1158 svm->vmcb->save.lstar = data;
1159 break;
1160 case MSR_CSTAR:
1161 svm->vmcb->save.cstar = data;
1162 break;
1163 case MSR_KERNEL_GS_BASE:
1164 svm->vmcb->save.kernel_gs_base = data;
1165 break;
1166 case MSR_SYSCALL_MASK:
1167 svm->vmcb->save.sfmask = data;
1168 break;
1169#endif
1170 case MSR_IA32_SYSENTER_CS:
1171 svm->vmcb->save.sysenter_cs = data;
1172 break;
1173 case MSR_IA32_SYSENTER_EIP:
1174 svm->vmcb->save.sysenter_eip = data;
1175 break;
1176 case MSR_IA32_SYSENTER_ESP:
1177 svm->vmcb->save.sysenter_esp = data;
1178 break;
1179 default:
1180 return kvm_set_msr_common(vcpu, ecx, data);
1181 }
1182 return 0;
1183}
1184
1185static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1186{
1187 u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
1188 u64 data = (svm->vmcb->save.rax & -1u)
1189 | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32);
1190 svm->next_rip = svm->vmcb->save.rip + 2;
1191 if (svm_set_msr(&svm->vcpu, ecx, data))
1192 svm_inject_gp(&svm->vcpu, 0);
1193 else
1194 skip_emulated_instruction(&svm->vcpu);
1195 return 1;
1196}
1197
1198static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1199{
1200 if (svm->vmcb->control.exit_info_1)
1201 return wrmsr_interception(svm, kvm_run);
1202 else
1203 return rdmsr_interception(svm, kvm_run);
1204}
1205
1206static int interrupt_window_interception(struct vcpu_svm *svm,
1207 struct kvm_run *kvm_run)
1208{
1209 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
1210 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1211 /*
1212 * If the user space waits to inject interrupts, exit as soon as
1213 * possible
1214 */
1215 if (kvm_run->request_interrupt_window &&
1216 !svm->vcpu.irq_summary) {
1217 ++svm->vcpu.stat.irq_window_exits;
1218 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1219 return 0;
1220 }
1221
1222 return 1;
1223}
1224
1225static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1226 struct kvm_run *kvm_run) = {
1227 [SVM_EXIT_READ_CR0] = emulate_on_interception,
1228 [SVM_EXIT_READ_CR3] = emulate_on_interception,
1229 [SVM_EXIT_READ_CR4] = emulate_on_interception,
1230 /* for now: */
1231 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
1232 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
1233 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
1234 [SVM_EXIT_READ_DR0] = emulate_on_interception,
1235 [SVM_EXIT_READ_DR1] = emulate_on_interception,
1236 [SVM_EXIT_READ_DR2] = emulate_on_interception,
1237 [SVM_EXIT_READ_DR3] = emulate_on_interception,
1238 [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
1239 [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
1240 [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
1241 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
1242 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
1243 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
1244 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1245 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1246 [SVM_EXIT_INTR] = nop_on_interception,
1247 [SVM_EXIT_NMI] = nop_on_interception,
1248 [SVM_EXIT_SMI] = nop_on_interception,
1249 [SVM_EXIT_INIT] = nop_on_interception,
1250 [SVM_EXIT_VINTR] = interrupt_window_interception,
1251 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
1252 [SVM_EXIT_CPUID] = cpuid_interception,
1253 [SVM_EXIT_INVD] = emulate_on_interception,
1254 [SVM_EXIT_HLT] = halt_interception,
1255 [SVM_EXIT_INVLPG] = emulate_on_interception,
1256 [SVM_EXIT_INVLPGA] = invalid_op_interception,
1257 [SVM_EXIT_IOIO] = io_interception,
1258 [SVM_EXIT_MSR] = msr_interception,
1259 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
1260 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
1261 [SVM_EXIT_VMRUN] = invalid_op_interception,
1262 [SVM_EXIT_VMMCALL] = vmmcall_interception,
1263 [SVM_EXIT_VMLOAD] = invalid_op_interception,
1264 [SVM_EXIT_VMSAVE] = invalid_op_interception,
1265 [SVM_EXIT_STGI] = invalid_op_interception,
1266 [SVM_EXIT_CLGI] = invalid_op_interception,
1267 [SVM_EXIT_SKINIT] = invalid_op_interception,
1268 [SVM_EXIT_WBINVD] = emulate_on_interception,
1269 [SVM_EXIT_MONITOR] = invalid_op_interception,
1270 [SVM_EXIT_MWAIT] = invalid_op_interception,
1271};
1272
1273
1274static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1275{
1276 struct vcpu_svm *svm = to_svm(vcpu);
1277 u32 exit_code = svm->vmcb->control.exit_code;
1278
1279 kvm_reput_irq(svm);
1280
1281 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
1282 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1283 kvm_run->fail_entry.hardware_entry_failure_reason
1284 = svm->vmcb->control.exit_code;
1285 return 0;
1286 }
1287
1288 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
1289 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
1290 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
1291 "exit_code 0x%x\n",
1292 __FUNCTION__, svm->vmcb->control.exit_int_info,
1293 exit_code);
1294
1295 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
1296 || svm_exit_handlers[exit_code] == 0) {
1297 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1298 kvm_run->hw.hardware_exit_reason = exit_code;
1299 return 0;
1300 }
1301
1302 return svm_exit_handlers[exit_code](svm, kvm_run);
1303}
1304
1305static void reload_tss(struct kvm_vcpu *vcpu)
1306{
1307 int cpu = raw_smp_processor_id();
1308
1309 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1310 svm_data->tss_desc->type = 9; //available 32/64-bit TSS
1311 load_TR_desc();
1312}
1313
1314static void pre_svm_run(struct vcpu_svm *svm)
1315{
1316 int cpu = raw_smp_processor_id();
1317
1318 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1319
1320 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
1321 if (svm->vcpu.cpu != cpu ||
1322 svm->asid_generation != svm_data->asid_generation)
1323 new_asid(svm, svm_data);
1324}
1325
1326
1327static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1328{
1329 struct vmcb_control_area *control;
1330
1331 control = &svm->vmcb->control;
1332 control->int_vector = irq;
1333 control->int_ctl &= ~V_INTR_PRIO_MASK;
1334 control->int_ctl |= V_IRQ_MASK |
1335 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1336}
1337
1338static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
1339{
1340 struct vcpu_svm *svm = to_svm(vcpu);
1341
1342 svm_inject_irq(svm, irq);
1343}
1344
1345static void svm_intr_assist(struct kvm_vcpu *vcpu)
1346{
1347 struct vcpu_svm *svm = to_svm(vcpu);
1348 struct vmcb *vmcb = svm->vmcb;
1349 int intr_vector = -1;
1350
1351 kvm_inject_pending_timer_irqs(vcpu);
1352 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
1353 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
1354 intr_vector = vmcb->control.exit_int_info &
1355 SVM_EVTINJ_VEC_MASK;
1356 vmcb->control.exit_int_info = 0;
1357 svm_inject_irq(svm, intr_vector);
1358 return;
1359 }
1360
1361 if (vmcb->control.int_ctl & V_IRQ_MASK)
1362 return;
1363
1364 if (!kvm_cpu_has_interrupt(vcpu))
1365 return;
1366
1367 if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
1368 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
1369 (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
1370 /* unable to deliver irq, set pending irq */
1371 vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
1372 svm_inject_irq(svm, 0x0);
1373 return;
1374 }
1375 /* Okay, we can deliver the interrupt: grab it and update PIC state. */
1376 intr_vector = kvm_cpu_get_interrupt(vcpu);
1377 svm_inject_irq(svm, intr_vector);
1378 kvm_timer_intr_post(vcpu, intr_vector);
1379}
1380
1381static void kvm_reput_irq(struct vcpu_svm *svm)
1382{
1383 struct vmcb_control_area *control = &svm->vmcb->control;
1384
1385 if ((control->int_ctl & V_IRQ_MASK)
1386 && !irqchip_in_kernel(svm->vcpu.kvm)) {
1387 control->int_ctl &= ~V_IRQ_MASK;
1388 push_irq(&svm->vcpu, control->int_vector);
1389 }
1390
1391 svm->vcpu.interrupt_window_open =
1392 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
1393}
1394
1395static void svm_do_inject_vector(struct vcpu_svm *svm)
1396{
1397 struct kvm_vcpu *vcpu = &svm->vcpu;
1398 int word_index = __ffs(vcpu->irq_summary);
1399 int bit_index = __ffs(vcpu->irq_pending[word_index]);
1400 int irq = word_index * BITS_PER_LONG + bit_index;
1401
1402 clear_bit(bit_index, &vcpu->irq_pending[word_index]);
1403 if (!vcpu->irq_pending[word_index])
1404 clear_bit(word_index, &vcpu->irq_summary);
1405 svm_inject_irq(svm, irq);
1406}
1407
1408static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1409 struct kvm_run *kvm_run)
1410{
1411 struct vcpu_svm *svm = to_svm(vcpu);
1412 struct vmcb_control_area *control = &svm->vmcb->control;
1413
1414 svm->vcpu.interrupt_window_open =
1415 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
1416 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
1417
1418 if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary)
1419 /*
1420 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1421 */
1422 svm_do_inject_vector(svm);
1423
1424 /*
1425 * Interrupts blocked. Wait for unblock.
1426 */
1427 if (!svm->vcpu.interrupt_window_open &&
1428 (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) {
1429 control->intercept |= 1ULL << INTERCEPT_VINTR;
1430 } else
1431 control->intercept &= ~(1ULL << INTERCEPT_VINTR);
1432}
1433
1434static void save_db_regs(unsigned long *db_regs)
1435{
1436 asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
1437 asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1]));
1438 asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2]));
1439 asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3]));
1440}
1441
1442static void load_db_regs(unsigned long *db_regs)
1443{
1444 asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0]));
1445 asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1]));
1446 asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2]));
1447 asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
1448}
1449
1450static void svm_flush_tlb(struct kvm_vcpu *vcpu)
1451{
1452 force_new_asid(vcpu);
1453}
1454
1455static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
1456{
1457}
1458
1459static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1460{
1461 struct vcpu_svm *svm = to_svm(vcpu);
1462 u16 fs_selector;
1463 u16 gs_selector;
1464 u16 ldt_selector;
1465
1466 pre_svm_run(svm);
1467
1468 save_host_msrs(vcpu);
1469 fs_selector = read_fs();
1470 gs_selector = read_gs();
1471 ldt_selector = read_ldt();
1472 svm->host_cr2 = kvm_read_cr2();
1473 svm->host_dr6 = read_dr6();
1474 svm->host_dr7 = read_dr7();
1475 svm->vmcb->save.cr2 = vcpu->cr2;
1476
1477 if (svm->vmcb->save.dr7 & 0xff) {
1478 write_dr7(0);
1479 save_db_regs(svm->host_db_regs);
1480 load_db_regs(svm->db_regs);
1481 }
1482
1483 clgi();
1484
1485 local_irq_enable();
1486
1487 asm volatile (
1488#ifdef CONFIG_X86_64
1489 "push %%rbx; push %%rcx; push %%rdx;"
1490 "push %%rsi; push %%rdi; push %%rbp;"
1491 "push %%r8; push %%r9; push %%r10; push %%r11;"
1492 "push %%r12; push %%r13; push %%r14; push %%r15;"
1493#else
1494 "push %%ebx; push %%ecx; push %%edx;"
1495 "push %%esi; push %%edi; push %%ebp;"
1496#endif
1497
1498#ifdef CONFIG_X86_64
1499 "mov %c[rbx](%[svm]), %%rbx \n\t"
1500 "mov %c[rcx](%[svm]), %%rcx \n\t"
1501 "mov %c[rdx](%[svm]), %%rdx \n\t"
1502 "mov %c[rsi](%[svm]), %%rsi \n\t"
1503 "mov %c[rdi](%[svm]), %%rdi \n\t"
1504 "mov %c[rbp](%[svm]), %%rbp \n\t"
1505 "mov %c[r8](%[svm]), %%r8 \n\t"
1506 "mov %c[r9](%[svm]), %%r9 \n\t"
1507 "mov %c[r10](%[svm]), %%r10 \n\t"
1508 "mov %c[r11](%[svm]), %%r11 \n\t"
1509 "mov %c[r12](%[svm]), %%r12 \n\t"
1510 "mov %c[r13](%[svm]), %%r13 \n\t"
1511 "mov %c[r14](%[svm]), %%r14 \n\t"
1512 "mov %c[r15](%[svm]), %%r15 \n\t"
1513#else
1514 "mov %c[rbx](%[svm]), %%ebx \n\t"
1515 "mov %c[rcx](%[svm]), %%ecx \n\t"
1516 "mov %c[rdx](%[svm]), %%edx \n\t"
1517 "mov %c[rsi](%[svm]), %%esi \n\t"
1518 "mov %c[rdi](%[svm]), %%edi \n\t"
1519 "mov %c[rbp](%[svm]), %%ebp \n\t"
1520#endif
1521
1522#ifdef CONFIG_X86_64
1523 /* Enter guest mode */
1524 "push %%rax \n\t"
1525 "mov %c[vmcb](%[svm]), %%rax \n\t"
1526 SVM_VMLOAD "\n\t"
1527 SVM_VMRUN "\n\t"
1528 SVM_VMSAVE "\n\t"
1529 "pop %%rax \n\t"
1530#else
1531 /* Enter guest mode */
1532 "push %%eax \n\t"
1533 "mov %c[vmcb](%[svm]), %%eax \n\t"
1534 SVM_VMLOAD "\n\t"
1535 SVM_VMRUN "\n\t"
1536 SVM_VMSAVE "\n\t"
1537 "pop %%eax \n\t"
1538#endif
1539
1540 /* Save guest registers, load host registers */
1541#ifdef CONFIG_X86_64
1542 "mov %%rbx, %c[rbx](%[svm]) \n\t"
1543 "mov %%rcx, %c[rcx](%[svm]) \n\t"
1544 "mov %%rdx, %c[rdx](%[svm]) \n\t"
1545 "mov %%rsi, %c[rsi](%[svm]) \n\t"
1546 "mov %%rdi, %c[rdi](%[svm]) \n\t"
1547 "mov %%rbp, %c[rbp](%[svm]) \n\t"
1548 "mov %%r8, %c[r8](%[svm]) \n\t"
1549 "mov %%r9, %c[r9](%[svm]) \n\t"
1550 "mov %%r10, %c[r10](%[svm]) \n\t"
1551 "mov %%r11, %c[r11](%[svm]) \n\t"
1552 "mov %%r12, %c[r12](%[svm]) \n\t"
1553 "mov %%r13, %c[r13](%[svm]) \n\t"
1554 "mov %%r14, %c[r14](%[svm]) \n\t"
1555 "mov %%r15, %c[r15](%[svm]) \n\t"
1556
1557 "pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
1558 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
1559 "pop %%rbp; pop %%rdi; pop %%rsi;"
1560 "pop %%rdx; pop %%rcx; pop %%rbx; \n\t"
1561#else
1562 "mov %%ebx, %c[rbx](%[svm]) \n\t"
1563 "mov %%ecx, %c[rcx](%[svm]) \n\t"
1564 "mov %%edx, %c[rdx](%[svm]) \n\t"
1565 "mov %%esi, %c[rsi](%[svm]) \n\t"
1566 "mov %%edi, %c[rdi](%[svm]) \n\t"
1567 "mov %%ebp, %c[rbp](%[svm]) \n\t"
1568
1569 "pop %%ebp; pop %%edi; pop %%esi;"
1570 "pop %%edx; pop %%ecx; pop %%ebx; \n\t"
1571#endif
1572 :
1573 : [svm]"a"(svm),
1574 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
1575 [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])),
1576 [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])),
1577 [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])),
1578 [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])),
1579 [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])),
1580 [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP]))
1581#ifdef CONFIG_X86_64
1582 ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])),
1583 [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])),
1584 [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])),
1585 [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])),
1586 [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])),
1587 [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])),
1588 [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])),
1589 [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15]))
1590#endif
1591 : "cc", "memory" );
1592
1593 if ((svm->vmcb->save.dr7 & 0xff))
1594 load_db_regs(svm->host_db_regs);
1595
1596 vcpu->cr2 = svm->vmcb->save.cr2;
1597
1598 write_dr6(svm->host_dr6);
1599 write_dr7(svm->host_dr7);
1600 kvm_write_cr2(svm->host_cr2);
1601
1602 load_fs(fs_selector);
1603 load_gs(gs_selector);
1604 load_ldt(ldt_selector);
1605 load_host_msrs(vcpu);
1606
1607 reload_tss(vcpu);
1608
1609 local_irq_disable();
1610
1611 stgi();
1612
1613 svm->next_rip = 0;
1614}
1615
1616static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1617{
1618 struct vcpu_svm *svm = to_svm(vcpu);
1619
1620 svm->vmcb->save.cr3 = root;
1621 force_new_asid(vcpu);
1622
1623 if (vcpu->fpu_active) {
1624 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
1625 svm->vmcb->save.cr0 |= X86_CR0_TS;
1626 vcpu->fpu_active = 0;
1627 }
1628}
1629
1630static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
1631 unsigned long addr,
1632 uint32_t err_code)
1633{
1634 struct vcpu_svm *svm = to_svm(vcpu);
1635 uint32_t exit_int_info = svm->vmcb->control.exit_int_info;
1636
1637 ++vcpu->stat.pf_guest;
1638
1639 if (is_page_fault(exit_int_info)) {
1640
1641 svm->vmcb->control.event_inj_err = 0;
1642 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
1643 SVM_EVTINJ_VALID_ERR |
1644 SVM_EVTINJ_TYPE_EXEPT |
1645 DF_VECTOR;
1646 return;
1647 }
1648 vcpu->cr2 = addr;
1649 svm->vmcb->save.cr2 = addr;
1650 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
1651 SVM_EVTINJ_VALID_ERR |
1652 SVM_EVTINJ_TYPE_EXEPT |
1653 PF_VECTOR;
1654 svm->vmcb->control.event_inj_err = err_code;
1655}
1656
1657
1658static int is_disabled(void)
1659{
1660 u64 vm_cr;
1661
1662 rdmsrl(MSR_VM_CR, vm_cr);
1663 if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
1664 return 1;
1665
1666 return 0;
1667}
1668
1669static void
1670svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1671{
1672 /*
1673 * Patch in the VMMCALL instruction:
1674 */
1675 hypercall[0] = 0x0f;
1676 hypercall[1] = 0x01;
1677 hypercall[2] = 0xd9;
1678 hypercall[3] = 0xc3;
1679}
1680
1681static void svm_check_processor_compat(void *rtn)
1682{
1683 *(int *)rtn = 0;
1684}
1685
1686static struct kvm_x86_ops svm_x86_ops = {
1687 .cpu_has_kvm_support = has_svm,
1688 .disabled_by_bios = is_disabled,
1689 .hardware_setup = svm_hardware_setup,
1690 .hardware_unsetup = svm_hardware_unsetup,
1691 .check_processor_compatibility = svm_check_processor_compat,
1692 .hardware_enable = svm_hardware_enable,
1693 .hardware_disable = svm_hardware_disable,
1694
1695 .vcpu_create = svm_create_vcpu,
1696 .vcpu_free = svm_free_vcpu,
1697 .vcpu_reset = svm_vcpu_reset,
1698
1699 .prepare_guest_switch = svm_prepare_guest_switch,
1700 .vcpu_load = svm_vcpu_load,
1701 .vcpu_put = svm_vcpu_put,
1702 .vcpu_decache = svm_vcpu_decache,
1703
1704 .set_guest_debug = svm_guest_debug,
1705 .get_msr = svm_get_msr,
1706 .set_msr = svm_set_msr,
1707 .get_segment_base = svm_get_segment_base,
1708 .get_segment = svm_get_segment,
1709 .set_segment = svm_set_segment,
1710 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
1711 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
1712 .set_cr0 = svm_set_cr0,
1713 .set_cr3 = svm_set_cr3,
1714 .set_cr4 = svm_set_cr4,
1715 .set_efer = svm_set_efer,
1716 .get_idt = svm_get_idt,
1717 .set_idt = svm_set_idt,
1718 .get_gdt = svm_get_gdt,
1719 .set_gdt = svm_set_gdt,
1720 .get_dr = svm_get_dr,
1721 .set_dr = svm_set_dr,
1722 .cache_regs = svm_cache_regs,
1723 .decache_regs = svm_decache_regs,
1724 .get_rflags = svm_get_rflags,
1725 .set_rflags = svm_set_rflags,
1726
1727 .tlb_flush = svm_flush_tlb,
1728 .inject_page_fault = svm_inject_page_fault,
1729
1730 .inject_gp = svm_inject_gp,
1731
1732 .run = svm_vcpu_run,
1733 .handle_exit = handle_exit,
1734 .skip_emulated_instruction = skip_emulated_instruction,
1735 .patch_hypercall = svm_patch_hypercall,
1736 .get_irq = svm_get_irq,
1737 .set_irq = svm_set_irq,
1738 .inject_pending_irq = svm_intr_assist,
1739 .inject_pending_vectors = do_interrupt_requests,
1740};
1741
1742static int __init svm_init(void)
1743{
1744 return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm),
1745 THIS_MODULE);
1746}
1747
1748static void __exit svm_exit(void)
1749{
1750 kvm_exit_x86();
1751}
1752
1753module_init(svm_init)
1754module_exit(svm_exit)
diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h
deleted file mode 100644
index 3b1b0f35b6cb..000000000000
--- a/drivers/kvm/svm.h
+++ /dev/null
@@ -1,324 +0,0 @@
1#ifndef __SVM_H
2#define __SVM_H
3
4enum {
5 INTERCEPT_INTR,
6 INTERCEPT_NMI,
7 INTERCEPT_SMI,
8 INTERCEPT_INIT,
9 INTERCEPT_VINTR,
10 INTERCEPT_SELECTIVE_CR0,
11 INTERCEPT_STORE_IDTR,
12 INTERCEPT_STORE_GDTR,
13 INTERCEPT_STORE_LDTR,
14 INTERCEPT_STORE_TR,
15 INTERCEPT_LOAD_IDTR,
16 INTERCEPT_LOAD_GDTR,
17 INTERCEPT_LOAD_LDTR,
18 INTERCEPT_LOAD_TR,
19 INTERCEPT_RDTSC,
20 INTERCEPT_RDPMC,
21 INTERCEPT_PUSHF,
22 INTERCEPT_POPF,
23 INTERCEPT_CPUID,
24 INTERCEPT_RSM,
25 INTERCEPT_IRET,
26 INTERCEPT_INTn,
27 INTERCEPT_INVD,
28 INTERCEPT_PAUSE,
29 INTERCEPT_HLT,
30 INTERCEPT_INVLPG,
31 INTERCEPT_INVLPGA,
32 INTERCEPT_IOIO_PROT,
33 INTERCEPT_MSR_PROT,
34 INTERCEPT_TASK_SWITCH,
35 INTERCEPT_FERR_FREEZE,
36 INTERCEPT_SHUTDOWN,
37 INTERCEPT_VMRUN,
38 INTERCEPT_VMMCALL,
39 INTERCEPT_VMLOAD,
40 INTERCEPT_VMSAVE,
41 INTERCEPT_STGI,
42 INTERCEPT_CLGI,
43 INTERCEPT_SKINIT,
44 INTERCEPT_RDTSCP,
45 INTERCEPT_ICEBP,
46 INTERCEPT_WBINVD,
47 INTERCEPT_MONITOR,
48 INTERCEPT_MWAIT,
49 INTERCEPT_MWAIT_COND,
50};
51
52
53struct __attribute__ ((__packed__)) vmcb_control_area {
54 u16 intercept_cr_read;
55 u16 intercept_cr_write;
56 u16 intercept_dr_read;
57 u16 intercept_dr_write;
58 u32 intercept_exceptions;
59 u64 intercept;
60 u8 reserved_1[44];
61 u64 iopm_base_pa;
62 u64 msrpm_base_pa;
63 u64 tsc_offset;
64 u32 asid;
65 u8 tlb_ctl;
66 u8 reserved_2[3];
67 u32 int_ctl;
68 u32 int_vector;
69 u32 int_state;
70 u8 reserved_3[4];
71 u32 exit_code;
72 u32 exit_code_hi;
73 u64 exit_info_1;
74 u64 exit_info_2;
75 u32 exit_int_info;
76 u32 exit_int_info_err;
77 u64 nested_ctl;
78 u8 reserved_4[16];
79 u32 event_inj;
80 u32 event_inj_err;
81 u64 nested_cr3;
82 u64 lbr_ctl;
83 u8 reserved_5[832];
84};
85
86
87#define TLB_CONTROL_DO_NOTHING 0
88#define TLB_CONTROL_FLUSH_ALL_ASID 1
89
90#define V_TPR_MASK 0x0f
91
92#define V_IRQ_SHIFT 8
93#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
94
95#define V_INTR_PRIO_SHIFT 16
96#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
97
98#define V_IGN_TPR_SHIFT 20
99#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
100
101#define V_INTR_MASKING_SHIFT 24
102#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
103
104#define SVM_INTERRUPT_SHADOW_MASK 1
105
106#define SVM_IOIO_STR_SHIFT 2
107#define SVM_IOIO_REP_SHIFT 3
108#define SVM_IOIO_SIZE_SHIFT 4
109#define SVM_IOIO_ASIZE_SHIFT 7
110
111#define SVM_IOIO_TYPE_MASK 1
112#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
113#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
114#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
115#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
116
117struct __attribute__ ((__packed__)) vmcb_seg {
118 u16 selector;
119 u16 attrib;
120 u32 limit;
121 u64 base;
122};
123
124struct __attribute__ ((__packed__)) vmcb_save_area {
125 struct vmcb_seg es;
126 struct vmcb_seg cs;
127 struct vmcb_seg ss;
128 struct vmcb_seg ds;
129 struct vmcb_seg fs;
130 struct vmcb_seg gs;
131 struct vmcb_seg gdtr;
132 struct vmcb_seg ldtr;
133 struct vmcb_seg idtr;
134 struct vmcb_seg tr;
135 u8 reserved_1[43];
136 u8 cpl;
137 u8 reserved_2[4];
138 u64 efer;
139 u8 reserved_3[112];
140 u64 cr4;
141 u64 cr3;
142 u64 cr0;
143 u64 dr7;
144 u64 dr6;
145 u64 rflags;
146 u64 rip;
147 u8 reserved_4[88];
148 u64 rsp;
149 u8 reserved_5[24];
150 u64 rax;
151 u64 star;
152 u64 lstar;
153 u64 cstar;
154 u64 sfmask;
155 u64 kernel_gs_base;
156 u64 sysenter_cs;
157 u64 sysenter_esp;
158 u64 sysenter_eip;
159 u64 cr2;
160 u8 reserved_6[32];
161 u64 g_pat;
162 u64 dbgctl;
163 u64 br_from;
164 u64 br_to;
165 u64 last_excp_from;
166 u64 last_excp_to;
167};
168
169struct __attribute__ ((__packed__)) vmcb {
170 struct vmcb_control_area control;
171 struct vmcb_save_area save;
172};
173
174#define SVM_CPUID_FEATURE_SHIFT 2
175#define SVM_CPUID_FUNC 0x8000000a
176
177#define MSR_EFER_SVME_MASK (1ULL << 12)
178#define MSR_VM_CR 0xc0010114
179#define MSR_VM_HSAVE_PA 0xc0010117ULL
180
181#define SVM_VM_CR_SVM_DISABLE 4
182
183#define SVM_SELECTOR_S_SHIFT 4
184#define SVM_SELECTOR_DPL_SHIFT 5
185#define SVM_SELECTOR_P_SHIFT 7
186#define SVM_SELECTOR_AVL_SHIFT 8
187#define SVM_SELECTOR_L_SHIFT 9
188#define SVM_SELECTOR_DB_SHIFT 10
189#define SVM_SELECTOR_G_SHIFT 11
190
191#define SVM_SELECTOR_TYPE_MASK (0xf)
192#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
193#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
194#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
195#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
196#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
197#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
198#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
199
200#define SVM_SELECTOR_WRITE_MASK (1 << 1)
201#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
202#define SVM_SELECTOR_CODE_MASK (1 << 3)
203
204#define INTERCEPT_CR0_MASK 1
205#define INTERCEPT_CR3_MASK (1 << 3)
206#define INTERCEPT_CR4_MASK (1 << 4)
207
208#define INTERCEPT_DR0_MASK 1
209#define INTERCEPT_DR1_MASK (1 << 1)
210#define INTERCEPT_DR2_MASK (1 << 2)
211#define INTERCEPT_DR3_MASK (1 << 3)
212#define INTERCEPT_DR4_MASK (1 << 4)
213#define INTERCEPT_DR5_MASK (1 << 5)
214#define INTERCEPT_DR6_MASK (1 << 6)
215#define INTERCEPT_DR7_MASK (1 << 7)
216
217#define SVM_EVTINJ_VEC_MASK 0xff
218
219#define SVM_EVTINJ_TYPE_SHIFT 8
220#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
221
222#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
223#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
224#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
225#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
226
227#define SVM_EVTINJ_VALID (1 << 31)
228#define SVM_EVTINJ_VALID_ERR (1 << 11)
229
230#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
231
232#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
233#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
234#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
235#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
236
237#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
238#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
239
240#define SVM_EXIT_READ_CR0 0x000
241#define SVM_EXIT_READ_CR3 0x003
242#define SVM_EXIT_READ_CR4 0x004
243#define SVM_EXIT_READ_CR8 0x008
244#define SVM_EXIT_WRITE_CR0 0x010
245#define SVM_EXIT_WRITE_CR3 0x013
246#define SVM_EXIT_WRITE_CR4 0x014
247#define SVM_EXIT_WRITE_CR8 0x018
248#define SVM_EXIT_READ_DR0 0x020
249#define SVM_EXIT_READ_DR1 0x021
250#define SVM_EXIT_READ_DR2 0x022
251#define SVM_EXIT_READ_DR3 0x023
252#define SVM_EXIT_READ_DR4 0x024
253#define SVM_EXIT_READ_DR5 0x025
254#define SVM_EXIT_READ_DR6 0x026
255#define SVM_EXIT_READ_DR7 0x027
256#define SVM_EXIT_WRITE_DR0 0x030
257#define SVM_EXIT_WRITE_DR1 0x031
258#define SVM_EXIT_WRITE_DR2 0x032
259#define SVM_EXIT_WRITE_DR3 0x033
260#define SVM_EXIT_WRITE_DR4 0x034
261#define SVM_EXIT_WRITE_DR5 0x035
262#define SVM_EXIT_WRITE_DR6 0x036
263#define SVM_EXIT_WRITE_DR7 0x037
264#define SVM_EXIT_EXCP_BASE 0x040
265#define SVM_EXIT_INTR 0x060
266#define SVM_EXIT_NMI 0x061
267#define SVM_EXIT_SMI 0x062
268#define SVM_EXIT_INIT 0x063
269#define SVM_EXIT_VINTR 0x064
270#define SVM_EXIT_CR0_SEL_WRITE 0x065
271#define SVM_EXIT_IDTR_READ 0x066
272#define SVM_EXIT_GDTR_READ 0x067
273#define SVM_EXIT_LDTR_READ 0x068
274#define SVM_EXIT_TR_READ 0x069
275#define SVM_EXIT_IDTR_WRITE 0x06a
276#define SVM_EXIT_GDTR_WRITE 0x06b
277#define SVM_EXIT_LDTR_WRITE 0x06c
278#define SVM_EXIT_TR_WRITE 0x06d
279#define SVM_EXIT_RDTSC 0x06e
280#define SVM_EXIT_RDPMC 0x06f
281#define SVM_EXIT_PUSHF 0x070
282#define SVM_EXIT_POPF 0x071
283#define SVM_EXIT_CPUID 0x072
284#define SVM_EXIT_RSM 0x073
285#define SVM_EXIT_IRET 0x074
286#define SVM_EXIT_SWINT 0x075
287#define SVM_EXIT_INVD 0x076
288#define SVM_EXIT_PAUSE 0x077
289#define SVM_EXIT_HLT 0x078
290#define SVM_EXIT_INVLPG 0x079
291#define SVM_EXIT_INVLPGA 0x07a
292#define SVM_EXIT_IOIO 0x07b
293#define SVM_EXIT_MSR 0x07c
294#define SVM_EXIT_TASK_SWITCH 0x07d
295#define SVM_EXIT_FERR_FREEZE 0x07e
296#define SVM_EXIT_SHUTDOWN 0x07f
297#define SVM_EXIT_VMRUN 0x080
298#define SVM_EXIT_VMMCALL 0x081
299#define SVM_EXIT_VMLOAD 0x082
300#define SVM_EXIT_VMSAVE 0x083
301#define SVM_EXIT_STGI 0x084
302#define SVM_EXIT_CLGI 0x085
303#define SVM_EXIT_SKINIT 0x086
304#define SVM_EXIT_RDTSCP 0x087
305#define SVM_EXIT_ICEBP 0x088
306#define SVM_EXIT_WBINVD 0x089
307#define SVM_EXIT_MONITOR 0x08a
308#define SVM_EXIT_MWAIT 0x08b
309#define SVM_EXIT_MWAIT_COND 0x08c
310#define SVM_EXIT_NPF 0x400
311
312#define SVM_EXIT_ERR -1
313
314#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP
315
316#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
317#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
318#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
319#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd"
320#define SVM_STGI ".byte 0x0f, 0x01, 0xdc"
321#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
322
323#endif
324
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
deleted file mode 100644
index 5b397b6c9f93..000000000000
--- a/drivers/kvm/vmx.c
+++ /dev/null
@@ -1,2566 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "kvm.h"
19#include "x86_emulate.h"
20#include "irq.h"
21#include "vmx.h"
22#include "segment_descriptor.h"
23
24#include <linux/module.h>
25#include <linux/kernel.h>
26#include <linux/mm.h>
27#include <linux/highmem.h>
28#include <linux/sched.h>
29
30#include <asm/io.h>
31#include <asm/desc.h>
32
33MODULE_AUTHOR("Qumranet");
34MODULE_LICENSE("GPL");
35
36struct vmcs {
37 u32 revision_id;
38 u32 abort;
39 char data[0];
40};
41
42struct vcpu_vmx {
43 struct kvm_vcpu vcpu;
44 int launched;
45 u8 fail;
46 struct kvm_msr_entry *guest_msrs;
47 struct kvm_msr_entry *host_msrs;
48 int nmsrs;
49 int save_nmsrs;
50 int msr_offset_efer;
51#ifdef CONFIG_X86_64
52 int msr_offset_kernel_gs_base;
53#endif
54 struct vmcs *vmcs;
55 struct {
56 int loaded;
57 u16 fs_sel, gs_sel, ldt_sel;
58 int gs_ldt_reload_needed;
59 int fs_reload_needed;
60 }host_state;
61
62};
63
64static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
65{
66 return container_of(vcpu, struct vcpu_vmx, vcpu);
67}
68
69static int init_rmode_tss(struct kvm *kvm);
70
71static DEFINE_PER_CPU(struct vmcs *, vmxarea);
72static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
73
74static struct page *vmx_io_bitmap_a;
75static struct page *vmx_io_bitmap_b;
76
77#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
78
79static struct vmcs_config {
80 int size;
81 int order;
82 u32 revision_id;
83 u32 pin_based_exec_ctrl;
84 u32 cpu_based_exec_ctrl;
85 u32 vmexit_ctrl;
86 u32 vmentry_ctrl;
87} vmcs_config;
88
89#define VMX_SEGMENT_FIELD(seg) \
90 [VCPU_SREG_##seg] = { \
91 .selector = GUEST_##seg##_SELECTOR, \
92 .base = GUEST_##seg##_BASE, \
93 .limit = GUEST_##seg##_LIMIT, \
94 .ar_bytes = GUEST_##seg##_AR_BYTES, \
95 }
96
97static struct kvm_vmx_segment_field {
98 unsigned selector;
99 unsigned base;
100 unsigned limit;
101 unsigned ar_bytes;
102} kvm_vmx_segment_fields[] = {
103 VMX_SEGMENT_FIELD(CS),
104 VMX_SEGMENT_FIELD(DS),
105 VMX_SEGMENT_FIELD(ES),
106 VMX_SEGMENT_FIELD(FS),
107 VMX_SEGMENT_FIELD(GS),
108 VMX_SEGMENT_FIELD(SS),
109 VMX_SEGMENT_FIELD(TR),
110 VMX_SEGMENT_FIELD(LDTR),
111};
112
113/*
114 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
115 * away by decrementing the array size.
116 */
117static const u32 vmx_msr_index[] = {
118#ifdef CONFIG_X86_64
119 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
120#endif
121 MSR_EFER, MSR_K6_STAR,
122};
123#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
124
125static void load_msrs(struct kvm_msr_entry *e, int n)
126{
127 int i;
128
129 for (i = 0; i < n; ++i)
130 wrmsrl(e[i].index, e[i].data);
131}
132
133static void save_msrs(struct kvm_msr_entry *e, int n)
134{
135 int i;
136
137 for (i = 0; i < n; ++i)
138 rdmsrl(e[i].index, e[i].data);
139}
140
141static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr)
142{
143 return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
144}
145
146static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx)
147{
148 int efer_offset = vmx->msr_offset_efer;
149 return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) !=
150 msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
151}
152
153static inline int is_page_fault(u32 intr_info)
154{
155 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
156 INTR_INFO_VALID_MASK)) ==
157 (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
158}
159
160static inline int is_no_device(u32 intr_info)
161{
162 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
163 INTR_INFO_VALID_MASK)) ==
164 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
165}
166
167static inline int is_external_interrupt(u32 intr_info)
168{
169 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
170 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
171}
172
173static inline int cpu_has_vmx_tpr_shadow(void)
174{
175 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
176}
177
178static inline int vm_need_tpr_shadow(struct kvm *kvm)
179{
180 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
181}
182
183static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
184{
185 int i;
186
187 for (i = 0; i < vmx->nmsrs; ++i)
188 if (vmx->guest_msrs[i].index == msr)
189 return i;
190 return -1;
191}
192
193static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
194{
195 int i;
196
197 i = __find_msr_index(vmx, msr);
198 if (i >= 0)
199 return &vmx->guest_msrs[i];
200 return NULL;
201}
202
203static void vmcs_clear(struct vmcs *vmcs)
204{
205 u64 phys_addr = __pa(vmcs);
206 u8 error;
207
208 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
209 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
210 : "cc", "memory");
211 if (error)
212 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
213 vmcs, phys_addr);
214}
215
216static void __vcpu_clear(void *arg)
217{
218 struct vcpu_vmx *vmx = arg;
219 int cpu = raw_smp_processor_id();
220
221 if (vmx->vcpu.cpu == cpu)
222 vmcs_clear(vmx->vmcs);
223 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
224 per_cpu(current_vmcs, cpu) = NULL;
225 rdtscll(vmx->vcpu.host_tsc);
226}
227
228static void vcpu_clear(struct vcpu_vmx *vmx)
229{
230 if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1)
231 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear,
232 vmx, 0, 1);
233 else
234 __vcpu_clear(vmx);
235 vmx->launched = 0;
236}
237
238static unsigned long vmcs_readl(unsigned long field)
239{
240 unsigned long value;
241
242 asm volatile (ASM_VMX_VMREAD_RDX_RAX
243 : "=a"(value) : "d"(field) : "cc");
244 return value;
245}
246
247static u16 vmcs_read16(unsigned long field)
248{
249 return vmcs_readl(field);
250}
251
252static u32 vmcs_read32(unsigned long field)
253{
254 return vmcs_readl(field);
255}
256
257static u64 vmcs_read64(unsigned long field)
258{
259#ifdef CONFIG_X86_64
260 return vmcs_readl(field);
261#else
262 return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
263#endif
264}
265
266static noinline void vmwrite_error(unsigned long field, unsigned long value)
267{
268 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
269 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
270 dump_stack();
271}
272
273static void vmcs_writel(unsigned long field, unsigned long value)
274{
275 u8 error;
276
277 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
278 : "=q"(error) : "a"(value), "d"(field) : "cc" );
279 if (unlikely(error))
280 vmwrite_error(field, value);
281}
282
283static void vmcs_write16(unsigned long field, u16 value)
284{
285 vmcs_writel(field, value);
286}
287
288static void vmcs_write32(unsigned long field, u32 value)
289{
290 vmcs_writel(field, value);
291}
292
293static void vmcs_write64(unsigned long field, u64 value)
294{
295#ifdef CONFIG_X86_64
296 vmcs_writel(field, value);
297#else
298 vmcs_writel(field, value);
299 asm volatile ("");
300 vmcs_writel(field+1, value >> 32);
301#endif
302}
303
304static void vmcs_clear_bits(unsigned long field, u32 mask)
305{
306 vmcs_writel(field, vmcs_readl(field) & ~mask);
307}
308
309static void vmcs_set_bits(unsigned long field, u32 mask)
310{
311 vmcs_writel(field, vmcs_readl(field) | mask);
312}
313
314static void update_exception_bitmap(struct kvm_vcpu *vcpu)
315{
316 u32 eb;
317
318 eb = 1u << PF_VECTOR;
319 if (!vcpu->fpu_active)
320 eb |= 1u << NM_VECTOR;
321 if (vcpu->guest_debug.enabled)
322 eb |= 1u << 1;
323 if (vcpu->rmode.active)
324 eb = ~0;
325 vmcs_write32(EXCEPTION_BITMAP, eb);
326}
327
328static void reload_tss(void)
329{
330#ifndef CONFIG_X86_64
331
332 /*
333 * VT restores TR but not its size. Useless.
334 */
335 struct descriptor_table gdt;
336 struct segment_descriptor *descs;
337
338 get_gdt(&gdt);
339 descs = (void *)gdt.base;
340 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
341 load_TR_desc();
342#endif
343}
344
345static void load_transition_efer(struct vcpu_vmx *vmx)
346{
347 u64 trans_efer;
348 int efer_offset = vmx->msr_offset_efer;
349
350 trans_efer = vmx->host_msrs[efer_offset].data;
351 trans_efer &= ~EFER_SAVE_RESTORE_BITS;
352 trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
353 wrmsrl(MSR_EFER, trans_efer);
354 vmx->vcpu.stat.efer_reload++;
355}
356
357static void vmx_save_host_state(struct kvm_vcpu *vcpu)
358{
359 struct vcpu_vmx *vmx = to_vmx(vcpu);
360
361 if (vmx->host_state.loaded)
362 return;
363
364 vmx->host_state.loaded = 1;
365 /*
366 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
367 * allow segment selectors with cpl > 0 or ti == 1.
368 */
369 vmx->host_state.ldt_sel = read_ldt();
370 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
371 vmx->host_state.fs_sel = read_fs();
372 if (!(vmx->host_state.fs_sel & 7)) {
373 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
374 vmx->host_state.fs_reload_needed = 0;
375 } else {
376 vmcs_write16(HOST_FS_SELECTOR, 0);
377 vmx->host_state.fs_reload_needed = 1;
378 }
379 vmx->host_state.gs_sel = read_gs();
380 if (!(vmx->host_state.gs_sel & 7))
381 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
382 else {
383 vmcs_write16(HOST_GS_SELECTOR, 0);
384 vmx->host_state.gs_ldt_reload_needed = 1;
385 }
386
387#ifdef CONFIG_X86_64
388 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
389 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
390#else
391 vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
392 vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
393#endif
394
395#ifdef CONFIG_X86_64
396 if (is_long_mode(&vmx->vcpu)) {
397 save_msrs(vmx->host_msrs +
398 vmx->msr_offset_kernel_gs_base, 1);
399 }
400#endif
401 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
402 if (msr_efer_need_save_restore(vmx))
403 load_transition_efer(vmx);
404}
405
406static void vmx_load_host_state(struct vcpu_vmx *vmx)
407{
408 unsigned long flags;
409
410 if (!vmx->host_state.loaded)
411 return;
412
413 vmx->host_state.loaded = 0;
414 if (vmx->host_state.fs_reload_needed)
415 load_fs(vmx->host_state.fs_sel);
416 if (vmx->host_state.gs_ldt_reload_needed) {
417 load_ldt(vmx->host_state.ldt_sel);
418 /*
419 * If we have to reload gs, we must take care to
420 * preserve our gs base.
421 */
422 local_irq_save(flags);
423 load_gs(vmx->host_state.gs_sel);
424#ifdef CONFIG_X86_64
425 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
426#endif
427 local_irq_restore(flags);
428 }
429 reload_tss();
430 save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
431 load_msrs(vmx->host_msrs, vmx->save_nmsrs);
432 if (msr_efer_need_save_restore(vmx))
433 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
434}
435
436/*
437 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
438 * vcpu mutex is already taken.
439 */
440static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
441{
442 struct vcpu_vmx *vmx = to_vmx(vcpu);
443 u64 phys_addr = __pa(vmx->vmcs);
444 u64 tsc_this, delta;
445
446 if (vcpu->cpu != cpu) {
447 vcpu_clear(vmx);
448 kvm_migrate_apic_timer(vcpu);
449 }
450
451 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
452 u8 error;
453
454 per_cpu(current_vmcs, cpu) = vmx->vmcs;
455 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
456 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
457 : "cc");
458 if (error)
459 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
460 vmx->vmcs, phys_addr);
461 }
462
463 if (vcpu->cpu != cpu) {
464 struct descriptor_table dt;
465 unsigned long sysenter_esp;
466
467 vcpu->cpu = cpu;
468 /*
469 * Linux uses per-cpu TSS and GDT, so set these when switching
470 * processors.
471 */
472 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
473 get_gdt(&dt);
474 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
475
476 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
477 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
478
479 /*
480 * Make sure the time stamp counter is monotonous.
481 */
482 rdtscll(tsc_this);
483 delta = vcpu->host_tsc - tsc_this;
484 vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
485 }
486}
487
488static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
489{
490 vmx_load_host_state(to_vmx(vcpu));
491 kvm_put_guest_fpu(vcpu);
492}
493
494static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
495{
496 if (vcpu->fpu_active)
497 return;
498 vcpu->fpu_active = 1;
499 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
500 if (vcpu->cr0 & X86_CR0_TS)
501 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
502 update_exception_bitmap(vcpu);
503}
504
505static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
506{
507 if (!vcpu->fpu_active)
508 return;
509 vcpu->fpu_active = 0;
510 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
511 update_exception_bitmap(vcpu);
512}
513
514static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
515{
516 vcpu_clear(to_vmx(vcpu));
517}
518
519static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
520{
521 return vmcs_readl(GUEST_RFLAGS);
522}
523
524static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
525{
526 if (vcpu->rmode.active)
527 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
528 vmcs_writel(GUEST_RFLAGS, rflags);
529}
530
531static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
532{
533 unsigned long rip;
534 u32 interruptibility;
535
536 rip = vmcs_readl(GUEST_RIP);
537 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
538 vmcs_writel(GUEST_RIP, rip);
539
540 /*
541 * We emulated an instruction, so temporary interrupt blocking
542 * should be removed, if set.
543 */
544 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
545 if (interruptibility & 3)
546 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
547 interruptibility & ~3);
548 vcpu->interrupt_window_open = 1;
549}
550
551static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
552{
553 printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
554 vmcs_readl(GUEST_RIP));
555 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
556 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
557 GP_VECTOR |
558 INTR_TYPE_EXCEPTION |
559 INTR_INFO_DELIEVER_CODE_MASK |
560 INTR_INFO_VALID_MASK);
561}
562
563/*
564 * Swap MSR entry in host/guest MSR entry array.
565 */
566#ifdef CONFIG_X86_64
567static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
568{
569 struct kvm_msr_entry tmp;
570
571 tmp = vmx->guest_msrs[to];
572 vmx->guest_msrs[to] = vmx->guest_msrs[from];
573 vmx->guest_msrs[from] = tmp;
574 tmp = vmx->host_msrs[to];
575 vmx->host_msrs[to] = vmx->host_msrs[from];
576 vmx->host_msrs[from] = tmp;
577}
578#endif
579
580/*
581 * Set up the vmcs to automatically save and restore system
582 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
583 * mode, as fiddling with msrs is very expensive.
584 */
585static void setup_msrs(struct vcpu_vmx *vmx)
586{
587 int save_nmsrs;
588
589 save_nmsrs = 0;
590#ifdef CONFIG_X86_64
591 if (is_long_mode(&vmx->vcpu)) {
592 int index;
593
594 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
595 if (index >= 0)
596 move_msr_up(vmx, index, save_nmsrs++);
597 index = __find_msr_index(vmx, MSR_LSTAR);
598 if (index >= 0)
599 move_msr_up(vmx, index, save_nmsrs++);
600 index = __find_msr_index(vmx, MSR_CSTAR);
601 if (index >= 0)
602 move_msr_up(vmx, index, save_nmsrs++);
603 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
604 if (index >= 0)
605 move_msr_up(vmx, index, save_nmsrs++);
606 /*
607 * MSR_K6_STAR is only needed on long mode guests, and only
608 * if efer.sce is enabled.
609 */
610 index = __find_msr_index(vmx, MSR_K6_STAR);
611 if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE))
612 move_msr_up(vmx, index, save_nmsrs++);
613 }
614#endif
615 vmx->save_nmsrs = save_nmsrs;
616
617#ifdef CONFIG_X86_64
618 vmx->msr_offset_kernel_gs_base =
619 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
620#endif
621 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
622}
623
624/*
625 * reads and returns guest's timestamp counter "register"
626 * guest_tsc = host_tsc + tsc_offset -- 21.3
627 */
628static u64 guest_read_tsc(void)
629{
630 u64 host_tsc, tsc_offset;
631
632 rdtscll(host_tsc);
633 tsc_offset = vmcs_read64(TSC_OFFSET);
634 return host_tsc + tsc_offset;
635}
636
637/*
638 * writes 'guest_tsc' into guest's timestamp counter "register"
639 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
640 */
641static void guest_write_tsc(u64 guest_tsc)
642{
643 u64 host_tsc;
644
645 rdtscll(host_tsc);
646 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
647}
648
649/*
650 * Reads an msr value (of 'msr_index') into 'pdata'.
651 * Returns 0 on success, non-0 otherwise.
652 * Assumes vcpu_load() was already called.
653 */
654static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
655{
656 u64 data;
657 struct kvm_msr_entry *msr;
658
659 if (!pdata) {
660 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
661 return -EINVAL;
662 }
663
664 switch (msr_index) {
665#ifdef CONFIG_X86_64
666 case MSR_FS_BASE:
667 data = vmcs_readl(GUEST_FS_BASE);
668 break;
669 case MSR_GS_BASE:
670 data = vmcs_readl(GUEST_GS_BASE);
671 break;
672 case MSR_EFER:
673 return kvm_get_msr_common(vcpu, msr_index, pdata);
674#endif
675 case MSR_IA32_TIME_STAMP_COUNTER:
676 data = guest_read_tsc();
677 break;
678 case MSR_IA32_SYSENTER_CS:
679 data = vmcs_read32(GUEST_SYSENTER_CS);
680 break;
681 case MSR_IA32_SYSENTER_EIP:
682 data = vmcs_readl(GUEST_SYSENTER_EIP);
683 break;
684 case MSR_IA32_SYSENTER_ESP:
685 data = vmcs_readl(GUEST_SYSENTER_ESP);
686 break;
687 default:
688 msr = find_msr_entry(to_vmx(vcpu), msr_index);
689 if (msr) {
690 data = msr->data;
691 break;
692 }
693 return kvm_get_msr_common(vcpu, msr_index, pdata);
694 }
695
696 *pdata = data;
697 return 0;
698}
699
700/*
701 * Writes msr value into into the appropriate "register".
702 * Returns 0 on success, non-0 otherwise.
703 * Assumes vcpu_load() was already called.
704 */
705static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
706{
707 struct vcpu_vmx *vmx = to_vmx(vcpu);
708 struct kvm_msr_entry *msr;
709 int ret = 0;
710
711 switch (msr_index) {
712#ifdef CONFIG_X86_64
713 case MSR_EFER:
714 ret = kvm_set_msr_common(vcpu, msr_index, data);
715 if (vmx->host_state.loaded)
716 load_transition_efer(vmx);
717 break;
718 case MSR_FS_BASE:
719 vmcs_writel(GUEST_FS_BASE, data);
720 break;
721 case MSR_GS_BASE:
722 vmcs_writel(GUEST_GS_BASE, data);
723 break;
724#endif
725 case MSR_IA32_SYSENTER_CS:
726 vmcs_write32(GUEST_SYSENTER_CS, data);
727 break;
728 case MSR_IA32_SYSENTER_EIP:
729 vmcs_writel(GUEST_SYSENTER_EIP, data);
730 break;
731 case MSR_IA32_SYSENTER_ESP:
732 vmcs_writel(GUEST_SYSENTER_ESP, data);
733 break;
734 case MSR_IA32_TIME_STAMP_COUNTER:
735 guest_write_tsc(data);
736 break;
737 default:
738 msr = find_msr_entry(vmx, msr_index);
739 if (msr) {
740 msr->data = data;
741 if (vmx->host_state.loaded)
742 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
743 break;
744 }
745 ret = kvm_set_msr_common(vcpu, msr_index, data);
746 }
747
748 return ret;
749}
750
751/*
752 * Sync the rsp and rip registers into the vcpu structure. This allows
753 * registers to be accessed by indexing vcpu->regs.
754 */
755static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
756{
757 vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
758 vcpu->rip = vmcs_readl(GUEST_RIP);
759}
760
761/*
762 * Syncs rsp and rip back into the vmcs. Should be called after possible
763 * modification.
764 */
765static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
766{
767 vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
768 vmcs_writel(GUEST_RIP, vcpu->rip);
769}
770
771static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
772{
773 unsigned long dr7 = 0x400;
774 int old_singlestep;
775
776 old_singlestep = vcpu->guest_debug.singlestep;
777
778 vcpu->guest_debug.enabled = dbg->enabled;
779 if (vcpu->guest_debug.enabled) {
780 int i;
781
782 dr7 |= 0x200; /* exact */
783 for (i = 0; i < 4; ++i) {
784 if (!dbg->breakpoints[i].enabled)
785 continue;
786 vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
787 dr7 |= 2 << (i*2); /* global enable */
788 dr7 |= 0 << (i*4+16); /* execution breakpoint */
789 }
790
791 vcpu->guest_debug.singlestep = dbg->singlestep;
792 } else
793 vcpu->guest_debug.singlestep = 0;
794
795 if (old_singlestep && !vcpu->guest_debug.singlestep) {
796 unsigned long flags;
797
798 flags = vmcs_readl(GUEST_RFLAGS);
799 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
800 vmcs_writel(GUEST_RFLAGS, flags);
801 }
802
803 update_exception_bitmap(vcpu);
804 vmcs_writel(GUEST_DR7, dr7);
805
806 return 0;
807}
808
809static int vmx_get_irq(struct kvm_vcpu *vcpu)
810{
811 u32 idtv_info_field;
812
813 idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
814 if (idtv_info_field & INTR_INFO_VALID_MASK) {
815 if (is_external_interrupt(idtv_info_field))
816 return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
817 else
818 printk("pending exception: not handled yet\n");
819 }
820 return -1;
821}
822
823static __init int cpu_has_kvm_support(void)
824{
825 unsigned long ecx = cpuid_ecx(1);
826 return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
827}
828
829static __init int vmx_disabled_by_bios(void)
830{
831 u64 msr;
832
833 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
834 return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
835 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
836 == MSR_IA32_FEATURE_CONTROL_LOCKED;
837 /* locked but not enabled */
838}
839
840static void hardware_enable(void *garbage)
841{
842 int cpu = raw_smp_processor_id();
843 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
844 u64 old;
845
846 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
847 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
848 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
849 != (MSR_IA32_FEATURE_CONTROL_LOCKED |
850 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
851 /* enable and lock */
852 wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
853 MSR_IA32_FEATURE_CONTROL_LOCKED |
854 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
855 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
856 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
857 : "memory", "cc");
858}
859
860static void hardware_disable(void *garbage)
861{
862 asm volatile (ASM_VMX_VMXOFF : : : "cc");
863}
864
865static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
866 u32 msr, u32* result)
867{
868 u32 vmx_msr_low, vmx_msr_high;
869 u32 ctl = ctl_min | ctl_opt;
870
871 rdmsr(msr, vmx_msr_low, vmx_msr_high);
872
873 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
874 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
875
876 /* Ensure minimum (required) set of control bits are supported. */
877 if (ctl_min & ~ctl)
878 return -EIO;
879
880 *result = ctl;
881 return 0;
882}
883
884static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
885{
886 u32 vmx_msr_low, vmx_msr_high;
887 u32 min, opt;
888 u32 _pin_based_exec_control = 0;
889 u32 _cpu_based_exec_control = 0;
890 u32 _vmexit_control = 0;
891 u32 _vmentry_control = 0;
892
893 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
894 opt = 0;
895 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
896 &_pin_based_exec_control) < 0)
897 return -EIO;
898
899 min = CPU_BASED_HLT_EXITING |
900#ifdef CONFIG_X86_64
901 CPU_BASED_CR8_LOAD_EXITING |
902 CPU_BASED_CR8_STORE_EXITING |
903#endif
904 CPU_BASED_USE_IO_BITMAPS |
905 CPU_BASED_MOV_DR_EXITING |
906 CPU_BASED_USE_TSC_OFFSETING;
907#ifdef CONFIG_X86_64
908 opt = CPU_BASED_TPR_SHADOW;
909#else
910 opt = 0;
911#endif
912 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
913 &_cpu_based_exec_control) < 0)
914 return -EIO;
915#ifdef CONFIG_X86_64
916 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
917 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
918 ~CPU_BASED_CR8_STORE_EXITING;
919#endif
920
921 min = 0;
922#ifdef CONFIG_X86_64
923 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
924#endif
925 opt = 0;
926 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
927 &_vmexit_control) < 0)
928 return -EIO;
929
930 min = opt = 0;
931 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
932 &_vmentry_control) < 0)
933 return -EIO;
934
935 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
936
937 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
938 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
939 return -EIO;
940
941#ifdef CONFIG_X86_64
942 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
943 if (vmx_msr_high & (1u<<16))
944 return -EIO;
945#endif
946
947 /* Require Write-Back (WB) memory type for VMCS accesses. */
948 if (((vmx_msr_high >> 18) & 15) != 6)
949 return -EIO;
950
951 vmcs_conf->size = vmx_msr_high & 0x1fff;
952 vmcs_conf->order = get_order(vmcs_config.size);
953 vmcs_conf->revision_id = vmx_msr_low;
954
955 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
956 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
957 vmcs_conf->vmexit_ctrl = _vmexit_control;
958 vmcs_conf->vmentry_ctrl = _vmentry_control;
959
960 return 0;
961}
962
963static struct vmcs *alloc_vmcs_cpu(int cpu)
964{
965 int node = cpu_to_node(cpu);
966 struct page *pages;
967 struct vmcs *vmcs;
968
969 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
970 if (!pages)
971 return NULL;
972 vmcs = page_address(pages);
973 memset(vmcs, 0, vmcs_config.size);
974 vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
975 return vmcs;
976}
977
978static struct vmcs *alloc_vmcs(void)
979{
980 return alloc_vmcs_cpu(raw_smp_processor_id());
981}
982
983static void free_vmcs(struct vmcs *vmcs)
984{
985 free_pages((unsigned long)vmcs, vmcs_config.order);
986}
987
988static void free_kvm_area(void)
989{
990 int cpu;
991
992 for_each_online_cpu(cpu)
993 free_vmcs(per_cpu(vmxarea, cpu));
994}
995
996static __init int alloc_kvm_area(void)
997{
998 int cpu;
999
1000 for_each_online_cpu(cpu) {
1001 struct vmcs *vmcs;
1002
1003 vmcs = alloc_vmcs_cpu(cpu);
1004 if (!vmcs) {
1005 free_kvm_area();
1006 return -ENOMEM;
1007 }
1008
1009 per_cpu(vmxarea, cpu) = vmcs;
1010 }
1011 return 0;
1012}
1013
1014static __init int hardware_setup(void)
1015{
1016 if (setup_vmcs_config(&vmcs_config) < 0)
1017 return -EIO;
1018 return alloc_kvm_area();
1019}
1020
1021static __exit void hardware_unsetup(void)
1022{
1023 free_kvm_area();
1024}
1025
1026static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1027{
1028 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1029
1030 if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
1031 vmcs_write16(sf->selector, save->selector);
1032 vmcs_writel(sf->base, save->base);
1033 vmcs_write32(sf->limit, save->limit);
1034 vmcs_write32(sf->ar_bytes, save->ar);
1035 } else {
1036 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
1037 << AR_DPL_SHIFT;
1038 vmcs_write32(sf->ar_bytes, 0x93 | dpl);
1039 }
1040}
1041
1042static void enter_pmode(struct kvm_vcpu *vcpu)
1043{
1044 unsigned long flags;
1045
1046 vcpu->rmode.active = 0;
1047
1048 vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
1049 vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
1050 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
1051
1052 flags = vmcs_readl(GUEST_RFLAGS);
1053 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
1054 flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
1055 vmcs_writel(GUEST_RFLAGS, flags);
1056
1057 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
1058 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
1059
1060 update_exception_bitmap(vcpu);
1061
1062 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es);
1063 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds);
1064 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs);
1065 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs);
1066
1067 vmcs_write16(GUEST_SS_SELECTOR, 0);
1068 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
1069
1070 vmcs_write16(GUEST_CS_SELECTOR,
1071 vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
1072 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1073}
1074
1075static gva_t rmode_tss_base(struct kvm* kvm)
1076{
1077 gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
1078 return base_gfn << PAGE_SHIFT;
1079}
1080
1081static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1082{
1083 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1084
1085 save->selector = vmcs_read16(sf->selector);
1086 save->base = vmcs_readl(sf->base);
1087 save->limit = vmcs_read32(sf->limit);
1088 save->ar = vmcs_read32(sf->ar_bytes);
1089 vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4);
1090 vmcs_write32(sf->limit, 0xffff);
1091 vmcs_write32(sf->ar_bytes, 0xf3);
1092}
1093
1094static void enter_rmode(struct kvm_vcpu *vcpu)
1095{
1096 unsigned long flags;
1097
1098 vcpu->rmode.active = 1;
1099
1100 vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1101 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1102
1103 vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
1104 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1105
1106 vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
1107 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1108
1109 flags = vmcs_readl(GUEST_RFLAGS);
1110 vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1111
1112 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1113
1114 vmcs_writel(GUEST_RFLAGS, flags);
1115 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
1116 update_exception_bitmap(vcpu);
1117
1118 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1119 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1120 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
1121
1122 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
1123 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1124 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
1125 vmcs_writel(GUEST_CS_BASE, 0xf0000);
1126 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1127
1128 fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
1129 fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
1130 fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
1131 fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
1132
1133 kvm_mmu_reset_context(vcpu);
1134 init_rmode_tss(vcpu->kvm);
1135}
1136
1137#ifdef CONFIG_X86_64
1138
1139static void enter_lmode(struct kvm_vcpu *vcpu)
1140{
1141 u32 guest_tr_ar;
1142
1143 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1144 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1145 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
1146 __FUNCTION__);
1147 vmcs_write32(GUEST_TR_AR_BYTES,
1148 (guest_tr_ar & ~AR_TYPE_MASK)
1149 | AR_TYPE_BUSY_64_TSS);
1150 }
1151
1152 vcpu->shadow_efer |= EFER_LMA;
1153
1154 find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
1155 vmcs_write32(VM_ENTRY_CONTROLS,
1156 vmcs_read32(VM_ENTRY_CONTROLS)
1157 | VM_ENTRY_IA32E_MODE);
1158}
1159
1160static void exit_lmode(struct kvm_vcpu *vcpu)
1161{
1162 vcpu->shadow_efer &= ~EFER_LMA;
1163
1164 vmcs_write32(VM_ENTRY_CONTROLS,
1165 vmcs_read32(VM_ENTRY_CONTROLS)
1166 & ~VM_ENTRY_IA32E_MODE);
1167}
1168
1169#endif
1170
1171static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1172{
1173 vcpu->cr4 &= KVM_GUEST_CR4_MASK;
1174 vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
1175}
1176
1177static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1178{
1179 vmx_fpu_deactivate(vcpu);
1180
1181 if (vcpu->rmode.active && (cr0 & X86_CR0_PE))
1182 enter_pmode(vcpu);
1183
1184 if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE))
1185 enter_rmode(vcpu);
1186
1187#ifdef CONFIG_X86_64
1188 if (vcpu->shadow_efer & EFER_LME) {
1189 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1190 enter_lmode(vcpu);
1191 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
1192 exit_lmode(vcpu);
1193 }
1194#endif
1195
1196 vmcs_writel(CR0_READ_SHADOW, cr0);
1197 vmcs_writel(GUEST_CR0,
1198 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
1199 vcpu->cr0 = cr0;
1200
1201 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1202 vmx_fpu_activate(vcpu);
1203}
1204
1205static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1206{
1207 vmcs_writel(GUEST_CR3, cr3);
1208 if (vcpu->cr0 & X86_CR0_PE)
1209 vmx_fpu_deactivate(vcpu);
1210}
1211
1212static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1213{
1214 vmcs_writel(CR4_READ_SHADOW, cr4);
1215 vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
1216 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
1217 vcpu->cr4 = cr4;
1218}
1219
1220#ifdef CONFIG_X86_64
1221
1222static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1223{
1224 struct vcpu_vmx *vmx = to_vmx(vcpu);
1225 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1226
1227 vcpu->shadow_efer = efer;
1228 if (efer & EFER_LMA) {
1229 vmcs_write32(VM_ENTRY_CONTROLS,
1230 vmcs_read32(VM_ENTRY_CONTROLS) |
1231 VM_ENTRY_IA32E_MODE);
1232 msr->data = efer;
1233
1234 } else {
1235 vmcs_write32(VM_ENTRY_CONTROLS,
1236 vmcs_read32(VM_ENTRY_CONTROLS) &
1237 ~VM_ENTRY_IA32E_MODE);
1238
1239 msr->data = efer & ~EFER_LME;
1240 }
1241 setup_msrs(vmx);
1242}
1243
1244#endif
1245
1246static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1247{
1248 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1249
1250 return vmcs_readl(sf->base);
1251}
1252
1253static void vmx_get_segment(struct kvm_vcpu *vcpu,
1254 struct kvm_segment *var, int seg)
1255{
1256 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1257 u32 ar;
1258
1259 var->base = vmcs_readl(sf->base);
1260 var->limit = vmcs_read32(sf->limit);
1261 var->selector = vmcs_read16(sf->selector);
1262 ar = vmcs_read32(sf->ar_bytes);
1263 if (ar & AR_UNUSABLE_MASK)
1264 ar = 0;
1265 var->type = ar & 15;
1266 var->s = (ar >> 4) & 1;
1267 var->dpl = (ar >> 5) & 3;
1268 var->present = (ar >> 7) & 1;
1269 var->avl = (ar >> 12) & 1;
1270 var->l = (ar >> 13) & 1;
1271 var->db = (ar >> 14) & 1;
1272 var->g = (ar >> 15) & 1;
1273 var->unusable = (ar >> 16) & 1;
1274}
1275
1276static u32 vmx_segment_access_rights(struct kvm_segment *var)
1277{
1278 u32 ar;
1279
1280 if (var->unusable)
1281 ar = 1 << 16;
1282 else {
1283 ar = var->type & 15;
1284 ar |= (var->s & 1) << 4;
1285 ar |= (var->dpl & 3) << 5;
1286 ar |= (var->present & 1) << 7;
1287 ar |= (var->avl & 1) << 12;
1288 ar |= (var->l & 1) << 13;
1289 ar |= (var->db & 1) << 14;
1290 ar |= (var->g & 1) << 15;
1291 }
1292 if (ar == 0) /* a 0 value means unusable */
1293 ar = AR_UNUSABLE_MASK;
1294
1295 return ar;
1296}
1297
1298static void vmx_set_segment(struct kvm_vcpu *vcpu,
1299 struct kvm_segment *var, int seg)
1300{
1301 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1302 u32 ar;
1303
1304 if (vcpu->rmode.active && seg == VCPU_SREG_TR) {
1305 vcpu->rmode.tr.selector = var->selector;
1306 vcpu->rmode.tr.base = var->base;
1307 vcpu->rmode.tr.limit = var->limit;
1308 vcpu->rmode.tr.ar = vmx_segment_access_rights(var);
1309 return;
1310 }
1311 vmcs_writel(sf->base, var->base);
1312 vmcs_write32(sf->limit, var->limit);
1313 vmcs_write16(sf->selector, var->selector);
1314 if (vcpu->rmode.active && var->s) {
1315 /*
1316 * Hack real-mode segments into vm86 compatibility.
1317 */
1318 if (var->base == 0xffff0000 && var->selector == 0xf000)
1319 vmcs_writel(sf->base, 0xf0000);
1320 ar = 0xf3;
1321 } else
1322 ar = vmx_segment_access_rights(var);
1323 vmcs_write32(sf->ar_bytes, ar);
1324}
1325
1326static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1327{
1328 u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
1329
1330 *db = (ar >> 14) & 1;
1331 *l = (ar >> 13) & 1;
1332}
1333
1334static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1335{
1336 dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
1337 dt->base = vmcs_readl(GUEST_IDTR_BASE);
1338}
1339
1340static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1341{
1342 vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
1343 vmcs_writel(GUEST_IDTR_BASE, dt->base);
1344}
1345
1346static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1347{
1348 dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
1349 dt->base = vmcs_readl(GUEST_GDTR_BASE);
1350}
1351
1352static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1353{
1354 vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
1355 vmcs_writel(GUEST_GDTR_BASE, dt->base);
1356}
1357
1358static int init_rmode_tss(struct kvm* kvm)
1359{
1360 struct page *p1, *p2, *p3;
1361 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
1362 char *page;
1363
1364 p1 = gfn_to_page(kvm, fn++);
1365 p2 = gfn_to_page(kvm, fn++);
1366 p3 = gfn_to_page(kvm, fn);
1367
1368 if (!p1 || !p2 || !p3) {
1369 kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
1370 return 0;
1371 }
1372
1373 page = kmap_atomic(p1, KM_USER0);
1374 clear_page(page);
1375 *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1376 kunmap_atomic(page, KM_USER0);
1377
1378 page = kmap_atomic(p2, KM_USER0);
1379 clear_page(page);
1380 kunmap_atomic(page, KM_USER0);
1381
1382 page = kmap_atomic(p3, KM_USER0);
1383 clear_page(page);
1384 *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
1385 kunmap_atomic(page, KM_USER0);
1386
1387 return 1;
1388}
1389
1390static void seg_setup(int seg)
1391{
1392 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1393
1394 vmcs_write16(sf->selector, 0);
1395 vmcs_writel(sf->base, 0);
1396 vmcs_write32(sf->limit, 0xffff);
1397 vmcs_write32(sf->ar_bytes, 0x93);
1398}
1399
1400/*
1401 * Sets up the vmcs for emulated real mode.
1402 */
1403static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1404{
1405 u32 host_sysenter_cs;
1406 u32 junk;
1407 unsigned long a;
1408 struct descriptor_table dt;
1409 int i;
1410 int ret = 0;
1411 unsigned long kvm_vmx_return;
1412 u64 msr;
1413 u32 exec_control;
1414
1415 if (!init_rmode_tss(vmx->vcpu.kvm)) {
1416 ret = -ENOMEM;
1417 goto out;
1418 }
1419
1420 vmx->vcpu.rmode.active = 0;
1421
1422 vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
1423 set_cr8(&vmx->vcpu, 0);
1424 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1425 if (vmx->vcpu.vcpu_id == 0)
1426 msr |= MSR_IA32_APICBASE_BSP;
1427 kvm_set_apic_base(&vmx->vcpu, msr);
1428
1429 fx_init(&vmx->vcpu);
1430
1431 /*
1432 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1433 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
1434 */
1435 if (vmx->vcpu.vcpu_id == 0) {
1436 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1437 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1438 } else {
1439 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8);
1440 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12);
1441 }
1442 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1443 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1444
1445 seg_setup(VCPU_SREG_DS);
1446 seg_setup(VCPU_SREG_ES);
1447 seg_setup(VCPU_SREG_FS);
1448 seg_setup(VCPU_SREG_GS);
1449 seg_setup(VCPU_SREG_SS);
1450
1451 vmcs_write16(GUEST_TR_SELECTOR, 0);
1452 vmcs_writel(GUEST_TR_BASE, 0);
1453 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1454 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1455
1456 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1457 vmcs_writel(GUEST_LDTR_BASE, 0);
1458 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1459 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1460
1461 vmcs_write32(GUEST_SYSENTER_CS, 0);
1462 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1463 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1464
1465 vmcs_writel(GUEST_RFLAGS, 0x02);
1466 if (vmx->vcpu.vcpu_id == 0)
1467 vmcs_writel(GUEST_RIP, 0xfff0);
1468 else
1469 vmcs_writel(GUEST_RIP, 0);
1470 vmcs_writel(GUEST_RSP, 0);
1471
1472 //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1473 vmcs_writel(GUEST_DR7, 0x400);
1474
1475 vmcs_writel(GUEST_GDTR_BASE, 0);
1476 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1477
1478 vmcs_writel(GUEST_IDTR_BASE, 0);
1479 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1480
1481 vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1482 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1483 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1484
1485 /* I/O */
1486 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
1487 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
1488
1489 guest_write_tsc(0);
1490
1491 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1492
1493 /* Special registers */
1494 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1495
1496 /* Control */
1497 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1498 vmcs_config.pin_based_exec_ctrl);
1499
1500 exec_control = vmcs_config.cpu_based_exec_ctrl;
1501 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
1502 exec_control &= ~CPU_BASED_TPR_SHADOW;
1503#ifdef CONFIG_X86_64
1504 exec_control |= CPU_BASED_CR8_STORE_EXITING |
1505 CPU_BASED_CR8_LOAD_EXITING;
1506#endif
1507 }
1508 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1509
1510 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1511 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1512 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
1513
1514 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
1515 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
1516 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
1517
1518 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
1519 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1520 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1521 vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */
1522 vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */
1523 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1524#ifdef CONFIG_X86_64
1525 rdmsrl(MSR_FS_BASE, a);
1526 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1527 rdmsrl(MSR_GS_BASE, a);
1528 vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1529#else
1530 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1531 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1532#endif
1533
1534 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
1535
1536 get_idt(&dt);
1537 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1538
1539 asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
1540 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
1541 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
1542 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
1543 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
1544
1545 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
1546 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1547 rdmsrl(MSR_IA32_SYSENTER_ESP, a);
1548 vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
1549 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
1550 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1551
1552 for (i = 0; i < NR_VMX_MSR; ++i) {
1553 u32 index = vmx_msr_index[i];
1554 u32 data_low, data_high;
1555 u64 data;
1556 int j = vmx->nmsrs;
1557
1558 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1559 continue;
1560 if (wrmsr_safe(index, data_low, data_high) < 0)
1561 continue;
1562 data = data_low | ((u64)data_high << 32);
1563 vmx->host_msrs[j].index = index;
1564 vmx->host_msrs[j].reserved = 0;
1565 vmx->host_msrs[j].data = data;
1566 vmx->guest_msrs[j] = vmx->host_msrs[j];
1567 ++vmx->nmsrs;
1568 }
1569
1570 setup_msrs(vmx);
1571
1572 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1573
1574 /* 22.2.1, 20.8.1 */
1575 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1576
1577 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1578
1579#ifdef CONFIG_X86_64
1580 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
1581 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
1582 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
1583 page_to_phys(vmx->vcpu.apic->regs_page));
1584 vmcs_write32(TPR_THRESHOLD, 0);
1585#endif
1586
1587 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
1588 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1589
1590 vmx->vcpu.cr0 = 0x60000010;
1591 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode
1592 vmx_set_cr4(&vmx->vcpu, 0);
1593#ifdef CONFIG_X86_64
1594 vmx_set_efer(&vmx->vcpu, 0);
1595#endif
1596 vmx_fpu_activate(&vmx->vcpu);
1597 update_exception_bitmap(&vmx->vcpu);
1598
1599 return 0;
1600
1601out:
1602 return ret;
1603}
1604
1605static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
1606{
1607 struct vcpu_vmx *vmx = to_vmx(vcpu);
1608
1609 vmx_vcpu_setup(vmx);
1610}
1611
1612static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
1613{
1614 u16 ent[2];
1615 u16 cs;
1616 u16 ip;
1617 unsigned long flags;
1618 unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
1619 u16 sp = vmcs_readl(GUEST_RSP);
1620 u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
1621
1622 if (sp > ss_limit || sp < 6 ) {
1623 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
1624 __FUNCTION__,
1625 vmcs_readl(GUEST_RSP),
1626 vmcs_readl(GUEST_SS_BASE),
1627 vmcs_read32(GUEST_SS_LIMIT));
1628 return;
1629 }
1630
1631 if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) !=
1632 X86EMUL_CONTINUE) {
1633 vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
1634 return;
1635 }
1636
1637 flags = vmcs_readl(GUEST_RFLAGS);
1638 cs = vmcs_readl(GUEST_CS_BASE) >> 4;
1639 ip = vmcs_readl(GUEST_RIP);
1640
1641
1642 if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE ||
1643 emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE ||
1644 emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) {
1645 vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
1646 return;
1647 }
1648
1649 vmcs_writel(GUEST_RFLAGS, flags &
1650 ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
1651 vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
1652 vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
1653 vmcs_writel(GUEST_RIP, ent[0]);
1654 vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
1655}
1656
1657static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
1658{
1659 if (vcpu->rmode.active) {
1660 inject_rmode_irq(vcpu, irq);
1661 return;
1662 }
1663 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1664 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1665}
1666
1667static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1668{
1669 int word_index = __ffs(vcpu->irq_summary);
1670 int bit_index = __ffs(vcpu->irq_pending[word_index]);
1671 int irq = word_index * BITS_PER_LONG + bit_index;
1672
1673 clear_bit(bit_index, &vcpu->irq_pending[word_index]);
1674 if (!vcpu->irq_pending[word_index])
1675 clear_bit(word_index, &vcpu->irq_summary);
1676 vmx_inject_irq(vcpu, irq);
1677}
1678
1679
1680static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1681 struct kvm_run *kvm_run)
1682{
1683 u32 cpu_based_vm_exec_control;
1684
1685 vcpu->interrupt_window_open =
1686 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
1687 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
1688
1689 if (vcpu->interrupt_window_open &&
1690 vcpu->irq_summary &&
1691 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
1692 /*
1693 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1694 */
1695 kvm_do_inject_irq(vcpu);
1696
1697 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
1698 if (!vcpu->interrupt_window_open &&
1699 (vcpu->irq_summary || kvm_run->request_interrupt_window))
1700 /*
1701 * Interrupts blocked. Wait for unblock.
1702 */
1703 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
1704 else
1705 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
1706 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
1707}
1708
1709static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1710{
1711 struct kvm_guest_debug *dbg = &vcpu->guest_debug;
1712
1713 set_debugreg(dbg->bp[0], 0);
1714 set_debugreg(dbg->bp[1], 1);
1715 set_debugreg(dbg->bp[2], 2);
1716 set_debugreg(dbg->bp[3], 3);
1717
1718 if (dbg->singlestep) {
1719 unsigned long flags;
1720
1721 flags = vmcs_readl(GUEST_RFLAGS);
1722 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1723 vmcs_writel(GUEST_RFLAGS, flags);
1724 }
1725}
1726
1727static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1728 int vec, u32 err_code)
1729{
1730 if (!vcpu->rmode.active)
1731 return 0;
1732
1733 /*
1734 * Instruction with address size override prefix opcode 0x67
1735 * Cause the #SS fault with 0 error code in VM86 mode.
1736 */
1737 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
1738 if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
1739 return 1;
1740 return 0;
1741}
1742
1743static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1744{
1745 u32 intr_info, error_code;
1746 unsigned long cr2, rip;
1747 u32 vect_info;
1748 enum emulation_result er;
1749 int r;
1750
1751 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1752 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1753
1754 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1755 !is_page_fault(intr_info)) {
1756 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
1757 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1758 }
1759
1760 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
1761 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1762 set_bit(irq, vcpu->irq_pending);
1763 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
1764 }
1765
1766 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
1767 return 1; /* already handled by vmx_vcpu_run() */
1768
1769 if (is_no_device(intr_info)) {
1770 vmx_fpu_activate(vcpu);
1771 return 1;
1772 }
1773
1774 error_code = 0;
1775 rip = vmcs_readl(GUEST_RIP);
1776 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1777 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1778 if (is_page_fault(intr_info)) {
1779 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1780
1781 mutex_lock(&vcpu->kvm->lock);
1782 r = kvm_mmu_page_fault(vcpu, cr2, error_code);
1783 if (r < 0) {
1784 mutex_unlock(&vcpu->kvm->lock);
1785 return r;
1786 }
1787 if (!r) {
1788 mutex_unlock(&vcpu->kvm->lock);
1789 return 1;
1790 }
1791
1792 er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
1793 mutex_unlock(&vcpu->kvm->lock);
1794
1795 switch (er) {
1796 case EMULATE_DONE:
1797 return 1;
1798 case EMULATE_DO_MMIO:
1799 ++vcpu->stat.mmio_exits;
1800 return 0;
1801 case EMULATE_FAIL:
1802 kvm_report_emulation_failure(vcpu, "pagetable");
1803 break;
1804 default:
1805 BUG();
1806 }
1807 }
1808
1809 if (vcpu->rmode.active &&
1810 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1811 error_code)) {
1812 if (vcpu->halt_request) {
1813 vcpu->halt_request = 0;
1814 return kvm_emulate_halt(vcpu);
1815 }
1816 return 1;
1817 }
1818
1819 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
1820 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1821 return 0;
1822 }
1823 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
1824 kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1825 kvm_run->ex.error_code = error_code;
1826 return 0;
1827}
1828
1829static int handle_external_interrupt(struct kvm_vcpu *vcpu,
1830 struct kvm_run *kvm_run)
1831{
1832 ++vcpu->stat.irq_exits;
1833 return 1;
1834}
1835
1836static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1837{
1838 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1839 return 0;
1840}
1841
1842static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1843{
1844 unsigned long exit_qualification;
1845 int size, down, in, string, rep;
1846 unsigned port;
1847
1848 ++vcpu->stat.io_exits;
1849 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1850 string = (exit_qualification & 16) != 0;
1851
1852 if (string) {
1853 if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
1854 return 0;
1855 return 1;
1856 }
1857
1858 size = (exit_qualification & 7) + 1;
1859 in = (exit_qualification & 8) != 0;
1860 down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
1861 rep = (exit_qualification & 32) != 0;
1862 port = exit_qualification >> 16;
1863
1864 return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
1865}
1866
1867static void
1868vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1869{
1870 /*
1871 * Patch in the VMCALL instruction:
1872 */
1873 hypercall[0] = 0x0f;
1874 hypercall[1] = 0x01;
1875 hypercall[2] = 0xc1;
1876 hypercall[3] = 0xc3;
1877}
1878
1879static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1880{
1881 unsigned long exit_qualification;
1882 int cr;
1883 int reg;
1884
1885 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1886 cr = exit_qualification & 15;
1887 reg = (exit_qualification >> 8) & 15;
1888 switch ((exit_qualification >> 4) & 3) {
1889 case 0: /* mov to cr */
1890 switch (cr) {
1891 case 0:
1892 vcpu_load_rsp_rip(vcpu);
1893 set_cr0(vcpu, vcpu->regs[reg]);
1894 skip_emulated_instruction(vcpu);
1895 return 1;
1896 case 3:
1897 vcpu_load_rsp_rip(vcpu);
1898 set_cr3(vcpu, vcpu->regs[reg]);
1899 skip_emulated_instruction(vcpu);
1900 return 1;
1901 case 4:
1902 vcpu_load_rsp_rip(vcpu);
1903 set_cr4(vcpu, vcpu->regs[reg]);
1904 skip_emulated_instruction(vcpu);
1905 return 1;
1906 case 8:
1907 vcpu_load_rsp_rip(vcpu);
1908 set_cr8(vcpu, vcpu->regs[reg]);
1909 skip_emulated_instruction(vcpu);
1910 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1911 return 0;
1912 };
1913 break;
1914 case 2: /* clts */
1915 vcpu_load_rsp_rip(vcpu);
1916 vmx_fpu_deactivate(vcpu);
1917 vcpu->cr0 &= ~X86_CR0_TS;
1918 vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
1919 vmx_fpu_activate(vcpu);
1920 skip_emulated_instruction(vcpu);
1921 return 1;
1922 case 1: /*mov from cr*/
1923 switch (cr) {
1924 case 3:
1925 vcpu_load_rsp_rip(vcpu);
1926 vcpu->regs[reg] = vcpu->cr3;
1927 vcpu_put_rsp_rip(vcpu);
1928 skip_emulated_instruction(vcpu);
1929 return 1;
1930 case 8:
1931 vcpu_load_rsp_rip(vcpu);
1932 vcpu->regs[reg] = get_cr8(vcpu);
1933 vcpu_put_rsp_rip(vcpu);
1934 skip_emulated_instruction(vcpu);
1935 return 1;
1936 }
1937 break;
1938 case 3: /* lmsw */
1939 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
1940
1941 skip_emulated_instruction(vcpu);
1942 return 1;
1943 default:
1944 break;
1945 }
1946 kvm_run->exit_reason = 0;
1947 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
1948 (int)(exit_qualification >> 4) & 3, cr);
1949 return 0;
1950}
1951
1952static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1953{
1954 unsigned long exit_qualification;
1955 unsigned long val;
1956 int dr, reg;
1957
1958 /*
1959 * FIXME: this code assumes the host is debugging the guest.
1960 * need to deal with guest debugging itself too.
1961 */
1962 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1963 dr = exit_qualification & 7;
1964 reg = (exit_qualification >> 8) & 15;
1965 vcpu_load_rsp_rip(vcpu);
1966 if (exit_qualification & 16) {
1967 /* mov from dr */
1968 switch (dr) {
1969 case 6:
1970 val = 0xffff0ff0;
1971 break;
1972 case 7:
1973 val = 0x400;
1974 break;
1975 default:
1976 val = 0;
1977 }
1978 vcpu->regs[reg] = val;
1979 } else {
1980 /* mov to dr */
1981 }
1982 vcpu_put_rsp_rip(vcpu);
1983 skip_emulated_instruction(vcpu);
1984 return 1;
1985}
1986
1987static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1988{
1989 kvm_emulate_cpuid(vcpu);
1990 return 1;
1991}
1992
1993static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1994{
1995 u32 ecx = vcpu->regs[VCPU_REGS_RCX];
1996 u64 data;
1997
1998 if (vmx_get_msr(vcpu, ecx, &data)) {
1999 vmx_inject_gp(vcpu, 0);
2000 return 1;
2001 }
2002
2003 /* FIXME: handling of bits 32:63 of rax, rdx */
2004 vcpu->regs[VCPU_REGS_RAX] = data & -1u;
2005 vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2006 skip_emulated_instruction(vcpu);
2007 return 1;
2008}
2009
2010static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2011{
2012 u32 ecx = vcpu->regs[VCPU_REGS_RCX];
2013 u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
2014 | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
2015
2016 if (vmx_set_msr(vcpu, ecx, data) != 0) {
2017 vmx_inject_gp(vcpu, 0);
2018 return 1;
2019 }
2020
2021 skip_emulated_instruction(vcpu);
2022 return 1;
2023}
2024
2025static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
2026 struct kvm_run *kvm_run)
2027{
2028 return 1;
2029}
2030
2031static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2032 struct kvm_run *kvm_run)
2033{
2034 u32 cpu_based_vm_exec_control;
2035
2036 /* clear pending irq */
2037 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2038 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2039 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2040 /*
2041 * If the user space waits to inject interrupts, exit as soon as
2042 * possible
2043 */
2044 if (kvm_run->request_interrupt_window &&
2045 !vcpu->irq_summary) {
2046 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2047 ++vcpu->stat.irq_window_exits;
2048 return 0;
2049 }
2050 return 1;
2051}
2052
2053static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2054{
2055 skip_emulated_instruction(vcpu);
2056 return kvm_emulate_halt(vcpu);
2057}
2058
2059static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2060{
2061 skip_emulated_instruction(vcpu);
2062 return kvm_hypercall(vcpu, kvm_run);
2063}
2064
2065/*
2066 * The exit handlers return 1 if the exit was handled fully and guest execution
2067 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
2068 * to be done to userspace and return 0.
2069 */
2070static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2071 struct kvm_run *kvm_run) = {
2072 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2073 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2074 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
2075 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2076 [EXIT_REASON_CR_ACCESS] = handle_cr,
2077 [EXIT_REASON_DR_ACCESS] = handle_dr,
2078 [EXIT_REASON_CPUID] = handle_cpuid,
2079 [EXIT_REASON_MSR_READ] = handle_rdmsr,
2080 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2081 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2082 [EXIT_REASON_HLT] = handle_halt,
2083 [EXIT_REASON_VMCALL] = handle_vmcall,
2084 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold
2085};
2086
2087static const int kvm_vmx_max_exit_handlers =
2088 ARRAY_SIZE(kvm_vmx_exit_handlers);
2089
2090/*
2091 * The guest has exited. See if we can fix it or if we need userspace
2092 * assistance.
2093 */
2094static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2095{
2096 u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2097 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
2098 struct vcpu_vmx *vmx = to_vmx(vcpu);
2099
2100 if (unlikely(vmx->fail)) {
2101 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2102 kvm_run->fail_entry.hardware_entry_failure_reason
2103 = vmcs_read32(VM_INSTRUCTION_ERROR);
2104 return 0;
2105 }
2106
2107 if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
2108 exit_reason != EXIT_REASON_EXCEPTION_NMI )
2109 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
2110 "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2111 if (exit_reason < kvm_vmx_max_exit_handlers
2112 && kvm_vmx_exit_handlers[exit_reason])
2113 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
2114 else {
2115 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
2116 kvm_run->hw.hardware_exit_reason = exit_reason;
2117 }
2118 return 0;
2119}
2120
2121static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
2122{
2123}
2124
2125static void update_tpr_threshold(struct kvm_vcpu *vcpu)
2126{
2127 int max_irr, tpr;
2128
2129 if (!vm_need_tpr_shadow(vcpu->kvm))
2130 return;
2131
2132 if (!kvm_lapic_enabled(vcpu) ||
2133 ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
2134 vmcs_write32(TPR_THRESHOLD, 0);
2135 return;
2136 }
2137
2138 tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
2139 vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
2140}
2141
2142static void enable_irq_window(struct kvm_vcpu *vcpu)
2143{
2144 u32 cpu_based_vm_exec_control;
2145
2146 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2147 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2148 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2149}
2150
2151static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2152{
2153 u32 idtv_info_field, intr_info_field;
2154 int has_ext_irq, interrupt_window_open;
2155 int vector;
2156
2157 kvm_inject_pending_timer_irqs(vcpu);
2158 update_tpr_threshold(vcpu);
2159
2160 has_ext_irq = kvm_cpu_has_interrupt(vcpu);
2161 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2162 idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2163 if (intr_info_field & INTR_INFO_VALID_MASK) {
2164 if (idtv_info_field & INTR_INFO_VALID_MASK) {
2165 /* TODO: fault when IDT_Vectoring */
2166 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2167 }
2168 if (has_ext_irq)
2169 enable_irq_window(vcpu);
2170 return;
2171 }
2172 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
2173 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
2174 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2175 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
2176
2177 if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
2178 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2179 vmcs_read32(IDT_VECTORING_ERROR_CODE));
2180 if (unlikely(has_ext_irq))
2181 enable_irq_window(vcpu);
2182 return;
2183 }
2184 if (!has_ext_irq)
2185 return;
2186 interrupt_window_open =
2187 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2188 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2189 if (interrupt_window_open) {
2190 vector = kvm_cpu_get_interrupt(vcpu);
2191 vmx_inject_irq(vcpu, vector);
2192 kvm_timer_intr_post(vcpu, vector);
2193 } else
2194 enable_irq_window(vcpu);
2195}
2196
2197static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2198{
2199 struct vcpu_vmx *vmx = to_vmx(vcpu);
2200 u32 intr_info;
2201
2202 /*
2203 * Loading guest fpu may have cleared host cr0.ts
2204 */
2205 vmcs_writel(HOST_CR0, read_cr0());
2206
2207 asm (
2208 /* Store host registers */
2209#ifdef CONFIG_X86_64
2210 "push %%rax; push %%rbx; push %%rdx;"
2211 "push %%rsi; push %%rdi; push %%rbp;"
2212 "push %%r8; push %%r9; push %%r10; push %%r11;"
2213 "push %%r12; push %%r13; push %%r14; push %%r15;"
2214 "push %%rcx \n\t"
2215 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
2216#else
2217 "pusha; push %%ecx \n\t"
2218 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
2219#endif
2220 /* Check if vmlaunch of vmresume is needed */
2221 "cmp $0, %1 \n\t"
2222 /* Load guest registers. Don't clobber flags. */
2223#ifdef CONFIG_X86_64
2224 "mov %c[cr2](%3), %%rax \n\t"
2225 "mov %%rax, %%cr2 \n\t"
2226 "mov %c[rax](%3), %%rax \n\t"
2227 "mov %c[rbx](%3), %%rbx \n\t"
2228 "mov %c[rdx](%3), %%rdx \n\t"
2229 "mov %c[rsi](%3), %%rsi \n\t"
2230 "mov %c[rdi](%3), %%rdi \n\t"
2231 "mov %c[rbp](%3), %%rbp \n\t"
2232 "mov %c[r8](%3), %%r8 \n\t"
2233 "mov %c[r9](%3), %%r9 \n\t"
2234 "mov %c[r10](%3), %%r10 \n\t"
2235 "mov %c[r11](%3), %%r11 \n\t"
2236 "mov %c[r12](%3), %%r12 \n\t"
2237 "mov %c[r13](%3), %%r13 \n\t"
2238 "mov %c[r14](%3), %%r14 \n\t"
2239 "mov %c[r15](%3), %%r15 \n\t"
2240 "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
2241#else
2242 "mov %c[cr2](%3), %%eax \n\t"
2243 "mov %%eax, %%cr2 \n\t"
2244 "mov %c[rax](%3), %%eax \n\t"
2245 "mov %c[rbx](%3), %%ebx \n\t"
2246 "mov %c[rdx](%3), %%edx \n\t"
2247 "mov %c[rsi](%3), %%esi \n\t"
2248 "mov %c[rdi](%3), %%edi \n\t"
2249 "mov %c[rbp](%3), %%ebp \n\t"
2250 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
2251#endif
2252 /* Enter guest mode */
2253 "jne .Llaunched \n\t"
2254 ASM_VMX_VMLAUNCH "\n\t"
2255 "jmp .Lkvm_vmx_return \n\t"
2256 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
2257 ".Lkvm_vmx_return: "
2258 /* Save guest registers, load host registers, keep flags */
2259#ifdef CONFIG_X86_64
2260 "xchg %3, (%%rsp) \n\t"
2261 "mov %%rax, %c[rax](%3) \n\t"
2262 "mov %%rbx, %c[rbx](%3) \n\t"
2263 "pushq (%%rsp); popq %c[rcx](%3) \n\t"
2264 "mov %%rdx, %c[rdx](%3) \n\t"
2265 "mov %%rsi, %c[rsi](%3) \n\t"
2266 "mov %%rdi, %c[rdi](%3) \n\t"
2267 "mov %%rbp, %c[rbp](%3) \n\t"
2268 "mov %%r8, %c[r8](%3) \n\t"
2269 "mov %%r9, %c[r9](%3) \n\t"
2270 "mov %%r10, %c[r10](%3) \n\t"
2271 "mov %%r11, %c[r11](%3) \n\t"
2272 "mov %%r12, %c[r12](%3) \n\t"
2273 "mov %%r13, %c[r13](%3) \n\t"
2274 "mov %%r14, %c[r14](%3) \n\t"
2275 "mov %%r15, %c[r15](%3) \n\t"
2276 "mov %%cr2, %%rax \n\t"
2277 "mov %%rax, %c[cr2](%3) \n\t"
2278 "mov (%%rsp), %3 \n\t"
2279
2280 "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
2281 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
2282 "pop %%rbp; pop %%rdi; pop %%rsi;"
2283 "pop %%rdx; pop %%rbx; pop %%rax \n\t"
2284#else
2285 "xchg %3, (%%esp) \n\t"
2286 "mov %%eax, %c[rax](%3) \n\t"
2287 "mov %%ebx, %c[rbx](%3) \n\t"
2288 "pushl (%%esp); popl %c[rcx](%3) \n\t"
2289 "mov %%edx, %c[rdx](%3) \n\t"
2290 "mov %%esi, %c[rsi](%3) \n\t"
2291 "mov %%edi, %c[rdi](%3) \n\t"
2292 "mov %%ebp, %c[rbp](%3) \n\t"
2293 "mov %%cr2, %%eax \n\t"
2294 "mov %%eax, %c[cr2](%3) \n\t"
2295 "mov (%%esp), %3 \n\t"
2296
2297 "pop %%ecx; popa \n\t"
2298#endif
2299 "setbe %0 \n\t"
2300 : "=q" (vmx->fail)
2301 : "r"(vmx->launched), "d"((unsigned long)HOST_RSP),
2302 "c"(vcpu),
2303 [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
2304 [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
2305 [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
2306 [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
2307 [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
2308 [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
2309 [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
2310#ifdef CONFIG_X86_64
2311 [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
2312 [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
2313 [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
2314 [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
2315 [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
2316 [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
2317 [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
2318 [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
2319#endif
2320 [cr2]"i"(offsetof(struct kvm_vcpu, cr2))
2321 : "cc", "memory" );
2322
2323 vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
2324
2325 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2326 vmx->launched = 1;
2327
2328 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2329
2330 /* We need to handle NMIs before interrupts are enabled */
2331 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
2332 asm("int $2");
2333}
2334
2335static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
2336 unsigned long addr,
2337 u32 err_code)
2338{
2339 u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2340
2341 ++vcpu->stat.pf_guest;
2342
2343 if (is_page_fault(vect_info)) {
2344 printk(KERN_DEBUG "inject_page_fault: "
2345 "double fault 0x%lx @ 0x%lx\n",
2346 addr, vmcs_readl(GUEST_RIP));
2347 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
2348 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2349 DF_VECTOR |
2350 INTR_TYPE_EXCEPTION |
2351 INTR_INFO_DELIEVER_CODE_MASK |
2352 INTR_INFO_VALID_MASK);
2353 return;
2354 }
2355 vcpu->cr2 = addr;
2356 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
2357 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2358 PF_VECTOR |
2359 INTR_TYPE_EXCEPTION |
2360 INTR_INFO_DELIEVER_CODE_MASK |
2361 INTR_INFO_VALID_MASK);
2362
2363}
2364
2365static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2366{
2367 struct vcpu_vmx *vmx = to_vmx(vcpu);
2368
2369 if (vmx->vmcs) {
2370 on_each_cpu(__vcpu_clear, vmx, 0, 1);
2371 free_vmcs(vmx->vmcs);
2372 vmx->vmcs = NULL;
2373 }
2374}
2375
2376static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
2377{
2378 struct vcpu_vmx *vmx = to_vmx(vcpu);
2379
2380 vmx_free_vmcs(vcpu);
2381 kfree(vmx->host_msrs);
2382 kfree(vmx->guest_msrs);
2383 kvm_vcpu_uninit(vcpu);
2384 kmem_cache_free(kvm_vcpu_cache, vmx);
2385}
2386
2387static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2388{
2389 int err;
2390 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
2391 int cpu;
2392
2393 if (!vmx)
2394 return ERR_PTR(-ENOMEM);
2395
2396 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
2397 if (err)
2398 goto free_vcpu;
2399
2400 if (irqchip_in_kernel(kvm)) {
2401 err = kvm_create_lapic(&vmx->vcpu);
2402 if (err < 0)
2403 goto free_vcpu;
2404 }
2405
2406 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2407 if (!vmx->guest_msrs) {
2408 err = -ENOMEM;
2409 goto uninit_vcpu;
2410 }
2411
2412 vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2413 if (!vmx->host_msrs)
2414 goto free_guest_msrs;
2415
2416 vmx->vmcs = alloc_vmcs();
2417 if (!vmx->vmcs)
2418 goto free_msrs;
2419
2420 vmcs_clear(vmx->vmcs);
2421
2422 cpu = get_cpu();
2423 vmx_vcpu_load(&vmx->vcpu, cpu);
2424 err = vmx_vcpu_setup(vmx);
2425 vmx_vcpu_put(&vmx->vcpu);
2426 put_cpu();
2427 if (err)
2428 goto free_vmcs;
2429
2430 return &vmx->vcpu;
2431
2432free_vmcs:
2433 free_vmcs(vmx->vmcs);
2434free_msrs:
2435 kfree(vmx->host_msrs);
2436free_guest_msrs:
2437 kfree(vmx->guest_msrs);
2438uninit_vcpu:
2439 kvm_vcpu_uninit(&vmx->vcpu);
2440free_vcpu:
2441 kmem_cache_free(kvm_vcpu_cache, vmx);
2442 return ERR_PTR(err);
2443}
2444
2445static void __init vmx_check_processor_compat(void *rtn)
2446{
2447 struct vmcs_config vmcs_conf;
2448
2449 *(int *)rtn = 0;
2450 if (setup_vmcs_config(&vmcs_conf) < 0)
2451 *(int *)rtn = -EIO;
2452 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
2453 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
2454 smp_processor_id());
2455 *(int *)rtn = -EIO;
2456 }
2457}
2458
2459static struct kvm_x86_ops vmx_x86_ops = {
2460 .cpu_has_kvm_support = cpu_has_kvm_support,
2461 .disabled_by_bios = vmx_disabled_by_bios,
2462 .hardware_setup = hardware_setup,
2463 .hardware_unsetup = hardware_unsetup,
2464 .check_processor_compatibility = vmx_check_processor_compat,
2465 .hardware_enable = hardware_enable,
2466 .hardware_disable = hardware_disable,
2467
2468 .vcpu_create = vmx_create_vcpu,
2469 .vcpu_free = vmx_free_vcpu,
2470 .vcpu_reset = vmx_vcpu_reset,
2471
2472 .prepare_guest_switch = vmx_save_host_state,
2473 .vcpu_load = vmx_vcpu_load,
2474 .vcpu_put = vmx_vcpu_put,
2475 .vcpu_decache = vmx_vcpu_decache,
2476
2477 .set_guest_debug = set_guest_debug,
2478 .guest_debug_pre = kvm_guest_debug_pre,
2479 .get_msr = vmx_get_msr,
2480 .set_msr = vmx_set_msr,
2481 .get_segment_base = vmx_get_segment_base,
2482 .get_segment = vmx_get_segment,
2483 .set_segment = vmx_set_segment,
2484 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
2485 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
2486 .set_cr0 = vmx_set_cr0,
2487 .set_cr3 = vmx_set_cr3,
2488 .set_cr4 = vmx_set_cr4,
2489#ifdef CONFIG_X86_64
2490 .set_efer = vmx_set_efer,
2491#endif
2492 .get_idt = vmx_get_idt,
2493 .set_idt = vmx_set_idt,
2494 .get_gdt = vmx_get_gdt,
2495 .set_gdt = vmx_set_gdt,
2496 .cache_regs = vcpu_load_rsp_rip,
2497 .decache_regs = vcpu_put_rsp_rip,
2498 .get_rflags = vmx_get_rflags,
2499 .set_rflags = vmx_set_rflags,
2500
2501 .tlb_flush = vmx_flush_tlb,
2502 .inject_page_fault = vmx_inject_page_fault,
2503
2504 .inject_gp = vmx_inject_gp,
2505
2506 .run = vmx_vcpu_run,
2507 .handle_exit = kvm_handle_exit,
2508 .skip_emulated_instruction = skip_emulated_instruction,
2509 .patch_hypercall = vmx_patch_hypercall,
2510 .get_irq = vmx_get_irq,
2511 .set_irq = vmx_inject_irq,
2512 .inject_pending_irq = vmx_intr_assist,
2513 .inject_pending_vectors = do_interrupt_requests,
2514};
2515
2516static int __init vmx_init(void)
2517{
2518 void *iova;
2519 int r;
2520
2521 vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2522 if (!vmx_io_bitmap_a)
2523 return -ENOMEM;
2524
2525 vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2526 if (!vmx_io_bitmap_b) {
2527 r = -ENOMEM;
2528 goto out;
2529 }
2530
2531 /*
2532 * Allow direct access to the PC debug port (it is often used for I/O
2533 * delays, but the vmexits simply slow things down).
2534 */
2535 iova = kmap(vmx_io_bitmap_a);
2536 memset(iova, 0xff, PAGE_SIZE);
2537 clear_bit(0x80, iova);
2538 kunmap(vmx_io_bitmap_a);
2539
2540 iova = kmap(vmx_io_bitmap_b);
2541 memset(iova, 0xff, PAGE_SIZE);
2542 kunmap(vmx_io_bitmap_b);
2543
2544 r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
2545 if (r)
2546 goto out1;
2547
2548 return 0;
2549
2550out1:
2551 __free_page(vmx_io_bitmap_b);
2552out:
2553 __free_page(vmx_io_bitmap_a);
2554 return r;
2555}
2556
2557static void __exit vmx_exit(void)
2558{
2559 __free_page(vmx_io_bitmap_b);
2560 __free_page(vmx_io_bitmap_a);
2561
2562 kvm_exit_x86();
2563}
2564
2565module_init(vmx_init)
2566module_exit(vmx_exit)
diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h
deleted file mode 100644
index fd4e14666088..000000000000
--- a/drivers/kvm/vmx.h
+++ /dev/null
@@ -1,310 +0,0 @@
1#ifndef VMX_H
2#define VMX_H
3
4/*
5 * vmx.h: VMX Architecture related definitions
6 * Copyright (c) 2004, Intel Corporation.
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 * Place - Suite 330, Boston, MA 02111-1307 USA.
20 *
21 * A few random additions are:
22 * Copyright (C) 2006 Qumranet
23 * Avi Kivity <avi@qumranet.com>
24 * Yaniv Kamay <yaniv@qumranet.com>
25 *
26 */
27
28#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
29#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
30#define CPU_BASED_HLT_EXITING 0x00000080
31#define CPU_BASED_INVLPG_EXITING 0x00000200
32#define CPU_BASED_MWAIT_EXITING 0x00000400
33#define CPU_BASED_RDPMC_EXITING 0x00000800
34#define CPU_BASED_RDTSC_EXITING 0x00001000
35#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
36#define CPU_BASED_CR8_STORE_EXITING 0x00100000
37#define CPU_BASED_TPR_SHADOW 0x00200000
38#define CPU_BASED_MOV_DR_EXITING 0x00800000
39#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
40#define CPU_BASED_USE_IO_BITMAPS 0x02000000
41#define CPU_BASED_USE_MSR_BITMAPS 0x10000000
42#define CPU_BASED_MONITOR_EXITING 0x20000000
43#define CPU_BASED_PAUSE_EXITING 0x40000000
44#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
45
46#define PIN_BASED_EXT_INTR_MASK 0x00000001
47#define PIN_BASED_NMI_EXITING 0x00000008
48#define PIN_BASED_VIRTUAL_NMIS 0x00000020
49
50#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
51#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
52
53#define VM_ENTRY_IA32E_MODE 0x00000200
54#define VM_ENTRY_SMM 0x00000400
55#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
56
57#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
58
59/* VMCS Encodings */
60enum vmcs_field {
61 GUEST_ES_SELECTOR = 0x00000800,
62 GUEST_CS_SELECTOR = 0x00000802,
63 GUEST_SS_SELECTOR = 0x00000804,
64 GUEST_DS_SELECTOR = 0x00000806,
65 GUEST_FS_SELECTOR = 0x00000808,
66 GUEST_GS_SELECTOR = 0x0000080a,
67 GUEST_LDTR_SELECTOR = 0x0000080c,
68 GUEST_TR_SELECTOR = 0x0000080e,
69 HOST_ES_SELECTOR = 0x00000c00,
70 HOST_CS_SELECTOR = 0x00000c02,
71 HOST_SS_SELECTOR = 0x00000c04,
72 HOST_DS_SELECTOR = 0x00000c06,
73 HOST_FS_SELECTOR = 0x00000c08,
74 HOST_GS_SELECTOR = 0x00000c0a,
75 HOST_TR_SELECTOR = 0x00000c0c,
76 IO_BITMAP_A = 0x00002000,
77 IO_BITMAP_A_HIGH = 0x00002001,
78 IO_BITMAP_B = 0x00002002,
79 IO_BITMAP_B_HIGH = 0x00002003,
80 MSR_BITMAP = 0x00002004,
81 MSR_BITMAP_HIGH = 0x00002005,
82 VM_EXIT_MSR_STORE_ADDR = 0x00002006,
83 VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007,
84 VM_EXIT_MSR_LOAD_ADDR = 0x00002008,
85 VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
86 VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
87 VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
88 TSC_OFFSET = 0x00002010,
89 TSC_OFFSET_HIGH = 0x00002011,
90 VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
91 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
92 VMCS_LINK_POINTER = 0x00002800,
93 VMCS_LINK_POINTER_HIGH = 0x00002801,
94 GUEST_IA32_DEBUGCTL = 0x00002802,
95 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
96 PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
97 CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
98 EXCEPTION_BITMAP = 0x00004004,
99 PAGE_FAULT_ERROR_CODE_MASK = 0x00004006,
100 PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008,
101 CR3_TARGET_COUNT = 0x0000400a,
102 VM_EXIT_CONTROLS = 0x0000400c,
103 VM_EXIT_MSR_STORE_COUNT = 0x0000400e,
104 VM_EXIT_MSR_LOAD_COUNT = 0x00004010,
105 VM_ENTRY_CONTROLS = 0x00004012,
106 VM_ENTRY_MSR_LOAD_COUNT = 0x00004014,
107 VM_ENTRY_INTR_INFO_FIELD = 0x00004016,
108 VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018,
109 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
110 TPR_THRESHOLD = 0x0000401c,
111 SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
112 VM_INSTRUCTION_ERROR = 0x00004400,
113 VM_EXIT_REASON = 0x00004402,
114 VM_EXIT_INTR_INFO = 0x00004404,
115 VM_EXIT_INTR_ERROR_CODE = 0x00004406,
116 IDT_VECTORING_INFO_FIELD = 0x00004408,
117 IDT_VECTORING_ERROR_CODE = 0x0000440a,
118 VM_EXIT_INSTRUCTION_LEN = 0x0000440c,
119 VMX_INSTRUCTION_INFO = 0x0000440e,
120 GUEST_ES_LIMIT = 0x00004800,
121 GUEST_CS_LIMIT = 0x00004802,
122 GUEST_SS_LIMIT = 0x00004804,
123 GUEST_DS_LIMIT = 0x00004806,
124 GUEST_FS_LIMIT = 0x00004808,
125 GUEST_GS_LIMIT = 0x0000480a,
126 GUEST_LDTR_LIMIT = 0x0000480c,
127 GUEST_TR_LIMIT = 0x0000480e,
128 GUEST_GDTR_LIMIT = 0x00004810,
129 GUEST_IDTR_LIMIT = 0x00004812,
130 GUEST_ES_AR_BYTES = 0x00004814,
131 GUEST_CS_AR_BYTES = 0x00004816,
132 GUEST_SS_AR_BYTES = 0x00004818,
133 GUEST_DS_AR_BYTES = 0x0000481a,
134 GUEST_FS_AR_BYTES = 0x0000481c,
135 GUEST_GS_AR_BYTES = 0x0000481e,
136 GUEST_LDTR_AR_BYTES = 0x00004820,
137 GUEST_TR_AR_BYTES = 0x00004822,
138 GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
139 GUEST_ACTIVITY_STATE = 0X00004826,
140 GUEST_SYSENTER_CS = 0x0000482A,
141 HOST_IA32_SYSENTER_CS = 0x00004c00,
142 CR0_GUEST_HOST_MASK = 0x00006000,
143 CR4_GUEST_HOST_MASK = 0x00006002,
144 CR0_READ_SHADOW = 0x00006004,
145 CR4_READ_SHADOW = 0x00006006,
146 CR3_TARGET_VALUE0 = 0x00006008,
147 CR3_TARGET_VALUE1 = 0x0000600a,
148 CR3_TARGET_VALUE2 = 0x0000600c,
149 CR3_TARGET_VALUE3 = 0x0000600e,
150 EXIT_QUALIFICATION = 0x00006400,
151 GUEST_LINEAR_ADDRESS = 0x0000640a,
152 GUEST_CR0 = 0x00006800,
153 GUEST_CR3 = 0x00006802,
154 GUEST_CR4 = 0x00006804,
155 GUEST_ES_BASE = 0x00006806,
156 GUEST_CS_BASE = 0x00006808,
157 GUEST_SS_BASE = 0x0000680a,
158 GUEST_DS_BASE = 0x0000680c,
159 GUEST_FS_BASE = 0x0000680e,
160 GUEST_GS_BASE = 0x00006810,
161 GUEST_LDTR_BASE = 0x00006812,
162 GUEST_TR_BASE = 0x00006814,
163 GUEST_GDTR_BASE = 0x00006816,
164 GUEST_IDTR_BASE = 0x00006818,
165 GUEST_DR7 = 0x0000681a,
166 GUEST_RSP = 0x0000681c,
167 GUEST_RIP = 0x0000681e,
168 GUEST_RFLAGS = 0x00006820,
169 GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822,
170 GUEST_SYSENTER_ESP = 0x00006824,
171 GUEST_SYSENTER_EIP = 0x00006826,
172 HOST_CR0 = 0x00006c00,
173 HOST_CR3 = 0x00006c02,
174 HOST_CR4 = 0x00006c04,
175 HOST_FS_BASE = 0x00006c06,
176 HOST_GS_BASE = 0x00006c08,
177 HOST_TR_BASE = 0x00006c0a,
178 HOST_GDTR_BASE = 0x00006c0c,
179 HOST_IDTR_BASE = 0x00006c0e,
180 HOST_IA32_SYSENTER_ESP = 0x00006c10,
181 HOST_IA32_SYSENTER_EIP = 0x00006c12,
182 HOST_RSP = 0x00006c14,
183 HOST_RIP = 0x00006c16,
184};
185
186#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
187
188#define EXIT_REASON_EXCEPTION_NMI 0
189#define EXIT_REASON_EXTERNAL_INTERRUPT 1
190#define EXIT_REASON_TRIPLE_FAULT 2
191
192#define EXIT_REASON_PENDING_INTERRUPT 7
193
194#define EXIT_REASON_TASK_SWITCH 9
195#define EXIT_REASON_CPUID 10
196#define EXIT_REASON_HLT 12
197#define EXIT_REASON_INVLPG 14
198#define EXIT_REASON_RDPMC 15
199#define EXIT_REASON_RDTSC 16
200#define EXIT_REASON_VMCALL 18
201#define EXIT_REASON_VMCLEAR 19
202#define EXIT_REASON_VMLAUNCH 20
203#define EXIT_REASON_VMPTRLD 21
204#define EXIT_REASON_VMPTRST 22
205#define EXIT_REASON_VMREAD 23
206#define EXIT_REASON_VMRESUME 24
207#define EXIT_REASON_VMWRITE 25
208#define EXIT_REASON_VMOFF 26
209#define EXIT_REASON_VMON 27
210#define EXIT_REASON_CR_ACCESS 28
211#define EXIT_REASON_DR_ACCESS 29
212#define EXIT_REASON_IO_INSTRUCTION 30
213#define EXIT_REASON_MSR_READ 31
214#define EXIT_REASON_MSR_WRITE 32
215#define EXIT_REASON_MWAIT_INSTRUCTION 36
216#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
217
218/*
219 * Interruption-information format
220 */
221#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
222#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
223#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */
224#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
225
226#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
227#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
228#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK
229#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
230
231#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
232#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
233
234/*
235 * Exit Qualifications for MOV for Control Register Access
236 */
237#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */
238#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
239#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */
240#define LMSW_SOURCE_DATA_SHIFT 16
241#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
242#define REG_EAX (0 << 8)
243#define REG_ECX (1 << 8)
244#define REG_EDX (2 << 8)
245#define REG_EBX (3 << 8)
246#define REG_ESP (4 << 8)
247#define REG_EBP (5 << 8)
248#define REG_ESI (6 << 8)
249#define REG_EDI (7 << 8)
250#define REG_R8 (8 << 8)
251#define REG_R9 (9 << 8)
252#define REG_R10 (10 << 8)
253#define REG_R11 (11 << 8)
254#define REG_R12 (12 << 8)
255#define REG_R13 (13 << 8)
256#define REG_R14 (14 << 8)
257#define REG_R15 (15 << 8)
258
259/*
260 * Exit Qualifications for MOV for Debug Register Access
261 */
262#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */
263#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
264#define TYPE_MOV_TO_DR (0 << 4)
265#define TYPE_MOV_FROM_DR (1 << 4)
266#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */
267
268
269/* segment AR */
270#define SEGMENT_AR_L_MASK (1 << 13)
271
272#define AR_TYPE_ACCESSES_MASK 1
273#define AR_TYPE_READABLE_MASK (1 << 1)
274#define AR_TYPE_WRITEABLE_MASK (1 << 2)
275#define AR_TYPE_CODE_MASK (1 << 3)
276#define AR_TYPE_MASK 0x0f
277#define AR_TYPE_BUSY_64_TSS 11
278#define AR_TYPE_BUSY_32_TSS 11
279#define AR_TYPE_BUSY_16_TSS 3
280#define AR_TYPE_LDT 2
281
282#define AR_UNUSABLE_MASK (1 << 16)
283#define AR_S_MASK (1 << 4)
284#define AR_P_MASK (1 << 7)
285#define AR_L_MASK (1 << 13)
286#define AR_DB_MASK (1 << 14)
287#define AR_G_MASK (1 << 15)
288#define AR_DPL_SHIFT 5
289#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
290
291#define AR_RESERVD_MASK 0xfffe0f00
292
293#define MSR_IA32_VMX_BASIC 0x480
294#define MSR_IA32_VMX_PINBASED_CTLS 0x481
295#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
296#define MSR_IA32_VMX_EXIT_CTLS 0x483
297#define MSR_IA32_VMX_ENTRY_CTLS 0x484
298#define MSR_IA32_VMX_MISC 0x485
299#define MSR_IA32_VMX_CR0_FIXED0 0x486
300#define MSR_IA32_VMX_CR0_FIXED1 0x487
301#define MSR_IA32_VMX_CR4_FIXED0 0x488
302#define MSR_IA32_VMX_CR4_FIXED1 0x489
303#define MSR_IA32_VMX_VMCS_ENUM 0x48a
304#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
305
306#define MSR_IA32_FEATURE_CONTROL 0x3a
307#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
308#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
309
310#endif
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
deleted file mode 100644
index bd46de6bf891..000000000000
--- a/drivers/kvm/x86_emulate.c
+++ /dev/null
@@ -1,1662 +0,0 @@
1/******************************************************************************
2 * x86_emulate.c
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * Linux coding style, mod r/m decoder, segment base fixes, real-mode
9 * privileged instructions:
10 *
11 * Copyright (C) 2006 Qumranet
12 *
13 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com>
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 *
19 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
20 */
21
22#ifndef __KERNEL__
23#include <stdio.h>
24#include <stdint.h>
25#include <public/xen.h>
26#define DPRINTF(_f, _a ...) printf( _f , ## _a )
27#else
28#include "kvm.h"
29#define DPRINTF(x...) do {} while (0)
30#endif
31#include "x86_emulate.h"
32#include <linux/module.h>
33
34/*
35 * Opcode effective-address decode tables.
36 * Note that we only emulate instructions that have at least one memory
37 * operand (excluding implicit stack references). We assume that stack
38 * references and instruction fetches will never occur in special memory
39 * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
40 * not be handled.
41 */
42
43/* Operand sizes: 8-bit operands or specified/overridden size. */
44#define ByteOp (1<<0) /* 8-bit operands. */
45/* Destination operand type. */
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1)
50/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3)
60/* Generic ModRM decode. */
61#define ModRM (1<<6)
62/* Destination is only written; never read. */
63#define Mov (1<<7)
64#define BitOp (1<<8)
65
66static u8 opcode_table[256] = {
67 /* 0x00 - 0x07 */
68 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
69 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
70 0, 0, 0, 0,
71 /* 0x08 - 0x0F */
72 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
73 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
74 0, 0, 0, 0,
75 /* 0x10 - 0x17 */
76 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
77 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
78 0, 0, 0, 0,
79 /* 0x18 - 0x1F */
80 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
81 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
82 0, 0, 0, 0,
83 /* 0x20 - 0x27 */
84 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
85 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
86 SrcImmByte, SrcImm, 0, 0,
87 /* 0x28 - 0x2F */
88 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
89 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
90 0, 0, 0, 0,
91 /* 0x30 - 0x37 */
92 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
93 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
94 0, 0, 0, 0,
95 /* 0x38 - 0x3F */
96 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
97 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
98 0, 0, 0, 0,
99 /* 0x40 - 0x4F */
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 /* 0x50 - 0x57 */
102 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
103 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
104 /* 0x58 - 0x5F */
105 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
106 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
107 /* 0x60 - 0x67 */
108 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
109 0, 0, 0, 0,
110 /* 0x68 - 0x6F */
111 0, 0, ImplicitOps|Mov, 0,
112 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
113 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
114 /* 0x70 - 0x77 */
115 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
116 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
117 /* 0x78 - 0x7F */
118 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
119 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
120 /* 0x80 - 0x87 */
121 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
122 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
123 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
124 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
125 /* 0x88 - 0x8F */
126 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
127 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
128 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov,
129 /* 0x90 - 0x9F */
130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
131 /* 0xA0 - 0xA7 */
132 ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
133 ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
134 ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
135 ByteOp | ImplicitOps, ImplicitOps,
136 /* 0xA8 - 0xAF */
137 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
138 ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
139 ByteOp | ImplicitOps, ImplicitOps,
140 /* 0xB0 - 0xBF */
141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
142 /* 0xC0 - 0xC7 */
143 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
144 0, ImplicitOps, 0, 0,
145 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
146 /* 0xC8 - 0xCF */
147 0, 0, 0, 0, 0, 0, 0, 0,
148 /* 0xD0 - 0xD7 */
149 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
150 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
151 0, 0, 0, 0,
152 /* 0xD8 - 0xDF */
153 0, 0, 0, 0, 0, 0, 0, 0,
154 /* 0xE0 - 0xE7 */
155 0, 0, 0, 0, 0, 0, 0, 0,
156 /* 0xE8 - 0xEF */
157 ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
158 /* 0xF0 - 0xF7 */
159 0, 0, 0, 0,
160 ImplicitOps, 0,
161 ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
162 /* 0xF8 - 0xFF */
163 0, 0, 0, 0,
164 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
165};
166
167static u16 twobyte_table[256] = {
168 /* 0x00 - 0x0F */
169 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
170 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
171 /* 0x10 - 0x1F */
172 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
173 /* 0x20 - 0x2F */
174 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 /* 0x30 - 0x3F */
177 ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
178 /* 0x40 - 0x47 */
179 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
180 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
181 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
182 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
183 /* 0x48 - 0x4F */
184 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
185 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
186 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
187 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
188 /* 0x50 - 0x5F */
189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 /* 0x60 - 0x6F */
191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
192 /* 0x70 - 0x7F */
193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 /* 0x80 - 0x8F */
195 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
196 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
197 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
198 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
199 /* 0x90 - 0x9F */
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 /* 0xA0 - 0xA7 */
202 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
203 /* 0xA8 - 0xAF */
204 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
205 /* 0xB0 - 0xB7 */
206 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
207 DstMem | SrcReg | ModRM | BitOp,
208 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
209 DstReg | SrcMem16 | ModRM | Mov,
210 /* 0xB8 - 0xBF */
211 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
212 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
213 DstReg | SrcMem16 | ModRM | Mov,
214 /* 0xC0 - 0xCF */
215 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 /* 0xD0 - 0xDF */
218 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
219 /* 0xE0 - 0xEF */
220 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
221 /* 0xF0 - 0xFF */
222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
223};
224
225/* Type, address-of, and value of an instruction's operand. */
226struct operand {
227 enum { OP_REG, OP_MEM, OP_IMM } type;
228 unsigned int bytes;
229 unsigned long val, orig_val, *ptr;
230};
231
232/* EFLAGS bit definitions. */
233#define EFLG_OF (1<<11)
234#define EFLG_DF (1<<10)
235#define EFLG_SF (1<<7)
236#define EFLG_ZF (1<<6)
237#define EFLG_AF (1<<4)
238#define EFLG_PF (1<<2)
239#define EFLG_CF (1<<0)
240
241/*
242 * Instruction emulation:
243 * Most instructions are emulated directly via a fragment of inline assembly
244 * code. This allows us to save/restore EFLAGS and thus very easily pick up
245 * any modified flags.
246 */
247
248#if defined(CONFIG_X86_64)
249#define _LO32 "k" /* force 32-bit operand */
250#define _STK "%%rsp" /* stack pointer */
251#elif defined(__i386__)
252#define _LO32 "" /* force 32-bit operand */
253#define _STK "%%esp" /* stack pointer */
254#endif
255
256/*
257 * These EFLAGS bits are restored from saved value during emulation, and
258 * any changes are written back to the saved value after emulation.
259 */
260#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
261
262/* Before executing instruction: restore necessary bits in EFLAGS. */
263#define _PRE_EFLAGS(_sav, _msk, _tmp) \
264 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \
265 "push %"_sav"; " \
266 "movl %"_msk",%"_LO32 _tmp"; " \
267 "andl %"_LO32 _tmp",("_STK"); " \
268 "pushf; " \
269 "notl %"_LO32 _tmp"; " \
270 "andl %"_LO32 _tmp",("_STK"); " \
271 "pop %"_tmp"; " \
272 "orl %"_LO32 _tmp",("_STK"); " \
273 "popf; " \
274 /* _sav &= ~msk; */ \
275 "movl %"_msk",%"_LO32 _tmp"; " \
276 "notl %"_LO32 _tmp"; " \
277 "andl %"_LO32 _tmp",%"_sav"; "
278
279/* After executing instruction: write-back necessary bits in EFLAGS. */
280#define _POST_EFLAGS(_sav, _msk, _tmp) \
281 /* _sav |= EFLAGS & _msk; */ \
282 "pushf; " \
283 "pop %"_tmp"; " \
284 "andl %"_msk",%"_LO32 _tmp"; " \
285 "orl %"_LO32 _tmp",%"_sav"; "
286
287/* Raw emulation: instruction has two explicit operands. */
288#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
289 do { \
290 unsigned long _tmp; \
291 \
292 switch ((_dst).bytes) { \
293 case 2: \
294 __asm__ __volatile__ ( \
295 _PRE_EFLAGS("0","4","2") \
296 _op"w %"_wx"3,%1; " \
297 _POST_EFLAGS("0","4","2") \
298 : "=m" (_eflags), "=m" ((_dst).val), \
299 "=&r" (_tmp) \
300 : _wy ((_src).val), "i" (EFLAGS_MASK) ); \
301 break; \
302 case 4: \
303 __asm__ __volatile__ ( \
304 _PRE_EFLAGS("0","4","2") \
305 _op"l %"_lx"3,%1; " \
306 _POST_EFLAGS("0","4","2") \
307 : "=m" (_eflags), "=m" ((_dst).val), \
308 "=&r" (_tmp) \
309 : _ly ((_src).val), "i" (EFLAGS_MASK) ); \
310 break; \
311 case 8: \
312 __emulate_2op_8byte(_op, _src, _dst, \
313 _eflags, _qx, _qy); \
314 break; \
315 } \
316 } while (0)
317
318#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
319 do { \
320 unsigned long _tmp; \
321 switch ( (_dst).bytes ) \
322 { \
323 case 1: \
324 __asm__ __volatile__ ( \
325 _PRE_EFLAGS("0","4","2") \
326 _op"b %"_bx"3,%1; " \
327 _POST_EFLAGS("0","4","2") \
328 : "=m" (_eflags), "=m" ((_dst).val), \
329 "=&r" (_tmp) \
330 : _by ((_src).val), "i" (EFLAGS_MASK) ); \
331 break; \
332 default: \
333 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
334 _wx, _wy, _lx, _ly, _qx, _qy); \
335 break; \
336 } \
337 } while (0)
338
339/* Source operand is byte-sized and may be restricted to just %cl. */
340#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
341 __emulate_2op(_op, _src, _dst, _eflags, \
342 "b", "c", "b", "c", "b", "c", "b", "c")
343
344/* Source operand is byte, word, long or quad sized. */
345#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
346 __emulate_2op(_op, _src, _dst, _eflags, \
347 "b", "q", "w", "r", _LO32, "r", "", "r")
348
349/* Source operand is word, long or quad sized. */
350#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
351 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
352 "w", "r", _LO32, "r", "", "r")
353
354/* Instruction has only one explicit operand (no source operand). */
355#define emulate_1op(_op, _dst, _eflags) \
356 do { \
357 unsigned long _tmp; \
358 \
359 switch ( (_dst).bytes ) \
360 { \
361 case 1: \
362 __asm__ __volatile__ ( \
363 _PRE_EFLAGS("0","3","2") \
364 _op"b %1; " \
365 _POST_EFLAGS("0","3","2") \
366 : "=m" (_eflags), "=m" ((_dst).val), \
367 "=&r" (_tmp) \
368 : "i" (EFLAGS_MASK) ); \
369 break; \
370 case 2: \
371 __asm__ __volatile__ ( \
372 _PRE_EFLAGS("0","3","2") \
373 _op"w %1; " \
374 _POST_EFLAGS("0","3","2") \
375 : "=m" (_eflags), "=m" ((_dst).val), \
376 "=&r" (_tmp) \
377 : "i" (EFLAGS_MASK) ); \
378 break; \
379 case 4: \
380 __asm__ __volatile__ ( \
381 _PRE_EFLAGS("0","3","2") \
382 _op"l %1; " \
383 _POST_EFLAGS("0","3","2") \
384 : "=m" (_eflags), "=m" ((_dst).val), \
385 "=&r" (_tmp) \
386 : "i" (EFLAGS_MASK) ); \
387 break; \
388 case 8: \
389 __emulate_1op_8byte(_op, _dst, _eflags); \
390 break; \
391 } \
392 } while (0)
393
394/* Emulate an instruction with quadword operands (x86/64 only). */
395#if defined(CONFIG_X86_64)
396#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
397 do { \
398 __asm__ __volatile__ ( \
399 _PRE_EFLAGS("0","4","2") \
400 _op"q %"_qx"3,%1; " \
401 _POST_EFLAGS("0","4","2") \
402 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
403 : _qy ((_src).val), "i" (EFLAGS_MASK) ); \
404 } while (0)
405
406#define __emulate_1op_8byte(_op, _dst, _eflags) \
407 do { \
408 __asm__ __volatile__ ( \
409 _PRE_EFLAGS("0","3","2") \
410 _op"q %1; " \
411 _POST_EFLAGS("0","3","2") \
412 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
413 : "i" (EFLAGS_MASK) ); \
414 } while (0)
415
416#elif defined(__i386__)
417#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
418#define __emulate_1op_8byte(_op, _dst, _eflags)
419#endif /* __i386__ */
420
421/* Fetch next part of the instruction being emulated. */
422#define insn_fetch(_type, _size, _eip) \
423({ unsigned long _x; \
424 rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \
425 (_size), ctxt->vcpu); \
426 if ( rc != 0 ) \
427 goto done; \
428 (_eip) += (_size); \
429 (_type)_x; \
430})
431
432/* Access/update address held in a register, based on addressing mode. */
433#define address_mask(reg) \
434 ((ad_bytes == sizeof(unsigned long)) ? \
435 (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1)))
436#define register_address(base, reg) \
437 ((base) + address_mask(reg))
438#define register_address_increment(reg, inc) \
439 do { \
440 /* signed type ensures sign extension to long */ \
441 int _inc = (inc); \
442 if ( ad_bytes == sizeof(unsigned long) ) \
443 (reg) += _inc; \
444 else \
445 (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \
446 (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
447 } while (0)
448
449#define JMP_REL(rel) \
450 do { \
451 register_address_increment(_eip, rel); \
452 } while (0)
453
454/*
455 * Given the 'reg' portion of a ModRM byte, and a register block, return a
456 * pointer into the block that addresses the relevant register.
457 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
458 */
459static void *decode_register(u8 modrm_reg, unsigned long *regs,
460 int highbyte_regs)
461{
462 void *p;
463
464 p = &regs[modrm_reg];
465 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
466 p = (unsigned char *)&regs[modrm_reg & 3] + 1;
467 return p;
468}
469
470static int read_descriptor(struct x86_emulate_ctxt *ctxt,
471 struct x86_emulate_ops *ops,
472 void *ptr,
473 u16 *size, unsigned long *address, int op_bytes)
474{
475 int rc;
476
477 if (op_bytes == 2)
478 op_bytes = 3;
479 *address = 0;
480 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
481 ctxt->vcpu);
482 if (rc)
483 return rc;
484 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
485 ctxt->vcpu);
486 return rc;
487}
488
489static int test_cc(unsigned int condition, unsigned int flags)
490{
491 int rc = 0;
492
493 switch ((condition & 15) >> 1) {
494 case 0: /* o */
495 rc |= (flags & EFLG_OF);
496 break;
497 case 1: /* b/c/nae */
498 rc |= (flags & EFLG_CF);
499 break;
500 case 2: /* z/e */
501 rc |= (flags & EFLG_ZF);
502 break;
503 case 3: /* be/na */
504 rc |= (flags & (EFLG_CF|EFLG_ZF));
505 break;
506 case 4: /* s */
507 rc |= (flags & EFLG_SF);
508 break;
509 case 5: /* p/pe */
510 rc |= (flags & EFLG_PF);
511 break;
512 case 7: /* le/ng */
513 rc |= (flags & EFLG_ZF);
514 /* fall through */
515 case 6: /* l/nge */
516 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
517 break;
518 }
519
520 /* Odd condition identifiers (lsb == 1) have inverted sense. */
521 return (!!rc ^ (condition & 1));
522}
523
524int
525x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
526{
527 unsigned d;
528 u8 b, sib, twobyte = 0, rex_prefix = 0;
529 u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
530 unsigned long *override_base = NULL;
531 unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
532 int rc = 0;
533 struct operand src, dst;
534 unsigned long cr2 = ctxt->cr2;
535 int mode = ctxt->mode;
536 unsigned long modrm_ea;
537 int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
538 int no_wb = 0;
539 u64 msr_data;
540
541 /* Shadow copy of register state. Committed on successful emulation. */
542 unsigned long _regs[NR_VCPU_REGS];
543 unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags;
544 unsigned long modrm_val = 0;
545
546 memcpy(_regs, ctxt->vcpu->regs, sizeof _regs);
547
548 switch (mode) {
549 case X86EMUL_MODE_REAL:
550 case X86EMUL_MODE_PROT16:
551 op_bytes = ad_bytes = 2;
552 break;
553 case X86EMUL_MODE_PROT32:
554 op_bytes = ad_bytes = 4;
555 break;
556#ifdef CONFIG_X86_64
557 case X86EMUL_MODE_PROT64:
558 op_bytes = 4;
559 ad_bytes = 8;
560 break;
561#endif
562 default:
563 return -1;
564 }
565
566 /* Legacy prefixes. */
567 for (i = 0; i < 8; i++) {
568 switch (b = insn_fetch(u8, 1, _eip)) {
569 case 0x66: /* operand-size override */
570 op_bytes ^= 6; /* switch between 2/4 bytes */
571 break;
572 case 0x67: /* address-size override */
573 if (mode == X86EMUL_MODE_PROT64)
574 ad_bytes ^= 12; /* switch between 4/8 bytes */
575 else
576 ad_bytes ^= 6; /* switch between 2/4 bytes */
577 break;
578 case 0x2e: /* CS override */
579 override_base = &ctxt->cs_base;
580 break;
581 case 0x3e: /* DS override */
582 override_base = &ctxt->ds_base;
583 break;
584 case 0x26: /* ES override */
585 override_base = &ctxt->es_base;
586 break;
587 case 0x64: /* FS override */
588 override_base = &ctxt->fs_base;
589 break;
590 case 0x65: /* GS override */
591 override_base = &ctxt->gs_base;
592 break;
593 case 0x36: /* SS override */
594 override_base = &ctxt->ss_base;
595 break;
596 case 0xf0: /* LOCK */
597 lock_prefix = 1;
598 break;
599 case 0xf2: /* REPNE/REPNZ */
600 case 0xf3: /* REP/REPE/REPZ */
601 rep_prefix = 1;
602 break;
603 default:
604 goto done_prefixes;
605 }
606 }
607
608done_prefixes:
609
610 /* REX prefix. */
611 if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) {
612 rex_prefix = b;
613 if (b & 8)
614 op_bytes = 8; /* REX.W */
615 modrm_reg = (b & 4) << 1; /* REX.R */
616 index_reg = (b & 2) << 2; /* REX.X */
617 modrm_rm = base_reg = (b & 1) << 3; /* REG.B */
618 b = insn_fetch(u8, 1, _eip);
619 }
620
621 /* Opcode byte(s). */
622 d = opcode_table[b];
623 if (d == 0) {
624 /* Two-byte opcode? */
625 if (b == 0x0f) {
626 twobyte = 1;
627 b = insn_fetch(u8, 1, _eip);
628 d = twobyte_table[b];
629 }
630
631 /* Unrecognised? */
632 if (d == 0)
633 goto cannot_emulate;
634 }
635
636 /* ModRM and SIB bytes. */
637 if (d & ModRM) {
638 modrm = insn_fetch(u8, 1, _eip);
639 modrm_mod |= (modrm & 0xc0) >> 6;
640 modrm_reg |= (modrm & 0x38) >> 3;
641 modrm_rm |= (modrm & 0x07);
642 modrm_ea = 0;
643 use_modrm_ea = 1;
644
645 if (modrm_mod == 3) {
646 modrm_val = *(unsigned long *)
647 decode_register(modrm_rm, _regs, d & ByteOp);
648 goto modrm_done;
649 }
650
651 if (ad_bytes == 2) {
652 unsigned bx = _regs[VCPU_REGS_RBX];
653 unsigned bp = _regs[VCPU_REGS_RBP];
654 unsigned si = _regs[VCPU_REGS_RSI];
655 unsigned di = _regs[VCPU_REGS_RDI];
656
657 /* 16-bit ModR/M decode. */
658 switch (modrm_mod) {
659 case 0:
660 if (modrm_rm == 6)
661 modrm_ea += insn_fetch(u16, 2, _eip);
662 break;
663 case 1:
664 modrm_ea += insn_fetch(s8, 1, _eip);
665 break;
666 case 2:
667 modrm_ea += insn_fetch(u16, 2, _eip);
668 break;
669 }
670 switch (modrm_rm) {
671 case 0:
672 modrm_ea += bx + si;
673 break;
674 case 1:
675 modrm_ea += bx + di;
676 break;
677 case 2:
678 modrm_ea += bp + si;
679 break;
680 case 3:
681 modrm_ea += bp + di;
682 break;
683 case 4:
684 modrm_ea += si;
685 break;
686 case 5:
687 modrm_ea += di;
688 break;
689 case 6:
690 if (modrm_mod != 0)
691 modrm_ea += bp;
692 break;
693 case 7:
694 modrm_ea += bx;
695 break;
696 }
697 if (modrm_rm == 2 || modrm_rm == 3 ||
698 (modrm_rm == 6 && modrm_mod != 0))
699 if (!override_base)
700 override_base = &ctxt->ss_base;
701 modrm_ea = (u16)modrm_ea;
702 } else {
703 /* 32/64-bit ModR/M decode. */
704 switch (modrm_rm) {
705 case 4:
706 case 12:
707 sib = insn_fetch(u8, 1, _eip);
708 index_reg |= (sib >> 3) & 7;
709 base_reg |= sib & 7;
710 scale = sib >> 6;
711
712 switch (base_reg) {
713 case 5:
714 if (modrm_mod != 0)
715 modrm_ea += _regs[base_reg];
716 else
717 modrm_ea += insn_fetch(s32, 4, _eip);
718 break;
719 default:
720 modrm_ea += _regs[base_reg];
721 }
722 switch (index_reg) {
723 case 4:
724 break;
725 default:
726 modrm_ea += _regs[index_reg] << scale;
727
728 }
729 break;
730 case 5:
731 if (modrm_mod != 0)
732 modrm_ea += _regs[modrm_rm];
733 else if (mode == X86EMUL_MODE_PROT64)
734 rip_relative = 1;
735 break;
736 default:
737 modrm_ea += _regs[modrm_rm];
738 break;
739 }
740 switch (modrm_mod) {
741 case 0:
742 if (modrm_rm == 5)
743 modrm_ea += insn_fetch(s32, 4, _eip);
744 break;
745 case 1:
746 modrm_ea += insn_fetch(s8, 1, _eip);
747 break;
748 case 2:
749 modrm_ea += insn_fetch(s32, 4, _eip);
750 break;
751 }
752 }
753 if (!override_base)
754 override_base = &ctxt->ds_base;
755 if (mode == X86EMUL_MODE_PROT64 &&
756 override_base != &ctxt->fs_base &&
757 override_base != &ctxt->gs_base)
758 override_base = NULL;
759
760 if (override_base)
761 modrm_ea += *override_base;
762
763 if (rip_relative) {
764 modrm_ea += _eip;
765 switch (d & SrcMask) {
766 case SrcImmByte:
767 modrm_ea += 1;
768 break;
769 case SrcImm:
770 if (d & ByteOp)
771 modrm_ea += 1;
772 else
773 if (op_bytes == 8)
774 modrm_ea += 4;
775 else
776 modrm_ea += op_bytes;
777 }
778 }
779 if (ad_bytes != 8)
780 modrm_ea = (u32)modrm_ea;
781 cr2 = modrm_ea;
782 modrm_done:
783 ;
784 }
785
786 /*
787 * Decode and fetch the source operand: register, memory
788 * or immediate.
789 */
790 switch (d & SrcMask) {
791 case SrcNone:
792 break;
793 case SrcReg:
794 src.type = OP_REG;
795 if (d & ByteOp) {
796 src.ptr = decode_register(modrm_reg, _regs,
797 (rex_prefix == 0));
798 src.val = src.orig_val = *(u8 *) src.ptr;
799 src.bytes = 1;
800 } else {
801 src.ptr = decode_register(modrm_reg, _regs, 0);
802 switch ((src.bytes = op_bytes)) {
803 case 2:
804 src.val = src.orig_val = *(u16 *) src.ptr;
805 break;
806 case 4:
807 src.val = src.orig_val = *(u32 *) src.ptr;
808 break;
809 case 8:
810 src.val = src.orig_val = *(u64 *) src.ptr;
811 break;
812 }
813 }
814 break;
815 case SrcMem16:
816 src.bytes = 2;
817 goto srcmem_common;
818 case SrcMem32:
819 src.bytes = 4;
820 goto srcmem_common;
821 case SrcMem:
822 src.bytes = (d & ByteOp) ? 1 : op_bytes;
823 /* Don't fetch the address for invlpg: it could be unmapped. */
824 if (twobyte && b == 0x01 && modrm_reg == 7)
825 break;
826 srcmem_common:
827 /*
828 * For instructions with a ModR/M byte, switch to register
829 * access if Mod = 3.
830 */
831 if ((d & ModRM) && modrm_mod == 3) {
832 src.type = OP_REG;
833 break;
834 }
835 src.type = OP_MEM;
836 src.ptr = (unsigned long *)cr2;
837 src.val = 0;
838 if ((rc = ops->read_emulated((unsigned long)src.ptr,
839 &src.val, src.bytes, ctxt->vcpu)) != 0)
840 goto done;
841 src.orig_val = src.val;
842 break;
843 case SrcImm:
844 src.type = OP_IMM;
845 src.ptr = (unsigned long *)_eip;
846 src.bytes = (d & ByteOp) ? 1 : op_bytes;
847 if (src.bytes == 8)
848 src.bytes = 4;
849 /* NB. Immediates are sign-extended as necessary. */
850 switch (src.bytes) {
851 case 1:
852 src.val = insn_fetch(s8, 1, _eip);
853 break;
854 case 2:
855 src.val = insn_fetch(s16, 2, _eip);
856 break;
857 case 4:
858 src.val = insn_fetch(s32, 4, _eip);
859 break;
860 }
861 break;
862 case SrcImmByte:
863 src.type = OP_IMM;
864 src.ptr = (unsigned long *)_eip;
865 src.bytes = 1;
866 src.val = insn_fetch(s8, 1, _eip);
867 break;
868 }
869
870 /* Decode and fetch the destination operand: register or memory. */
871 switch (d & DstMask) {
872 case ImplicitOps:
873 /* Special instructions do their own operand decoding. */
874 goto special_insn;
875 case DstReg:
876 dst.type = OP_REG;
877 if ((d & ByteOp)
878 && !(twobyte && (b == 0xb6 || b == 0xb7))) {
879 dst.ptr = decode_register(modrm_reg, _regs,
880 (rex_prefix == 0));
881 dst.val = *(u8 *) dst.ptr;
882 dst.bytes = 1;
883 } else {
884 dst.ptr = decode_register(modrm_reg, _regs, 0);
885 switch ((dst.bytes = op_bytes)) {
886 case 2:
887 dst.val = *(u16 *)dst.ptr;
888 break;
889 case 4:
890 dst.val = *(u32 *)dst.ptr;
891 break;
892 case 8:
893 dst.val = *(u64 *)dst.ptr;
894 break;
895 }
896 }
897 break;
898 case DstMem:
899 dst.type = OP_MEM;
900 dst.ptr = (unsigned long *)cr2;
901 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
902 dst.val = 0;
903 /*
904 * For instructions with a ModR/M byte, switch to register
905 * access if Mod = 3.
906 */
907 if ((d & ModRM) && modrm_mod == 3) {
908 dst.type = OP_REG;
909 break;
910 }
911 if (d & BitOp) {
912 unsigned long mask = ~(dst.bytes * 8 - 1);
913
914 dst.ptr = (void *)dst.ptr + (src.val & mask) / 8;
915 }
916 if (!(d & Mov) && /* optimisation - avoid slow emulated read */
917 ((rc = ops->read_emulated((unsigned long)dst.ptr,
918 &dst.val, dst.bytes, ctxt->vcpu)) != 0))
919 goto done;
920 break;
921 }
922 dst.orig_val = dst.val;
923
924 if (twobyte)
925 goto twobyte_insn;
926
927 switch (b) {
928 case 0x00 ... 0x05:
929 add: /* add */
930 emulate_2op_SrcV("add", src, dst, _eflags);
931 break;
932 case 0x08 ... 0x0d:
933 or: /* or */
934 emulate_2op_SrcV("or", src, dst, _eflags);
935 break;
936 case 0x10 ... 0x15:
937 adc: /* adc */
938 emulate_2op_SrcV("adc", src, dst, _eflags);
939 break;
940 case 0x18 ... 0x1d:
941 sbb: /* sbb */
942 emulate_2op_SrcV("sbb", src, dst, _eflags);
943 break;
944 case 0x20 ... 0x23:
945 and: /* and */
946 emulate_2op_SrcV("and", src, dst, _eflags);
947 break;
948 case 0x24: /* and al imm8 */
949 dst.type = OP_REG;
950 dst.ptr = &_regs[VCPU_REGS_RAX];
951 dst.val = *(u8 *)dst.ptr;
952 dst.bytes = 1;
953 dst.orig_val = dst.val;
954 goto and;
955 case 0x25: /* and ax imm16, or eax imm32 */
956 dst.type = OP_REG;
957 dst.bytes = op_bytes;
958 dst.ptr = &_regs[VCPU_REGS_RAX];
959 if (op_bytes == 2)
960 dst.val = *(u16 *)dst.ptr;
961 else
962 dst.val = *(u32 *)dst.ptr;
963 dst.orig_val = dst.val;
964 goto and;
965 case 0x28 ... 0x2d:
966 sub: /* sub */
967 emulate_2op_SrcV("sub", src, dst, _eflags);
968 break;
969 case 0x30 ... 0x35:
970 xor: /* xor */
971 emulate_2op_SrcV("xor", src, dst, _eflags);
972 break;
973 case 0x38 ... 0x3d:
974 cmp: /* cmp */
975 emulate_2op_SrcV("cmp", src, dst, _eflags);
976 break;
977 case 0x63: /* movsxd */
978 if (mode != X86EMUL_MODE_PROT64)
979 goto cannot_emulate;
980 dst.val = (s32) src.val;
981 break;
982 case 0x80 ... 0x83: /* Grp1 */
983 switch (modrm_reg) {
984 case 0:
985 goto add;
986 case 1:
987 goto or;
988 case 2:
989 goto adc;
990 case 3:
991 goto sbb;
992 case 4:
993 goto and;
994 case 5:
995 goto sub;
996 case 6:
997 goto xor;
998 case 7:
999 goto cmp;
1000 }
1001 break;
1002 case 0x84 ... 0x85:
1003 test: /* test */
1004 emulate_2op_SrcV("test", src, dst, _eflags);
1005 break;
1006 case 0x86 ... 0x87: /* xchg */
1007 /* Write back the register source. */
1008 switch (dst.bytes) {
1009 case 1:
1010 *(u8 *) src.ptr = (u8) dst.val;
1011 break;
1012 case 2:
1013 *(u16 *) src.ptr = (u16) dst.val;
1014 break;
1015 case 4:
1016 *src.ptr = (u32) dst.val;
1017 break; /* 64b reg: zero-extend */
1018 case 8:
1019 *src.ptr = dst.val;
1020 break;
1021 }
1022 /*
1023 * Write back the memory destination with implicit LOCK
1024 * prefix.
1025 */
1026 dst.val = src.val;
1027 lock_prefix = 1;
1028 break;
1029 case 0x88 ... 0x8b: /* mov */
1030 goto mov;
1031 case 0x8d: /* lea r16/r32, m */
1032 dst.val = modrm_val;
1033 break;
1034 case 0x8f: /* pop (sole member of Grp1a) */
1035 /* 64-bit mode: POP always pops a 64-bit operand. */
1036 if (mode == X86EMUL_MODE_PROT64)
1037 dst.bytes = 8;
1038 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1039 _regs[VCPU_REGS_RSP]),
1040 &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1041 goto done;
1042 register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
1043 break;
1044 case 0xa0 ... 0xa1: /* mov */
1045 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1046 dst.val = src.val;
1047 _eip += ad_bytes; /* skip src displacement */
1048 break;
1049 case 0xa2 ... 0xa3: /* mov */
1050 dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
1051 _eip += ad_bytes; /* skip dst displacement */
1052 break;
1053 case 0xc0 ... 0xc1:
1054 grp2: /* Grp2 */
1055 switch (modrm_reg) {
1056 case 0: /* rol */
1057 emulate_2op_SrcB("rol", src, dst, _eflags);
1058 break;
1059 case 1: /* ror */
1060 emulate_2op_SrcB("ror", src, dst, _eflags);
1061 break;
1062 case 2: /* rcl */
1063 emulate_2op_SrcB("rcl", src, dst, _eflags);
1064 break;
1065 case 3: /* rcr */
1066 emulate_2op_SrcB("rcr", src, dst, _eflags);
1067 break;
1068 case 4: /* sal/shl */
1069 case 6: /* sal/shl */
1070 emulate_2op_SrcB("sal", src, dst, _eflags);
1071 break;
1072 case 5: /* shr */
1073 emulate_2op_SrcB("shr", src, dst, _eflags);
1074 break;
1075 case 7: /* sar */
1076 emulate_2op_SrcB("sar", src, dst, _eflags);
1077 break;
1078 }
1079 break;
1080 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
1081 mov:
1082 dst.val = src.val;
1083 break;
1084 case 0xd0 ... 0xd1: /* Grp2 */
1085 src.val = 1;
1086 goto grp2;
1087 case 0xd2 ... 0xd3: /* Grp2 */
1088 src.val = _regs[VCPU_REGS_RCX];
1089 goto grp2;
1090 case 0xf6 ... 0xf7: /* Grp3 */
1091 switch (modrm_reg) {
1092 case 0 ... 1: /* test */
1093 /*
1094 * Special case in Grp3: test has an immediate
1095 * source operand.
1096 */
1097 src.type = OP_IMM;
1098 src.ptr = (unsigned long *)_eip;
1099 src.bytes = (d & ByteOp) ? 1 : op_bytes;
1100 if (src.bytes == 8)
1101 src.bytes = 4;
1102 switch (src.bytes) {
1103 case 1:
1104 src.val = insn_fetch(s8, 1, _eip);
1105 break;
1106 case 2:
1107 src.val = insn_fetch(s16, 2, _eip);
1108 break;
1109 case 4:
1110 src.val = insn_fetch(s32, 4, _eip);
1111 break;
1112 }
1113 goto test;
1114 case 2: /* not */
1115 dst.val = ~dst.val;
1116 break;
1117 case 3: /* neg */
1118 emulate_1op("neg", dst, _eflags);
1119 break;
1120 default:
1121 goto cannot_emulate;
1122 }
1123 break;
1124 case 0xfe ... 0xff: /* Grp4/Grp5 */
1125 switch (modrm_reg) {
1126 case 0: /* inc */
1127 emulate_1op("inc", dst, _eflags);
1128 break;
1129 case 1: /* dec */
1130 emulate_1op("dec", dst, _eflags);
1131 break;
1132 case 4: /* jmp abs */
1133 if (b == 0xff)
1134 _eip = dst.val;
1135 else
1136 goto cannot_emulate;
1137 break;
1138 case 6: /* push */
1139 /* 64-bit mode: PUSH always pushes a 64-bit operand. */
1140 if (mode == X86EMUL_MODE_PROT64) {
1141 dst.bytes = 8;
1142 if ((rc = ops->read_std((unsigned long)dst.ptr,
1143 &dst.val, 8,
1144 ctxt->vcpu)) != 0)
1145 goto done;
1146 }
1147 register_address_increment(_regs[VCPU_REGS_RSP],
1148 -dst.bytes);
1149 if ((rc = ops->write_emulated(
1150 register_address(ctxt->ss_base,
1151 _regs[VCPU_REGS_RSP]),
1152 &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1153 goto done;
1154 no_wb = 1;
1155 break;
1156 default:
1157 goto cannot_emulate;
1158 }
1159 break;
1160 }
1161
1162writeback:
1163 if (!no_wb) {
1164 switch (dst.type) {
1165 case OP_REG:
1166 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
1167 switch (dst.bytes) {
1168 case 1:
1169 *(u8 *)dst.ptr = (u8)dst.val;
1170 break;
1171 case 2:
1172 *(u16 *)dst.ptr = (u16)dst.val;
1173 break;
1174 case 4:
1175 *dst.ptr = (u32)dst.val;
1176 break; /* 64b: zero-ext */
1177 case 8:
1178 *dst.ptr = dst.val;
1179 break;
1180 }
1181 break;
1182 case OP_MEM:
1183 if (lock_prefix)
1184 rc = ops->cmpxchg_emulated((unsigned long)dst.
1185 ptr, &dst.orig_val,
1186 &dst.val, dst.bytes,
1187 ctxt->vcpu);
1188 else
1189 rc = ops->write_emulated((unsigned long)dst.ptr,
1190 &dst.val, dst.bytes,
1191 ctxt->vcpu);
1192 if (rc != 0)
1193 goto done;
1194 default:
1195 break;
1196 }
1197 }
1198
1199 /* Commit shadow register state. */
1200 memcpy(ctxt->vcpu->regs, _regs, sizeof _regs);
1201 ctxt->eflags = _eflags;
1202 ctxt->vcpu->rip = _eip;
1203
1204done:
1205 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1206
1207special_insn:
1208 if (twobyte)
1209 goto twobyte_special_insn;
1210 switch(b) {
1211 case 0x50 ... 0x57: /* push reg */
1212 if (op_bytes == 2)
1213 src.val = (u16) _regs[b & 0x7];
1214 else
1215 src.val = (u32) _regs[b & 0x7];
1216 dst.type = OP_MEM;
1217 dst.bytes = op_bytes;
1218 dst.val = src.val;
1219 register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
1220 dst.ptr = (void *) register_address(
1221 ctxt->ss_base, _regs[VCPU_REGS_RSP]);
1222 break;
1223 case 0x58 ... 0x5f: /* pop reg */
1224 dst.ptr = (unsigned long *)&_regs[b & 0x7];
1225 pop_instruction:
1226 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1227 _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
1228 != 0)
1229 goto done;
1230
1231 register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
1232 no_wb = 1; /* Disable writeback. */
1233 break;
1234 case 0x6a: /* push imm8 */
1235 src.val = 0L;
1236 src.val = insn_fetch(s8, 1, _eip);
1237 push:
1238 dst.type = OP_MEM;
1239 dst.bytes = op_bytes;
1240 dst.val = src.val;
1241 register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
1242 dst.ptr = (void *) register_address(ctxt->ss_base,
1243 _regs[VCPU_REGS_RSP]);
1244 break;
1245 case 0x6c: /* insb */
1246 case 0x6d: /* insw/insd */
1247 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1248 1, /* in */
1249 (d & ByteOp) ? 1 : op_bytes, /* size */
1250 rep_prefix ?
1251 address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
1252 (_eflags & EFLG_DF), /* down */
1253 register_address(ctxt->es_base,
1254 _regs[VCPU_REGS_RDI]), /* address */
1255 rep_prefix,
1256 _regs[VCPU_REGS_RDX] /* port */
1257 ) == 0)
1258 return -1;
1259 return 0;
1260 case 0x6e: /* outsb */
1261 case 0x6f: /* outsw/outsd */
1262 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1263 0, /* in */
1264 (d & ByteOp) ? 1 : op_bytes, /* size */
1265 rep_prefix ?
1266 address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
1267 (_eflags & EFLG_DF), /* down */
1268 register_address(override_base ?
1269 *override_base : ctxt->ds_base,
1270 _regs[VCPU_REGS_RSI]), /* address */
1271 rep_prefix,
1272 _regs[VCPU_REGS_RDX] /* port */
1273 ) == 0)
1274 return -1;
1275 return 0;
1276 case 0x70 ... 0x7f: /* jcc (short) */ {
1277 int rel = insn_fetch(s8, 1, _eip);
1278
1279 if (test_cc(b, _eflags))
1280 JMP_REL(rel);
1281 break;
1282 }
1283 case 0x9c: /* pushf */
1284 src.val = (unsigned long) _eflags;
1285 goto push;
1286 case 0x9d: /* popf */
1287 dst.ptr = (unsigned long *) &_eflags;
1288 goto pop_instruction;
1289 case 0xc3: /* ret */
1290 dst.ptr = &_eip;
1291 goto pop_instruction;
1292 case 0xf4: /* hlt */
1293 ctxt->vcpu->halt_request = 1;
1294 goto done;
1295 }
1296 if (rep_prefix) {
1297 if (_regs[VCPU_REGS_RCX] == 0) {
1298 ctxt->vcpu->rip = _eip;
1299 goto done;
1300 }
1301 _regs[VCPU_REGS_RCX]--;
1302 _eip = ctxt->vcpu->rip;
1303 }
1304 switch (b) {
1305 case 0xa4 ... 0xa5: /* movs */
1306 dst.type = OP_MEM;
1307 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1308 dst.ptr = (unsigned long *)register_address(ctxt->es_base,
1309 _regs[VCPU_REGS_RDI]);
1310 if ((rc = ops->read_emulated(register_address(
1311 override_base ? *override_base : ctxt->ds_base,
1312 _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1313 goto done;
1314 register_address_increment(_regs[VCPU_REGS_RSI],
1315 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1316 register_address_increment(_regs[VCPU_REGS_RDI],
1317 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1318 break;
1319 case 0xa6 ... 0xa7: /* cmps */
1320 DPRINTF("Urk! I don't handle CMPS.\n");
1321 goto cannot_emulate;
1322 case 0xaa ... 0xab: /* stos */
1323 dst.type = OP_MEM;
1324 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1325 dst.ptr = (unsigned long *)cr2;
1326 dst.val = _regs[VCPU_REGS_RAX];
1327 register_address_increment(_regs[VCPU_REGS_RDI],
1328 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1329 break;
1330 case 0xac ... 0xad: /* lods */
1331 dst.type = OP_REG;
1332 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1333 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1334 if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
1335 ctxt->vcpu)) != 0)
1336 goto done;
1337 register_address_increment(_regs[VCPU_REGS_RSI],
1338 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1339 break;
1340 case 0xae ... 0xaf: /* scas */
1341 DPRINTF("Urk! I don't handle SCAS.\n");
1342 goto cannot_emulate;
1343 case 0xe8: /* call (near) */ {
1344 long int rel;
1345 switch (op_bytes) {
1346 case 2:
1347 rel = insn_fetch(s16, 2, _eip);
1348 break;
1349 case 4:
1350 rel = insn_fetch(s32, 4, _eip);
1351 break;
1352 case 8:
1353 rel = insn_fetch(s64, 8, _eip);
1354 break;
1355 default:
1356 DPRINTF("Call: Invalid op_bytes\n");
1357 goto cannot_emulate;
1358 }
1359 src.val = (unsigned long) _eip;
1360 JMP_REL(rel);
1361 op_bytes = ad_bytes;
1362 goto push;
1363 }
1364 case 0xe9: /* jmp rel */
1365 case 0xeb: /* jmp rel short */
1366 JMP_REL(src.val);
1367 no_wb = 1; /* Disable writeback. */
1368 break;
1369
1370
1371 }
1372 goto writeback;
1373
1374twobyte_insn:
1375 switch (b) {
1376 case 0x01: /* lgdt, lidt, lmsw */
1377 /* Disable writeback. */
1378 no_wb = 1;
1379 switch (modrm_reg) {
1380 u16 size;
1381 unsigned long address;
1382
1383 case 2: /* lgdt */
1384 rc = read_descriptor(ctxt, ops, src.ptr,
1385 &size, &address, op_bytes);
1386 if (rc)
1387 goto done;
1388 realmode_lgdt(ctxt->vcpu, size, address);
1389 break;
1390 case 3: /* lidt */
1391 rc = read_descriptor(ctxt, ops, src.ptr,
1392 &size, &address, op_bytes);
1393 if (rc)
1394 goto done;
1395 realmode_lidt(ctxt->vcpu, size, address);
1396 break;
1397 case 4: /* smsw */
1398 if (modrm_mod != 3)
1399 goto cannot_emulate;
1400 *(u16 *)&_regs[modrm_rm]
1401 = realmode_get_cr(ctxt->vcpu, 0);
1402 break;
1403 case 6: /* lmsw */
1404 if (modrm_mod != 3)
1405 goto cannot_emulate;
1406 realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags);
1407 break;
1408 case 7: /* invlpg*/
1409 emulate_invlpg(ctxt->vcpu, cr2);
1410 break;
1411 default:
1412 goto cannot_emulate;
1413 }
1414 break;
1415 case 0x21: /* mov from dr to reg */
1416 no_wb = 1;
1417 if (modrm_mod != 3)
1418 goto cannot_emulate;
1419 rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]);
1420 break;
1421 case 0x23: /* mov from reg to dr */
1422 no_wb = 1;
1423 if (modrm_mod != 3)
1424 goto cannot_emulate;
1425 rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]);
1426 break;
1427 case 0x40 ... 0x4f: /* cmov */
1428 dst.val = dst.orig_val = src.val;
1429 no_wb = 1;
1430 /*
1431 * First, assume we're decoding an even cmov opcode
1432 * (lsb == 0).
1433 */
1434 switch ((b & 15) >> 1) {
1435 case 0: /* cmovo */
1436 no_wb = (_eflags & EFLG_OF) ? 0 : 1;
1437 break;
1438 case 1: /* cmovb/cmovc/cmovnae */
1439 no_wb = (_eflags & EFLG_CF) ? 0 : 1;
1440 break;
1441 case 2: /* cmovz/cmove */
1442 no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
1443 break;
1444 case 3: /* cmovbe/cmovna */
1445 no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
1446 break;
1447 case 4: /* cmovs */
1448 no_wb = (_eflags & EFLG_SF) ? 0 : 1;
1449 break;
1450 case 5: /* cmovp/cmovpe */
1451 no_wb = (_eflags & EFLG_PF) ? 0 : 1;
1452 break;
1453 case 7: /* cmovle/cmovng */
1454 no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
1455 /* fall through */
1456 case 6: /* cmovl/cmovnge */
1457 no_wb &= (!(_eflags & EFLG_SF) !=
1458 !(_eflags & EFLG_OF)) ? 0 : 1;
1459 break;
1460 }
1461 /* Odd cmov opcodes (lsb == 1) have inverted sense. */
1462 no_wb ^= b & 1;
1463 break;
1464 case 0xa3:
1465 bt: /* bt */
1466 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1467 emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
1468 break;
1469 case 0xab:
1470 bts: /* bts */
1471 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1472 emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
1473 break;
1474 case 0xb0 ... 0xb1: /* cmpxchg */
1475 /*
1476 * Save real source value, then compare EAX against
1477 * destination.
1478 */
1479 src.orig_val = src.val;
1480 src.val = _regs[VCPU_REGS_RAX];
1481 emulate_2op_SrcV("cmp", src, dst, _eflags);
1482 if (_eflags & EFLG_ZF) {
1483 /* Success: write back to memory. */
1484 dst.val = src.orig_val;
1485 } else {
1486 /* Failure: write the value we saw to EAX. */
1487 dst.type = OP_REG;
1488 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1489 }
1490 break;
1491 case 0xb3:
1492 btr: /* btr */
1493 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1494 emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
1495 break;
1496 case 0xb6 ... 0xb7: /* movzx */
1497 dst.bytes = op_bytes;
1498 dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
1499 break;
1500 case 0xba: /* Grp8 */
1501 switch (modrm_reg & 3) {
1502 case 0:
1503 goto bt;
1504 case 1:
1505 goto bts;
1506 case 2:
1507 goto btr;
1508 case 3:
1509 goto btc;
1510 }
1511 break;
1512 case 0xbb:
1513 btc: /* btc */
1514 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1515 emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
1516 break;
1517 case 0xbe ... 0xbf: /* movsx */
1518 dst.bytes = op_bytes;
1519 dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
1520 break;
1521 case 0xc3: /* movnti */
1522 dst.bytes = op_bytes;
1523 dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val;
1524 break;
1525 }
1526 goto writeback;
1527
1528twobyte_special_insn:
1529 /* Disable writeback. */
1530 no_wb = 1;
1531 switch (b) {
1532 case 0x06:
1533 emulate_clts(ctxt->vcpu);
1534 break;
1535 case 0x08: /* invd */
1536 break;
1537 case 0x09: /* wbinvd */
1538 break;
1539 case 0x0d: /* GrpP (prefetch) */
1540 case 0x18: /* Grp16 (prefetch/nop) */
1541 break;
1542 case 0x20: /* mov cr, reg */
1543 if (modrm_mod != 3)
1544 goto cannot_emulate;
1545 _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg);
1546 break;
1547 case 0x22: /* mov reg, cr */
1548 if (modrm_mod != 3)
1549 goto cannot_emulate;
1550 realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags);
1551 break;
1552 case 0x30:
1553 /* wrmsr */
1554 msr_data = (u32)_regs[VCPU_REGS_RAX]
1555 | ((u64)_regs[VCPU_REGS_RDX] << 32);
1556 rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
1557 if (rc) {
1558 kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
1559 _eip = ctxt->vcpu->rip;
1560 }
1561 rc = X86EMUL_CONTINUE;
1562 break;
1563 case 0x32:
1564 /* rdmsr */
1565 rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
1566 if (rc) {
1567 kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
1568 _eip = ctxt->vcpu->rip;
1569 } else {
1570 _regs[VCPU_REGS_RAX] = (u32)msr_data;
1571 _regs[VCPU_REGS_RDX] = msr_data >> 32;
1572 }
1573 rc = X86EMUL_CONTINUE;
1574 break;
1575 case 0x80 ... 0x8f: /* jnz rel, etc*/ {
1576 long int rel;
1577
1578 switch (op_bytes) {
1579 case 2:
1580 rel = insn_fetch(s16, 2, _eip);
1581 break;
1582 case 4:
1583 rel = insn_fetch(s32, 4, _eip);
1584 break;
1585 case 8:
1586 rel = insn_fetch(s64, 8, _eip);
1587 break;
1588 default:
1589 DPRINTF("jnz: Invalid op_bytes\n");
1590 goto cannot_emulate;
1591 }
1592 if (test_cc(b, _eflags))
1593 JMP_REL(rel);
1594 break;
1595 }
1596 case 0xc7: /* Grp9 (cmpxchg8b) */
1597 {
1598 u64 old, new;
1599 if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
1600 != 0)
1601 goto done;
1602 if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
1603 ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
1604 _regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1605 _regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1606 _eflags &= ~EFLG_ZF;
1607 } else {
1608 new = ((u64)_regs[VCPU_REGS_RCX] << 32)
1609 | (u32) _regs[VCPU_REGS_RBX];
1610 if ((rc = ops->cmpxchg_emulated(cr2, &old,
1611 &new, 8, ctxt->vcpu)) != 0)
1612 goto done;
1613 _eflags |= EFLG_ZF;
1614 }
1615 break;
1616 }
1617 }
1618 goto writeback;
1619
1620cannot_emulate:
1621 DPRINTF("Cannot emulate %02x\n", b);
1622 return -1;
1623}
1624
1625#ifdef __XEN__
1626
1627#include <asm/mm.h>
1628#include <asm/uaccess.h>
1629
1630int
1631x86_emulate_read_std(unsigned long addr,
1632 unsigned long *val,
1633 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1634{
1635 unsigned int rc;
1636
1637 *val = 0;
1638
1639 if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) {
1640 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
1641 return X86EMUL_PROPAGATE_FAULT;
1642 }
1643
1644 return X86EMUL_CONTINUE;
1645}
1646
1647int
1648x86_emulate_write_std(unsigned long addr,
1649 unsigned long val,
1650 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1651{
1652 unsigned int rc;
1653
1654 if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) {
1655 propagate_page_fault(addr + bytes - rc, PGERR_write_access);
1656 return X86EMUL_PROPAGATE_FAULT;
1657 }
1658
1659 return X86EMUL_CONTINUE;
1660}
1661
1662#endif
diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h
deleted file mode 100644
index 92c73aa7f9ac..000000000000
--- a/drivers/kvm/x86_emulate.h
+++ /dev/null
@@ -1,155 +0,0 @@
1/******************************************************************************
2 * x86_emulate.h
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
9 */
10
11#ifndef __X86_EMULATE_H__
12#define __X86_EMULATE_H__
13
14struct x86_emulate_ctxt;
15
16/*
17 * x86_emulate_ops:
18 *
19 * These operations represent the instruction emulator's interface to memory.
20 * There are two categories of operation: those that act on ordinary memory
21 * regions (*_std), and those that act on memory regions known to require
22 * special treatment or emulation (*_emulated).
23 *
24 * The emulator assumes that an instruction accesses only one 'emulated memory'
25 * location, that this location is the given linear faulting address (cr2), and
26 * that this is one of the instruction's data operands. Instruction fetches and
27 * stack operations are assumed never to access emulated memory. The emulator
28 * automatically deduces which operand of a string-move operation is accessing
29 * emulated memory, and assumes that the other operand accesses normal memory.
30 *
31 * NOTES:
32 * 1. The emulator isn't very smart about emulated vs. standard memory.
33 * 'Emulated memory' access addresses should be checked for sanity.
34 * 'Normal memory' accesses may fault, and the caller must arrange to
35 * detect and handle reentrancy into the emulator via recursive faults.
36 * Accesses may be unaligned and may cross page boundaries.
37 * 2. If the access fails (cannot emulate, or a standard access faults) then
38 * it is up to the memop to propagate the fault to the guest VM via
39 * some out-of-band mechanism, unknown to the emulator. The memop signals
40 * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
41 * then immediately bail.
42 * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
43 * cmpxchg8b_emulated need support 8-byte accesses.
44 * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
45 */
46/* Access completed successfully: continue emulation as normal. */
47#define X86EMUL_CONTINUE 0
48/* Access is unhandleable: bail from emulation and return error to caller. */
49#define X86EMUL_UNHANDLEABLE 1
50/* Terminate emulation but return success to the caller. */
51#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
52#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */
53#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */
54struct x86_emulate_ops {
55 /*
56 * read_std: Read bytes of standard (non-emulated/special) memory.
57 * Used for instruction fetch, stack operations, and others.
58 * @addr: [IN ] Linear address from which to read.
59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
60 * @bytes: [IN ] Number of bytes to read from memory.
61 */
62 int (*read_std)(unsigned long addr, void *val,
63 unsigned int bytes, struct kvm_vcpu *vcpu);
64
65 /*
66 * write_std: Write bytes of standard (non-emulated/special) memory.
67 * Used for stack operations, and others.
68 * @addr: [IN ] Linear address to which to write.
69 * @val: [IN ] Value to write to memory (low-order bytes used as
70 * required).
71 * @bytes: [IN ] Number of bytes to write to memory.
72 */
73 int (*write_std)(unsigned long addr, const void *val,
74 unsigned int bytes, struct kvm_vcpu *vcpu);
75
76 /*
77 * read_emulated: Read bytes from emulated/special memory area.
78 * @addr: [IN ] Linear address from which to read.
79 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
80 * @bytes: [IN ] Number of bytes to read from memory.
81 */
82 int (*read_emulated) (unsigned long addr,
83 void *val,
84 unsigned int bytes,
85 struct kvm_vcpu *vcpu);
86
87 /*
88 * write_emulated: Read bytes from emulated/special memory area.
89 * @addr: [IN ] Linear address to which to write.
90 * @val: [IN ] Value to write to memory (low-order bytes used as
91 * required).
92 * @bytes: [IN ] Number of bytes to write to memory.
93 */
94 int (*write_emulated) (unsigned long addr,
95 const void *val,
96 unsigned int bytes,
97 struct kvm_vcpu *vcpu);
98
99 /*
100 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
101 * emulated/special memory area.
102 * @addr: [IN ] Linear address to access.
103 * @old: [IN ] Value expected to be current at @addr.
104 * @new: [IN ] Value to write to @addr.
105 * @bytes: [IN ] Number of bytes to access using CMPXCHG.
106 */
107 int (*cmpxchg_emulated) (unsigned long addr,
108 const void *old,
109 const void *new,
110 unsigned int bytes,
111 struct kvm_vcpu *vcpu);
112
113};
114
115struct x86_emulate_ctxt {
116 /* Register state before/after emulation. */
117 struct kvm_vcpu *vcpu;
118
119 /* Linear faulting address (if emulating a page-faulting instruction). */
120 unsigned long eflags;
121 unsigned long cr2;
122
123 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
124 int mode;
125
126 unsigned long cs_base;
127 unsigned long ds_base;
128 unsigned long es_base;
129 unsigned long ss_base;
130 unsigned long gs_base;
131 unsigned long fs_base;
132};
133
134/* Execution mode, passed to the emulator. */
135#define X86EMUL_MODE_REAL 0 /* Real mode. */
136#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
137#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
138#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
139
140/* Host execution mode. */
141#if defined(__i386__)
142#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
143#elif defined(CONFIG_X86_64)
144#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
145#endif
146
147/*
148 * x86_emulate_memop: Emulate an instruction that faulted attempting to
149 * read/write a 'special' memory area.
150 * Returns -1 on failure, 0 on success.
151 */
152int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
153 struct x86_emulate_ops *ops);
154
155#endif /* __X86_EMULATE_H__ */
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index cb4c67025d52..7743d73768df 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -151,43 +151,43 @@ int lguest_address_ok(const struct lguest *lg,
151/* This routine copies memory from the Guest. Here we can see how useful the 151/* This routine copies memory from the Guest. Here we can see how useful the
152 * kill_lguest() routine we met in the Launcher can be: we return a random 152 * kill_lguest() routine we met in the Launcher can be: we return a random
153 * value (all zeroes) instead of needing to return an error. */ 153 * value (all zeroes) instead of needing to return an error. */
154void __lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) 154void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
155{ 155{
156 if (!lguest_address_ok(lg, addr, bytes) 156 if (!lguest_address_ok(cpu->lg, addr, bytes)
157 || copy_from_user(b, lg->mem_base + addr, bytes) != 0) { 157 || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
158 /* copy_from_user should do this, but as we rely on it... */ 158 /* copy_from_user should do this, but as we rely on it... */
159 memset(b, 0, bytes); 159 memset(b, 0, bytes);
160 kill_guest(lg, "bad read address %#lx len %u", addr, bytes); 160 kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
161 } 161 }
162} 162}
163 163
164/* This is the write (copy into guest) version. */ 164/* This is the write (copy into guest) version. */
165void __lgwrite(struct lguest *lg, unsigned long addr, const void *b, 165void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
166 unsigned bytes) 166 unsigned bytes)
167{ 167{
168 if (!lguest_address_ok(lg, addr, bytes) 168 if (!lguest_address_ok(cpu->lg, addr, bytes)
169 || copy_to_user(lg->mem_base + addr, b, bytes) != 0) 169 || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
170 kill_guest(lg, "bad write address %#lx len %u", addr, bytes); 170 kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
171} 171}
172/*:*/ 172/*:*/
173 173
174/*H:030 Let's jump straight to the the main loop which runs the Guest. 174/*H:030 Let's jump straight to the the main loop which runs the Guest.
175 * Remember, this is called by the Launcher reading /dev/lguest, and we keep 175 * Remember, this is called by the Launcher reading /dev/lguest, and we keep
176 * going around and around until something interesting happens. */ 176 * going around and around until something interesting happens. */
177int run_guest(struct lguest *lg, unsigned long __user *user) 177int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
178{ 178{
179 /* We stop running once the Guest is dead. */ 179 /* We stop running once the Guest is dead. */
180 while (!lg->dead) { 180 while (!cpu->lg->dead) {
181 /* First we run any hypercalls the Guest wants done. */ 181 /* First we run any hypercalls the Guest wants done. */
182 if (lg->hcall) 182 if (cpu->hcall)
183 do_hypercalls(lg); 183 do_hypercalls(cpu);
184 184
185 /* It's possible the Guest did a NOTIFY hypercall to the 185 /* It's possible the Guest did a NOTIFY hypercall to the
186 * Launcher, in which case we return from the read() now. */ 186 * Launcher, in which case we return from the read() now. */
187 if (lg->pending_notify) { 187 if (cpu->pending_notify) {
188 if (put_user(lg->pending_notify, user)) 188 if (put_user(cpu->pending_notify, user))
189 return -EFAULT; 189 return -EFAULT;
190 return sizeof(lg->pending_notify); 190 return sizeof(cpu->pending_notify);
191 } 191 }
192 192
193 /* Check for signals */ 193 /* Check for signals */
@@ -195,13 +195,13 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
195 return -ERESTARTSYS; 195 return -ERESTARTSYS;
196 196
197 /* If Waker set break_out, return to Launcher. */ 197 /* If Waker set break_out, return to Launcher. */
198 if (lg->break_out) 198 if (cpu->break_out)
199 return -EAGAIN; 199 return -EAGAIN;
200 200
201 /* Check if there are any interrupts which can be delivered 201 /* Check if there are any interrupts which can be delivered
202 * now: if so, this sets up the hander to be executed when we 202 * now: if so, this sets up the hander to be executed when we
203 * next run the Guest. */ 203 * next run the Guest. */
204 maybe_do_interrupt(lg); 204 maybe_do_interrupt(cpu);
205 205
206 /* All long-lived kernel loops need to check with this horrible 206 /* All long-lived kernel loops need to check with this horrible
207 * thing called the freezer. If the Host is trying to suspend, 207 * thing called the freezer. If the Host is trying to suspend,
@@ -210,12 +210,12 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
210 210
211 /* Just make absolutely sure the Guest is still alive. One of 211 /* Just make absolutely sure the Guest is still alive. One of
212 * those hypercalls could have been fatal, for example. */ 212 * those hypercalls could have been fatal, for example. */
213 if (lg->dead) 213 if (cpu->lg->dead)
214 break; 214 break;
215 215
216 /* If the Guest asked to be stopped, we sleep. The Guest's 216 /* If the Guest asked to be stopped, we sleep. The Guest's
217 * clock timer or LHCALL_BREAK from the Waker will wake us. */ 217 * clock timer or LHCALL_BREAK from the Waker will wake us. */
218 if (lg->halted) { 218 if (cpu->halted) {
219 set_current_state(TASK_INTERRUPTIBLE); 219 set_current_state(TASK_INTERRUPTIBLE);
220 schedule(); 220 schedule();
221 continue; 221 continue;
@@ -226,15 +226,17 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
226 local_irq_disable(); 226 local_irq_disable();
227 227
228 /* Actually run the Guest until something happens. */ 228 /* Actually run the Guest until something happens. */
229 lguest_arch_run_guest(lg); 229 lguest_arch_run_guest(cpu);
230 230
231 /* Now we're ready to be interrupted or moved to other CPUs */ 231 /* Now we're ready to be interrupted or moved to other CPUs */
232 local_irq_enable(); 232 local_irq_enable();
233 233
234 /* Now we deal with whatever happened to the Guest. */ 234 /* Now we deal with whatever happened to the Guest. */
235 lguest_arch_handle_trap(lg); 235 lguest_arch_handle_trap(cpu);
236 } 236 }
237 237
238 if (cpu->lg->dead == ERR_PTR(-ERESTART))
239 return -ERESTART;
238 /* The Guest is dead => "No such file or directory" */ 240 /* The Guest is dead => "No such file or directory" */
239 return -ENOENT; 241 return -ENOENT;
240} 242}
@@ -253,7 +255,7 @@ static int __init init(void)
253 255
254 /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */ 256 /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */
255 if (paravirt_enabled()) { 257 if (paravirt_enabled()) {
256 printk("lguest is afraid of %s\n", pv_info.name); 258 printk("lguest is afraid of being a guest\n");
257 return -EPERM; 259 return -EPERM;
258 } 260 }
259 261
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index b478affe8f91..0f2cb4fd7c69 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -23,13 +23,14 @@
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/syscalls.h> 24#include <linux/syscalls.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/ktime.h>
26#include <asm/page.h> 27#include <asm/page.h>
27#include <asm/pgtable.h> 28#include <asm/pgtable.h>
28#include "lg.h" 29#include "lg.h"
29 30
30/*H:120 This is the core hypercall routine: where the Guest gets what it wants. 31/*H:120 This is the core hypercall routine: where the Guest gets what it wants.
31 * Or gets killed. Or, in the case of LHCALL_CRASH, both. */ 32 * Or gets killed. Or, in the case of LHCALL_CRASH, both. */
32static void do_hcall(struct lguest *lg, struct hcall_args *args) 33static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
33{ 34{
34 switch (args->arg0) { 35 switch (args->arg0) {
35 case LHCALL_FLUSH_ASYNC: 36 case LHCALL_FLUSH_ASYNC:
@@ -39,60 +40,62 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args)
39 case LHCALL_LGUEST_INIT: 40 case LHCALL_LGUEST_INIT:
40 /* You can't get here unless you're already initialized. Don't 41 /* You can't get here unless you're already initialized. Don't
41 * do that. */ 42 * do that. */
42 kill_guest(lg, "already have lguest_data"); 43 kill_guest(cpu, "already have lguest_data");
43 break; 44 break;
44 case LHCALL_CRASH: { 45 case LHCALL_SHUTDOWN: {
45 /* Crash is such a trivial hypercall that we do it in four 46 /* Shutdown is such a trivial hypercall that we do it in four
46 * lines right here. */ 47 * lines right here. */
47 char msg[128]; 48 char msg[128];
48 /* If the lgread fails, it will call kill_guest() itself; the 49 /* If the lgread fails, it will call kill_guest() itself; the
49 * kill_guest() with the message will be ignored. */ 50 * kill_guest() with the message will be ignored. */
50 __lgread(lg, msg, args->arg1, sizeof(msg)); 51 __lgread(cpu, msg, args->arg1, sizeof(msg));
51 msg[sizeof(msg)-1] = '\0'; 52 msg[sizeof(msg)-1] = '\0';
52 kill_guest(lg, "CRASH: %s", msg); 53 kill_guest(cpu, "CRASH: %s", msg);
54 if (args->arg2 == LGUEST_SHUTDOWN_RESTART)
55 cpu->lg->dead = ERR_PTR(-ERESTART);
53 break; 56 break;
54 } 57 }
55 case LHCALL_FLUSH_TLB: 58 case LHCALL_FLUSH_TLB:
56 /* FLUSH_TLB comes in two flavors, depending on the 59 /* FLUSH_TLB comes in two flavors, depending on the
57 * argument: */ 60 * argument: */
58 if (args->arg1) 61 if (args->arg1)
59 guest_pagetable_clear_all(lg); 62 guest_pagetable_clear_all(cpu);
60 else 63 else
61 guest_pagetable_flush_user(lg); 64 guest_pagetable_flush_user(cpu);
62 break; 65 break;
63 66
64 /* All these calls simply pass the arguments through to the right 67 /* All these calls simply pass the arguments through to the right
65 * routines. */ 68 * routines. */
66 case LHCALL_NEW_PGTABLE: 69 case LHCALL_NEW_PGTABLE:
67 guest_new_pagetable(lg, args->arg1); 70 guest_new_pagetable(cpu, args->arg1);
68 break; 71 break;
69 case LHCALL_SET_STACK: 72 case LHCALL_SET_STACK:
70 guest_set_stack(lg, args->arg1, args->arg2, args->arg3); 73 guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
71 break; 74 break;
72 case LHCALL_SET_PTE: 75 case LHCALL_SET_PTE:
73 guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3)); 76 guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
74 break; 77 break;
75 case LHCALL_SET_PMD: 78 case LHCALL_SET_PMD:
76 guest_set_pmd(lg, args->arg1, args->arg2); 79 guest_set_pmd(cpu->lg, args->arg1, args->arg2);
77 break; 80 break;
78 case LHCALL_SET_CLOCKEVENT: 81 case LHCALL_SET_CLOCKEVENT:
79 guest_set_clockevent(lg, args->arg1); 82 guest_set_clockevent(cpu, args->arg1);
80 break; 83 break;
81 case LHCALL_TS: 84 case LHCALL_TS:
82 /* This sets the TS flag, as we saw used in run_guest(). */ 85 /* This sets the TS flag, as we saw used in run_guest(). */
83 lg->ts = args->arg1; 86 cpu->ts = args->arg1;
84 break; 87 break;
85 case LHCALL_HALT: 88 case LHCALL_HALT:
86 /* Similarly, this sets the halted flag for run_guest(). */ 89 /* Similarly, this sets the halted flag for run_guest(). */
87 lg->halted = 1; 90 cpu->halted = 1;
88 break; 91 break;
89 case LHCALL_NOTIFY: 92 case LHCALL_NOTIFY:
90 lg->pending_notify = args->arg1; 93 cpu->pending_notify = args->arg1;
91 break; 94 break;
92 default: 95 default:
93 /* It should be an architecture-specific hypercall. */ 96 /* It should be an architecture-specific hypercall. */
94 if (lguest_arch_do_hcall(lg, args)) 97 if (lguest_arch_do_hcall(cpu, args))
95 kill_guest(lg, "Bad hypercall %li\n", args->arg0); 98 kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
96 } 99 }
97} 100}
98/*:*/ 101/*:*/
@@ -104,13 +107,13 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args)
104 * Guest put them in the ring, but we also promise the Guest that they will 107 * Guest put them in the ring, but we also promise the Guest that they will
105 * happen before any normal hypercall (which is why we check this before 108 * happen before any normal hypercall (which is why we check this before
106 * checking for a normal hcall). */ 109 * checking for a normal hcall). */
107static void do_async_hcalls(struct lguest *lg) 110static void do_async_hcalls(struct lg_cpu *cpu)
108{ 111{
109 unsigned int i; 112 unsigned int i;
110 u8 st[LHCALL_RING_SIZE]; 113 u8 st[LHCALL_RING_SIZE];
111 114
112 /* For simplicity, we copy the entire call status array in at once. */ 115 /* For simplicity, we copy the entire call status array in at once. */
113 if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) 116 if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st)))
114 return; 117 return;
115 118
116 /* We process "struct lguest_data"s hcalls[] ring once. */ 119 /* We process "struct lguest_data"s hcalls[] ring once. */
@@ -119,7 +122,7 @@ static void do_async_hcalls(struct lguest *lg)
119 /* We remember where we were up to from last time. This makes 122 /* We remember where we were up to from last time. This makes
120 * sure that the hypercalls are done in the order the Guest 123 * sure that the hypercalls are done in the order the Guest
121 * places them in the ring. */ 124 * places them in the ring. */
122 unsigned int n = lg->next_hcall; 125 unsigned int n = cpu->next_hcall;
123 126
124 /* 0xFF means there's no call here (yet). */ 127 /* 0xFF means there's no call here (yet). */
125 if (st[n] == 0xFF) 128 if (st[n] == 0xFF)
@@ -127,65 +130,65 @@ static void do_async_hcalls(struct lguest *lg)
127 130
128 /* OK, we have hypercall. Increment the "next_hcall" cursor, 131 /* OK, we have hypercall. Increment the "next_hcall" cursor,
129 * and wrap back to 0 if we reach the end. */ 132 * and wrap back to 0 if we reach the end. */
130 if (++lg->next_hcall == LHCALL_RING_SIZE) 133 if (++cpu->next_hcall == LHCALL_RING_SIZE)
131 lg->next_hcall = 0; 134 cpu->next_hcall = 0;
132 135
133 /* Copy the hypercall arguments into a local copy of 136 /* Copy the hypercall arguments into a local copy of
134 * the hcall_args struct. */ 137 * the hcall_args struct. */
135 if (copy_from_user(&args, &lg->lguest_data->hcalls[n], 138 if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
136 sizeof(struct hcall_args))) { 139 sizeof(struct hcall_args))) {
137 kill_guest(lg, "Fetching async hypercalls"); 140 kill_guest(cpu, "Fetching async hypercalls");
138 break; 141 break;
139 } 142 }
140 143
141 /* Do the hypercall, same as a normal one. */ 144 /* Do the hypercall, same as a normal one. */
142 do_hcall(lg, &args); 145 do_hcall(cpu, &args);
143 146
144 /* Mark the hypercall done. */ 147 /* Mark the hypercall done. */
145 if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { 148 if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) {
146 kill_guest(lg, "Writing result for async hypercall"); 149 kill_guest(cpu, "Writing result for async hypercall");
147 break; 150 break;
148 } 151 }
149 152
150 /* Stop doing hypercalls if they want to notify the Launcher: 153 /* Stop doing hypercalls if they want to notify the Launcher:
151 * it needs to service this first. */ 154 * it needs to service this first. */
152 if (lg->pending_notify) 155 if (cpu->pending_notify)
153 break; 156 break;
154 } 157 }
155} 158}
156 159
157/* Last of all, we look at what happens first of all. The very first time the 160/* Last of all, we look at what happens first of all. The very first time the
158 * Guest makes a hypercall, we end up here to set things up: */ 161 * Guest makes a hypercall, we end up here to set things up: */
159static void initialize(struct lguest *lg) 162static void initialize(struct lg_cpu *cpu)
160{ 163{
161 /* You can't do anything until you're initialized. The Guest knows the 164 /* You can't do anything until you're initialized. The Guest knows the
162 * rules, so we're unforgiving here. */ 165 * rules, so we're unforgiving here. */
163 if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) { 166 if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
164 kill_guest(lg, "hypercall %li before INIT", lg->hcall->arg0); 167 kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
165 return; 168 return;
166 } 169 }
167 170
168 if (lguest_arch_init_hypercalls(lg)) 171 if (lguest_arch_init_hypercalls(cpu))
169 kill_guest(lg, "bad guest page %p", lg->lguest_data); 172 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
170 173
171 /* The Guest tells us where we're not to deliver interrupts by putting 174 /* The Guest tells us where we're not to deliver interrupts by putting
172 * the range of addresses into "struct lguest_data". */ 175 * the range of addresses into "struct lguest_data". */
173 if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) 176 if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start)
174 || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)) 177 || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end))
175 kill_guest(lg, "bad guest page %p", lg->lguest_data); 178 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
176 179
177 /* We write the current time into the Guest's data page once so it can 180 /* We write the current time into the Guest's data page once so it can
178 * set its clock. */ 181 * set its clock. */
179 write_timestamp(lg); 182 write_timestamp(cpu);
180 183
181 /* page_tables.c will also do some setup. */ 184 /* page_tables.c will also do some setup. */
182 page_table_guest_data_init(lg); 185 page_table_guest_data_init(cpu);
183 186
184 /* This is the one case where the above accesses might have been the 187 /* This is the one case where the above accesses might have been the
185 * first write to a Guest page. This may have caused a copy-on-write 188 * first write to a Guest page. This may have caused a copy-on-write
186 * fault, but the old page might be (read-only) in the Guest 189 * fault, but the old page might be (read-only) in the Guest
187 * pagetable. */ 190 * pagetable. */
188 guest_pagetable_clear_all(lg); 191 guest_pagetable_clear_all(cpu);
189} 192}
190 193
191/*H:100 194/*H:100
@@ -194,27 +197,27 @@ static void initialize(struct lguest *lg)
194 * Remember from the Guest, hypercalls come in two flavors: normal and 197 * Remember from the Guest, hypercalls come in two flavors: normal and
195 * asynchronous. This file handles both of types. 198 * asynchronous. This file handles both of types.
196 */ 199 */
197void do_hypercalls(struct lguest *lg) 200void do_hypercalls(struct lg_cpu *cpu)
198{ 201{
199 /* Not initialized yet? This hypercall must do it. */ 202 /* Not initialized yet? This hypercall must do it. */
200 if (unlikely(!lg->lguest_data)) { 203 if (unlikely(!cpu->lg->lguest_data)) {
201 /* Set up the "struct lguest_data" */ 204 /* Set up the "struct lguest_data" */
202 initialize(lg); 205 initialize(cpu);
203 /* Hcall is done. */ 206 /* Hcall is done. */
204 lg->hcall = NULL; 207 cpu->hcall = NULL;
205 return; 208 return;
206 } 209 }
207 210
208 /* The Guest has initialized. 211 /* The Guest has initialized.
209 * 212 *
210 * Look in the hypercall ring for the async hypercalls: */ 213 * Look in the hypercall ring for the async hypercalls: */
211 do_async_hcalls(lg); 214 do_async_hcalls(cpu);
212 215
213 /* If we stopped reading the hypercall ring because the Guest did a 216 /* If we stopped reading the hypercall ring because the Guest did a
214 * NOTIFY to the Launcher, we want to return now. Otherwise we do 217 * NOTIFY to the Launcher, we want to return now. Otherwise we do
215 * the hypercall. */ 218 * the hypercall. */
216 if (!lg->pending_notify) { 219 if (!cpu->pending_notify) {
217 do_hcall(lg, lg->hcall); 220 do_hcall(cpu, cpu->hcall);
218 /* Tricky point: we reset the hcall pointer to mark the 221 /* Tricky point: we reset the hcall pointer to mark the
219 * hypercall as "done". We use the hcall pointer rather than 222 * hypercall as "done". We use the hcall pointer rather than
220 * the trap number to indicate a hypercall is pending. 223 * the trap number to indicate a hypercall is pending.
@@ -225,16 +228,17 @@ void do_hypercalls(struct lguest *lg)
225 * Launcher, the run_guest() loop will exit without running the 228 * Launcher, the run_guest() loop will exit without running the
226 * Guest. When it comes back it would try to re-run the 229 * Guest. When it comes back it would try to re-run the
227 * hypercall. */ 230 * hypercall. */
228 lg->hcall = NULL; 231 cpu->hcall = NULL;
229 } 232 }
230} 233}
231 234
232/* This routine supplies the Guest with time: it's used for wallclock time at 235/* This routine supplies the Guest with time: it's used for wallclock time at
233 * initial boot and as a rough time source if the TSC isn't available. */ 236 * initial boot and as a rough time source if the TSC isn't available. */
234void write_timestamp(struct lguest *lg) 237void write_timestamp(struct lg_cpu *cpu)
235{ 238{
236 struct timespec now; 239 struct timespec now;
237 ktime_get_real_ts(&now); 240 ktime_get_real_ts(&now);
238 if (copy_to_user(&lg->lguest_data->time, &now, sizeof(struct timespec))) 241 if (copy_to_user(&cpu->lg->lguest_data->time,
239 kill_guest(lg, "Writing timestamp"); 242 &now, sizeof(struct timespec)))
243 kill_guest(cpu, "Writing timestamp");
240} 244}
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 2b66f79c208b..32e97c1858e5 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -41,11 +41,11 @@ static int idt_present(u32 lo, u32 hi)
41 41
42/* We need a helper to "push" a value onto the Guest's stack, since that's a 42/* We need a helper to "push" a value onto the Guest's stack, since that's a
43 * big part of what delivering an interrupt does. */ 43 * big part of what delivering an interrupt does. */
44static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) 44static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
45{ 45{
46 /* Stack grows upwards: move stack then write value. */ 46 /* Stack grows upwards: move stack then write value. */
47 *gstack -= 4; 47 *gstack -= 4;
48 lgwrite(lg, *gstack, u32, val); 48 lgwrite(cpu, *gstack, u32, val);
49} 49}
50 50
51/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or 51/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or
@@ -60,7 +60,7 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
60 * We set up the stack just like the CPU does for a real interrupt, so it's 60 * We set up the stack just like the CPU does for a real interrupt, so it's
61 * identical for the Guest (and the standard "iret" instruction will undo 61 * identical for the Guest (and the standard "iret" instruction will undo
62 * it). */ 62 * it). */
63static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) 63static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, int has_err)
64{ 64{
65 unsigned long gstack, origstack; 65 unsigned long gstack, origstack;
66 u32 eflags, ss, irq_enable; 66 u32 eflags, ss, irq_enable;
@@ -69,59 +69,59 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
69 /* There are two cases for interrupts: one where the Guest is already 69 /* There are two cases for interrupts: one where the Guest is already
70 * in the kernel, and a more complex one where the Guest is in 70 * in the kernel, and a more complex one where the Guest is in
71 * userspace. We check the privilege level to find out. */ 71 * userspace. We check the privilege level to find out. */
72 if ((lg->regs->ss&0x3) != GUEST_PL) { 72 if ((cpu->regs->ss&0x3) != GUEST_PL) {
73 /* The Guest told us their kernel stack with the SET_STACK 73 /* The Guest told us their kernel stack with the SET_STACK
74 * hypercall: both the virtual address and the segment */ 74 * hypercall: both the virtual address and the segment */
75 virtstack = lg->esp1; 75 virtstack = cpu->esp1;
76 ss = lg->ss1; 76 ss = cpu->ss1;
77 77
78 origstack = gstack = guest_pa(lg, virtstack); 78 origstack = gstack = guest_pa(cpu, virtstack);
79 /* We push the old stack segment and pointer onto the new 79 /* We push the old stack segment and pointer onto the new
80 * stack: when the Guest does an "iret" back from the interrupt 80 * stack: when the Guest does an "iret" back from the interrupt
81 * handler the CPU will notice they're dropping privilege 81 * handler the CPU will notice they're dropping privilege
82 * levels and expect these here. */ 82 * levels and expect these here. */
83 push_guest_stack(lg, &gstack, lg->regs->ss); 83 push_guest_stack(cpu, &gstack, cpu->regs->ss);
84 push_guest_stack(lg, &gstack, lg->regs->esp); 84 push_guest_stack(cpu, &gstack, cpu->regs->esp);
85 } else { 85 } else {
86 /* We're staying on the same Guest (kernel) stack. */ 86 /* We're staying on the same Guest (kernel) stack. */
87 virtstack = lg->regs->esp; 87 virtstack = cpu->regs->esp;
88 ss = lg->regs->ss; 88 ss = cpu->regs->ss;
89 89
90 origstack = gstack = guest_pa(lg, virtstack); 90 origstack = gstack = guest_pa(cpu, virtstack);
91 } 91 }
92 92
93 /* Remember that we never let the Guest actually disable interrupts, so 93 /* Remember that we never let the Guest actually disable interrupts, so
94 * the "Interrupt Flag" bit is always set. We copy that bit from the 94 * the "Interrupt Flag" bit is always set. We copy that bit from the
95 * Guest's "irq_enabled" field into the eflags word: we saw the Guest 95 * Guest's "irq_enabled" field into the eflags word: we saw the Guest
96 * copy it back in "lguest_iret". */ 96 * copy it back in "lguest_iret". */
97 eflags = lg->regs->eflags; 97 eflags = cpu->regs->eflags;
98 if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0 98 if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0
99 && !(irq_enable & X86_EFLAGS_IF)) 99 && !(irq_enable & X86_EFLAGS_IF))
100 eflags &= ~X86_EFLAGS_IF; 100 eflags &= ~X86_EFLAGS_IF;
101 101
102 /* An interrupt is expected to push three things on the stack: the old 102 /* An interrupt is expected to push three things on the stack: the old
103 * "eflags" word, the old code segment, and the old instruction 103 * "eflags" word, the old code segment, and the old instruction
104 * pointer. */ 104 * pointer. */
105 push_guest_stack(lg, &gstack, eflags); 105 push_guest_stack(cpu, &gstack, eflags);
106 push_guest_stack(lg, &gstack, lg->regs->cs); 106 push_guest_stack(cpu, &gstack, cpu->regs->cs);
107 push_guest_stack(lg, &gstack, lg->regs->eip); 107 push_guest_stack(cpu, &gstack, cpu->regs->eip);
108 108
109 /* For the six traps which supply an error code, we push that, too. */ 109 /* For the six traps which supply an error code, we push that, too. */
110 if (has_err) 110 if (has_err)
111 push_guest_stack(lg, &gstack, lg->regs->errcode); 111 push_guest_stack(cpu, &gstack, cpu->regs->errcode);
112 112
113 /* Now we've pushed all the old state, we change the stack, the code 113 /* Now we've pushed all the old state, we change the stack, the code
114 * segment and the address to execute. */ 114 * segment and the address to execute. */
115 lg->regs->ss = ss; 115 cpu->regs->ss = ss;
116 lg->regs->esp = virtstack + (gstack - origstack); 116 cpu->regs->esp = virtstack + (gstack - origstack);
117 lg->regs->cs = (__KERNEL_CS|GUEST_PL); 117 cpu->regs->cs = (__KERNEL_CS|GUEST_PL);
118 lg->regs->eip = idt_address(lo, hi); 118 cpu->regs->eip = idt_address(lo, hi);
119 119
120 /* There are two kinds of interrupt handlers: 0xE is an "interrupt 120 /* There are two kinds of interrupt handlers: 0xE is an "interrupt
121 * gate" which expects interrupts to be disabled on entry. */ 121 * gate" which expects interrupts to be disabled on entry. */
122 if (idt_type(lo, hi) == 0xE) 122 if (idt_type(lo, hi) == 0xE)
123 if (put_user(0, &lg->lguest_data->irq_enabled)) 123 if (put_user(0, &cpu->lg->lguest_data->irq_enabled))
124 kill_guest(lg, "Disabling interrupts"); 124 kill_guest(cpu, "Disabling interrupts");
125} 125}
126 126
127/*H:205 127/*H:205
@@ -129,23 +129,23 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
129 * 129 *
130 * maybe_do_interrupt() gets called before every entry to the Guest, to see if 130 * maybe_do_interrupt() gets called before every entry to the Guest, to see if
131 * we should divert the Guest to running an interrupt handler. */ 131 * we should divert the Guest to running an interrupt handler. */
132void maybe_do_interrupt(struct lguest *lg) 132void maybe_do_interrupt(struct lg_cpu *cpu)
133{ 133{
134 unsigned int irq; 134 unsigned int irq;
135 DECLARE_BITMAP(blk, LGUEST_IRQS); 135 DECLARE_BITMAP(blk, LGUEST_IRQS);
136 struct desc_struct *idt; 136 struct desc_struct *idt;
137 137
138 /* If the Guest hasn't even initialized yet, we can do nothing. */ 138 /* If the Guest hasn't even initialized yet, we can do nothing. */
139 if (!lg->lguest_data) 139 if (!cpu->lg->lguest_data)
140 return; 140 return;
141 141
142 /* Take our "irqs_pending" array and remove any interrupts the Guest 142 /* Take our "irqs_pending" array and remove any interrupts the Guest
143 * wants blocked: the result ends up in "blk". */ 143 * wants blocked: the result ends up in "blk". */
144 if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts, 144 if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
145 sizeof(blk))) 145 sizeof(blk)))
146 return; 146 return;
147 147
148 bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS); 148 bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
149 149
150 /* Find the first interrupt. */ 150 /* Find the first interrupt. */
151 irq = find_first_bit(blk, LGUEST_IRQS); 151 irq = find_first_bit(blk, LGUEST_IRQS);
@@ -155,19 +155,20 @@ void maybe_do_interrupt(struct lguest *lg)
155 155
156 /* They may be in the middle of an iret, where they asked us never to 156 /* They may be in the middle of an iret, where they asked us never to
157 * deliver interrupts. */ 157 * deliver interrupts. */
158 if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end) 158 if (cpu->regs->eip >= cpu->lg->noirq_start &&
159 (cpu->regs->eip < cpu->lg->noirq_end))
159 return; 160 return;
160 161
161 /* If they're halted, interrupts restart them. */ 162 /* If they're halted, interrupts restart them. */
162 if (lg->halted) { 163 if (cpu->halted) {
163 /* Re-enable interrupts. */ 164 /* Re-enable interrupts. */
164 if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled)) 165 if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled))
165 kill_guest(lg, "Re-enabling interrupts"); 166 kill_guest(cpu, "Re-enabling interrupts");
166 lg->halted = 0; 167 cpu->halted = 0;
167 } else { 168 } else {
168 /* Otherwise we check if they have interrupts disabled. */ 169 /* Otherwise we check if they have interrupts disabled. */
169 u32 irq_enabled; 170 u32 irq_enabled;
170 if (get_user(irq_enabled, &lg->lguest_data->irq_enabled)) 171 if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
171 irq_enabled = 0; 172 irq_enabled = 0;
172 if (!irq_enabled) 173 if (!irq_enabled)
173 return; 174 return;
@@ -176,15 +177,15 @@ void maybe_do_interrupt(struct lguest *lg)
176 /* Look at the IDT entry the Guest gave us for this interrupt. The 177 /* Look at the IDT entry the Guest gave us for this interrupt. The
177 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip 178 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
178 * over them. */ 179 * over them. */
179 idt = &lg->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; 180 idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
180 /* If they don't have a handler (yet?), we just ignore it */ 181 /* If they don't have a handler (yet?), we just ignore it */
181 if (idt_present(idt->a, idt->b)) { 182 if (idt_present(idt->a, idt->b)) {
182 /* OK, mark it no longer pending and deliver it. */ 183 /* OK, mark it no longer pending and deliver it. */
183 clear_bit(irq, lg->irqs_pending); 184 clear_bit(irq, cpu->irqs_pending);
184 /* set_guest_interrupt() takes the interrupt descriptor and a 185 /* set_guest_interrupt() takes the interrupt descriptor and a
185 * flag to say whether this interrupt pushes an error code onto 186 * flag to say whether this interrupt pushes an error code onto
186 * the stack as well: virtual interrupts never do. */ 187 * the stack as well: virtual interrupts never do. */
187 set_guest_interrupt(lg, idt->a, idt->b, 0); 188 set_guest_interrupt(cpu, idt->a, idt->b, 0);
188 } 189 }
189 190
190 /* Every time we deliver an interrupt, we update the timestamp in the 191 /* Every time we deliver an interrupt, we update the timestamp in the
@@ -192,7 +193,7 @@ void maybe_do_interrupt(struct lguest *lg)
192 * did this more often, but it can actually be quite slow: doing it 193 * did this more often, but it can actually be quite slow: doing it
193 * here is a compromise which means at least it gets updated every 194 * here is a compromise which means at least it gets updated every
194 * timer interrupt. */ 195 * timer interrupt. */
195 write_timestamp(lg); 196 write_timestamp(cpu);
196} 197}
197/*:*/ 198/*:*/
198 199
@@ -245,19 +246,19 @@ static int has_err(unsigned int trap)
245} 246}
246 247
247/* deliver_trap() returns true if it could deliver the trap. */ 248/* deliver_trap() returns true if it could deliver the trap. */
248int deliver_trap(struct lguest *lg, unsigned int num) 249int deliver_trap(struct lg_cpu *cpu, unsigned int num)
249{ 250{
250 /* Trap numbers are always 8 bit, but we set an impossible trap number 251 /* Trap numbers are always 8 bit, but we set an impossible trap number
251 * for traps inside the Switcher, so check that here. */ 252 * for traps inside the Switcher, so check that here. */
252 if (num >= ARRAY_SIZE(lg->arch.idt)) 253 if (num >= ARRAY_SIZE(cpu->arch.idt))
253 return 0; 254 return 0;
254 255
255 /* Early on the Guest hasn't set the IDT entries (or maybe it put a 256 /* Early on the Guest hasn't set the IDT entries (or maybe it put a
256 * bogus one in): if we fail here, the Guest will be killed. */ 257 * bogus one in): if we fail here, the Guest will be killed. */
257 if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b)) 258 if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
258 return 0; 259 return 0;
259 set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, 260 set_guest_interrupt(cpu, cpu->arch.idt[num].a,
260 has_err(num)); 261 cpu->arch.idt[num].b, has_err(num));
261 return 1; 262 return 1;
262} 263}
263 264
@@ -309,18 +310,18 @@ static int direct_trap(unsigned int num)
309 * the Guest. 310 * the Guest.
310 * 311 *
311 * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ 312 * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */
312void pin_stack_pages(struct lguest *lg) 313void pin_stack_pages(struct lg_cpu *cpu)
313{ 314{
314 unsigned int i; 315 unsigned int i;
315 316
316 /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or 317 /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or
317 * two pages of stack space. */ 318 * two pages of stack space. */
318 for (i = 0; i < lg->stack_pages; i++) 319 for (i = 0; i < cpu->lg->stack_pages; i++)
319 /* The stack grows *upwards*, so the address we're given is the 320 /* The stack grows *upwards*, so the address we're given is the
320 * start of the page after the kernel stack. Subtract one to 321 * start of the page after the kernel stack. Subtract one to
321 * get back onto the first stack page, and keep subtracting to 322 * get back onto the first stack page, and keep subtracting to
322 * get to the rest of the stack pages. */ 323 * get to the rest of the stack pages. */
323 pin_page(lg, lg->esp1 - 1 - i * PAGE_SIZE); 324 pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE);
324} 325}
325 326
326/* Direct traps also mean that we need to know whenever the Guest wants to use 327/* Direct traps also mean that we need to know whenever the Guest wants to use
@@ -331,21 +332,21 @@ void pin_stack_pages(struct lguest *lg)
331 * 332 *
332 * In Linux each process has its own kernel stack, so this happens a lot: we 333 * In Linux each process has its own kernel stack, so this happens a lot: we
333 * change stacks on each context switch. */ 334 * change stacks on each context switch. */
334void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) 335void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
335{ 336{
336 /* You are not allowed have a stack segment with privilege level 0: bad 337 /* You are not allowed have a stack segment with privilege level 0: bad
337 * Guest! */ 338 * Guest! */
338 if ((seg & 0x3) != GUEST_PL) 339 if ((seg & 0x3) != GUEST_PL)
339 kill_guest(lg, "bad stack segment %i", seg); 340 kill_guest(cpu, "bad stack segment %i", seg);
340 /* We only expect one or two stack pages. */ 341 /* We only expect one or two stack pages. */
341 if (pages > 2) 342 if (pages > 2)
342 kill_guest(lg, "bad stack pages %u", pages); 343 kill_guest(cpu, "bad stack pages %u", pages);
343 /* Save where the stack is, and how many pages */ 344 /* Save where the stack is, and how many pages */
344 lg->ss1 = seg; 345 cpu->ss1 = seg;
345 lg->esp1 = esp; 346 cpu->esp1 = esp;
346 lg->stack_pages = pages; 347 cpu->lg->stack_pages = pages;
347 /* Make sure the new stack pages are mapped */ 348 /* Make sure the new stack pages are mapped */
348 pin_stack_pages(lg); 349 pin_stack_pages(cpu);
349} 350}
350 351
351/* All this reference to mapping stacks leads us neatly into the other complex 352/* All this reference to mapping stacks leads us neatly into the other complex
@@ -353,7 +354,7 @@ void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
353 354
354/*H:235 This is the routine which actually checks the Guest's IDT entry and 355/*H:235 This is the routine which actually checks the Guest's IDT entry and
355 * transfers it into the entry in "struct lguest": */ 356 * transfers it into the entry in "struct lguest": */
356static void set_trap(struct lguest *lg, struct desc_struct *trap, 357static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
357 unsigned int num, u32 lo, u32 hi) 358 unsigned int num, u32 lo, u32 hi)
358{ 359{
359 u8 type = idt_type(lo, hi); 360 u8 type = idt_type(lo, hi);
@@ -366,7 +367,7 @@ static void set_trap(struct lguest *lg, struct desc_struct *trap,
366 367
367 /* We only support interrupt and trap gates. */ 368 /* We only support interrupt and trap gates. */
368 if (type != 0xE && type != 0xF) 369 if (type != 0xE && type != 0xF)
369 kill_guest(lg, "bad IDT type %i", type); 370 kill_guest(cpu, "bad IDT type %i", type);
370 371
371 /* We only copy the handler address, present bit, privilege level and 372 /* We only copy the handler address, present bit, privilege level and
372 * type. The privilege level controls where the trap can be triggered 373 * type. The privilege level controls where the trap can be triggered
@@ -383,7 +384,7 @@ static void set_trap(struct lguest *lg, struct desc_struct *trap,
383 * 384 *
384 * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the 385 * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
385 * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ 386 * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */
386void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi) 387void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
387{ 388{
388 /* Guest never handles: NMI, doublefault, spurious interrupt or 389 /* Guest never handles: NMI, doublefault, spurious interrupt or
389 * hypercall. We ignore when it tries to set them. */ 390 * hypercall. We ignore when it tries to set them. */
@@ -392,13 +393,13 @@ void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
392 393
393 /* Mark the IDT as changed: next time the Guest runs we'll know we have 394 /* Mark the IDT as changed: next time the Guest runs we'll know we have
394 * to copy this again. */ 395 * to copy this again. */
395 lg->changed |= CHANGED_IDT; 396 cpu->changed |= CHANGED_IDT;
396 397
397 /* Check that the Guest doesn't try to step outside the bounds. */ 398 /* Check that the Guest doesn't try to step outside the bounds. */
398 if (num >= ARRAY_SIZE(lg->arch.idt)) 399 if (num >= ARRAY_SIZE(cpu->arch.idt))
399 kill_guest(lg, "Setting idt entry %u", num); 400 kill_guest(cpu, "Setting idt entry %u", num);
400 else 401 else
401 set_trap(lg, &lg->arch.idt[num], num, lo, hi); 402 set_trap(cpu, &cpu->arch.idt[num], num, lo, hi);
402} 403}
403 404
404/* The default entry for each interrupt points into the Switcher routines which 405/* The default entry for each interrupt points into the Switcher routines which
@@ -434,14 +435,14 @@ void setup_default_idt_entries(struct lguest_ro_state *state,
434/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead 435/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead
435 * we copy them into the IDT which we've set up for Guests on this CPU, just 436 * we copy them into the IDT which we've set up for Guests on this CPU, just
436 * before we run the Guest. This routine does that copy. */ 437 * before we run the Guest. This routine does that copy. */
437void copy_traps(const struct lguest *lg, struct desc_struct *idt, 438void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
438 const unsigned long *def) 439 const unsigned long *def)
439{ 440{
440 unsigned int i; 441 unsigned int i;
441 442
442 /* We can simply copy the direct traps, otherwise we use the default 443 /* We can simply copy the direct traps, otherwise we use the default
443 * ones in the Switcher: they will return to the Host. */ 444 * ones in the Switcher: they will return to the Host. */
444 for (i = 0; i < ARRAY_SIZE(lg->arch.idt); i++) { 445 for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
445 /* If no Guest can ever override this trap, leave it alone. */ 446 /* If no Guest can ever override this trap, leave it alone. */
446 if (!direct_trap(i)) 447 if (!direct_trap(i))
447 continue; 448 continue;
@@ -450,8 +451,8 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
450 * Interrupt gates (type 14) disable interrupts as they are 451 * Interrupt gates (type 14) disable interrupts as they are
451 * entered, which we never let the Guest do. Not present 452 * entered, which we never let the Guest do. Not present
452 * entries (type 0x0) also can't go direct, of course. */ 453 * entries (type 0x0) also can't go direct, of course. */
453 if (idt_type(lg->arch.idt[i].a, lg->arch.idt[i].b) == 0xF) 454 if (idt_type(cpu->arch.idt[i].a, cpu->arch.idt[i].b) == 0xF)
454 idt[i] = lg->arch.idt[i]; 455 idt[i] = cpu->arch.idt[i];
455 else 456 else
456 /* Reset it to the default. */ 457 /* Reset it to the default. */
457 default_idt_entry(&idt[i], i, def[i]); 458 default_idt_entry(&idt[i], i, def[i]);
@@ -470,13 +471,13 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
470 * infrastructure to set a callback at that time. 471 * infrastructure to set a callback at that time.
471 * 472 *
472 * 0 means "turn off the clock". */ 473 * 0 means "turn off the clock". */
473void guest_set_clockevent(struct lguest *lg, unsigned long delta) 474void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
474{ 475{
475 ktime_t expires; 476 ktime_t expires;
476 477
477 if (unlikely(delta == 0)) { 478 if (unlikely(delta == 0)) {
478 /* Clock event device is shutting down. */ 479 /* Clock event device is shutting down. */
479 hrtimer_cancel(&lg->hrt); 480 hrtimer_cancel(&cpu->hrt);
480 return; 481 return;
481 } 482 }
482 483
@@ -484,25 +485,25 @@ void guest_set_clockevent(struct lguest *lg, unsigned long delta)
484 * all the time between now and the timer interrupt it asked for. This 485 * all the time between now and the timer interrupt it asked for. This
485 * is almost always the right thing to do. */ 486 * is almost always the right thing to do. */
486 expires = ktime_add_ns(ktime_get_real(), delta); 487 expires = ktime_add_ns(ktime_get_real(), delta);
487 hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS); 488 hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS);
488} 489}
489 490
490/* This is the function called when the Guest's timer expires. */ 491/* This is the function called when the Guest's timer expires. */
491static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) 492static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
492{ 493{
493 struct lguest *lg = container_of(timer, struct lguest, hrt); 494 struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
494 495
495 /* Remember the first interrupt is the timer interrupt. */ 496 /* Remember the first interrupt is the timer interrupt. */
496 set_bit(0, lg->irqs_pending); 497 set_bit(0, cpu->irqs_pending);
497 /* If the Guest is actually stopped, we need to wake it up. */ 498 /* If the Guest is actually stopped, we need to wake it up. */
498 if (lg->halted) 499 if (cpu->halted)
499 wake_up_process(lg->tsk); 500 wake_up_process(cpu->tsk);
500 return HRTIMER_NORESTART; 501 return HRTIMER_NORESTART;
501} 502}
502 503
503/* This sets up the timer for this Guest. */ 504/* This sets up the timer for this Guest. */
504void init_clockdev(struct lguest *lg) 505void init_clockdev(struct lg_cpu *cpu)
505{ 506{
506 hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS); 507 hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
507 lg->hrt.function = clockdev_fn; 508 cpu->hrt.function = clockdev_fn;
508} 509}
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 86924891b5eb..2337e1a06f02 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -8,6 +8,7 @@
8#include <linux/lguest.h> 8#include <linux/lguest.h>
9#include <linux/lguest_launcher.h> 9#include <linux/lguest_launcher.h>
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hrtimer.h>
11#include <linux/err.h> 12#include <linux/err.h>
12#include <asm/semaphore.h> 13#include <asm/semaphore.h>
13 14
@@ -38,58 +39,72 @@ struct lguest_pages
38#define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */ 39#define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */
39#define CHANGED_ALL 3 40#define CHANGED_ALL 3
40 41
41/* The private info the thread maintains about the guest. */ 42struct lguest;
42struct lguest 43
43{ 44struct lg_cpu {
44 /* At end of a page shared mapped over lguest_pages in guest. */ 45 unsigned int id;
45 unsigned long regs_page; 46 struct lguest *lg;
46 struct lguest_regs *regs;
47 struct lguest_data __user *lguest_data;
48 struct task_struct *tsk; 47 struct task_struct *tsk;
49 struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ 48 struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */
50 u32 pfn_limit; 49
51 /* This provides the offset to the base of guest-physical
52 * memory in the Launcher. */
53 void __user *mem_base;
54 unsigned long kernel_address;
55 u32 cr2; 50 u32 cr2;
56 int halted;
57 int ts; 51 int ts;
58 u32 next_hcall;
59 u32 esp1; 52 u32 esp1;
60 u8 ss1; 53 u8 ss1;
61 54
55 /* Bitmap of what has changed: see CHANGED_* above. */
56 int changed;
57
58 unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
59
60 /* At end of a page shared mapped over lguest_pages in guest. */
61 unsigned long regs_page;
62 struct lguest_regs *regs;
63
64 struct lguest_pages *last_pages;
65
66 int cpu_pgd; /* which pgd this cpu is currently using */
67
62 /* If a hypercall was asked for, this points to the arguments. */ 68 /* If a hypercall was asked for, this points to the arguments. */
63 struct hcall_args *hcall; 69 struct hcall_args *hcall;
70 u32 next_hcall;
71
72 /* Virtual clock device */
73 struct hrtimer hrt;
64 74
65 /* Do we need to stop what we're doing and return to userspace? */ 75 /* Do we need to stop what we're doing and return to userspace? */
66 int break_out; 76 int break_out;
67 wait_queue_head_t break_wq; 77 wait_queue_head_t break_wq;
78 int halted;
68 79
69 /* Bitmap of what has changed: see CHANGED_* above. */ 80 /* Pending virtual interrupts */
70 int changed; 81 DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
71 struct lguest_pages *last_pages; 82
83 struct lg_cpu_arch arch;
84};
85
86/* The private info the thread maintains about the guest. */
87struct lguest
88{
89 struct lguest_data __user *lguest_data;
90 struct lg_cpu cpus[NR_CPUS];
91 unsigned int nr_cpus;
92
93 u32 pfn_limit;
94 /* This provides the offset to the base of guest-physical
95 * memory in the Launcher. */
96 void __user *mem_base;
97 unsigned long kernel_address;
72 98
73 /* We keep a small number of these. */
74 u32 pgdidx;
75 struct pgdir pgdirs[4]; 99 struct pgdir pgdirs[4];
76 100
77 unsigned long noirq_start, noirq_end; 101 unsigned long noirq_start, noirq_end;
78 unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
79 102
80 unsigned int stack_pages; 103 unsigned int stack_pages;
81 u32 tsc_khz; 104 u32 tsc_khz;
82 105
83 /* Dead? */ 106 /* Dead? */
84 const char *dead; 107 const char *dead;
85
86 struct lguest_arch arch;
87
88 /* Virtual clock device */
89 struct hrtimer hrt;
90
91 /* Pending virtual interrupts */
92 DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
93}; 108};
94 109
95extern struct mutex lguest_lock; 110extern struct mutex lguest_lock;
@@ -97,26 +112,26 @@ extern struct mutex lguest_lock;
97/* core.c: */ 112/* core.c: */
98int lguest_address_ok(const struct lguest *lg, 113int lguest_address_ok(const struct lguest *lg,
99 unsigned long addr, unsigned long len); 114 unsigned long addr, unsigned long len);
100void __lgread(struct lguest *, void *, unsigned long, unsigned); 115void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
101void __lgwrite(struct lguest *, unsigned long, const void *, unsigned); 116void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
102 117
103/*H:035 Using memory-copy operations like that is usually inconvient, so we 118/*H:035 Using memory-copy operations like that is usually inconvient, so we
104 * have the following helper macros which read and write a specific type (often 119 * have the following helper macros which read and write a specific type (often
105 * an unsigned long). 120 * an unsigned long).
106 * 121 *
107 * This reads into a variable of the given type then returns that. */ 122 * This reads into a variable of the given type then returns that. */
108#define lgread(lg, addr, type) \ 123#define lgread(cpu, addr, type) \
109 ({ type _v; __lgread((lg), &_v, (addr), sizeof(_v)); _v; }) 124 ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
110 125
111/* This checks that the variable is of the given type, then writes it out. */ 126/* This checks that the variable is of the given type, then writes it out. */
112#define lgwrite(lg, addr, type, val) \ 127#define lgwrite(cpu, addr, type, val) \
113 do { \ 128 do { \
114 typecheck(type, val); \ 129 typecheck(type, val); \
115 __lgwrite((lg), (addr), &(val), sizeof(val)); \ 130 __lgwrite((cpu), (addr), &(val), sizeof(val)); \
116 } while(0) 131 } while(0)
117/* (end of memory access helper routines) :*/ 132/* (end of memory access helper routines) :*/
118 133
119int run_guest(struct lguest *lg, unsigned long __user *user); 134int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
120 135
121/* Helper macros to obtain the first 12 or the last 20 bits, this is only the 136/* Helper macros to obtain the first 12 or the last 20 bits, this is only the
122 * first step in the migration to the kernel types. pte_pfn is already defined 137 * first step in the migration to the kernel types. pte_pfn is already defined
@@ -126,52 +141,53 @@ int run_guest(struct lguest *lg, unsigned long __user *user);
126#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) 141#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT)
127 142
128/* interrupts_and_traps.c: */ 143/* interrupts_and_traps.c: */
129void maybe_do_interrupt(struct lguest *lg); 144void maybe_do_interrupt(struct lg_cpu *cpu);
130int deliver_trap(struct lguest *lg, unsigned int num); 145int deliver_trap(struct lg_cpu *cpu, unsigned int num);
131void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi); 146void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
132void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages); 147 u32 low, u32 hi);
133void pin_stack_pages(struct lguest *lg); 148void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
149void pin_stack_pages(struct lg_cpu *cpu);
134void setup_default_idt_entries(struct lguest_ro_state *state, 150void setup_default_idt_entries(struct lguest_ro_state *state,
135 const unsigned long *def); 151 const unsigned long *def);
136void copy_traps(const struct lguest *lg, struct desc_struct *idt, 152void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
137 const unsigned long *def); 153 const unsigned long *def);
138void guest_set_clockevent(struct lguest *lg, unsigned long delta); 154void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
139void init_clockdev(struct lguest *lg); 155void init_clockdev(struct lg_cpu *cpu);
140bool check_syscall_vector(struct lguest *lg); 156bool check_syscall_vector(struct lguest *lg);
141int init_interrupts(void); 157int init_interrupts(void);
142void free_interrupts(void); 158void free_interrupts(void);
143 159
144/* segments.c: */ 160/* segments.c: */
145void setup_default_gdt_entries(struct lguest_ro_state *state); 161void setup_default_gdt_entries(struct lguest_ro_state *state);
146void setup_guest_gdt(struct lguest *lg); 162void setup_guest_gdt(struct lg_cpu *cpu);
147void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num); 163void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num);
148void guest_load_tls(struct lguest *lg, unsigned long tls_array); 164void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
149void copy_gdt(const struct lguest *lg, struct desc_struct *gdt); 165void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
150void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt); 166void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
151 167
152/* page_tables.c: */ 168/* page_tables.c: */
153int init_guest_pagetable(struct lguest *lg, unsigned long pgtable); 169int init_guest_pagetable(struct lguest *lg, unsigned long pgtable);
154void free_guest_pagetable(struct lguest *lg); 170void free_guest_pagetable(struct lguest *lg);
155void guest_new_pagetable(struct lguest *lg, unsigned long pgtable); 171void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
156void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); 172void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
157void guest_pagetable_clear_all(struct lguest *lg); 173void guest_pagetable_clear_all(struct lg_cpu *cpu);
158void guest_pagetable_flush_user(struct lguest *lg); 174void guest_pagetable_flush_user(struct lg_cpu *cpu);
159void guest_set_pte(struct lguest *lg, unsigned long gpgdir, 175void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
160 unsigned long vaddr, pte_t val); 176 unsigned long vaddr, pte_t val);
161void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); 177void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
162int demand_page(struct lguest *info, unsigned long cr2, int errcode); 178int demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode);
163void pin_page(struct lguest *lg, unsigned long vaddr); 179void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
164unsigned long guest_pa(struct lguest *lg, unsigned long vaddr); 180unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
165void page_table_guest_data_init(struct lguest *lg); 181void page_table_guest_data_init(struct lg_cpu *cpu);
166 182
167/* <arch>/core.c: */ 183/* <arch>/core.c: */
168void lguest_arch_host_init(void); 184void lguest_arch_host_init(void);
169void lguest_arch_host_fini(void); 185void lguest_arch_host_fini(void);
170void lguest_arch_run_guest(struct lguest *lg); 186void lguest_arch_run_guest(struct lg_cpu *cpu);
171void lguest_arch_handle_trap(struct lguest *lg); 187void lguest_arch_handle_trap(struct lg_cpu *cpu);
172int lguest_arch_init_hypercalls(struct lguest *lg); 188int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
173int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args); 189int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
174void lguest_arch_setup_regs(struct lguest *lg, unsigned long start); 190void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
175 191
176/* <arch>/switcher.S: */ 192/* <arch>/switcher.S: */
177extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; 193extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
@@ -181,8 +197,8 @@ int lguest_device_init(void);
181void lguest_device_remove(void); 197void lguest_device_remove(void);
182 198
183/* hypercalls.c: */ 199/* hypercalls.c: */
184void do_hypercalls(struct lguest *lg); 200void do_hypercalls(struct lg_cpu *cpu);
185void write_timestamp(struct lguest *lg); 201void write_timestamp(struct lg_cpu *cpu);
186 202
187/*L:035 203/*L:035
188 * Let's step aside for the moment, to study one important routine that's used 204 * Let's step aside for the moment, to study one important routine that's used
@@ -208,12 +224,12 @@ void write_timestamp(struct lguest *lg);
208 * Like any macro which uses an "if", it is safely wrapped in a run-once "do { 224 * Like any macro which uses an "if", it is safely wrapped in a run-once "do {
209 * } while(0)". 225 * } while(0)".
210 */ 226 */
211#define kill_guest(lg, fmt...) \ 227#define kill_guest(cpu, fmt...) \
212do { \ 228do { \
213 if (!(lg)->dead) { \ 229 if (!(cpu)->lg->dead) { \
214 (lg)->dead = kasprintf(GFP_ATOMIC, fmt); \ 230 (cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt); \
215 if (!(lg)->dead) \ 231 if (!(cpu)->lg->dead) \
216 (lg)->dead = ERR_PTR(-ENOMEM); \ 232 (cpu)->lg->dead = ERR_PTR(-ENOMEM); \
217 } \ 233 } \
218} while(0) 234} while(0)
219/* (End of aside) :*/ 235/* (End of aside) :*/
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 3b92a61ba8d2..85d42d3d01a9 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -6,6 +6,7 @@
6#include <linux/uaccess.h> 6#include <linux/uaccess.h>
7#include <linux/miscdevice.h> 7#include <linux/miscdevice.h>
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/sched.h>
9#include "lg.h" 10#include "lg.h"
10 11
11/*L:055 When something happens, the Waker process needs a way to stop the 12/*L:055 When something happens, the Waker process needs a way to stop the
@@ -13,7 +14,7 @@
13 * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher 14 * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher
14 * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release 15 * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release
15 * the Waker. */ 16 * the Waker. */
16static int break_guest_out(struct lguest *lg, const unsigned long __user *input) 17static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
17{ 18{
18 unsigned long on; 19 unsigned long on;
19 20
@@ -22,21 +23,21 @@ static int break_guest_out(struct lguest *lg, const unsigned long __user *input)
22 return -EFAULT; 23 return -EFAULT;
23 24
24 if (on) { 25 if (on) {
25 lg->break_out = 1; 26 cpu->break_out = 1;
26 /* Pop it out of the Guest (may be running on different CPU) */ 27 /* Pop it out of the Guest (may be running on different CPU) */
27 wake_up_process(lg->tsk); 28 wake_up_process(cpu->tsk);
28 /* Wait for them to reset it */ 29 /* Wait for them to reset it */
29 return wait_event_interruptible(lg->break_wq, !lg->break_out); 30 return wait_event_interruptible(cpu->break_wq, !cpu->break_out);
30 } else { 31 } else {
31 lg->break_out = 0; 32 cpu->break_out = 0;
32 wake_up(&lg->break_wq); 33 wake_up(&cpu->break_wq);
33 return 0; 34 return 0;
34 } 35 }
35} 36}
36 37
37/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt 38/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
38 * number to /dev/lguest. */ 39 * number to /dev/lguest. */
39static int user_send_irq(struct lguest *lg, const unsigned long __user *input) 40static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
40{ 41{
41 unsigned long irq; 42 unsigned long irq;
42 43
@@ -46,7 +47,7 @@ static int user_send_irq(struct lguest *lg, const unsigned long __user *input)
46 return -EINVAL; 47 return -EINVAL;
47 /* Next time the Guest runs, the core code will see if it can deliver 48 /* Next time the Guest runs, the core code will see if it can deliver
48 * this interrupt. */ 49 * this interrupt. */
49 set_bit(irq, lg->irqs_pending); 50 set_bit(irq, cpu->irqs_pending);
50 return 0; 51 return 0;
51} 52}
52 53
@@ -55,13 +56,21 @@ static int user_send_irq(struct lguest *lg, const unsigned long __user *input)
55static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) 56static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
56{ 57{
57 struct lguest *lg = file->private_data; 58 struct lguest *lg = file->private_data;
59 struct lg_cpu *cpu;
60 unsigned int cpu_id = *o;
58 61
59 /* You must write LHREQ_INITIALIZE first! */ 62 /* You must write LHREQ_INITIALIZE first! */
60 if (!lg) 63 if (!lg)
61 return -EINVAL; 64 return -EINVAL;
62 65
66 /* Watch out for arbitrary vcpu indexes! */
67 if (cpu_id >= lg->nr_cpus)
68 return -EINVAL;
69
70 cpu = &lg->cpus[cpu_id];
71
63 /* If you're not the task which owns the Guest, go away. */ 72 /* If you're not the task which owns the Guest, go away. */
64 if (current != lg->tsk) 73 if (current != cpu->tsk)
65 return -EPERM; 74 return -EPERM;
66 75
67 /* If the guest is already dead, we indicate why */ 76 /* If the guest is already dead, we indicate why */
@@ -81,11 +90,53 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
81 90
82 /* If we returned from read() last time because the Guest notified, 91 /* If we returned from read() last time because the Guest notified,
83 * clear the flag. */ 92 * clear the flag. */
84 if (lg->pending_notify) 93 if (cpu->pending_notify)
85 lg->pending_notify = 0; 94 cpu->pending_notify = 0;
86 95
87 /* Run the Guest until something interesting happens. */ 96 /* Run the Guest until something interesting happens. */
88 return run_guest(lg, (unsigned long __user *)user); 97 return run_guest(cpu, (unsigned long __user *)user);
98}
99
100static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
101{
102 if (id >= NR_CPUS)
103 return -EINVAL;
104
105 cpu->id = id;
106 cpu->lg = container_of((cpu - id), struct lguest, cpus[0]);
107 cpu->lg->nr_cpus++;
108 init_clockdev(cpu);
109
110 /* We need a complete page for the Guest registers: they are accessible
111 * to the Guest and we can only grant it access to whole pages. */
112 cpu->regs_page = get_zeroed_page(GFP_KERNEL);
113 if (!cpu->regs_page)
114 return -ENOMEM;
115
116 /* We actually put the registers at the bottom of the page. */
117 cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
118
119 /* Now we initialize the Guest's registers, handing it the start
120 * address. */
121 lguest_arch_setup_regs(cpu, start_ip);
122
123 /* Initialize the queue for the waker to wait on */
124 init_waitqueue_head(&cpu->break_wq);
125
126 /* We keep a pointer to the Launcher task (ie. current task) for when
127 * other Guests want to wake this one (inter-Guest I/O). */
128 cpu->tsk = current;
129
130 /* We need to keep a pointer to the Launcher's memory map, because if
131 * the Launcher dies we need to clean it up. If we don't keep a
132 * reference, it is destroyed before close() is called. */
133 cpu->mm = get_task_mm(cpu->tsk);
134
135 /* We remember which CPU's pages this Guest used last, for optimization
136 * when the same Guest runs on the same CPU twice. */
137 cpu->last_pages = NULL;
138
139 return 0;
89} 140}
90 141
91/*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit) 142/*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit)
@@ -134,15 +185,10 @@ static int initialize(struct file *file, const unsigned long __user *input)
134 lg->mem_base = (void __user *)(long)args[0]; 185 lg->mem_base = (void __user *)(long)args[0];
135 lg->pfn_limit = args[1]; 186 lg->pfn_limit = args[1];
136 187
137 /* We need a complete page for the Guest registers: they are accessible 188 /* This is the first cpu */
138 * to the Guest and we can only grant it access to whole pages. */ 189 err = lg_cpu_start(&lg->cpus[0], 0, args[3]);
139 lg->regs_page = get_zeroed_page(GFP_KERNEL); 190 if (err)
140 if (!lg->regs_page) {
141 err = -ENOMEM;
142 goto release_guest; 191 goto release_guest;
143 }
144 /* We actually put the registers at the bottom of the page. */
145 lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs);
146 192
147 /* Initialize the Guest's shadow page tables, using the toplevel 193 /* Initialize the Guest's shadow page tables, using the toplevel
148 * address the Launcher gave us. This allocates memory, so can 194 * address the Launcher gave us. This allocates memory, so can
@@ -151,28 +197,6 @@ static int initialize(struct file *file, const unsigned long __user *input)
151 if (err) 197 if (err)
152 goto free_regs; 198 goto free_regs;
153 199
154 /* Now we initialize the Guest's registers, handing it the start
155 * address. */
156 lguest_arch_setup_regs(lg, args[3]);
157
158 /* The timer for lguest's clock needs initialization. */
159 init_clockdev(lg);
160
161 /* We keep a pointer to the Launcher task (ie. current task) for when
162 * other Guests want to wake this one (inter-Guest I/O). */
163 lg->tsk = current;
164 /* We need to keep a pointer to the Launcher's memory map, because if
165 * the Launcher dies we need to clean it up. If we don't keep a
166 * reference, it is destroyed before close() is called. */
167 lg->mm = get_task_mm(lg->tsk);
168
169 /* Initialize the queue for the waker to wait on */
170 init_waitqueue_head(&lg->break_wq);
171
172 /* We remember which CPU's pages this Guest used last, for optimization
173 * when the same Guest runs on the same CPU twice. */
174 lg->last_pages = NULL;
175
176 /* We keep our "struct lguest" in the file's private_data. */ 200 /* We keep our "struct lguest" in the file's private_data. */
177 file->private_data = lg; 201 file->private_data = lg;
178 202
@@ -182,7 +206,8 @@ static int initialize(struct file *file, const unsigned long __user *input)
182 return sizeof(args); 206 return sizeof(args);
183 207
184free_regs: 208free_regs:
185 free_page(lg->regs_page); 209 /* FIXME: This should be in free_vcpu */
210 free_page(lg->cpus[0].regs_page);
186release_guest: 211release_guest:
187 kfree(lg); 212 kfree(lg);
188unlock: 213unlock:
@@ -202,30 +227,37 @@ static ssize_t write(struct file *file, const char __user *in,
202 struct lguest *lg = file->private_data; 227 struct lguest *lg = file->private_data;
203 const unsigned long __user *input = (const unsigned long __user *)in; 228 const unsigned long __user *input = (const unsigned long __user *)in;
204 unsigned long req; 229 unsigned long req;
230 struct lg_cpu *uninitialized_var(cpu);
231 unsigned int cpu_id = *off;
205 232
206 if (get_user(req, input) != 0) 233 if (get_user(req, input) != 0)
207 return -EFAULT; 234 return -EFAULT;
208 input++; 235 input++;
209 236
210 /* If you haven't initialized, you must do that first. */ 237 /* If you haven't initialized, you must do that first. */
211 if (req != LHREQ_INITIALIZE && !lg) 238 if (req != LHREQ_INITIALIZE) {
212 return -EINVAL; 239 if (!lg || (cpu_id >= lg->nr_cpus))
240 return -EINVAL;
241 cpu = &lg->cpus[cpu_id];
242 if (!cpu)
243 return -EINVAL;
244 }
213 245
214 /* Once the Guest is dead, all you can do is read() why it died. */ 246 /* Once the Guest is dead, all you can do is read() why it died. */
215 if (lg && lg->dead) 247 if (lg && lg->dead)
216 return -ENOENT; 248 return -ENOENT;
217 249
218 /* If you're not the task which owns the Guest, you can only break */ 250 /* If you're not the task which owns the Guest, you can only break */
219 if (lg && current != lg->tsk && req != LHREQ_BREAK) 251 if (lg && current != cpu->tsk && req != LHREQ_BREAK)
220 return -EPERM; 252 return -EPERM;
221 253
222 switch (req) { 254 switch (req) {
223 case LHREQ_INITIALIZE: 255 case LHREQ_INITIALIZE:
224 return initialize(file, input); 256 return initialize(file, input);
225 case LHREQ_IRQ: 257 case LHREQ_IRQ:
226 return user_send_irq(lg, input); 258 return user_send_irq(cpu, input);
227 case LHREQ_BREAK: 259 case LHREQ_BREAK:
228 return break_guest_out(lg, input); 260 return break_guest_out(cpu, input);
229 default: 261 default:
230 return -EINVAL; 262 return -EINVAL;
231 } 263 }
@@ -241,6 +273,7 @@ static ssize_t write(struct file *file, const char __user *in,
241static int close(struct inode *inode, struct file *file) 273static int close(struct inode *inode, struct file *file)
242{ 274{
243 struct lguest *lg = file->private_data; 275 struct lguest *lg = file->private_data;
276 unsigned int i;
244 277
245 /* If we never successfully initialized, there's nothing to clean up */ 278 /* If we never successfully initialized, there's nothing to clean up */
246 if (!lg) 279 if (!lg)
@@ -249,19 +282,23 @@ static int close(struct inode *inode, struct file *file)
249 /* We need the big lock, to protect from inter-guest I/O and other 282 /* We need the big lock, to protect from inter-guest I/O and other
250 * Launchers initializing guests. */ 283 * Launchers initializing guests. */
251 mutex_lock(&lguest_lock); 284 mutex_lock(&lguest_lock);
252 /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ 285
253 hrtimer_cancel(&lg->hrt);
254 /* Free up the shadow page tables for the Guest. */ 286 /* Free up the shadow page tables for the Guest. */
255 free_guest_pagetable(lg); 287 free_guest_pagetable(lg);
256 /* Now all the memory cleanups are done, it's safe to release the 288
257 * Launcher's memory management structure. */ 289 for (i = 0; i < lg->nr_cpus; i++) {
258 mmput(lg->mm); 290 /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
291 hrtimer_cancel(&lg->cpus[i].hrt);
292 /* We can free up the register page we allocated. */
293 free_page(lg->cpus[i].regs_page);
294 /* Now all the memory cleanups are done, it's safe to release
295 * the Launcher's memory management structure. */
296 mmput(lg->cpus[i].mm);
297 }
259 /* If lg->dead doesn't contain an error code it will be NULL or a 298 /* If lg->dead doesn't contain an error code it will be NULL or a
260 * kmalloc()ed string, either of which is ok to hand to kfree(). */ 299 * kmalloc()ed string, either of which is ok to hand to kfree(). */
261 if (!IS_ERR(lg->dead)) 300 if (!IS_ERR(lg->dead))
262 kfree(lg->dead); 301 kfree(lg->dead);
263 /* We can free up the register page we allocated. */
264 free_page(lg->regs_page);
265 /* We clear the entire structure, which also marks it as free for the 302 /* We clear the entire structure, which also marks it as free for the
266 * next user. */ 303 * next user. */
267 memset(lg, 0, sizeof(*lg)); 304 memset(lg, 0, sizeof(*lg));
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index fffabb327157..74b4cf2a6c41 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -68,23 +68,23 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
68 * page directory entry (PGD) for that address. Since we keep track of several 68 * page directory entry (PGD) for that address. Since we keep track of several
69 * page tables, the "i" argument tells us which one we're interested in (it's 69 * page tables, the "i" argument tells us which one we're interested in (it's
70 * usually the current one). */ 70 * usually the current one). */
71static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) 71static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
72{ 72{
73 unsigned int index = pgd_index(vaddr); 73 unsigned int index = pgd_index(vaddr);
74 74
75 /* We kill any Guest trying to touch the Switcher addresses. */ 75 /* We kill any Guest trying to touch the Switcher addresses. */
76 if (index >= SWITCHER_PGD_INDEX) { 76 if (index >= SWITCHER_PGD_INDEX) {
77 kill_guest(lg, "attempt to access switcher pages"); 77 kill_guest(cpu, "attempt to access switcher pages");
78 index = 0; 78 index = 0;
79 } 79 }
80 /* Return a pointer index'th pgd entry for the i'th page table. */ 80 /* Return a pointer index'th pgd entry for the i'th page table. */
81 return &lg->pgdirs[i].pgdir[index]; 81 return &cpu->lg->pgdirs[i].pgdir[index];
82} 82}
83 83
84/* This routine then takes the page directory entry returned above, which 84/* This routine then takes the page directory entry returned above, which
85 * contains the address of the page table entry (PTE) page. It then returns a 85 * contains the address of the page table entry (PTE) page. It then returns a
86 * pointer to the PTE entry for the given address. */ 86 * pointer to the PTE entry for the given address. */
87static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) 87static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr)
88{ 88{
89 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 89 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
90 /* You should never call this if the PGD entry wasn't valid */ 90 /* You should never call this if the PGD entry wasn't valid */
@@ -94,14 +94,13 @@ static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
94 94
95/* These two functions just like the above two, except they access the Guest 95/* These two functions just like the above two, except they access the Guest
96 * page tables. Hence they return a Guest address. */ 96 * page tables. Hence they return a Guest address. */
97static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) 97static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
98{ 98{
99 unsigned int index = vaddr >> (PGDIR_SHIFT); 99 unsigned int index = vaddr >> (PGDIR_SHIFT);
100 return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t); 100 return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
101} 101}
102 102
103static unsigned long gpte_addr(struct lguest *lg, 103static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr)
104 pgd_t gpgd, unsigned long vaddr)
105{ 104{
106 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 105 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
107 BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); 106 BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
@@ -138,7 +137,7 @@ static unsigned long get_pfn(unsigned long virtpfn, int write)
138 * entry can be a little tricky. The flags are (almost) the same, but the 137 * entry can be a little tricky. The flags are (almost) the same, but the
139 * Guest PTE contains a virtual page number: the CPU needs the real page 138 * Guest PTE contains a virtual page number: the CPU needs the real page
140 * number. */ 139 * number. */
141static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write) 140static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
142{ 141{
143 unsigned long pfn, base, flags; 142 unsigned long pfn, base, flags;
144 143
@@ -149,7 +148,7 @@ static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
149 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); 148 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
150 149
151 /* The Guest's pages are offset inside the Launcher. */ 150 /* The Guest's pages are offset inside the Launcher. */
152 base = (unsigned long)lg->mem_base / PAGE_SIZE; 151 base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
153 152
154 /* We need a temporary "unsigned long" variable to hold the answer from 153 /* We need a temporary "unsigned long" variable to hold the answer from
155 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 154 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
@@ -157,7 +156,7 @@ static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
157 * page, given the virtual number. */ 156 * page, given the virtual number. */
158 pfn = get_pfn(base + pte_pfn(gpte), write); 157 pfn = get_pfn(base + pte_pfn(gpte), write);
159 if (pfn == -1UL) { 158 if (pfn == -1UL) {
160 kill_guest(lg, "failed to get page %lu", pte_pfn(gpte)); 159 kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
161 /* When we destroy the Guest, we'll go through the shadow page 160 /* When we destroy the Guest, we'll go through the shadow page
162 * tables and release_pte() them. Make sure we don't think 161 * tables and release_pte() them. Make sure we don't think
163 * this one is valid! */ 162 * this one is valid! */
@@ -177,17 +176,18 @@ static void release_pte(pte_t pte)
177} 176}
178/*:*/ 177/*:*/
179 178
180static void check_gpte(struct lguest *lg, pte_t gpte) 179static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
181{ 180{
182 if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE)) 181 if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE))
183 || pte_pfn(gpte) >= lg->pfn_limit) 182 || pte_pfn(gpte) >= cpu->lg->pfn_limit)
184 kill_guest(lg, "bad page table entry"); 183 kill_guest(cpu, "bad page table entry");
185} 184}
186 185
187static void check_gpgd(struct lguest *lg, pgd_t gpgd) 186static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
188{ 187{
189 if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit) 188 if ((pgd_flags(gpgd) & ~_PAGE_TABLE) ||
190 kill_guest(lg, "bad page directory entry"); 189 (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
190 kill_guest(cpu, "bad page directory entry");
191} 191}
192 192
193/*H:330 193/*H:330
@@ -200,7 +200,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd)
200 * 200 *
201 * If we fixed up the fault (ie. we mapped the address), this routine returns 201 * If we fixed up the fault (ie. we mapped the address), this routine returns
202 * true. Otherwise, it was a real fault and we need to tell the Guest. */ 202 * true. Otherwise, it was a real fault and we need to tell the Guest. */
203int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) 203int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
204{ 204{
205 pgd_t gpgd; 205 pgd_t gpgd;
206 pgd_t *spgd; 206 pgd_t *spgd;
@@ -209,24 +209,24 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
209 pte_t *spte; 209 pte_t *spte;
210 210
211 /* First step: get the top-level Guest page table entry. */ 211 /* First step: get the top-level Guest page table entry. */
212 gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); 212 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
213 /* Toplevel not present? We can't map it in. */ 213 /* Toplevel not present? We can't map it in. */
214 if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) 214 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
215 return 0; 215 return 0;
216 216
217 /* Now look at the matching shadow entry. */ 217 /* Now look at the matching shadow entry. */
218 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 218 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
219 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { 219 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
220 /* No shadow entry: allocate a new shadow PTE page. */ 220 /* No shadow entry: allocate a new shadow PTE page. */
221 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 221 unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
222 /* This is not really the Guest's fault, but killing it is 222 /* This is not really the Guest's fault, but killing it is
223 * simple for this corner case. */ 223 * simple for this corner case. */
224 if (!ptepage) { 224 if (!ptepage) {
225 kill_guest(lg, "out of memory allocating pte page"); 225 kill_guest(cpu, "out of memory allocating pte page");
226 return 0; 226 return 0;
227 } 227 }
228 /* We check that the Guest pgd is OK. */ 228 /* We check that the Guest pgd is OK. */
229 check_gpgd(lg, gpgd); 229 check_gpgd(cpu, gpgd);
230 /* And we copy the flags to the shadow PGD entry. The page 230 /* And we copy the flags to the shadow PGD entry. The page
231 * number in the shadow PGD is the page we just allocated. */ 231 * number in the shadow PGD is the page we just allocated. */
232 *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); 232 *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd));
@@ -234,8 +234,8 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
234 234
235 /* OK, now we look at the lower level in the Guest page table: keep its 235 /* OK, now we look at the lower level in the Guest page table: keep its
236 * address, because we might update it later. */ 236 * address, because we might update it later. */
237 gpte_ptr = gpte_addr(lg, gpgd, vaddr); 237 gpte_ptr = gpte_addr(gpgd, vaddr);
238 gpte = lgread(lg, gpte_ptr, pte_t); 238 gpte = lgread(cpu, gpte_ptr, pte_t);
239 239
240 /* If this page isn't in the Guest page tables, we can't page it in. */ 240 /* If this page isn't in the Guest page tables, we can't page it in. */
241 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 241 if (!(pte_flags(gpte) & _PAGE_PRESENT))
@@ -252,7 +252,7 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
252 252
253 /* Check that the Guest PTE flags are OK, and the page number is below 253 /* Check that the Guest PTE flags are OK, and the page number is below
254 * the pfn_limit (ie. not mapping the Launcher binary). */ 254 * the pfn_limit (ie. not mapping the Launcher binary). */
255 check_gpte(lg, gpte); 255 check_gpte(cpu, gpte);
256 256
257 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 257 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
258 gpte = pte_mkyoung(gpte); 258 gpte = pte_mkyoung(gpte);
@@ -260,7 +260,7 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
260 gpte = pte_mkdirty(gpte); 260 gpte = pte_mkdirty(gpte);
261 261
262 /* Get the pointer to the shadow PTE entry we're going to set. */ 262 /* Get the pointer to the shadow PTE entry we're going to set. */
263 spte = spte_addr(lg, *spgd, vaddr); 263 spte = spte_addr(*spgd, vaddr);
264 /* If there was a valid shadow PTE entry here before, we release it. 264 /* If there was a valid shadow PTE entry here before, we release it.
265 * This can happen with a write to a previously read-only entry. */ 265 * This can happen with a write to a previously read-only entry. */
266 release_pte(*spte); 266 release_pte(*spte);
@@ -268,17 +268,17 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
268 /* If this is a write, we insist that the Guest page is writable (the 268 /* If this is a write, we insist that the Guest page is writable (the
269 * final arg to gpte_to_spte()). */ 269 * final arg to gpte_to_spte()). */
270 if (pte_dirty(gpte)) 270 if (pte_dirty(gpte))
271 *spte = gpte_to_spte(lg, gpte, 1); 271 *spte = gpte_to_spte(cpu, gpte, 1);
272 else 272 else
273 /* If this is a read, don't set the "writable" bit in the page 273 /* If this is a read, don't set the "writable" bit in the page
274 * table entry, even if the Guest says it's writable. That way 274 * table entry, even if the Guest says it's writable. That way
275 * we will come back here when a write does actually occur, so 275 * we will come back here when a write does actually occur, so
276 * we can update the Guest's _PAGE_DIRTY flag. */ 276 * we can update the Guest's _PAGE_DIRTY flag. */
277 *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); 277 *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0);
278 278
279 /* Finally, we write the Guest PTE entry back: we've set the 279 /* Finally, we write the Guest PTE entry back: we've set the
280 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 280 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
281 lgwrite(lg, gpte_ptr, pte_t, gpte); 281 lgwrite(cpu, gpte_ptr, pte_t, gpte);
282 282
283 /* The fault is fixed, the page table is populated, the mapping 283 /* The fault is fixed, the page table is populated, the mapping
284 * manipulated, the result returned and the code complete. A small 284 * manipulated, the result returned and the code complete. A small
@@ -297,19 +297,19 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
297 * 297 *
298 * This is a quick version which answers the question: is this virtual address 298 * This is a quick version which answers the question: is this virtual address
299 * mapped by the shadow page tables, and is it writable? */ 299 * mapped by the shadow page tables, and is it writable? */
300static int page_writable(struct lguest *lg, unsigned long vaddr) 300static int page_writable(struct lg_cpu *cpu, unsigned long vaddr)
301{ 301{
302 pgd_t *spgd; 302 pgd_t *spgd;
303 unsigned long flags; 303 unsigned long flags;
304 304
305 /* Look at the current top level entry: is it present? */ 305 /* Look at the current top level entry: is it present? */
306 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 306 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
307 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) 307 if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
308 return 0; 308 return 0;
309 309
310 /* Check the flags on the pte entry itself: it must be present and 310 /* Check the flags on the pte entry itself: it must be present and
311 * writable. */ 311 * writable. */
312 flags = pte_flags(*(spte_addr(lg, *spgd, vaddr))); 312 flags = pte_flags(*(spte_addr(*spgd, vaddr)));
313 313
314 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 314 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
315} 315}
@@ -317,10 +317,10 @@ static int page_writable(struct lguest *lg, unsigned long vaddr)
317/* So, when pin_stack_pages() asks us to pin a page, we check if it's already 317/* So, when pin_stack_pages() asks us to pin a page, we check if it's already
318 * in the page tables, and if not, we call demand_page() with error code 2 318 * in the page tables, and if not, we call demand_page() with error code 2
319 * (meaning "write"). */ 319 * (meaning "write"). */
320void pin_page(struct lguest *lg, unsigned long vaddr) 320void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
321{ 321{
322 if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2)) 322 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
323 kill_guest(lg, "bad stack page %#lx", vaddr); 323 kill_guest(cpu, "bad stack page %#lx", vaddr);
324} 324}
325 325
326/*H:450 If we chase down the release_pgd() code, it looks like this: */ 326/*H:450 If we chase down the release_pgd() code, it looks like this: */
@@ -358,28 +358,28 @@ static void flush_user_mappings(struct lguest *lg, int idx)
358 * 358 *
359 * The Guest has a hypercall to throw away the page tables: it's used when a 359 * The Guest has a hypercall to throw away the page tables: it's used when a
360 * large number of mappings have been changed. */ 360 * large number of mappings have been changed. */
361void guest_pagetable_flush_user(struct lguest *lg) 361void guest_pagetable_flush_user(struct lg_cpu *cpu)
362{ 362{
363 /* Drop the userspace part of the current page table. */ 363 /* Drop the userspace part of the current page table. */
364 flush_user_mappings(lg, lg->pgdidx); 364 flush_user_mappings(cpu->lg, cpu->cpu_pgd);
365} 365}
366/*:*/ 366/*:*/
367 367
368/* We walk down the guest page tables to get a guest-physical address */ 368/* We walk down the guest page tables to get a guest-physical address */
369unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) 369unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
370{ 370{
371 pgd_t gpgd; 371 pgd_t gpgd;
372 pte_t gpte; 372 pte_t gpte;
373 373
374 /* First step: get the top-level Guest page table entry. */ 374 /* First step: get the top-level Guest page table entry. */
375 gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); 375 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
376 /* Toplevel not present? We can't map it in. */ 376 /* Toplevel not present? We can't map it in. */
377 if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) 377 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
378 kill_guest(lg, "Bad address %#lx", vaddr); 378 kill_guest(cpu, "Bad address %#lx", vaddr);
379 379
380 gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t); 380 gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t);
381 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 381 if (!(pte_flags(gpte) & _PAGE_PRESENT))
382 kill_guest(lg, "Bad address %#lx", vaddr); 382 kill_guest(cpu, "Bad address %#lx", vaddr);
383 383
384 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); 384 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
385} 385}
@@ -399,7 +399,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
399/*H:435 And this is us, creating the new page directory. If we really do 399/*H:435 And this is us, creating the new page directory. If we really do
400 * allocate a new one (and so the kernel parts are not there), we set 400 * allocate a new one (and so the kernel parts are not there), we set
401 * blank_pgdir. */ 401 * blank_pgdir. */
402static unsigned int new_pgdir(struct lguest *lg, 402static unsigned int new_pgdir(struct lg_cpu *cpu,
403 unsigned long gpgdir, 403 unsigned long gpgdir,
404 int *blank_pgdir) 404 int *blank_pgdir)
405{ 405{
@@ -407,22 +407,23 @@ static unsigned int new_pgdir(struct lguest *lg,
407 407
408 /* We pick one entry at random to throw out. Choosing the Least 408 /* We pick one entry at random to throw out. Choosing the Least
409 * Recently Used might be better, but this is easy. */ 409 * Recently Used might be better, but this is easy. */
410 next = random32() % ARRAY_SIZE(lg->pgdirs); 410 next = random32() % ARRAY_SIZE(cpu->lg->pgdirs);
411 /* If it's never been allocated at all before, try now. */ 411 /* If it's never been allocated at all before, try now. */
412 if (!lg->pgdirs[next].pgdir) { 412 if (!cpu->lg->pgdirs[next].pgdir) {
413 lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); 413 cpu->lg->pgdirs[next].pgdir =
414 (pgd_t *)get_zeroed_page(GFP_KERNEL);
414 /* If the allocation fails, just keep using the one we have */ 415 /* If the allocation fails, just keep using the one we have */
415 if (!lg->pgdirs[next].pgdir) 416 if (!cpu->lg->pgdirs[next].pgdir)
416 next = lg->pgdidx; 417 next = cpu->cpu_pgd;
417 else 418 else
418 /* This is a blank page, so there are no kernel 419 /* This is a blank page, so there are no kernel
419 * mappings: caller must map the stack! */ 420 * mappings: caller must map the stack! */
420 *blank_pgdir = 1; 421 *blank_pgdir = 1;
421 } 422 }
422 /* Record which Guest toplevel this shadows. */ 423 /* Record which Guest toplevel this shadows. */
423 lg->pgdirs[next].gpgdir = gpgdir; 424 cpu->lg->pgdirs[next].gpgdir = gpgdir;
424 /* Release all the non-kernel mappings. */ 425 /* Release all the non-kernel mappings. */
425 flush_user_mappings(lg, next); 426 flush_user_mappings(cpu->lg, next);
426 427
427 return next; 428 return next;
428} 429}
@@ -432,21 +433,21 @@ static unsigned int new_pgdir(struct lguest *lg,
432 * Now we've seen all the page table setting and manipulation, let's see what 433 * Now we've seen all the page table setting and manipulation, let's see what
433 * what happens when the Guest changes page tables (ie. changes the top-level 434 * what happens when the Guest changes page tables (ie. changes the top-level
434 * pgdir). This occurs on almost every context switch. */ 435 * pgdir). This occurs on almost every context switch. */
435void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) 436void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
436{ 437{
437 int newpgdir, repin = 0; 438 int newpgdir, repin = 0;
438 439
439 /* Look to see if we have this one already. */ 440 /* Look to see if we have this one already. */
440 newpgdir = find_pgdir(lg, pgtable); 441 newpgdir = find_pgdir(cpu->lg, pgtable);
441 /* If not, we allocate or mug an existing one: if it's a fresh one, 442 /* If not, we allocate or mug an existing one: if it's a fresh one,
442 * repin gets set to 1. */ 443 * repin gets set to 1. */
443 if (newpgdir == ARRAY_SIZE(lg->pgdirs)) 444 if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
444 newpgdir = new_pgdir(lg, pgtable, &repin); 445 newpgdir = new_pgdir(cpu, pgtable, &repin);
445 /* Change the current pgd index to the new one. */ 446 /* Change the current pgd index to the new one. */
446 lg->pgdidx = newpgdir; 447 cpu->cpu_pgd = newpgdir;
447 /* If it was completely blank, we map in the Guest kernel stack */ 448 /* If it was completely blank, we map in the Guest kernel stack */
448 if (repin) 449 if (repin)
449 pin_stack_pages(lg); 450 pin_stack_pages(cpu);
450} 451}
451 452
452/*H:470 Finally, a routine which throws away everything: all PGD entries in all 453/*H:470 Finally, a routine which throws away everything: all PGD entries in all
@@ -468,11 +469,11 @@ static void release_all_pagetables(struct lguest *lg)
468 * mapping. Since kernel mappings are in every page table, it's easiest to 469 * mapping. Since kernel mappings are in every page table, it's easiest to
469 * throw them all away. This traps the Guest in amber for a while as 470 * throw them all away. This traps the Guest in amber for a while as
470 * everything faults back in, but it's rare. */ 471 * everything faults back in, but it's rare. */
471void guest_pagetable_clear_all(struct lguest *lg) 472void guest_pagetable_clear_all(struct lg_cpu *cpu)
472{ 473{
473 release_all_pagetables(lg); 474 release_all_pagetables(cpu->lg);
474 /* We need the Guest kernel stack mapped again. */ 475 /* We need the Guest kernel stack mapped again. */
475 pin_stack_pages(lg); 476 pin_stack_pages(cpu);
476} 477}
477/*:*/ 478/*:*/
478/*M:009 Since we throw away all mappings when a kernel mapping changes, our 479/*M:009 Since we throw away all mappings when a kernel mapping changes, our
@@ -497,24 +498,24 @@ void guest_pagetable_clear_all(struct lguest *lg)
497 * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if 498 * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
498 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. 499 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
499 */ 500 */
500static void do_set_pte(struct lguest *lg, int idx, 501static void do_set_pte(struct lg_cpu *cpu, int idx,
501 unsigned long vaddr, pte_t gpte) 502 unsigned long vaddr, pte_t gpte)
502{ 503{
503 /* Look up the matching shadow page directory entry. */ 504 /* Look up the matching shadow page directory entry. */
504 pgd_t *spgd = spgd_addr(lg, idx, vaddr); 505 pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
505 506
506 /* If the top level isn't present, there's no entry to update. */ 507 /* If the top level isn't present, there's no entry to update. */
507 if (pgd_flags(*spgd) & _PAGE_PRESENT) { 508 if (pgd_flags(*spgd) & _PAGE_PRESENT) {
508 /* Otherwise, we start by releasing the existing entry. */ 509 /* Otherwise, we start by releasing the existing entry. */
509 pte_t *spte = spte_addr(lg, *spgd, vaddr); 510 pte_t *spte = spte_addr(*spgd, vaddr);
510 release_pte(*spte); 511 release_pte(*spte);
511 512
512 /* If they're setting this entry as dirty or accessed, we might 513 /* If they're setting this entry as dirty or accessed, we might
513 * as well put that entry they've given us in now. This shaves 514 * as well put that entry they've given us in now. This shaves
514 * 10% off a copy-on-write micro-benchmark. */ 515 * 10% off a copy-on-write micro-benchmark. */
515 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 516 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
516 check_gpte(lg, gpte); 517 check_gpte(cpu, gpte);
517 *spte = gpte_to_spte(lg, gpte, 518 *spte = gpte_to_spte(cpu, gpte,
518 pte_flags(gpte) & _PAGE_DIRTY); 519 pte_flags(gpte) & _PAGE_DIRTY);
519 } else 520 } else
520 /* Otherwise kill it and we can demand_page() it in 521 /* Otherwise kill it and we can demand_page() it in
@@ -533,22 +534,22 @@ static void do_set_pte(struct lguest *lg, int idx,
533 * 534 *
534 * The benefit is that when we have to track a new page table, we can copy keep 535 * The benefit is that when we have to track a new page table, we can copy keep
535 * all the kernel mappings. This speeds up context switch immensely. */ 536 * all the kernel mappings. This speeds up context switch immensely. */
536void guest_set_pte(struct lguest *lg, 537void guest_set_pte(struct lg_cpu *cpu,
537 unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 538 unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
538{ 539{
539 /* Kernel mappings must be changed on all top levels. Slow, but 540 /* Kernel mappings must be changed on all top levels. Slow, but
540 * doesn't happen often. */ 541 * doesn't happen often. */
541 if (vaddr >= lg->kernel_address) { 542 if (vaddr >= cpu->lg->kernel_address) {
542 unsigned int i; 543 unsigned int i;
543 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 544 for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
544 if (lg->pgdirs[i].pgdir) 545 if (cpu->lg->pgdirs[i].pgdir)
545 do_set_pte(lg, i, vaddr, gpte); 546 do_set_pte(cpu, i, vaddr, gpte);
546 } else { 547 } else {
547 /* Is this page table one we have a shadow for? */ 548 /* Is this page table one we have a shadow for? */
548 int pgdir = find_pgdir(lg, gpgdir); 549 int pgdir = find_pgdir(cpu->lg, gpgdir);
549 if (pgdir != ARRAY_SIZE(lg->pgdirs)) 550 if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))
550 /* If so, do the update. */ 551 /* If so, do the update. */
551 do_set_pte(lg, pgdir, vaddr, gpte); 552 do_set_pte(cpu, pgdir, vaddr, gpte);
552 } 553 }
553} 554}
554 555
@@ -590,30 +591,32 @@ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
590{ 591{
591 /* We start on the first shadow page table, and give it a blank PGD 592 /* We start on the first shadow page table, and give it a blank PGD
592 * page. */ 593 * page. */
593 lg->pgdidx = 0; 594 lg->pgdirs[0].gpgdir = pgtable;
594 lg->pgdirs[lg->pgdidx].gpgdir = pgtable; 595 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
595 lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL); 596 if (!lg->pgdirs[0].pgdir)
596 if (!lg->pgdirs[lg->pgdidx].pgdir)
597 return -ENOMEM; 597 return -ENOMEM;
598 lg->cpus[0].cpu_pgd = 0;
598 return 0; 599 return 0;
599} 600}
600 601
601/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 602/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
602void page_table_guest_data_init(struct lguest *lg) 603void page_table_guest_data_init(struct lg_cpu *cpu)
603{ 604{
604 /* We get the kernel address: above this is all kernel memory. */ 605 /* We get the kernel address: above this is all kernel memory. */
605 if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address) 606 if (get_user(cpu->lg->kernel_address,
607 &cpu->lg->lguest_data->kernel_address)
606 /* We tell the Guest that it can't use the top 4MB of virtual 608 /* We tell the Guest that it can't use the top 4MB of virtual
607 * addresses used by the Switcher. */ 609 * addresses used by the Switcher. */
608 || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) 610 || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem)
609 || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir)) 611 || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir))
610 kill_guest(lg, "bad guest page %p", lg->lguest_data); 612 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
611 613
612 /* In flush_user_mappings() we loop from 0 to 614 /* In flush_user_mappings() we loop from 0 to
613 * "pgd_index(lg->kernel_address)". This assumes it won't hit the 615 * "pgd_index(lg->kernel_address)". This assumes it won't hit the
614 * Switcher mappings, so check that now. */ 616 * Switcher mappings, so check that now. */
615 if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX) 617 if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
616 kill_guest(lg, "bad kernel address %#lx", lg->kernel_address); 618 kill_guest(cpu, "bad kernel address %#lx",
619 cpu->lg->kernel_address);
617} 620}
618 621
619/* When a Guest dies, our cleanup is fairly simple. */ 622/* When a Guest dies, our cleanup is fairly simple. */
@@ -634,17 +637,18 @@ void free_guest_pagetable(struct lguest *lg)
634 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages 637 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages
635 * for each CPU already set up, we just need to hook them in now we know which 638 * for each CPU already set up, we just need to hook them in now we know which
636 * Guest is about to run on this CPU. */ 639 * Guest is about to run on this CPU. */
637void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) 640void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
638{ 641{
639 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 642 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
640 pgd_t switcher_pgd; 643 pgd_t switcher_pgd;
641 pte_t regs_pte; 644 pte_t regs_pte;
645 unsigned long pfn;
642 646
643 /* Make the last PGD entry for this Guest point to the Switcher's PTE 647 /* Make the last PGD entry for this Guest point to the Switcher's PTE
644 * page for this CPU (with appropriate flags). */ 648 * page for this CPU (with appropriate flags). */
645 switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL); 649 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL);
646 650
647 lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 651 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
648 652
649 /* We also change the Switcher PTE page. When we're running the Guest, 653 /* We also change the Switcher PTE page. When we're running the Guest,
650 * we want the Guest's "regs" page to appear where the first Switcher 654 * we want the Guest's "regs" page to appear where the first Switcher
@@ -653,7 +657,8 @@ void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
653 * CPU's "struct lguest_pages": if we make sure the Guest's register 657 * CPU's "struct lguest_pages": if we make sure the Guest's register
654 * page is already mapped there, we don't have to copy them out 658 * page is already mapped there, we don't have to copy them out
655 * again. */ 659 * again. */
656 regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL)); 660 pfn = __pa(cpu->regs_page) >> PAGE_SHIFT;
661 regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL));
657 switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; 662 switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;
658} 663}
659/*:*/ 664/*:*/
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index 9e189cbec7dd..ec6aa3f1c36b 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -58,7 +58,7 @@ static int ignored_gdt(unsigned int num)
58 * Protection Fault in the Switcher when it restores a Guest segment register 58 * Protection Fault in the Switcher when it restores a Guest segment register
59 * which tries to use that entry. Then we kill the Guest for causing such a 59 * which tries to use that entry. Then we kill the Guest for causing such a
60 * mess: the message will be "unhandled trap 256". */ 60 * mess: the message will be "unhandled trap 256". */
61static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) 61static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
62{ 62{
63 unsigned int i; 63 unsigned int i;
64 64
@@ -71,14 +71,14 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
71 /* Segment descriptors contain a privilege level: the Guest is 71 /* Segment descriptors contain a privilege level: the Guest is
72 * sometimes careless and leaves this as 0, even though it's 72 * sometimes careless and leaves this as 0, even though it's
73 * running at privilege level 1. If so, we fix it here. */ 73 * running at privilege level 1. If so, we fix it here. */
74 if ((lg->arch.gdt[i].b & 0x00006000) == 0) 74 if ((cpu->arch.gdt[i].b & 0x00006000) == 0)
75 lg->arch.gdt[i].b |= (GUEST_PL << 13); 75 cpu->arch.gdt[i].b |= (GUEST_PL << 13);
76 76
77 /* Each descriptor has an "accessed" bit. If we don't set it 77 /* Each descriptor has an "accessed" bit. If we don't set it
78 * now, the CPU will try to set it when the Guest first loads 78 * now, the CPU will try to set it when the Guest first loads
79 * that entry into a segment register. But the GDT isn't 79 * that entry into a segment register. But the GDT isn't
80 * writable by the Guest, so bad things can happen. */ 80 * writable by the Guest, so bad things can happen. */
81 lg->arch.gdt[i].b |= 0x00000100; 81 cpu->arch.gdt[i].b |= 0x00000100;
82 } 82 }
83} 83}
84 84
@@ -109,31 +109,31 @@ void setup_default_gdt_entries(struct lguest_ro_state *state)
109 109
110/* This routine sets up the initial Guest GDT for booting. All entries start 110/* This routine sets up the initial Guest GDT for booting. All entries start
111 * as 0 (unusable). */ 111 * as 0 (unusable). */
112void setup_guest_gdt(struct lguest *lg) 112void setup_guest_gdt(struct lg_cpu *cpu)
113{ 113{
114 /* Start with full 0-4G segments... */ 114 /* Start with full 0-4G segments... */
115 lg->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; 115 cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
116 lg->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; 116 cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
117 /* ...except the Guest is allowed to use them, so set the privilege 117 /* ...except the Guest is allowed to use them, so set the privilege
118 * level appropriately in the flags. */ 118 * level appropriately in the flags. */
119 lg->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); 119 cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
120 lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); 120 cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
121} 121}
122 122
123/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" 123/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage"
124 * entries. */ 124 * entries. */
125void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) 125void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
126{ 126{
127 unsigned int i; 127 unsigned int i;
128 128
129 for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++) 129 for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
130 gdt[i] = lg->arch.gdt[i]; 130 gdt[i] = cpu->arch.gdt[i];
131} 131}
132 132
133/*H:640 When the Guest is run on a different CPU, or the GDT entries have 133/*H:640 When the Guest is run on a different CPU, or the GDT entries have
134 * changed, copy_gdt() is called to copy the Guest's GDT entries across to this 134 * changed, copy_gdt() is called to copy the Guest's GDT entries across to this
135 * CPU's GDT. */ 135 * CPU's GDT. */
136void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) 136void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
137{ 137{
138 unsigned int i; 138 unsigned int i;
139 139
@@ -141,38 +141,38 @@ void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
141 * replaced. See ignored_gdt() above. */ 141 * replaced. See ignored_gdt() above. */
142 for (i = 0; i < GDT_ENTRIES; i++) 142 for (i = 0; i < GDT_ENTRIES; i++)
143 if (!ignored_gdt(i)) 143 if (!ignored_gdt(i))
144 gdt[i] = lg->arch.gdt[i]; 144 gdt[i] = cpu->arch.gdt[i];
145} 145}
146 146
147/*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). 147/*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT).
148 * We copy it from the Guest and tweak the entries. */ 148 * We copy it from the Guest and tweak the entries. */
149void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) 149void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num)
150{ 150{
151 /* We assume the Guest has the same number of GDT entries as the 151 /* We assume the Guest has the same number of GDT entries as the
152 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ 152 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */
153 if (num > ARRAY_SIZE(lg->arch.gdt)) 153 if (num > ARRAY_SIZE(cpu->arch.gdt))
154 kill_guest(lg, "too many gdt entries %i", num); 154 kill_guest(cpu, "too many gdt entries %i", num);
155 155
156 /* We read the whole thing in, then fix it up. */ 156 /* We read the whole thing in, then fix it up. */
157 __lgread(lg, lg->arch.gdt, table, num * sizeof(lg->arch.gdt[0])); 157 __lgread(cpu, cpu->arch.gdt, table, num * sizeof(cpu->arch.gdt[0]));
158 fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->arch.gdt)); 158 fixup_gdt_table(cpu, 0, ARRAY_SIZE(cpu->arch.gdt));
159 /* Mark that the GDT changed so the core knows it has to copy it again, 159 /* Mark that the GDT changed so the core knows it has to copy it again,
160 * even if the Guest is run on the same CPU. */ 160 * even if the Guest is run on the same CPU. */
161 lg->changed |= CHANGED_GDT; 161 cpu->changed |= CHANGED_GDT;
162} 162}
163 163
164/* This is the fast-track version for just changing the three TLS entries. 164/* This is the fast-track version for just changing the three TLS entries.
165 * Remember that this happens on every context switch, so it's worth 165 * Remember that this happens on every context switch, so it's worth
166 * optimizing. But wouldn't it be neater to have a single hypercall to cover 166 * optimizing. But wouldn't it be neater to have a single hypercall to cover
167 * both cases? */ 167 * both cases? */
168void guest_load_tls(struct lguest *lg, unsigned long gtls) 168void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
169{ 169{
170 struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN]; 170 struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
171 171
172 __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); 172 __lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
173 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); 173 fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
174 /* Note that just the TLS entries have changed. */ 174 /* Note that just the TLS entries have changed. */
175 lg->changed |= CHANGED_GDT_TLS; 175 cpu->changed |= CHANGED_GDT_TLS;
176} 176}
177/*:*/ 177/*:*/
178 178
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 44adb00e1490..61f2f8eb8cad 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -60,7 +60,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu)
60 (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); 60 (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
61} 61}
62 62
63static DEFINE_PER_CPU(struct lguest *, last_guest); 63static DEFINE_PER_CPU(struct lg_cpu *, last_cpu);
64 64
65/*S:010 65/*S:010
66 * We approach the Switcher. 66 * We approach the Switcher.
@@ -73,16 +73,16 @@ static DEFINE_PER_CPU(struct lguest *, last_guest);
73 * since it last ran. We saw this set in interrupts_and_traps.c and 73 * since it last ran. We saw this set in interrupts_and_traps.c and
74 * segments.c. 74 * segments.c.
75 */ 75 */
76static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) 76static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
77{ 77{
78 /* Copying all this data can be quite expensive. We usually run the 78 /* Copying all this data can be quite expensive. We usually run the
79 * same Guest we ran last time (and that Guest hasn't run anywhere else 79 * same Guest we ran last time (and that Guest hasn't run anywhere else
80 * meanwhile). If that's not the case, we pretend everything in the 80 * meanwhile). If that's not the case, we pretend everything in the
81 * Guest has changed. */ 81 * Guest has changed. */
82 if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { 82 if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) {
83 __get_cpu_var(last_guest) = lg; 83 __get_cpu_var(last_cpu) = cpu;
84 lg->last_pages = pages; 84 cpu->last_pages = pages;
85 lg->changed = CHANGED_ALL; 85 cpu->changed = CHANGED_ALL;
86 } 86 }
87 87
88 /* These copies are pretty cheap, so we do them unconditionally: */ 88 /* These copies are pretty cheap, so we do them unconditionally: */
@@ -90,42 +90,42 @@ static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
90 pages->state.host_cr3 = __pa(current->mm->pgd); 90 pages->state.host_cr3 = __pa(current->mm->pgd);
91 /* Set up the Guest's page tables to see this CPU's pages (and no 91 /* Set up the Guest's page tables to see this CPU's pages (and no
92 * other CPU's pages). */ 92 * other CPU's pages). */
93 map_switcher_in_guest(lg, pages); 93 map_switcher_in_guest(cpu, pages);
94 /* Set up the two "TSS" members which tell the CPU what stack to use 94 /* Set up the two "TSS" members which tell the CPU what stack to use
95 * for traps which do directly into the Guest (ie. traps at privilege 95 * for traps which do directly into the Guest (ie. traps at privilege
96 * level 1). */ 96 * level 1). */
97 pages->state.guest_tss.sp1 = lg->esp1; 97 pages->state.guest_tss.esp1 = cpu->esp1;
98 pages->state.guest_tss.ss1 = lg->ss1; 98 pages->state.guest_tss.ss1 = cpu->ss1;
99 99
100 /* Copy direct-to-Guest trap entries. */ 100 /* Copy direct-to-Guest trap entries. */
101 if (lg->changed & CHANGED_IDT) 101 if (cpu->changed & CHANGED_IDT)
102 copy_traps(lg, pages->state.guest_idt, default_idt_entries); 102 copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
103 103
104 /* Copy all GDT entries which the Guest can change. */ 104 /* Copy all GDT entries which the Guest can change. */
105 if (lg->changed & CHANGED_GDT) 105 if (cpu->changed & CHANGED_GDT)
106 copy_gdt(lg, pages->state.guest_gdt); 106 copy_gdt(cpu, pages->state.guest_gdt);
107 /* If only the TLS entries have changed, copy them. */ 107 /* If only the TLS entries have changed, copy them. */
108 else if (lg->changed & CHANGED_GDT_TLS) 108 else if (cpu->changed & CHANGED_GDT_TLS)
109 copy_gdt_tls(lg, pages->state.guest_gdt); 109 copy_gdt_tls(cpu, pages->state.guest_gdt);
110 110
111 /* Mark the Guest as unchanged for next time. */ 111 /* Mark the Guest as unchanged for next time. */
112 lg->changed = 0; 112 cpu->changed = 0;
113} 113}
114 114
115/* Finally: the code to actually call into the Switcher to run the Guest. */ 115/* Finally: the code to actually call into the Switcher to run the Guest. */
116static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) 116static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
117{ 117{
118 /* This is a dummy value we need for GCC's sake. */ 118 /* This is a dummy value we need for GCC's sake. */
119 unsigned int clobber; 119 unsigned int clobber;
120 120
121 /* Copy the guest-specific information into this CPU's "struct 121 /* Copy the guest-specific information into this CPU's "struct
122 * lguest_pages". */ 122 * lguest_pages". */
123 copy_in_guest_info(lg, pages); 123 copy_in_guest_info(cpu, pages);
124 124
125 /* Set the trap number to 256 (impossible value). If we fault while 125 /* Set the trap number to 256 (impossible value). If we fault while
126 * switching to the Guest (bad segment registers or bug), this will 126 * switching to the Guest (bad segment registers or bug), this will
127 * cause us to abort the Guest. */ 127 * cause us to abort the Guest. */
128 lg->regs->trapnum = 256; 128 cpu->regs->trapnum = 256;
129 129
130 /* Now: we push the "eflags" register on the stack, then do an "lcall". 130 /* Now: we push the "eflags" register on the stack, then do an "lcall".
131 * This is how we change from using the kernel code segment to using 131 * This is how we change from using the kernel code segment to using
@@ -143,7 +143,7 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
143 * 0-th argument above, ie "a"). %ebx contains the 143 * 0-th argument above, ie "a"). %ebx contains the
144 * physical address of the Guest's top-level page 144 * physical address of the Guest's top-level page
145 * directory. */ 145 * directory. */
146 : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) 146 : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir))
147 /* We tell gcc that all these registers could change, 147 /* We tell gcc that all these registers could change,
148 * which means we don't have to save and restore them in 148 * which means we don't have to save and restore them in
149 * the Switcher. */ 149 * the Switcher. */
@@ -161,12 +161,12 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
161 161
162/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts 162/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts
163 * are disabled: we own the CPU. */ 163 * are disabled: we own the CPU. */
164void lguest_arch_run_guest(struct lguest *lg) 164void lguest_arch_run_guest(struct lg_cpu *cpu)
165{ 165{
166 /* Remember the awfully-named TS bit? If the Guest has asked to set it 166 /* Remember the awfully-named TS bit? If the Guest has asked to set it
167 * we set it now, so we can trap and pass that trap to the Guest if it 167 * we set it now, so we can trap and pass that trap to the Guest if it
168 * uses the FPU. */ 168 * uses the FPU. */
169 if (lg->ts) 169 if (cpu->ts)
170 lguest_set_ts(); 170 lguest_set_ts();
171 171
172 /* SYSENTER is an optimized way of doing system calls. We can't allow 172 /* SYSENTER is an optimized way of doing system calls. We can't allow
@@ -180,7 +180,7 @@ void lguest_arch_run_guest(struct lguest *lg)
180 /* Now we actually run the Guest. It will return when something 180 /* Now we actually run the Guest. It will return when something
181 * interesting happens, and we can examine its registers to see what it 181 * interesting happens, and we can examine its registers to see what it
182 * was doing. */ 182 * was doing. */
183 run_guest_once(lg, lguest_pages(raw_smp_processor_id())); 183 run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
184 184
185 /* Note that the "regs" pointer contains two extra entries which are 185 /* Note that the "regs" pointer contains two extra entries which are
186 * not really registers: a trap number which says what interrupt or 186 * not really registers: a trap number which says what interrupt or
@@ -191,11 +191,11 @@ void lguest_arch_run_guest(struct lguest *lg)
191 * bad virtual address. We have to grab this now, because once we 191 * bad virtual address. We have to grab this now, because once we
192 * re-enable interrupts an interrupt could fault and thus overwrite 192 * re-enable interrupts an interrupt could fault and thus overwrite
193 * cr2, or we could even move off to a different CPU. */ 193 * cr2, or we could even move off to a different CPU. */
194 if (lg->regs->trapnum == 14) 194 if (cpu->regs->trapnum == 14)
195 lg->arch.last_pagefault = read_cr2(); 195 cpu->arch.last_pagefault = read_cr2();
196 /* Similarly, if we took a trap because the Guest used the FPU, 196 /* Similarly, if we took a trap because the Guest used the FPU,
197 * we have to restore the FPU it expects to see. */ 197 * we have to restore the FPU it expects to see. */
198 else if (lg->regs->trapnum == 7) 198 else if (cpu->regs->trapnum == 7)
199 math_state_restore(); 199 math_state_restore();
200 200
201 /* Restore SYSENTER if it's supposed to be on. */ 201 /* Restore SYSENTER if it's supposed to be on. */
@@ -214,22 +214,22 @@ void lguest_arch_run_guest(struct lguest *lg)
214 * When the Guest uses one of these instructions, we get a trap (General 214 * When the Guest uses one of these instructions, we get a trap (General
215 * Protection Fault) and come here. We see if it's one of those troublesome 215 * Protection Fault) and come here. We see if it's one of those troublesome
216 * instructions and skip over it. We return true if we did. */ 216 * instructions and skip over it. We return true if we did. */
217static int emulate_insn(struct lguest *lg) 217static int emulate_insn(struct lg_cpu *cpu)
218{ 218{
219 u8 insn; 219 u8 insn;
220 unsigned int insnlen = 0, in = 0, shift = 0; 220 unsigned int insnlen = 0, in = 0, shift = 0;
221 /* The eip contains the *virtual* address of the Guest's instruction: 221 /* The eip contains the *virtual* address of the Guest's instruction:
222 * guest_pa just subtracts the Guest's page_offset. */ 222 * guest_pa just subtracts the Guest's page_offset. */
223 unsigned long physaddr = guest_pa(lg, lg->regs->eip); 223 unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
224 224
225 /* This must be the Guest kernel trying to do something, not userspace! 225 /* This must be the Guest kernel trying to do something, not userspace!
226 * The bottom two bits of the CS segment register are the privilege 226 * The bottom two bits of the CS segment register are the privilege
227 * level. */ 227 * level. */
228 if ((lg->regs->cs & 3) != GUEST_PL) 228 if ((cpu->regs->cs & 3) != GUEST_PL)
229 return 0; 229 return 0;
230 230
231 /* Decoding x86 instructions is icky. */ 231 /* Decoding x86 instructions is icky. */
232 insn = lgread(lg, physaddr, u8); 232 insn = lgread(cpu, physaddr, u8);
233 233
234 /* 0x66 is an "operand prefix". It means it's using the upper 16 bits 234 /* 0x66 is an "operand prefix". It means it's using the upper 16 bits
235 of the eax register. */ 235 of the eax register. */
@@ -237,7 +237,7 @@ static int emulate_insn(struct lguest *lg)
237 shift = 16; 237 shift = 16;
238 /* The instruction is 1 byte so far, read the next byte. */ 238 /* The instruction is 1 byte so far, read the next byte. */
239 insnlen = 1; 239 insnlen = 1;
240 insn = lgread(lg, physaddr + insnlen, u8); 240 insn = lgread(cpu, physaddr + insnlen, u8);
241 } 241 }
242 242
243 /* We can ignore the lower bit for the moment and decode the 4 opcodes 243 /* We can ignore the lower bit for the moment and decode the 4 opcodes
@@ -268,26 +268,26 @@ static int emulate_insn(struct lguest *lg)
268 if (in) { 268 if (in) {
269 /* Lower bit tells is whether it's a 16 or 32 bit access */ 269 /* Lower bit tells is whether it's a 16 or 32 bit access */
270 if (insn & 0x1) 270 if (insn & 0x1)
271 lg->regs->eax = 0xFFFFFFFF; 271 cpu->regs->eax = 0xFFFFFFFF;
272 else 272 else
273 lg->regs->eax |= (0xFFFF << shift); 273 cpu->regs->eax |= (0xFFFF << shift);
274 } 274 }
275 /* Finally, we've "done" the instruction, so move past it. */ 275 /* Finally, we've "done" the instruction, so move past it. */
276 lg->regs->eip += insnlen; 276 cpu->regs->eip += insnlen;
277 /* Success! */ 277 /* Success! */
278 return 1; 278 return 1;
279} 279}
280 280
281/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ 281/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
282void lguest_arch_handle_trap(struct lguest *lg) 282void lguest_arch_handle_trap(struct lg_cpu *cpu)
283{ 283{
284 switch (lg->regs->trapnum) { 284 switch (cpu->regs->trapnum) {
285 case 13: /* We've intercepted a General Protection Fault. */ 285 case 13: /* We've intercepted a General Protection Fault. */
286 /* Check if this was one of those annoying IN or OUT 286 /* Check if this was one of those annoying IN or OUT
287 * instructions which we need to emulate. If so, we just go 287 * instructions which we need to emulate. If so, we just go
288 * back into the Guest after we've done it. */ 288 * back into the Guest after we've done it. */
289 if (lg->regs->errcode == 0) { 289 if (cpu->regs->errcode == 0) {
290 if (emulate_insn(lg)) 290 if (emulate_insn(cpu))
291 return; 291 return;
292 } 292 }
293 break; 293 break;
@@ -301,7 +301,8 @@ void lguest_arch_handle_trap(struct lguest *lg)
301 * 301 *
302 * The errcode tells whether this was a read or a write, and 302 * The errcode tells whether this was a read or a write, and
303 * whether kernel or userspace code. */ 303 * whether kernel or userspace code. */
304 if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode)) 304 if (demand_page(cpu, cpu->arch.last_pagefault,
305 cpu->regs->errcode))
305 return; 306 return;
306 307
307 /* OK, it's really not there (or not OK): the Guest needs to 308 /* OK, it's really not there (or not OK): the Guest needs to
@@ -311,15 +312,16 @@ void lguest_arch_handle_trap(struct lguest *lg)
311 * Note that if the Guest were really messed up, this could 312 * Note that if the Guest were really messed up, this could
312 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so 313 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
313 * lg->lguest_data could be NULL */ 314 * lg->lguest_data could be NULL */
314 if (lg->lguest_data && 315 if (cpu->lg->lguest_data &&
315 put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2)) 316 put_user(cpu->arch.last_pagefault,
316 kill_guest(lg, "Writing cr2"); 317 &cpu->lg->lguest_data->cr2))
318 kill_guest(cpu, "Writing cr2");
317 break; 319 break;
318 case 7: /* We've intercepted a Device Not Available fault. */ 320 case 7: /* We've intercepted a Device Not Available fault. */
319 /* If the Guest doesn't want to know, we already restored the 321 /* If the Guest doesn't want to know, we already restored the
320 * Floating Point Unit, so we just continue without telling 322 * Floating Point Unit, so we just continue without telling
321 * it. */ 323 * it. */
322 if (!lg->ts) 324 if (!cpu->ts)
323 return; 325 return;
324 break; 326 break;
325 case 32 ... 255: 327 case 32 ... 255:
@@ -332,19 +334,19 @@ void lguest_arch_handle_trap(struct lguest *lg)
332 case LGUEST_TRAP_ENTRY: 334 case LGUEST_TRAP_ENTRY:
333 /* Our 'struct hcall_args' maps directly over our regs: we set 335 /* Our 'struct hcall_args' maps directly over our regs: we set
334 * up the pointer now to indicate a hypercall is pending. */ 336 * up the pointer now to indicate a hypercall is pending. */
335 lg->hcall = (struct hcall_args *)lg->regs; 337 cpu->hcall = (struct hcall_args *)cpu->regs;
336 return; 338 return;
337 } 339 }
338 340
339 /* We didn't handle the trap, so it needs to go to the Guest. */ 341 /* We didn't handle the trap, so it needs to go to the Guest. */
340 if (!deliver_trap(lg, lg->regs->trapnum)) 342 if (!deliver_trap(cpu, cpu->regs->trapnum))
341 /* If the Guest doesn't have a handler (either it hasn't 343 /* If the Guest doesn't have a handler (either it hasn't
342 * registered any yet, or it's one of the faults we don't let 344 * registered any yet, or it's one of the faults we don't let
343 * it handle), it dies with a cryptic error message. */ 345 * it handle), it dies with a cryptic error message. */
344 kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", 346 kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
345 lg->regs->trapnum, lg->regs->eip, 347 cpu->regs->trapnum, cpu->regs->eip,
346 lg->regs->trapnum == 14 ? lg->arch.last_pagefault 348 cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
347 : lg->regs->errcode); 349 : cpu->regs->errcode);
348} 350}
349 351
350/* Now we can look at each of the routines this calls, in increasing order of 352/* Now we can look at each of the routines this calls, in increasing order of
@@ -487,17 +489,17 @@ void __exit lguest_arch_host_fini(void)
487 489
488 490
489/*H:122 The i386-specific hypercalls simply farm out to the right functions. */ 491/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
490int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args) 492int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
491{ 493{
492 switch (args->arg0) { 494 switch (args->arg0) {
493 case LHCALL_LOAD_GDT: 495 case LHCALL_LOAD_GDT:
494 load_guest_gdt(lg, args->arg1, args->arg2); 496 load_guest_gdt(cpu, args->arg1, args->arg2);
495 break; 497 break;
496 case LHCALL_LOAD_IDT_ENTRY: 498 case LHCALL_LOAD_IDT_ENTRY:
497 load_guest_idt_entry(lg, args->arg1, args->arg2, args->arg3); 499 load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
498 break; 500 break;
499 case LHCALL_LOAD_TLS: 501 case LHCALL_LOAD_TLS:
500 guest_load_tls(lg, args->arg1); 502 guest_load_tls(cpu, args->arg1);
501 break; 503 break;
502 default: 504 default:
503 /* Bad Guest. Bad! */ 505 /* Bad Guest. Bad! */
@@ -507,13 +509,14 @@ int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args)
507} 509}
508 510
509/*H:126 i386-specific hypercall initialization: */ 511/*H:126 i386-specific hypercall initialization: */
510int lguest_arch_init_hypercalls(struct lguest *lg) 512int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
511{ 513{
512 u32 tsc_speed; 514 u32 tsc_speed;
513 515
514 /* The pointer to the Guest's "struct lguest_data" is the only 516 /* The pointer to the Guest's "struct lguest_data" is the only
515 * argument. We check that address now. */ 517 * argument. We check that address now. */
516 if (!lguest_address_ok(lg, lg->hcall->arg1, sizeof(*lg->lguest_data))) 518 if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
519 sizeof(*cpu->lg->lguest_data)))
517 return -EFAULT; 520 return -EFAULT;
518 521
519 /* Having checked it, we simply set lg->lguest_data to point straight 522 /* Having checked it, we simply set lg->lguest_data to point straight
@@ -521,7 +524,7 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
521 * copy_to_user/from_user from now on, instead of lgread/write. I put 524 * copy_to_user/from_user from now on, instead of lgread/write. I put
522 * this in to show that I'm not immune to writing stupid 525 * this in to show that I'm not immune to writing stupid
523 * optimizations. */ 526 * optimizations. */
524 lg->lguest_data = lg->mem_base + lg->hcall->arg1; 527 cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
525 528
526 /* We insist that the Time Stamp Counter exist and doesn't change with 529 /* We insist that the Time Stamp Counter exist and doesn't change with
527 * cpu frequency. Some devious chip manufacturers decided that TSC 530 * cpu frequency. Some devious chip manufacturers decided that TSC
@@ -534,12 +537,12 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
534 tsc_speed = tsc_khz; 537 tsc_speed = tsc_khz;
535 else 538 else
536 tsc_speed = 0; 539 tsc_speed = 0;
537 if (put_user(tsc_speed, &lg->lguest_data->tsc_khz)) 540 if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
538 return -EFAULT; 541 return -EFAULT;
539 542
540 /* The interrupt code might not like the system call vector. */ 543 /* The interrupt code might not like the system call vector. */
541 if (!check_syscall_vector(lg)) 544 if (!check_syscall_vector(cpu->lg))
542 kill_guest(lg, "bad syscall vector"); 545 kill_guest(cpu, "bad syscall vector");
543 546
544 return 0; 547 return 0;
545} 548}
@@ -548,9 +551,9 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
548 * 551 *
549 * Most of the Guest's registers are left alone: we used get_zeroed_page() to 552 * Most of the Guest's registers are left alone: we used get_zeroed_page() to
550 * allocate the structure, so they will be 0. */ 553 * allocate the structure, so they will be 0. */
551void lguest_arch_setup_regs(struct lguest *lg, unsigned long start) 554void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
552{ 555{
553 struct lguest_regs *regs = lg->regs; 556 struct lguest_regs *regs = cpu->regs;
554 557
555 /* There are four "segment" registers which the Guest needs to boot: 558 /* There are four "segment" registers which the Guest needs to boot:
556 * The "code segment" register (cs) refers to the kernel code segment 559 * The "code segment" register (cs) refers to the kernel code segment
@@ -577,5 +580,5 @@ void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
577 580
578 /* There are a couple of GDT entries the Guest expects when first 581 /* There are a couple of GDT entries the Guest expects when first
579 * booting. */ 582 * booting. */
580 setup_guest_gdt(lg); 583 setup_guest_gdt(cpu);
581} 584}
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index e45f85f7c7ed..0dff05840ee2 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -4224,10 +4224,10 @@ zfcp_fsf_send_fcp_command_task_handler(struct zfcp_fsf_req *fsf_req)
4224 4224
4225 ZFCP_LOG_TRACE("%i bytes sense data provided by FCP\n", 4225 ZFCP_LOG_TRACE("%i bytes sense data provided by FCP\n",
4226 fcp_rsp_iu->fcp_sns_len); 4226 fcp_rsp_iu->fcp_sns_len);
4227 memcpy(&scpnt->sense_buffer, 4227 memcpy(scpnt->sense_buffer,
4228 zfcp_get_fcp_sns_info_ptr(fcp_rsp_iu), sns_len); 4228 zfcp_get_fcp_sns_info_ptr(fcp_rsp_iu), sns_len);
4229 ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_TRACE, 4229 ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_TRACE,
4230 (void *) &scpnt->sense_buffer, sns_len); 4230 (void *)scpnt->sense_buffer, sns_len);
4231 } 4231 }
4232 4232
4233 /* check for overrun */ 4233 /* check for overrun */
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c
index 1c244832c6c8..b4912d1cee2a 100644
--- a/drivers/scsi/3w-9xxx.c
+++ b/drivers/scsi/3w-9xxx.c
@@ -1990,7 +1990,6 @@ static struct scsi_host_template driver_template = {
1990 .max_sectors = TW_MAX_SECTORS, 1990 .max_sectors = TW_MAX_SECTORS,
1991 .cmd_per_lun = TW_MAX_CMDS_PER_LUN, 1991 .cmd_per_lun = TW_MAX_CMDS_PER_LUN,
1992 .use_clustering = ENABLE_CLUSTERING, 1992 .use_clustering = ENABLE_CLUSTERING,
1993 .use_sg_chaining = ENABLE_SG_CHAINING,
1994 .shost_attrs = twa_host_attrs, 1993 .shost_attrs = twa_host_attrs,
1995 .emulated = 1 1994 .emulated = 1
1996}; 1995};
diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c
index 59716ebeb10c..d09532162217 100644
--- a/drivers/scsi/3w-xxxx.c
+++ b/drivers/scsi/3w-xxxx.c
@@ -2261,7 +2261,6 @@ static struct scsi_host_template driver_template = {
2261 .max_sectors = TW_MAX_SECTORS, 2261 .max_sectors = TW_MAX_SECTORS,
2262 .cmd_per_lun = TW_MAX_CMDS_PER_LUN, 2262 .cmd_per_lun = TW_MAX_CMDS_PER_LUN,
2263 .use_clustering = ENABLE_CLUSTERING, 2263 .use_clustering = ENABLE_CLUSTERING,
2264 .use_sg_chaining = ENABLE_SG_CHAINING,
2265 .shost_attrs = tw_host_attrs, 2264 .shost_attrs = tw_host_attrs,
2266 .emulated = 1 2265 .emulated = 1
2267}; 2266};
diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c
index ead47c143ce0..4d3ebb1af490 100644
--- a/drivers/scsi/BusLogic.c
+++ b/drivers/scsi/BusLogic.c
@@ -3575,7 +3575,6 @@ static struct scsi_host_template Bus_Logic_template = {
3575 .unchecked_isa_dma = 1, 3575 .unchecked_isa_dma = 1,
3576 .max_sectors = 128, 3576 .max_sectors = 128,
3577 .use_clustering = ENABLE_CLUSTERING, 3577 .use_clustering = ENABLE_CLUSTERING,
3578 .use_sg_chaining = ENABLE_SG_CHAINING,
3579}; 3578};
3580 3579
3581/* 3580/*
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 3e161cd66463..14fc7f39e83e 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -345,7 +345,7 @@ config ISCSI_TCP
345 345
346config SGIWD93_SCSI 346config SGIWD93_SCSI
347 tristate "SGI WD93C93 SCSI Driver" 347 tristate "SGI WD93C93 SCSI Driver"
348 depends on SGI_IP22 && SCSI 348 depends on SGI_HAS_WD93 && SCSI
349 help 349 help
350 If you have a Western Digital WD93 SCSI controller on 350 If you have a Western Digital WD93 SCSI controller on
351 an SGI MIPS system, say Y. Otherwise, say N. 351 an SGI MIPS system, say Y. Otherwise, say N.
diff --git a/drivers/scsi/NCR53c406a.c b/drivers/scsi/NCR53c406a.c
index 137d065db3da..6961f78742ae 100644
--- a/drivers/scsi/NCR53c406a.c
+++ b/drivers/scsi/NCR53c406a.c
@@ -1065,7 +1065,6 @@ static struct scsi_host_template driver_template =
1065 .cmd_per_lun = 1 /* commands per lun */, 1065 .cmd_per_lun = 1 /* commands per lun */,
1066 .unchecked_isa_dma = 1 /* unchecked_isa_dma */, 1066 .unchecked_isa_dma = 1 /* unchecked_isa_dma */,
1067 .use_clustering = ENABLE_CLUSTERING, 1067 .use_clustering = ENABLE_CLUSTERING,
1068 .use_sg_chaining = ENABLE_SG_CHAINING,
1069}; 1068};
1070 1069
1071#include "scsi_module.c" 1070#include "scsi_module.c"
diff --git a/drivers/scsi/a100u2w.c b/drivers/scsi/a100u2w.c
index d3a6d15fb77a..f608d4a1d6da 100644
--- a/drivers/scsi/a100u2w.c
+++ b/drivers/scsi/a100u2w.c
@@ -1071,7 +1071,6 @@ static struct scsi_host_template inia100_template = {
1071 .sg_tablesize = SG_ALL, 1071 .sg_tablesize = SG_ALL,
1072 .cmd_per_lun = 1, 1072 .cmd_per_lun = 1,
1073 .use_clustering = ENABLE_CLUSTERING, 1073 .use_clustering = ENABLE_CLUSTERING,
1074 .use_sg_chaining = ENABLE_SG_CHAINING,
1075}; 1074};
1076 1075
1077static int __devinit inia100_probe_one(struct pci_dev *pdev, 1076static int __devinit inia100_probe_one(struct pci_dev *pdev,
diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c
index 851a7e599c50..f8afa358b6b6 100644
--- a/drivers/scsi/aacraid/commctrl.c
+++ b/drivers/scsi/aacraid/commctrl.c
@@ -243,7 +243,6 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg)
243 * Search the list of AdapterFibContext addresses on the adapter 243 * Search the list of AdapterFibContext addresses on the adapter
244 * to be sure this is a valid address 244 * to be sure this is a valid address
245 */ 245 */
246 spin_lock_irqsave(&dev->fib_lock, flags);
247 entry = dev->fib_list.next; 246 entry = dev->fib_list.next;
248 fibctx = NULL; 247 fibctx = NULL;
249 248
@@ -252,25 +251,24 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg)
252 /* 251 /*
253 * Extract the AdapterFibContext from the Input parameters. 252 * Extract the AdapterFibContext from the Input parameters.
254 */ 253 */
255 if (fibctx->unique == f.fibctx) { /* We found a winner */ 254 if (fibctx->unique == f.fibctx) { /* We found a winner */
256 break; 255 break;
257 } 256 }
258 entry = entry->next; 257 entry = entry->next;
259 fibctx = NULL; 258 fibctx = NULL;
260 } 259 }
261 if (!fibctx) { 260 if (!fibctx) {
262 spin_unlock_irqrestore(&dev->fib_lock, flags);
263 dprintk ((KERN_INFO "Fib Context not found\n")); 261 dprintk ((KERN_INFO "Fib Context not found\n"));
264 return -EINVAL; 262 return -EINVAL;
265 } 263 }
266 264
267 if((fibctx->type != FSAFS_NTC_GET_ADAPTER_FIB_CONTEXT) || 265 if((fibctx->type != FSAFS_NTC_GET_ADAPTER_FIB_CONTEXT) ||
268 (fibctx->size != sizeof(struct aac_fib_context))) { 266 (fibctx->size != sizeof(struct aac_fib_context))) {
269 spin_unlock_irqrestore(&dev->fib_lock, flags);
270 dprintk ((KERN_INFO "Fib Context corrupt?\n")); 267 dprintk ((KERN_INFO "Fib Context corrupt?\n"));
271 return -EINVAL; 268 return -EINVAL;
272 } 269 }
273 status = 0; 270 status = 0;
271 spin_lock_irqsave(&dev->fib_lock, flags);
274 /* 272 /*
275 * If there are no fibs to send back, then either wait or return 273 * If there are no fibs to send back, then either wait or return
276 * -EAGAIN 274 * -EAGAIN
@@ -328,9 +326,7 @@ return_fib:
328int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx) 326int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx)
329{ 327{
330 struct fib *fib; 328 struct fib *fib;
331 unsigned long flags;
332 329
333 spin_lock_irqsave(&dev->fib_lock, flags);
334 /* 330 /*
335 * First free any FIBs that have not been consumed. 331 * First free any FIBs that have not been consumed.
336 */ 332 */
@@ -353,7 +349,6 @@ int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx)
353 * Remove the Context from the AdapterFibContext List 349 * Remove the Context from the AdapterFibContext List
354 */ 350 */
355 list_del(&fibctx->next); 351 list_del(&fibctx->next);
356 spin_unlock_irqrestore(&dev->fib_lock, flags);
357 /* 352 /*
358 * Invalidate context 353 * Invalidate context
359 */ 354 */
@@ -419,8 +414,8 @@ static int close_getadapter_fib(struct aac_dev * dev, void __user *arg)
419 * @arg: ioctl arguments 414 * @arg: ioctl arguments
420 * 415 *
421 * This routine returns the driver version. 416 * This routine returns the driver version.
422 * Under Linux, there have been no version incompatibilities, so this is 417 * Under Linux, there have been no version incompatibilities, so this is
423 * simple! 418 * simple!
424 */ 419 */
425 420
426static int check_revision(struct aac_dev *dev, void __user *arg) 421static int check_revision(struct aac_dev *dev, void __user *arg)
@@ -468,7 +463,7 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg)
468 u32 data_dir; 463 u32 data_dir;
469 void __user *sg_user[32]; 464 void __user *sg_user[32];
470 void *sg_list[32]; 465 void *sg_list[32];
471 u32 sg_indx = 0; 466 u32 sg_indx = 0;
472 u32 byte_count = 0; 467 u32 byte_count = 0;
473 u32 actual_fibsize64, actual_fibsize = 0; 468 u32 actual_fibsize64, actual_fibsize = 0;
474 int i; 469 int i;
@@ -522,11 +517,11 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg)
522 // Fix up srb for endian and force some values 517 // Fix up srb for endian and force some values
523 518
524 srbcmd->function = cpu_to_le32(SRBF_ExecuteScsi); // Force this 519 srbcmd->function = cpu_to_le32(SRBF_ExecuteScsi); // Force this
525 srbcmd->channel = cpu_to_le32(user_srbcmd->channel); 520 srbcmd->channel = cpu_to_le32(user_srbcmd->channel);
526 srbcmd->id = cpu_to_le32(user_srbcmd->id); 521 srbcmd->id = cpu_to_le32(user_srbcmd->id);
527 srbcmd->lun = cpu_to_le32(user_srbcmd->lun); 522 srbcmd->lun = cpu_to_le32(user_srbcmd->lun);
528 srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout); 523 srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout);
529 srbcmd->flags = cpu_to_le32(flags); 524 srbcmd->flags = cpu_to_le32(flags);
530 srbcmd->retry_limit = 0; // Obsolete parameter 525 srbcmd->retry_limit = 0; // Obsolete parameter
531 srbcmd->cdb_size = cpu_to_le32(user_srbcmd->cdb_size); 526 srbcmd->cdb_size = cpu_to_le32(user_srbcmd->cdb_size);
532 memcpy(srbcmd->cdb, user_srbcmd->cdb, sizeof(srbcmd->cdb)); 527 memcpy(srbcmd->cdb, user_srbcmd->cdb, sizeof(srbcmd->cdb));
@@ -791,9 +786,9 @@ static int aac_get_pci_info(struct aac_dev* dev, void __user *arg)
791 pci_info.bus = dev->pdev->bus->number; 786 pci_info.bus = dev->pdev->bus->number;
792 pci_info.slot = PCI_SLOT(dev->pdev->devfn); 787 pci_info.slot = PCI_SLOT(dev->pdev->devfn);
793 788
794 if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) { 789 if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) {
795 dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n")); 790 dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n"));
796 return -EFAULT; 791 return -EFAULT;
797 } 792 }
798 return 0; 793 return 0;
799} 794}
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 61be22774e99..0e8267c1e915 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -1032,7 +1032,6 @@ static struct scsi_host_template aac_driver_template = {
1032 .cmd_per_lun = AAC_NUM_IO_FIB, 1032 .cmd_per_lun = AAC_NUM_IO_FIB,
1033#endif 1033#endif
1034 .use_clustering = ENABLE_CLUSTERING, 1034 .use_clustering = ENABLE_CLUSTERING,
1035 .use_sg_chaining = ENABLE_SG_CHAINING,
1036 .emulated = 1, 1035 .emulated = 1,
1037}; 1036};
1038 1037
diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c
index be58a0b097c7..7c45d88a205b 100644
--- a/drivers/scsi/aha1740.c
+++ b/drivers/scsi/aha1740.c
@@ -563,7 +563,6 @@ static struct scsi_host_template aha1740_template = {
563 .sg_tablesize = AHA1740_SCATTER, 563 .sg_tablesize = AHA1740_SCATTER,
564 .cmd_per_lun = AHA1740_CMDLUN, 564 .cmd_per_lun = AHA1740_CMDLUN,
565 .use_clustering = ENABLE_CLUSTERING, 565 .use_clustering = ENABLE_CLUSTERING,
566 .use_sg_chaining = ENABLE_SG_CHAINING,
567 .eh_abort_handler = aha1740_eh_abort_handler, 566 .eh_abort_handler = aha1740_eh_abort_handler,
568}; 567};
569 568
diff --git a/drivers/scsi/aic7xxx/aic79xx.h b/drivers/scsi/aic7xxx/aic79xx.h
index ce638aa6005a..2f00467b6b8c 100644
--- a/drivers/scsi/aic7xxx/aic79xx.h
+++ b/drivers/scsi/aic7xxx/aic79xx.h
@@ -1340,8 +1340,10 @@ struct ahd_pci_identity *ahd_find_pci_device(ahd_dev_softc_t);
1340int ahd_pci_config(struct ahd_softc *, 1340int ahd_pci_config(struct ahd_softc *,
1341 struct ahd_pci_identity *); 1341 struct ahd_pci_identity *);
1342int ahd_pci_test_register_access(struct ahd_softc *); 1342int ahd_pci_test_register_access(struct ahd_softc *);
1343#ifdef CONFIG_PM
1343void ahd_pci_suspend(struct ahd_softc *); 1344void ahd_pci_suspend(struct ahd_softc *);
1344void ahd_pci_resume(struct ahd_softc *); 1345void ahd_pci_resume(struct ahd_softc *);
1346#endif
1345 1347
1346/************************** SCB and SCB queue management **********************/ 1348/************************** SCB and SCB queue management **********************/
1347void ahd_qinfifo_requeue_tail(struct ahd_softc *ahd, 1349void ahd_qinfifo_requeue_tail(struct ahd_softc *ahd,
@@ -1352,8 +1354,10 @@ struct ahd_softc *ahd_alloc(void *platform_arg, char *name);
1352int ahd_softc_init(struct ahd_softc *); 1354int ahd_softc_init(struct ahd_softc *);
1353void ahd_controller_info(struct ahd_softc *ahd, char *buf); 1355void ahd_controller_info(struct ahd_softc *ahd, char *buf);
1354int ahd_init(struct ahd_softc *ahd); 1356int ahd_init(struct ahd_softc *ahd);
1357#ifdef CONFIG_PM
1355int ahd_suspend(struct ahd_softc *ahd); 1358int ahd_suspend(struct ahd_softc *ahd);
1356void ahd_resume(struct ahd_softc *ahd); 1359void ahd_resume(struct ahd_softc *ahd);
1360#endif
1357int ahd_default_config(struct ahd_softc *ahd); 1361int ahd_default_config(struct ahd_softc *ahd);
1358int ahd_parse_vpddata(struct ahd_softc *ahd, 1362int ahd_parse_vpddata(struct ahd_softc *ahd,
1359 struct vpd_config *vpd); 1363 struct vpd_config *vpd);
@@ -1361,7 +1365,6 @@ int ahd_parse_cfgdata(struct ahd_softc *ahd,
1361 struct seeprom_config *sc); 1365 struct seeprom_config *sc);
1362void ahd_intr_enable(struct ahd_softc *ahd, int enable); 1366void ahd_intr_enable(struct ahd_softc *ahd, int enable);
1363void ahd_pause_and_flushwork(struct ahd_softc *ahd); 1367void ahd_pause_and_flushwork(struct ahd_softc *ahd);
1364int ahd_suspend(struct ahd_softc *ahd);
1365void ahd_set_unit(struct ahd_softc *, int); 1368void ahd_set_unit(struct ahd_softc *, int);
1366void ahd_set_name(struct ahd_softc *, char *); 1369void ahd_set_name(struct ahd_softc *, char *);
1367struct scb *ahd_get_scb(struct ahd_softc *ahd, u_int col_idx); 1370struct scb *ahd_get_scb(struct ahd_softc *ahd, u_int col_idx);
diff --git a/drivers/scsi/aic7xxx/aic79xx_core.c b/drivers/scsi/aic7xxx/aic79xx_core.c
index a7dd8cdda472..ade0fb8fbdb2 100644
--- a/drivers/scsi/aic7xxx/aic79xx_core.c
+++ b/drivers/scsi/aic7xxx/aic79xx_core.c
@@ -7175,6 +7175,7 @@ ahd_pause_and_flushwork(struct ahd_softc *ahd)
7175 ahd->flags &= ~AHD_ALL_INTERRUPTS; 7175 ahd->flags &= ~AHD_ALL_INTERRUPTS;
7176} 7176}
7177 7177
7178#ifdef CONFIG_PM
7178int 7179int
7179ahd_suspend(struct ahd_softc *ahd) 7180ahd_suspend(struct ahd_softc *ahd)
7180{ 7181{
@@ -7197,6 +7198,7 @@ ahd_resume(struct ahd_softc *ahd)
7197 ahd_intr_enable(ahd, TRUE); 7198 ahd_intr_enable(ahd, TRUE);
7198 ahd_restart(ahd); 7199 ahd_restart(ahd);
7199} 7200}
7201#endif
7200 7202
7201/************************** Busy Target Table *********************************/ 7203/************************** Busy Target Table *********************************/
7202/* 7204/*
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c
index 0e4708fd43c8..014654792901 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm.c
@@ -766,7 +766,6 @@ struct scsi_host_template aic79xx_driver_template = {
766 .max_sectors = 8192, 766 .max_sectors = 8192,
767 .cmd_per_lun = 2, 767 .cmd_per_lun = 2,
768 .use_clustering = ENABLE_CLUSTERING, 768 .use_clustering = ENABLE_CLUSTERING,
769 .use_sg_chaining = ENABLE_SG_CHAINING,
770 .slave_alloc = ahd_linux_slave_alloc, 769 .slave_alloc = ahd_linux_slave_alloc,
771 .slave_configure = ahd_linux_slave_configure, 770 .slave_configure = ahd_linux_slave_configure,
772 .target_alloc = ahd_linux_target_alloc, 771 .target_alloc = ahd_linux_target_alloc,
@@ -1922,7 +1921,7 @@ ahd_linux_queue_cmd_complete(struct ahd_softc *ahd, struct scsi_cmnd *cmd)
1922 struct scsi_sense_data *sense; 1921 struct scsi_sense_data *sense;
1923 1922
1924 sense = (struct scsi_sense_data *) 1923 sense = (struct scsi_sense_data *)
1925 &cmd->sense_buffer; 1924 cmd->sense_buffer;
1926 if (sense->extra_len >= 5 && 1925 if (sense->extra_len >= 5 &&
1927 (sense->add_sense_code == 0x47 1926 (sense->add_sense_code == 0x47
1928 || sense->add_sense_code == 0x48)) 1927 || sense->add_sense_code == 0x48))
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
index 66f0259edb69..4150c8a8fdc2 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
@@ -43,17 +43,6 @@
43#include "aic79xx_inline.h" 43#include "aic79xx_inline.h"
44#include "aic79xx_pci.h" 44#include "aic79xx_pci.h"
45 45
46static int ahd_linux_pci_dev_probe(struct pci_dev *pdev,
47 const struct pci_device_id *ent);
48static int ahd_linux_pci_reserve_io_regions(struct ahd_softc *ahd,
49 u_long *base, u_long *base2);
50static int ahd_linux_pci_reserve_mem_region(struct ahd_softc *ahd,
51 u_long *bus_addr,
52 uint8_t __iomem **maddr);
53static int ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg);
54static int ahd_linux_pci_dev_resume(struct pci_dev *pdev);
55static void ahd_linux_pci_dev_remove(struct pci_dev *pdev);
56
57/* Define the macro locally since it's different for different class of chips. 46/* Define the macro locally since it's different for different class of chips.
58 */ 47 */
59#define ID(x) \ 48#define ID(x) \
@@ -85,17 +74,7 @@ static struct pci_device_id ahd_linux_pci_id_table[] = {
85 74
86MODULE_DEVICE_TABLE(pci, ahd_linux_pci_id_table); 75MODULE_DEVICE_TABLE(pci, ahd_linux_pci_id_table);
87 76
88static struct pci_driver aic79xx_pci_driver = {
89 .name = "aic79xx",
90 .probe = ahd_linux_pci_dev_probe,
91#ifdef CONFIG_PM 77#ifdef CONFIG_PM
92 .suspend = ahd_linux_pci_dev_suspend,
93 .resume = ahd_linux_pci_dev_resume,
94#endif
95 .remove = ahd_linux_pci_dev_remove,
96 .id_table = ahd_linux_pci_id_table
97};
98
99static int 78static int
100ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg) 79ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg)
101{ 80{
@@ -139,6 +118,7 @@ ahd_linux_pci_dev_resume(struct pci_dev *pdev)
139 118
140 return rc; 119 return rc;
141} 120}
121#endif
142 122
143static void 123static void
144ahd_linux_pci_dev_remove(struct pci_dev *pdev) 124ahd_linux_pci_dev_remove(struct pci_dev *pdev)
@@ -245,6 +225,17 @@ ahd_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
245 return (0); 225 return (0);
246} 226}
247 227
228static struct pci_driver aic79xx_pci_driver = {
229 .name = "aic79xx",
230 .probe = ahd_linux_pci_dev_probe,
231#ifdef CONFIG_PM
232 .suspend = ahd_linux_pci_dev_suspend,
233 .resume = ahd_linux_pci_dev_resume,
234#endif
235 .remove = ahd_linux_pci_dev_remove,
236 .id_table = ahd_linux_pci_id_table
237};
238
248int 239int
249ahd_linux_pci_init(void) 240ahd_linux_pci_init(void)
250{ 241{
diff --git a/drivers/scsi/aic7xxx/aic79xx_pci.c b/drivers/scsi/aic7xxx/aic79xx_pci.c
index 7a203a90601a..df853676e66a 100644
--- a/drivers/scsi/aic7xxx/aic79xx_pci.c
+++ b/drivers/scsi/aic7xxx/aic79xx_pci.c
@@ -389,6 +389,7 @@ ahd_pci_config(struct ahd_softc *ahd, struct ahd_pci_identity *entry)
389 return error; 389 return error;
390} 390}
391 391
392#ifdef CONFIG_PM
392void 393void
393ahd_pci_suspend(struct ahd_softc *ahd) 394ahd_pci_suspend(struct ahd_softc *ahd)
394{ 395{
@@ -415,6 +416,7 @@ ahd_pci_resume(struct ahd_softc *ahd)
415 ahd_pci_write_config(ahd->dev_softc, CSIZE_LATTIME, 416 ahd_pci_write_config(ahd->dev_softc, CSIZE_LATTIME,
416 ahd->suspend_state.pci_state.csize_lattime, /*bytes*/1); 417 ahd->suspend_state.pci_state.csize_lattime, /*bytes*/1);
417} 418}
419#endif
418 420
419/* 421/*
420 * Perform some simple tests that should catch situations where 422 * Perform some simple tests that should catch situations where
diff --git a/drivers/scsi/aic7xxx/aic7xxx.h b/drivers/scsi/aic7xxx/aic7xxx.h
index 3d4e42d90452..c0344e617651 100644
--- a/drivers/scsi/aic7xxx/aic7xxx.h
+++ b/drivers/scsi/aic7xxx/aic7xxx.h
@@ -1143,7 +1143,9 @@ struct ahc_pci_identity *ahc_find_pci_device(ahc_dev_softc_t);
1143int ahc_pci_config(struct ahc_softc *, 1143int ahc_pci_config(struct ahc_softc *,
1144 struct ahc_pci_identity *); 1144 struct ahc_pci_identity *);
1145int ahc_pci_test_register_access(struct ahc_softc *); 1145int ahc_pci_test_register_access(struct ahc_softc *);
1146#ifdef CONFIG_PM
1146void ahc_pci_resume(struct ahc_softc *ahc); 1147void ahc_pci_resume(struct ahc_softc *ahc);
1148#endif
1147 1149
1148/*************************** EISA/VL Front End ********************************/ 1150/*************************** EISA/VL Front End ********************************/
1149struct aic7770_identity *aic7770_find_device(uint32_t); 1151struct aic7770_identity *aic7770_find_device(uint32_t);
@@ -1170,8 +1172,10 @@ int ahc_chip_init(struct ahc_softc *ahc);
1170int ahc_init(struct ahc_softc *ahc); 1172int ahc_init(struct ahc_softc *ahc);
1171void ahc_intr_enable(struct ahc_softc *ahc, int enable); 1173void ahc_intr_enable(struct ahc_softc *ahc, int enable);
1172void ahc_pause_and_flushwork(struct ahc_softc *ahc); 1174void ahc_pause_and_flushwork(struct ahc_softc *ahc);
1175#ifdef CONFIG_PM
1173int ahc_suspend(struct ahc_softc *ahc); 1176int ahc_suspend(struct ahc_softc *ahc);
1174int ahc_resume(struct ahc_softc *ahc); 1177int ahc_resume(struct ahc_softc *ahc);
1178#endif
1175void ahc_set_unit(struct ahc_softc *, int); 1179void ahc_set_unit(struct ahc_softc *, int);
1176void ahc_set_name(struct ahc_softc *, char *); 1180void ahc_set_name(struct ahc_softc *, char *);
1177void ahc_alloc_scbs(struct ahc_softc *ahc); 1181void ahc_alloc_scbs(struct ahc_softc *ahc);
diff --git a/drivers/scsi/aic7xxx/aic7xxx_core.c b/drivers/scsi/aic7xxx/aic7xxx_core.c
index f350b5e89e76..6d2ae641273c 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_core.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_core.c
@@ -5078,6 +5078,7 @@ ahc_pause_and_flushwork(struct ahc_softc *ahc)
5078 ahc->flags &= ~AHC_ALL_INTERRUPTS; 5078 ahc->flags &= ~AHC_ALL_INTERRUPTS;
5079} 5079}
5080 5080
5081#ifdef CONFIG_PM
5081int 5082int
5082ahc_suspend(struct ahc_softc *ahc) 5083ahc_suspend(struct ahc_softc *ahc)
5083{ 5084{
@@ -5113,7 +5114,7 @@ ahc_resume(struct ahc_softc *ahc)
5113 ahc_restart(ahc); 5114 ahc_restart(ahc);
5114 return (0); 5115 return (0);
5115} 5116}
5116 5117#endif
5117/************************** Busy Target Table *********************************/ 5118/************************** Busy Target Table *********************************/
5118/* 5119/*
5119 * Return the untagged transaction id for a given target/channel lun. 5120 * Return the untagged transaction id for a given target/channel lun.
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c
index e310e414067f..99a3b33a3233 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_osm.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c
@@ -747,7 +747,6 @@ struct scsi_host_template aic7xxx_driver_template = {
747 .max_sectors = 8192, 747 .max_sectors = 8192,
748 .cmd_per_lun = 2, 748 .cmd_per_lun = 2,
749 .use_clustering = ENABLE_CLUSTERING, 749 .use_clustering = ENABLE_CLUSTERING,
750 .use_sg_chaining = ENABLE_SG_CHAINING,
751 .slave_alloc = ahc_linux_slave_alloc, 750 .slave_alloc = ahc_linux_slave_alloc,
752 .slave_configure = ahc_linux_slave_configure, 751 .slave_configure = ahc_linux_slave_configure,
753 .target_alloc = ahc_linux_target_alloc, 752 .target_alloc = ahc_linux_target_alloc,
@@ -1658,9 +1657,12 @@ ahc_done(struct ahc_softc *ahc, struct scb *scb)
1658 untagged_q = &(ahc->untagged_queues[target_offset]); 1657 untagged_q = &(ahc->untagged_queues[target_offset]);
1659 TAILQ_REMOVE(untagged_q, scb, links.tqe); 1658 TAILQ_REMOVE(untagged_q, scb, links.tqe);
1660 BUG_ON(!TAILQ_EMPTY(untagged_q)); 1659 BUG_ON(!TAILQ_EMPTY(untagged_q));
1661 } 1660 } else if ((scb->flags & SCB_ACTIVE) == 0) {
1662 1661 /*
1663 if ((scb->flags & SCB_ACTIVE) == 0) { 1662 * Transactions aborted from the untagged queue may
1663 * not have been dispatched to the controller, so
1664 * only check the SCB_ACTIVE flag for tagged transactions.
1665 */
1664 printf("SCB %d done'd twice\n", scb->hscb->tag); 1666 printf("SCB %d done'd twice\n", scb->hscb->tag);
1665 ahc_dump_card_state(ahc); 1667 ahc_dump_card_state(ahc);
1666 panic("Stopping for safety"); 1668 panic("Stopping for safety");
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
index 4488946cff2e..dd6e21d6f1dd 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
@@ -42,17 +42,6 @@
42#include "aic7xxx_osm.h" 42#include "aic7xxx_osm.h"
43#include "aic7xxx_pci.h" 43#include "aic7xxx_pci.h"
44 44
45static int ahc_linux_pci_dev_probe(struct pci_dev *pdev,
46 const struct pci_device_id *ent);
47static int ahc_linux_pci_reserve_io_region(struct ahc_softc *ahc,
48 u_long *base);
49static int ahc_linux_pci_reserve_mem_region(struct ahc_softc *ahc,
50 u_long *bus_addr,
51 uint8_t __iomem **maddr);
52static int ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg);
53static int ahc_linux_pci_dev_resume(struct pci_dev *pdev);
54static void ahc_linux_pci_dev_remove(struct pci_dev *pdev);
55
56/* Define the macro locally since it's different for different class of chips. 45/* Define the macro locally since it's different for different class of chips.
57*/ 46*/
58#define ID(x) ID_C(x, PCI_CLASS_STORAGE_SCSI) 47#define ID(x) ID_C(x, PCI_CLASS_STORAGE_SCSI)
@@ -132,17 +121,7 @@ static struct pci_device_id ahc_linux_pci_id_table[] = {
132 121
133MODULE_DEVICE_TABLE(pci, ahc_linux_pci_id_table); 122MODULE_DEVICE_TABLE(pci, ahc_linux_pci_id_table);
134 123
135static struct pci_driver aic7xxx_pci_driver = {
136 .name = "aic7xxx",
137 .probe = ahc_linux_pci_dev_probe,
138#ifdef CONFIG_PM 124#ifdef CONFIG_PM
139 .suspend = ahc_linux_pci_dev_suspend,
140 .resume = ahc_linux_pci_dev_resume,
141#endif
142 .remove = ahc_linux_pci_dev_remove,
143 .id_table = ahc_linux_pci_id_table
144};
145
146static int 125static int
147ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg) 126ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg)
148{ 127{
@@ -182,6 +161,7 @@ ahc_linux_pci_dev_resume(struct pci_dev *pdev)
182 161
183 return (ahc_resume(ahc)); 162 return (ahc_resume(ahc));
184} 163}
164#endif
185 165
186static void 166static void
187ahc_linux_pci_dev_remove(struct pci_dev *pdev) 167ahc_linux_pci_dev_remove(struct pci_dev *pdev)
@@ -289,6 +269,17 @@ ahc_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
289 return (0); 269 return (0);
290} 270}
291 271
272static struct pci_driver aic7xxx_pci_driver = {
273 .name = "aic7xxx",
274 .probe = ahc_linux_pci_dev_probe,
275#ifdef CONFIG_PM
276 .suspend = ahc_linux_pci_dev_suspend,
277 .resume = ahc_linux_pci_dev_resume,
278#endif
279 .remove = ahc_linux_pci_dev_remove,
280 .id_table = ahc_linux_pci_id_table
281};
282
292int 283int
293ahc_linux_pci_init(void) 284ahc_linux_pci_init(void)
294{ 285{
diff --git a/drivers/scsi/aic7xxx/aic7xxx_pci.c b/drivers/scsi/aic7xxx/aic7xxx_pci.c
index ae35937b8055..56848f41e4f9 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_pci.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_pci.c
@@ -2020,6 +2020,7 @@ ahc_pci_chip_init(struct ahc_softc *ahc)
2020 return (ahc_chip_init(ahc)); 2020 return (ahc_chip_init(ahc));
2021} 2021}
2022 2022
2023#ifdef CONFIG_PM
2023void 2024void
2024ahc_pci_resume(struct ahc_softc *ahc) 2025ahc_pci_resume(struct ahc_softc *ahc)
2025{ 2026{
@@ -2051,6 +2052,7 @@ ahc_pci_resume(struct ahc_softc *ahc)
2051 ahc_release_seeprom(&sd); 2052 ahc_release_seeprom(&sd);
2052 } 2053 }
2053} 2054}
2055#endif
2054 2056
2055static int 2057static int
2056ahc_aic785X_setup(struct ahc_softc *ahc) 2058ahc_aic785X_setup(struct ahc_softc *ahc)
diff --git a/drivers/scsi/aic7xxx_old.c b/drivers/scsi/aic7xxx_old.c
index bcb0b870320c..3bfd9296bbfa 100644
--- a/drivers/scsi/aic7xxx_old.c
+++ b/drivers/scsi/aic7xxx_old.c
@@ -11141,7 +11141,6 @@ static struct scsi_host_template driver_template = {
11141 .max_sectors = 2048, 11141 .max_sectors = 2048,
11142 .cmd_per_lun = 3, 11142 .cmd_per_lun = 3,
11143 .use_clustering = ENABLE_CLUSTERING, 11143 .use_clustering = ENABLE_CLUSTERING,
11144 .use_sg_chaining = ENABLE_SG_CHAINING,
11145}; 11144};
11146 11145
11147#include "scsi_module.c" 11146#include "scsi_module.c"
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c
index d80dba913a75..f4a202e8df26 100644
--- a/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -122,7 +122,6 @@ static struct scsi_host_template arcmsr_scsi_host_template = {
122 .max_sectors = ARCMSR_MAX_XFER_SECTORS, 122 .max_sectors = ARCMSR_MAX_XFER_SECTORS,
123 .cmd_per_lun = ARCMSR_MAX_CMD_PERLUN, 123 .cmd_per_lun = ARCMSR_MAX_CMD_PERLUN,
124 .use_clustering = ENABLE_CLUSTERING, 124 .use_clustering = ENABLE_CLUSTERING,
125 .use_sg_chaining = ENABLE_SG_CHAINING,
126 .shost_attrs = arcmsr_host_attrs, 125 .shost_attrs = arcmsr_host_attrs,
127}; 126};
128#ifdef CONFIG_SCSI_ARCMSR_AER 127#ifdef CONFIG_SCSI_ARCMSR_AER
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c
index f93c73c0ba53..22ef3716e786 100644
--- a/drivers/scsi/dc395x.c
+++ b/drivers/scsi/dc395x.c
@@ -4763,7 +4763,6 @@ static struct scsi_host_template dc395x_driver_template = {
4763 .eh_bus_reset_handler = dc395x_eh_bus_reset, 4763 .eh_bus_reset_handler = dc395x_eh_bus_reset,
4764 .unchecked_isa_dma = 0, 4764 .unchecked_isa_dma = 0,
4765 .use_clustering = DISABLE_CLUSTERING, 4765 .use_clustering = DISABLE_CLUSTERING,
4766 .use_sg_chaining = ENABLE_SG_CHAINING,
4767}; 4766};
4768 4767
4769 4768
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c
index 19cce125124c..c9dd8392aab2 100644
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -3340,7 +3340,6 @@ static struct scsi_host_template driver_template = {
3340 .this_id = 7, 3340 .this_id = 7,
3341 .cmd_per_lun = 1, 3341 .cmd_per_lun = 1,
3342 .use_clustering = ENABLE_CLUSTERING, 3342 .use_clustering = ENABLE_CLUSTERING,
3343 .use_sg_chaining = ENABLE_SG_CHAINING,
3344}; 3343};
3345#include "scsi_module.c" 3344#include "scsi_module.c"
3346MODULE_LICENSE("GPL"); 3345MODULE_LICENSE("GPL");
diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c
index 05163cefec12..8be3d76656fa 100644
--- a/drivers/scsi/eata.c
+++ b/drivers/scsi/eata.c
@@ -524,7 +524,6 @@ static struct scsi_host_template driver_template = {
524 .this_id = 7, 524 .this_id = 7,
525 .unchecked_isa_dma = 1, 525 .unchecked_isa_dma = 1,
526 .use_clustering = ENABLE_CLUSTERING, 526 .use_clustering = ENABLE_CLUSTERING,
527 .use_sg_chaining = ENABLE_SG_CHAINING,
528}; 527};
529 528
530#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD) 529#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 5ea1f986220c..880c78bff0e1 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -342,7 +342,6 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
342 shost->use_clustering = sht->use_clustering; 342 shost->use_clustering = sht->use_clustering;
343 shost->ordered_tag = sht->ordered_tag; 343 shost->ordered_tag = sht->ordered_tag;
344 shost->active_mode = sht->supported_mode; 344 shost->active_mode = sht->supported_mode;
345 shost->use_sg_chaining = sht->use_sg_chaining;
346 345
347 if (sht->supported_mode == MODE_UNKNOWN) 346 if (sht->supported_mode == MODE_UNKNOWN)
348 /* means we didn't set it ... default to INITIATOR */ 347 /* means we didn't set it ... default to INITIATOR */
diff --git a/drivers/scsi/hptiop.c b/drivers/scsi/hptiop.c
index e7b2f3575ce9..ff149ad6bc4e 100644
--- a/drivers/scsi/hptiop.c
+++ b/drivers/scsi/hptiop.c
@@ -573,7 +573,7 @@ static void hptiop_finish_scsi_req(struct hptiop_hba *hba, u32 tag,
573 scsi_set_resid(scp, 573 scsi_set_resid(scp,
574 scsi_bufflen(scp) - le32_to_cpu(req->dataxfer_length)); 574 scsi_bufflen(scp) - le32_to_cpu(req->dataxfer_length));
575 scp->result = SAM_STAT_CHECK_CONDITION; 575 scp->result = SAM_STAT_CHECK_CONDITION;
576 memcpy(&scp->sense_buffer, &req->sg_list, 576 memcpy(scp->sense_buffer, &req->sg_list,
577 min_t(size_t, SCSI_SENSE_BUFFERSIZE, 577 min_t(size_t, SCSI_SENSE_BUFFERSIZE,
578 le32_to_cpu(req->dataxfer_length))); 578 le32_to_cpu(req->dataxfer_length)));
579 break; 579 break;
@@ -906,7 +906,6 @@ static struct scsi_host_template driver_template = {
906 .unchecked_isa_dma = 0, 906 .unchecked_isa_dma = 0,
907 .emulated = 0, 907 .emulated = 0,
908 .use_clustering = ENABLE_CLUSTERING, 908 .use_clustering = ENABLE_CLUSTERING,
909 .use_sg_chaining = ENABLE_SG_CHAINING,
910 .proc_name = driver_name, 909 .proc_name = driver_name,
911 .shost_attrs = hptiop_attrs, 910 .shost_attrs = hptiop_attrs,
912 .this_id = -1, 911 .this_id = -1,
diff --git a/drivers/scsi/ibmmca.c b/drivers/scsi/ibmmca.c
index db004a450732..4d15a62914e9 100644
--- a/drivers/scsi/ibmmca.c
+++ b/drivers/scsi/ibmmca.c
@@ -1501,7 +1501,6 @@ static struct scsi_host_template ibmmca_driver_template = {
1501 .sg_tablesize = 16, 1501 .sg_tablesize = 16,
1502 .cmd_per_lun = 1, 1502 .cmd_per_lun = 1,
1503 .use_clustering = ENABLE_CLUSTERING, 1503 .use_clustering = ENABLE_CLUSTERING,
1504 .use_sg_chaining = ENABLE_SG_CHAINING,
1505}; 1504};
1506 1505
1507static int ibmmca_probe(struct device *dev) 1506static int ibmmca_probe(struct device *dev)
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index 30819012898f..78d46a900bb5 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -1600,7 +1600,6 @@ static struct scsi_host_template driver_template = {
1600 .this_id = -1, 1600 .this_id = -1,
1601 .sg_tablesize = SG_ALL, 1601 .sg_tablesize = SG_ALL,
1602 .use_clustering = ENABLE_CLUSTERING, 1602 .use_clustering = ENABLE_CLUSTERING,
1603 .use_sg_chaining = ENABLE_SG_CHAINING,
1604 .shost_attrs = ibmvscsi_attrs, 1603 .shost_attrs = ibmvscsi_attrs,
1605}; 1604};
1606 1605
diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c
index a10a5c74b48d..0cc8868ea35d 100644
--- a/drivers/scsi/initio.c
+++ b/drivers/scsi/initio.c
@@ -2833,7 +2833,6 @@ static struct scsi_host_template initio_template = {
2833 .sg_tablesize = SG_ALL, 2833 .sg_tablesize = SG_ALL,
2834 .cmd_per_lun = 1, 2834 .cmd_per_lun = 1,
2835 .use_clustering = ENABLE_CLUSTERING, 2835 .use_clustering = ENABLE_CLUSTERING,
2836 .use_sg_chaining = ENABLE_SG_CHAINING,
2837}; 2836};
2838 2837
2839static int initio_probe_one(struct pci_dev *pdev, 2838static int initio_probe_one(struct pci_dev *pdev,
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index e5be5fd4ef58..b6f99dfbb038 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -1933,7 +1933,6 @@ static struct scsi_host_template iscsi_sht = {
1933 .eh_device_reset_handler= iscsi_eh_device_reset, 1933 .eh_device_reset_handler= iscsi_eh_device_reset,
1934 .eh_host_reset_handler = iscsi_eh_host_reset, 1934 .eh_host_reset_handler = iscsi_eh_host_reset,
1935 .use_clustering = DISABLE_CLUSTERING, 1935 .use_clustering = DISABLE_CLUSTERING,
1936 .use_sg_chaining = ENABLE_SG_CHAINING,
1937 .slave_configure = iscsi_tcp_slave_configure, 1936 .slave_configure = iscsi_tcp_slave_configure,
1938 .proc_name = "iscsi_tcp", 1937 .proc_name = "iscsi_tcp",
1939 .this_id = -1, 1938 .this_id = -1,
diff --git a/drivers/scsi/libsrp.c b/drivers/scsi/libsrp.c
index 5cff0204227d..6d6a76e65a6c 100644
--- a/drivers/scsi/libsrp.c
+++ b/drivers/scsi/libsrp.c
@@ -426,8 +426,8 @@ int srp_cmd_queue(struct Scsi_Host *shost, struct srp_cmd *cmd, void *info,
426 426
427 sc->SCp.ptr = info; 427 sc->SCp.ptr = info;
428 memcpy(sc->cmnd, cmd->cdb, MAX_COMMAND_SIZE); 428 memcpy(sc->cmnd, cmd->cdb, MAX_COMMAND_SIZE);
429 sc->request_bufflen = len; 429 sc->sdb.length = len;
430 sc->request_buffer = (void *) (unsigned long) addr; 430 sc->sdb.table.sgl = (void *) (unsigned long) addr;
431 sc->tag = tag; 431 sc->tag = tag;
432 err = scsi_tgt_queue_command(sc, itn_id, (struct scsi_lun *)&cmd->lun, 432 err = scsi_tgt_queue_command(sc, itn_id, (struct scsi_lun *)&cmd->lun,
433 cmd->tag); 433 cmd->tag);
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 6483c62730b3..fc5c3a42b05a 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -1459,7 +1459,6 @@ struct scsi_host_template lpfc_template = {
1459 .scan_finished = lpfc_scan_finished, 1459 .scan_finished = lpfc_scan_finished,
1460 .this_id = -1, 1460 .this_id = -1,
1461 .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT, 1461 .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT,
1462 .use_sg_chaining = ENABLE_SG_CHAINING,
1463 .cmd_per_lun = LPFC_CMD_PER_LUN, 1462 .cmd_per_lun = LPFC_CMD_PER_LUN,
1464 .use_clustering = ENABLE_CLUSTERING, 1463 .use_clustering = ENABLE_CLUSTERING,
1465 .shost_attrs = lpfc_hba_attrs, 1464 .shost_attrs = lpfc_hba_attrs,
@@ -1482,7 +1481,6 @@ struct scsi_host_template lpfc_vport_template = {
1482 .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT, 1481 .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT,
1483 .cmd_per_lun = LPFC_CMD_PER_LUN, 1482 .cmd_per_lun = LPFC_CMD_PER_LUN,
1484 .use_clustering = ENABLE_CLUSTERING, 1483 .use_clustering = ENABLE_CLUSTERING,
1485 .use_sg_chaining = ENABLE_SG_CHAINING,
1486 .shost_attrs = lpfc_vport_attrs, 1484 .shost_attrs = lpfc_vport_attrs,
1487 .max_sectors = 0xFFFF, 1485 .max_sectors = 0xFFFF,
1488}; 1486};
diff --git a/drivers/scsi/mac53c94.c b/drivers/scsi/mac53c94.c
index a035001f4438..b12ad7c7c673 100644
--- a/drivers/scsi/mac53c94.c
+++ b/drivers/scsi/mac53c94.c
@@ -402,7 +402,6 @@ static struct scsi_host_template mac53c94_template = {
402 .sg_tablesize = SG_ALL, 402 .sg_tablesize = SG_ALL,
403 .cmd_per_lun = 1, 403 .cmd_per_lun = 1,
404 .use_clustering = DISABLE_CLUSTERING, 404 .use_clustering = DISABLE_CLUSTERING,
405 .use_sg_chaining = ENABLE_SG_CHAINING,
406}; 405};
407 406
408static int mac53c94_probe(struct macio_dev *mdev, const struct of_device_id *match) 407static int mac53c94_probe(struct macio_dev *mdev, const struct of_device_id *match)
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index 765c24d2bc38..4d59ae8491a4 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -4490,7 +4490,6 @@ static struct scsi_host_template megaraid_template = {
4490 .sg_tablesize = MAX_SGLIST, 4490 .sg_tablesize = MAX_SGLIST,
4491 .cmd_per_lun = DEF_CMD_PER_LUN, 4491 .cmd_per_lun = DEF_CMD_PER_LUN,
4492 .use_clustering = ENABLE_CLUSTERING, 4492 .use_clustering = ENABLE_CLUSTERING,
4493 .use_sg_chaining = ENABLE_SG_CHAINING,
4494 .eh_abort_handler = megaraid_abort, 4493 .eh_abort_handler = megaraid_abort,
4495 .eh_device_reset_handler = megaraid_reset, 4494 .eh_device_reset_handler = megaraid_reset,
4496 .eh_bus_reset_handler = megaraid_reset, 4495 .eh_bus_reset_handler = megaraid_reset,
diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c
index 24e32e446e76..6db77c00e3ee 100644
--- a/drivers/scsi/megaraid/megaraid_mbox.c
+++ b/drivers/scsi/megaraid/megaraid_mbox.c
@@ -361,7 +361,6 @@ static struct scsi_host_template megaraid_template_g = {
361 .eh_host_reset_handler = megaraid_reset_handler, 361 .eh_host_reset_handler = megaraid_reset_handler,
362 .change_queue_depth = megaraid_change_queue_depth, 362 .change_queue_depth = megaraid_change_queue_depth,
363 .use_clustering = ENABLE_CLUSTERING, 363 .use_clustering = ENABLE_CLUSTERING,
364 .use_sg_chaining = ENABLE_SG_CHAINING,
365 .sdev_attrs = megaraid_sdev_attrs, 364 .sdev_attrs = megaraid_sdev_attrs,
366 .shost_attrs = megaraid_shost_attrs, 365 .shost_attrs = megaraid_shost_attrs,
367}; 366};
diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c
index d7ec921865c4..672c759ac24d 100644
--- a/drivers/scsi/megaraid/megaraid_sas.c
+++ b/drivers/scsi/megaraid/megaraid_sas.c
@@ -1192,7 +1192,6 @@ static struct scsi_host_template megasas_template = {
1192 .eh_timed_out = megasas_reset_timer, 1192 .eh_timed_out = megasas_reset_timer,
1193 .bios_param = megasas_bios_param, 1193 .bios_param = megasas_bios_param,
1194 .use_clustering = ENABLE_CLUSTERING, 1194 .use_clustering = ENABLE_CLUSTERING,
1195 .use_sg_chaining = ENABLE_SG_CHAINING,
1196}; 1195};
1197 1196
1198/** 1197/**
diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c
index 7470ff39ab22..651d09b08f2a 100644
--- a/drivers/scsi/mesh.c
+++ b/drivers/scsi/mesh.c
@@ -1843,7 +1843,6 @@ static struct scsi_host_template mesh_template = {
1843 .sg_tablesize = SG_ALL, 1843 .sg_tablesize = SG_ALL,
1844 .cmd_per_lun = 2, 1844 .cmd_per_lun = 2,
1845 .use_clustering = DISABLE_CLUSTERING, 1845 .use_clustering = DISABLE_CLUSTERING,
1846 .use_sg_chaining = ENABLE_SG_CHAINING,
1847}; 1846};
1848 1847
1849static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match) 1848static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match)
diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c
index c02771aa6c9b..c5ebf018b378 100644
--- a/drivers/scsi/ncr53c8xx.c
+++ b/drivers/scsi/ncr53c8xx.c
@@ -4967,7 +4967,7 @@ void ncr_complete (struct ncb *np, struct ccb *cp)
4967 sizeof(cp->sense_buf))); 4967 sizeof(cp->sense_buf)));
4968 4968
4969 if (DEBUG_FLAGS & (DEBUG_RESULT|DEBUG_TINY)) { 4969 if (DEBUG_FLAGS & (DEBUG_RESULT|DEBUG_TINY)) {
4970 u_char * p = (u_char*) & cmd->sense_buffer; 4970 u_char *p = cmd->sense_buffer;
4971 int i; 4971 int i;
4972 PRINT_ADDR(cmd, "sense data:"); 4972 PRINT_ADDR(cmd, "sense data:");
4973 for (i=0; i<14; i++) printk (" %x", *p++); 4973 for (i=0; i<14; i++) printk (" %x", *p++);
diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c
index 28161dc95e0d..7fed35372150 100644
--- a/drivers/scsi/nsp32.c
+++ b/drivers/scsi/nsp32.c
@@ -281,7 +281,6 @@ static struct scsi_host_template nsp32_template = {
281 .cmd_per_lun = 1, 281 .cmd_per_lun = 1,
282 .this_id = NSP32_HOST_SCSIID, 282 .this_id = NSP32_HOST_SCSIID,
283 .use_clustering = DISABLE_CLUSTERING, 283 .use_clustering = DISABLE_CLUSTERING,
284 .use_sg_chaining = ENABLE_SG_CHAINING,
285 .eh_abort_handler = nsp32_eh_abort, 284 .eh_abort_handler = nsp32_eh_abort,
286 .eh_bus_reset_handler = nsp32_eh_bus_reset, 285 .eh_bus_reset_handler = nsp32_eh_bus_reset,
287 .eh_host_reset_handler = nsp32_eh_host_reset, 286 .eh_host_reset_handler = nsp32_eh_host_reset,
diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c
index 969b9387a0c3..3454a5714749 100644
--- a/drivers/scsi/pcmcia/sym53c500_cs.c
+++ b/drivers/scsi/pcmcia/sym53c500_cs.c
@@ -692,7 +692,6 @@ static struct scsi_host_template sym53c500_driver_template = {
692 .sg_tablesize = 32, 692 .sg_tablesize = 32,
693 .cmd_per_lun = 1, 693 .cmd_per_lun = 1,
694 .use_clustering = ENABLE_CLUSTERING, 694 .use_clustering = ENABLE_CLUSTERING,
695 .use_sg_chaining = ENABLE_SG_CHAINING,
696 .shost_attrs = SYM53C500_shost_attrs 695 .shost_attrs = SYM53C500_shost_attrs
697}; 696};
698 697
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index c94906abfee3..68c0d09ffe78 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -4204,7 +4204,6 @@ static struct scsi_host_template qla1280_driver_template = {
4204 .sg_tablesize = SG_ALL, 4204 .sg_tablesize = SG_ALL,
4205 .cmd_per_lun = 1, 4205 .cmd_per_lun = 1,
4206 .use_clustering = ENABLE_CLUSTERING, 4206 .use_clustering = ENABLE_CLUSTERING,
4207 .use_sg_chaining = ENABLE_SG_CHAINING,
4208}; 4207};
4209 4208
4210 4209
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index aba1e6d48066..3954ed2d7b51 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -131,7 +131,6 @@ static struct scsi_host_template qla2x00_driver_template = {
131 .this_id = -1, 131 .this_id = -1,
132 .cmd_per_lun = 3, 132 .cmd_per_lun = 3,
133 .use_clustering = ENABLE_CLUSTERING, 133 .use_clustering = ENABLE_CLUSTERING,
134 .use_sg_chaining = ENABLE_SG_CHAINING,
135 .sg_tablesize = SG_ALL, 134 .sg_tablesize = SG_ALL,
136 135
137 /* 136 /*
@@ -163,7 +162,6 @@ struct scsi_host_template qla24xx_driver_template = {
163 .this_id = -1, 162 .this_id = -1,
164 .cmd_per_lun = 3, 163 .cmd_per_lun = 3,
165 .use_clustering = ENABLE_CLUSTERING, 164 .use_clustering = ENABLE_CLUSTERING,
166 .use_sg_chaining = ENABLE_SG_CHAINING,
167 .sg_tablesize = SG_ALL, 165 .sg_tablesize = SG_ALL,
168 166
169 .max_sectors = 0xFFFF, 167 .max_sectors = 0xFFFF,
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index d3f86646cb08..2e2b9fedffcc 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -94,7 +94,6 @@ static struct scsi_host_template qla4xxx_driver_template = {
94 .this_id = -1, 94 .this_id = -1,
95 .cmd_per_lun = 3, 95 .cmd_per_lun = 3,
96 .use_clustering = ENABLE_CLUSTERING, 96 .use_clustering = ENABLE_CLUSTERING,
97 .use_sg_chaining = ENABLE_SG_CHAINING,
98 .sg_tablesize = SG_ALL, 97 .sg_tablesize = SG_ALL,
99 98
100 .max_sectors = 0xFFFF, 99 .max_sectors = 0xFFFF,
diff --git a/drivers/scsi/qlogicfas.c b/drivers/scsi/qlogicfas.c
index 1769f965eedf..1e874f1fb5c6 100644
--- a/drivers/scsi/qlogicfas.c
+++ b/drivers/scsi/qlogicfas.c
@@ -197,7 +197,6 @@ static struct scsi_host_template qlogicfas_driver_template = {
197 .sg_tablesize = SG_ALL, 197 .sg_tablesize = SG_ALL,
198 .cmd_per_lun = 1, 198 .cmd_per_lun = 1,
199 .use_clustering = DISABLE_CLUSTERING, 199 .use_clustering = DISABLE_CLUSTERING,
200 .use_sg_chaining = ENABLE_SG_CHAINING,
201}; 200};
202 201
203static __init int qlogicfas_init(void) 202static __init int qlogicfas_init(void)
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 1a9fba6a9f92..b35d19472caa 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -757,7 +757,7 @@ void scsi_finish_command(struct scsi_cmnd *cmd)
757 "Notifying upper driver of completion " 757 "Notifying upper driver of completion "
758 "(result %x)\n", cmd->result)); 758 "(result %x)\n", cmd->result));
759 759
760 good_bytes = cmd->request_bufflen; 760 good_bytes = scsi_bufflen(cmd);
761 if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) { 761 if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) {
762 drv = scsi_cmd_to_driver(cmd); 762 drv = scsi_cmd_to_driver(cmd);
763 if (drv->done) 763 if (drv->done)
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 82c06f0a9d02..1541c174937a 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -280,6 +280,8 @@ static int resp_write(struct scsi_cmnd * SCpnt, unsigned long long lba,
280 unsigned int num, struct sdebug_dev_info * devip); 280 unsigned int num, struct sdebug_dev_info * devip);
281static int resp_report_luns(struct scsi_cmnd * SCpnt, 281static int resp_report_luns(struct scsi_cmnd * SCpnt,
282 struct sdebug_dev_info * devip); 282 struct sdebug_dev_info * devip);
283static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba,
284 unsigned int num, struct sdebug_dev_info *devip);
283static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, 285static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
284 int arr_len); 286 int arr_len);
285static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, 287static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
@@ -311,12 +313,48 @@ static void sdebug_max_tgts_luns(void);
311static struct device pseudo_primary; 313static struct device pseudo_primary;
312static struct bus_type pseudo_lld_bus; 314static struct bus_type pseudo_lld_bus;
313 315
316static void get_data_transfer_info(unsigned char *cmd,
317 unsigned long long *lba, unsigned int *num)
318{
319 int i;
320
321 switch (*cmd) {
322 case WRITE_16:
323 case READ_16:
324 for (*lba = 0, i = 0; i < 8; ++i) {
325 if (i > 0)
326 *lba <<= 8;
327 *lba += cmd[2 + i];
328 }
329 *num = cmd[13] + (cmd[12] << 8) +
330 (cmd[11] << 16) + (cmd[10] << 24);
331 break;
332 case WRITE_12:
333 case READ_12:
334 *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24);
335 *num = cmd[9] + (cmd[8] << 8) + (cmd[7] << 16) + (cmd[6] << 24);
336 break;
337 case WRITE_10:
338 case READ_10:
339 case XDWRITEREAD_10:
340 *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24);
341 *num = cmd[8] + (cmd[7] << 8);
342 break;
343 case WRITE_6:
344 case READ_6:
345 *lba = cmd[3] + (cmd[2] << 8) + ((cmd[1] & 0x1f) << 16);
346 *num = (0 == cmd[4]) ? 256 : cmd[4];
347 break;
348 default:
349 break;
350 }
351}
314 352
315static 353static
316int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) 354int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
317{ 355{
318 unsigned char *cmd = (unsigned char *) SCpnt->cmnd; 356 unsigned char *cmd = (unsigned char *) SCpnt->cmnd;
319 int len, k, j; 357 int len, k;
320 unsigned int num; 358 unsigned int num;
321 unsigned long long lba; 359 unsigned long long lba;
322 int errsts = 0; 360 int errsts = 0;
@@ -452,28 +490,7 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
452 break; 490 break;
453 if (scsi_debug_fake_rw) 491 if (scsi_debug_fake_rw)
454 break; 492 break;
455 if ((*cmd) == READ_16) { 493 get_data_transfer_info(cmd, &lba, &num);
456 for (lba = 0, j = 0; j < 8; ++j) {
457 if (j > 0)
458 lba <<= 8;
459 lba += cmd[2 + j];
460 }
461 num = cmd[13] + (cmd[12] << 8) +
462 (cmd[11] << 16) + (cmd[10] << 24);
463 } else if ((*cmd) == READ_12) {
464 lba = cmd[5] + (cmd[4] << 8) +
465 (cmd[3] << 16) + (cmd[2] << 24);
466 num = cmd[9] + (cmd[8] << 8) +
467 (cmd[7] << 16) + (cmd[6] << 24);
468 } else if ((*cmd) == READ_10) {
469 lba = cmd[5] + (cmd[4] << 8) +
470 (cmd[3] << 16) + (cmd[2] << 24);
471 num = cmd[8] + (cmd[7] << 8);
472 } else { /* READ (6) */
473 lba = cmd[3] + (cmd[2] << 8) +
474 ((cmd[1] & 0x1f) << 16);
475 num = (0 == cmd[4]) ? 256 : cmd[4];
476 }
477 errsts = resp_read(SCpnt, lba, num, devip); 494 errsts = resp_read(SCpnt, lba, num, devip);
478 if (inj_recovered && (0 == errsts)) { 495 if (inj_recovered && (0 == errsts)) {
479 mk_sense_buffer(devip, RECOVERED_ERROR, 496 mk_sense_buffer(devip, RECOVERED_ERROR,
@@ -500,28 +517,7 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
500 break; 517 break;
501 if (scsi_debug_fake_rw) 518 if (scsi_debug_fake_rw)
502 break; 519 break;
503 if ((*cmd) == WRITE_16) { 520 get_data_transfer_info(cmd, &lba, &num);
504 for (lba = 0, j = 0; j < 8; ++j) {
505 if (j > 0)
506 lba <<= 8;
507 lba += cmd[2 + j];
508 }
509 num = cmd[13] + (cmd[12] << 8) +
510 (cmd[11] << 16) + (cmd[10] << 24);
511 } else if ((*cmd) == WRITE_12) {
512 lba = cmd[5] + (cmd[4] << 8) +
513 (cmd[3] << 16) + (cmd[2] << 24);
514 num = cmd[9] + (cmd[8] << 8) +
515 (cmd[7] << 16) + (cmd[6] << 24);
516 } else if ((*cmd) == WRITE_10) {
517 lba = cmd[5] + (cmd[4] << 8) +
518 (cmd[3] << 16) + (cmd[2] << 24);
519 num = cmd[8] + (cmd[7] << 8);
520 } else { /* WRITE (6) */
521 lba = cmd[3] + (cmd[2] << 8) +
522 ((cmd[1] & 0x1f) << 16);
523 num = (0 == cmd[4]) ? 256 : cmd[4];
524 }
525 errsts = resp_write(SCpnt, lba, num, devip); 521 errsts = resp_write(SCpnt, lba, num, devip);
526 if (inj_recovered && (0 == errsts)) { 522 if (inj_recovered && (0 == errsts)) {
527 mk_sense_buffer(devip, RECOVERED_ERROR, 523 mk_sense_buffer(devip, RECOVERED_ERROR,
@@ -549,6 +545,28 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
549 case WRITE_BUFFER: 545 case WRITE_BUFFER:
550 errsts = check_readiness(SCpnt, 1, devip); 546 errsts = check_readiness(SCpnt, 1, devip);
551 break; 547 break;
548 case XDWRITEREAD_10:
549 if (!scsi_bidi_cmnd(SCpnt)) {
550 mk_sense_buffer(devip, ILLEGAL_REQUEST,
551 INVALID_FIELD_IN_CDB, 0);
552 errsts = check_condition_result;
553 break;
554 }
555
556 errsts = check_readiness(SCpnt, 0, devip);
557 if (errsts)
558 break;
559 if (scsi_debug_fake_rw)
560 break;
561 get_data_transfer_info(cmd, &lba, &num);
562 errsts = resp_read(SCpnt, lba, num, devip);
563 if (errsts)
564 break;
565 errsts = resp_write(SCpnt, lba, num, devip);
566 if (errsts)
567 break;
568 errsts = resp_xdwriteread(SCpnt, lba, num, devip);
569 break;
552 default: 570 default:
553 if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) 571 if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
554 printk(KERN_INFO "scsi_debug: Opcode: 0x%x not " 572 printk(KERN_INFO "scsi_debug: Opcode: 0x%x not "
@@ -601,18 +619,18 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
601 int k, req_len, act_len, len, active; 619 int k, req_len, act_len, len, active;
602 void * kaddr; 620 void * kaddr;
603 void * kaddr_off; 621 void * kaddr_off;
604 struct scatterlist * sg; 622 struct scatterlist *sg;
623 struct scsi_data_buffer *sdb = scsi_in(scp);
605 624
606 if (0 == scsi_bufflen(scp)) 625 if (!sdb->length)
607 return 0; 626 return 0;
608 if (NULL == scsi_sglist(scp)) 627 if (!sdb->table.sgl)
609 return (DID_ERROR << 16); 628 return (DID_ERROR << 16);
610 if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) || 629 if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_FROM_DEVICE))
611 (scp->sc_data_direction == DMA_FROM_DEVICE)))
612 return (DID_ERROR << 16); 630 return (DID_ERROR << 16);
613 active = 1; 631 active = 1;
614 req_len = act_len = 0; 632 req_len = act_len = 0;
615 scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) { 633 for_each_sg(sdb->table.sgl, sg, sdb->table.nents, k) {
616 if (active) { 634 if (active) {
617 kaddr = (unsigned char *) 635 kaddr = (unsigned char *)
618 kmap_atomic(sg_page(sg), KM_USER0); 636 kmap_atomic(sg_page(sg), KM_USER0);
@@ -630,10 +648,10 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
630 } 648 }
631 req_len += sg->length; 649 req_len += sg->length;
632 } 650 }
633 if (scsi_get_resid(scp)) 651 if (sdb->resid)
634 scsi_set_resid(scp, scsi_get_resid(scp) - act_len); 652 sdb->resid -= act_len;
635 else 653 else
636 scsi_set_resid(scp, req_len - act_len); 654 sdb->resid = req_len - act_len;
637 return 0; 655 return 0;
638} 656}
639 657
@@ -650,8 +668,7 @@ static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
650 return 0; 668 return 0;
651 if (NULL == scsi_sglist(scp)) 669 if (NULL == scsi_sglist(scp))
652 return -1; 670 return -1;
653 if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) || 671 if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_TO_DEVICE))
654 (scp->sc_data_direction == DMA_TO_DEVICE)))
655 return -1; 672 return -1;
656 req_len = fin = 0; 673 req_len = fin = 0;
657 scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) { 674 scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) {
@@ -1956,6 +1973,50 @@ static int resp_report_luns(struct scsi_cmnd * scp,
1956 min((int)alloc_len, SDEBUG_RLUN_ARR_SZ)); 1973 min((int)alloc_len, SDEBUG_RLUN_ARR_SZ));
1957} 1974}
1958 1975
1976static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba,
1977 unsigned int num, struct sdebug_dev_info *devip)
1978{
1979 int i, j, ret = -1;
1980 unsigned char *kaddr, *buf;
1981 unsigned int offset;
1982 struct scatterlist *sg;
1983 struct scsi_data_buffer *sdb = scsi_in(scp);
1984
1985 /* better not to use temporary buffer. */
1986 buf = kmalloc(scsi_bufflen(scp), GFP_ATOMIC);
1987 if (!buf)
1988 return ret;
1989
1990 offset = 0;
1991 scsi_for_each_sg(scp, sg, scsi_sg_count(scp), i) {
1992 kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0);
1993 if (!kaddr)
1994 goto out;
1995
1996 memcpy(buf + offset, kaddr + sg->offset, sg->length);
1997 offset += sg->length;
1998 kunmap_atomic(kaddr, KM_USER0);
1999 }
2000
2001 offset = 0;
2002 for_each_sg(sdb->table.sgl, sg, sdb->table.nents, i) {
2003 kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0);
2004 if (!kaddr)
2005 goto out;
2006
2007 for (j = 0; j < sg->length; j++)
2008 *(kaddr + sg->offset + j) ^= *(buf + offset + j);
2009
2010 offset += sg->length;
2011 kunmap_atomic(kaddr, KM_USER0);
2012 }
2013 ret = 0;
2014out:
2015 kfree(buf);
2016
2017 return ret;
2018}
2019
1959/* When timer goes off this function is called. */ 2020/* When timer goes off this function is called. */
1960static void timer_intr_handler(unsigned long indx) 2021static void timer_intr_handler(unsigned long indx)
1961{ 2022{
@@ -1989,6 +2050,7 @@ static int scsi_debug_slave_alloc(struct scsi_device * sdp)
1989 if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) 2050 if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
1990 printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %u>\n", 2051 printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %u>\n",
1991 sdp->host->host_no, sdp->channel, sdp->id, sdp->lun); 2052 sdp->host->host_no, sdp->channel, sdp->id, sdp->lun);
2053 set_bit(QUEUE_FLAG_BIDI, &sdp->request_queue->queue_flags);
1992 return 0; 2054 return 0;
1993} 2055}
1994 2056
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 547e85aa414f..045a0868fc7b 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -617,29 +617,27 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses,
617 ses->cmd_len = scmd->cmd_len; 617 ses->cmd_len = scmd->cmd_len;
618 memcpy(ses->cmnd, scmd->cmnd, sizeof(scmd->cmnd)); 618 memcpy(ses->cmnd, scmd->cmnd, sizeof(scmd->cmnd));
619 ses->data_direction = scmd->sc_data_direction; 619 ses->data_direction = scmd->sc_data_direction;
620 ses->bufflen = scmd->request_bufflen; 620 ses->sdb = scmd->sdb;
621 ses->buffer = scmd->request_buffer; 621 ses->next_rq = scmd->request->next_rq;
622 ses->use_sg = scmd->use_sg;
623 ses->resid = scmd->resid;
624 ses->result = scmd->result; 622 ses->result = scmd->result;
625 623
624 memset(&scmd->sdb, 0, sizeof(scmd->sdb));
625 scmd->request->next_rq = NULL;
626
626 if (sense_bytes) { 627 if (sense_bytes) {
627 scmd->request_bufflen = min_t(unsigned, 628 scmd->sdb.length = min_t(unsigned, SCSI_SENSE_BUFFERSIZE,
628 SCSI_SENSE_BUFFERSIZE, sense_bytes); 629 sense_bytes);
629 sg_init_one(&ses->sense_sgl, scmd->sense_buffer, 630 sg_init_one(&ses->sense_sgl, scmd->sense_buffer,
630 scmd->request_bufflen); 631 scmd->sdb.length);
631 scmd->request_buffer = &ses->sense_sgl; 632 scmd->sdb.table.sgl = &ses->sense_sgl;
632 scmd->sc_data_direction = DMA_FROM_DEVICE; 633 scmd->sc_data_direction = DMA_FROM_DEVICE;
633 scmd->use_sg = 1; 634 scmd->sdb.table.nents = 1;
634 memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); 635 memset(scmd->cmnd, 0, sizeof(scmd->cmnd));
635 scmd->cmnd[0] = REQUEST_SENSE; 636 scmd->cmnd[0] = REQUEST_SENSE;
636 scmd->cmnd[4] = scmd->request_bufflen; 637 scmd->cmnd[4] = scmd->sdb.length;
637 scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); 638 scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]);
638 } else { 639 } else {
639 scmd->request_buffer = NULL;
640 scmd->request_bufflen = 0;
641 scmd->sc_data_direction = DMA_NONE; 640 scmd->sc_data_direction = DMA_NONE;
642 scmd->use_sg = 0;
643 if (cmnd) { 641 if (cmnd) {
644 memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); 642 memset(scmd->cmnd, 0, sizeof(scmd->cmnd));
645 memcpy(scmd->cmnd, cmnd, cmnd_size); 643 memcpy(scmd->cmnd, cmnd, cmnd_size);
@@ -676,10 +674,8 @@ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses)
676 scmd->cmd_len = ses->cmd_len; 674 scmd->cmd_len = ses->cmd_len;
677 memcpy(scmd->cmnd, ses->cmnd, sizeof(scmd->cmnd)); 675 memcpy(scmd->cmnd, ses->cmnd, sizeof(scmd->cmnd));
678 scmd->sc_data_direction = ses->data_direction; 676 scmd->sc_data_direction = ses->data_direction;
679 scmd->request_bufflen = ses->bufflen; 677 scmd->sdb = ses->sdb;
680 scmd->request_buffer = ses->buffer; 678 scmd->request->next_rq = ses->next_rq;
681 scmd->use_sg = ses->use_sg;
682 scmd->resid = ses->resid;
683 scmd->result = ses->result; 679 scmd->result = ses->result;
684} 680}
685EXPORT_SYMBOL(scsi_eh_restore_cmnd); 681EXPORT_SYMBOL(scsi_eh_restore_cmnd);
@@ -1700,8 +1696,7 @@ scsi_reset_provider(struct scsi_device *dev, int flag)
1700 memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd)); 1696 memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd));
1701 1697
1702 scmd->scsi_done = scsi_reset_provider_done_command; 1698 scmd->scsi_done = scsi_reset_provider_done_command;
1703 scmd->request_buffer = NULL; 1699 memset(&scmd->sdb, 0, sizeof(scmd->sdb));
1704 scmd->request_bufflen = 0;
1705 1700
1706 scmd->cmd_len = 0; 1701 scmd->cmd_len = 0;
1707 1702
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7c4c889c5221..b12fb310e399 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/bio.h> 10#include <linux/bio.h>
11#include <linux/bitops.h>
11#include <linux/blkdev.h> 12#include <linux/blkdev.h>
12#include <linux/completion.h> 13#include <linux/completion.h>
13#include <linux/kernel.h> 14#include <linux/kernel.h>
@@ -34,13 +35,6 @@
34#define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools) 35#define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools)
35#define SG_MEMPOOL_SIZE 2 36#define SG_MEMPOOL_SIZE 2
36 37
37/*
38 * The maximum number of SG segments that we will put inside a scatterlist
39 * (unless chaining is used). Should ideally fit inside a single page, to
40 * avoid a higher order allocation.
41 */
42#define SCSI_MAX_SG_SEGMENTS 128
43
44struct scsi_host_sg_pool { 38struct scsi_host_sg_pool {
45 size_t size; 39 size_t size;
46 char *name; 40 char *name;
@@ -48,22 +42,31 @@ struct scsi_host_sg_pool {
48 mempool_t *pool; 42 mempool_t *pool;
49}; 43};
50 44
51#define SP(x) { x, "sgpool-" #x } 45#define SP(x) { x, "sgpool-" __stringify(x) }
46#if (SCSI_MAX_SG_SEGMENTS < 32)
47#error SCSI_MAX_SG_SEGMENTS is too small (must be 32 or greater)
48#endif
52static struct scsi_host_sg_pool scsi_sg_pools[] = { 49static struct scsi_host_sg_pool scsi_sg_pools[] = {
53 SP(8), 50 SP(8),
54 SP(16), 51 SP(16),
55#if (SCSI_MAX_SG_SEGMENTS > 16)
56 SP(32),
57#if (SCSI_MAX_SG_SEGMENTS > 32) 52#if (SCSI_MAX_SG_SEGMENTS > 32)
58 SP(64), 53 SP(32),
59#if (SCSI_MAX_SG_SEGMENTS > 64) 54#if (SCSI_MAX_SG_SEGMENTS > 64)
55 SP(64),
56#if (SCSI_MAX_SG_SEGMENTS > 128)
60 SP(128), 57 SP(128),
58#if (SCSI_MAX_SG_SEGMENTS > 256)
59#error SCSI_MAX_SG_SEGMENTS is too large (256 MAX)
60#endif
61#endif 61#endif
62#endif 62#endif
63#endif 63#endif
64 SP(SCSI_MAX_SG_SEGMENTS)
64}; 65};
65#undef SP 66#undef SP
66 67
68static struct kmem_cache *scsi_bidi_sdb_cache;
69
67static void scsi_run_queue(struct request_queue *q); 70static void scsi_run_queue(struct request_queue *q);
68 71
69/* 72/*
@@ -440,7 +443,7 @@ EXPORT_SYMBOL_GPL(scsi_execute_async);
440static void scsi_init_cmd_errh(struct scsi_cmnd *cmd) 443static void scsi_init_cmd_errh(struct scsi_cmnd *cmd)
441{ 444{
442 cmd->serial_number = 0; 445 cmd->serial_number = 0;
443 cmd->resid = 0; 446 scsi_set_resid(cmd, 0);
444 memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); 447 memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
445 if (cmd->cmd_len == 0) 448 if (cmd->cmd_len == 0)
446 cmd->cmd_len = COMMAND_SIZE(cmd->cmnd[0]); 449 cmd->cmd_len = COMMAND_SIZE(cmd->cmnd[0]);
@@ -690,42 +693,16 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
690 return NULL; 693 return NULL;
691} 694}
692 695
693/*
694 * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
695 * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
696 */
697#define SCSI_MAX_SG_CHAIN_SEGMENTS 2048
698
699static inline unsigned int scsi_sgtable_index(unsigned short nents) 696static inline unsigned int scsi_sgtable_index(unsigned short nents)
700{ 697{
701 unsigned int index; 698 unsigned int index;
702 699
703 switch (nents) { 700 BUG_ON(nents > SCSI_MAX_SG_SEGMENTS);
704 case 1 ... 8: 701
702 if (nents <= 8)
705 index = 0; 703 index = 0;
706 break; 704 else
707 case 9 ... 16: 705 index = get_count_order(nents) - 3;
708 index = 1;
709 break;
710#if (SCSI_MAX_SG_SEGMENTS > 16)
711 case 17 ... 32:
712 index = 2;
713 break;
714#if (SCSI_MAX_SG_SEGMENTS > 32)
715 case 33 ... 64:
716 index = 3;
717 break;
718#if (SCSI_MAX_SG_SEGMENTS > 64)
719 case 65 ... 128:
720 index = 4;
721 break;
722#endif
723#endif
724#endif
725 default:
726 printk(KERN_ERR "scsi: bad segment count=%d\n", nents);
727 BUG();
728 }
729 706
730 return index; 707 return index;
731} 708}
@@ -746,31 +723,27 @@ static struct scatterlist *scsi_sg_alloc(unsigned int nents, gfp_t gfp_mask)
746 return mempool_alloc(sgp->pool, gfp_mask); 723 return mempool_alloc(sgp->pool, gfp_mask);
747} 724}
748 725
749int scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask) 726static int scsi_alloc_sgtable(struct scsi_data_buffer *sdb, int nents,
727 gfp_t gfp_mask)
750{ 728{
751 int ret; 729 int ret;
752 730
753 BUG_ON(!cmd->use_sg); 731 BUG_ON(!nents);
754 732
755 ret = __sg_alloc_table(&cmd->sg_table, cmd->use_sg, 733 ret = __sg_alloc_table(&sdb->table, nents, SCSI_MAX_SG_SEGMENTS,
756 SCSI_MAX_SG_SEGMENTS, gfp_mask, scsi_sg_alloc); 734 gfp_mask, scsi_sg_alloc);
757 if (unlikely(ret)) 735 if (unlikely(ret))
758 __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, 736 __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS,
759 scsi_sg_free); 737 scsi_sg_free);
760 738
761 cmd->request_buffer = cmd->sg_table.sgl;
762 return ret; 739 return ret;
763} 740}
764 741
765EXPORT_SYMBOL(scsi_alloc_sgtable); 742static void scsi_free_sgtable(struct scsi_data_buffer *sdb)
766
767void scsi_free_sgtable(struct scsi_cmnd *cmd)
768{ 743{
769 __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free); 744 __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free);
770} 745}
771 746
772EXPORT_SYMBOL(scsi_free_sgtable);
773
774/* 747/*
775 * Function: scsi_release_buffers() 748 * Function: scsi_release_buffers()
776 * 749 *
@@ -788,17 +761,49 @@ EXPORT_SYMBOL(scsi_free_sgtable);
788 * the scatter-gather table, and potentially any bounce 761 * the scatter-gather table, and potentially any bounce
789 * buffers. 762 * buffers.
790 */ 763 */
791static void scsi_release_buffers(struct scsi_cmnd *cmd) 764void scsi_release_buffers(struct scsi_cmnd *cmd)
765{
766 if (cmd->sdb.table.nents)
767 scsi_free_sgtable(&cmd->sdb);
768
769 memset(&cmd->sdb, 0, sizeof(cmd->sdb));
770
771 if (scsi_bidi_cmnd(cmd)) {
772 struct scsi_data_buffer *bidi_sdb =
773 cmd->request->next_rq->special;
774 scsi_free_sgtable(bidi_sdb);
775 kmem_cache_free(scsi_bidi_sdb_cache, bidi_sdb);
776 cmd->request->next_rq->special = NULL;
777 }
778}
779EXPORT_SYMBOL(scsi_release_buffers);
780
781/*
782 * Bidi commands Must be complete as a whole, both sides at once.
783 * If part of the bytes were written and lld returned
784 * scsi_in()->resid and/or scsi_out()->resid this information will be left
785 * in req->data_len and req->next_rq->data_len. The upper-layer driver can
786 * decide what to do with this information.
787 */
788void scsi_end_bidi_request(struct scsi_cmnd *cmd)
792{ 789{
793 if (cmd->use_sg) 790 struct request *req = cmd->request;
794 scsi_free_sgtable(cmd); 791 unsigned int dlen = req->data_len;
792 unsigned int next_dlen = req->next_rq->data_len;
793
794 req->data_len = scsi_out(cmd)->resid;
795 req->next_rq->data_len = scsi_in(cmd)->resid;
796
797 /* The req and req->next_rq have not been completed */
798 BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen));
799
800 scsi_release_buffers(cmd);
795 801
796 /* 802 /*
797 * Zero these out. They now point to freed memory, and it is 803 * This will goose the queue request function at the end, so we don't
798 * dangerous to hang onto the pointers. 804 * need to worry about launching another command.
799 */ 805 */
800 cmd->request_buffer = NULL; 806 scsi_next_command(cmd);
801 cmd->request_bufflen = 0;
802} 807}
803 808
804/* 809/*
@@ -832,7 +837,7 @@ static void scsi_release_buffers(struct scsi_cmnd *cmd)
832void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) 837void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
833{ 838{
834 int result = cmd->result; 839 int result = cmd->result;
835 int this_count = cmd->request_bufflen; 840 int this_count = scsi_bufflen(cmd);
836 struct request_queue *q = cmd->device->request_queue; 841 struct request_queue *q = cmd->device->request_queue;
837 struct request *req = cmd->request; 842 struct request *req = cmd->request;
838 int clear_errors = 1; 843 int clear_errors = 1;
@@ -840,8 +845,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
840 int sense_valid = 0; 845 int sense_valid = 0;
841 int sense_deferred = 0; 846 int sense_deferred = 0;
842 847
843 scsi_release_buffers(cmd);
844
845 if (result) { 848 if (result) {
846 sense_valid = scsi_command_normalize_sense(cmd, &sshdr); 849 sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
847 if (sense_valid) 850 if (sense_valid)
@@ -864,9 +867,17 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
864 req->sense_len = len; 867 req->sense_len = len;
865 } 868 }
866 } 869 }
867 req->data_len = cmd->resid; 870 if (scsi_bidi_cmnd(cmd)) {
871 /* will also release_buffers */
872 scsi_end_bidi_request(cmd);
873 return;
874 }
875 req->data_len = scsi_get_resid(cmd);
868 } 876 }
869 877
878 BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */
879 scsi_release_buffers(cmd);
880
870 /* 881 /*
871 * Next deal with any sectors which we were able to correctly 882 * Next deal with any sectors which we were able to correctly
872 * handle. 883 * handle.
@@ -874,7 +885,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
874 SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, " 885 SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, "
875 "%d bytes done.\n", 886 "%d bytes done.\n",
876 req->nr_sectors, good_bytes)); 887 req->nr_sectors, good_bytes));
877 SCSI_LOG_HLCOMPLETE(1, printk("use_sg is %d\n", cmd->use_sg));
878 888
879 if (clear_errors) 889 if (clear_errors)
880 req->errors = 0; 890 req->errors = 0;
@@ -991,52 +1001,80 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
991 scsi_end_request(cmd, -EIO, this_count, !result); 1001 scsi_end_request(cmd, -EIO, this_count, !result);
992} 1002}
993 1003
994/* 1004static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
995 * Function: scsi_init_io() 1005 gfp_t gfp_mask)
996 *
997 * Purpose: SCSI I/O initialize function.
998 *
999 * Arguments: cmd - Command descriptor we wish to initialize
1000 *
1001 * Returns: 0 on success
1002 * BLKPREP_DEFER if the failure is retryable
1003 */
1004static int scsi_init_io(struct scsi_cmnd *cmd)
1005{ 1006{
1006 struct request *req = cmd->request; 1007 int count;
1007 int count;
1008
1009 /*
1010 * We used to not use scatter-gather for single segment request,
1011 * but now we do (it makes highmem I/O easier to support without
1012 * kmapping pages)
1013 */
1014 cmd->use_sg = req->nr_phys_segments;
1015 1008
1016 /* 1009 /*
1017 * If sg table allocation fails, requeue request later. 1010 * If sg table allocation fails, requeue request later.
1018 */ 1011 */
1019 if (unlikely(scsi_alloc_sgtable(cmd, GFP_ATOMIC))) { 1012 if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
1020 scsi_unprep_request(req); 1013 gfp_mask))) {
1021 return BLKPREP_DEFER; 1014 return BLKPREP_DEFER;
1022 } 1015 }
1023 1016
1024 req->buffer = NULL; 1017 req->buffer = NULL;
1025 if (blk_pc_request(req)) 1018 if (blk_pc_request(req))
1026 cmd->request_bufflen = req->data_len; 1019 sdb->length = req->data_len;
1027 else 1020 else
1028 cmd->request_bufflen = req->nr_sectors << 9; 1021 sdb->length = req->nr_sectors << 9;
1029 1022
1030 /* 1023 /*
1031 * Next, walk the list, and fill in the addresses and sizes of 1024 * Next, walk the list, and fill in the addresses and sizes of
1032 * each segment. 1025 * each segment.
1033 */ 1026 */
1034 count = blk_rq_map_sg(req->q, req, cmd->request_buffer); 1027 count = blk_rq_map_sg(req->q, req, sdb->table.sgl);
1035 BUG_ON(count > cmd->use_sg); 1028 BUG_ON(count > sdb->table.nents);
1036 cmd->use_sg = count; 1029 sdb->table.nents = count;
1037 return BLKPREP_OK; 1030 return BLKPREP_OK;
1038} 1031}
1039 1032
1033/*
1034 * Function: scsi_init_io()
1035 *
1036 * Purpose: SCSI I/O initialize function.
1037 *
1038 * Arguments: cmd - Command descriptor we wish to initialize
1039 *
1040 * Returns: 0 on success
1041 * BLKPREP_DEFER if the failure is retryable
1042 * BLKPREP_KILL if the failure is fatal
1043 */
1044int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
1045{
1046 int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask);
1047 if (error)
1048 goto err_exit;
1049
1050 if (blk_bidi_rq(cmd->request)) {
1051 struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc(
1052 scsi_bidi_sdb_cache, GFP_ATOMIC);
1053 if (!bidi_sdb) {
1054 error = BLKPREP_DEFER;
1055 goto err_exit;
1056 }
1057
1058 cmd->request->next_rq->special = bidi_sdb;
1059 error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb,
1060 GFP_ATOMIC);
1061 if (error)
1062 goto err_exit;
1063 }
1064
1065 return BLKPREP_OK ;
1066
1067err_exit:
1068 scsi_release_buffers(cmd);
1069 if (error == BLKPREP_KILL)
1070 scsi_put_command(cmd);
1071 else /* BLKPREP_DEFER */
1072 scsi_unprep_request(cmd->request);
1073
1074 return error;
1075}
1076EXPORT_SYMBOL(scsi_init_io);
1077
1040static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev, 1078static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev,
1041 struct request *req) 1079 struct request *req)
1042{ 1080{
@@ -1081,16 +1119,14 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
1081 1119
1082 BUG_ON(!req->nr_phys_segments); 1120 BUG_ON(!req->nr_phys_segments);
1083 1121
1084 ret = scsi_init_io(cmd); 1122 ret = scsi_init_io(cmd, GFP_ATOMIC);
1085 if (unlikely(ret)) 1123 if (unlikely(ret))
1086 return ret; 1124 return ret;
1087 } else { 1125 } else {
1088 BUG_ON(req->data_len); 1126 BUG_ON(req->data_len);
1089 BUG_ON(req->data); 1127 BUG_ON(req->data);
1090 1128
1091 cmd->request_bufflen = 0; 1129 memset(&cmd->sdb, 0, sizeof(cmd->sdb));
1092 cmd->request_buffer = NULL;
1093 cmd->use_sg = 0;
1094 req->buffer = NULL; 1130 req->buffer = NULL;
1095 } 1131 }
1096 1132
@@ -1132,7 +1168,7 @@ int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
1132 if (unlikely(!cmd)) 1168 if (unlikely(!cmd))
1133 return BLKPREP_DEFER; 1169 return BLKPREP_DEFER;
1134 1170
1135 return scsi_init_io(cmd); 1171 return scsi_init_io(cmd, GFP_ATOMIC);
1136} 1172}
1137EXPORT_SYMBOL(scsi_setup_fs_cmnd); 1173EXPORT_SYMBOL(scsi_setup_fs_cmnd);
1138 1174
@@ -1542,20 +1578,7 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
1542 * this limit is imposed by hardware restrictions 1578 * this limit is imposed by hardware restrictions
1543 */ 1579 */
1544 blk_queue_max_hw_segments(q, shost->sg_tablesize); 1580 blk_queue_max_hw_segments(q, shost->sg_tablesize);
1545 1581 blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
1546 /*
1547 * In the future, sg chaining support will be mandatory and this
1548 * ifdef can then go away. Right now we don't have all archs
1549 * converted, so better keep it safe.
1550 */
1551#ifdef ARCH_HAS_SG_CHAIN
1552 if (shost->use_sg_chaining)
1553 blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
1554 else
1555 blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
1556#else
1557 blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
1558#endif
1559 1582
1560 blk_queue_max_sectors(q, shost->max_sectors); 1583 blk_queue_max_sectors(q, shost->max_sectors);
1561 blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost)); 1584 blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
@@ -1654,6 +1677,14 @@ int __init scsi_init_queue(void)
1654 return -ENOMEM; 1677 return -ENOMEM;
1655 } 1678 }
1656 1679
1680 scsi_bidi_sdb_cache = kmem_cache_create("scsi_bidi_sdb",
1681 sizeof(struct scsi_data_buffer),
1682 0, 0, NULL);
1683 if (!scsi_bidi_sdb_cache) {
1684 printk(KERN_ERR "SCSI: can't init scsi bidi sdb cache\n");
1685 goto cleanup_io_context;
1686 }
1687
1657 for (i = 0; i < SG_MEMPOOL_NR; i++) { 1688 for (i = 0; i < SG_MEMPOOL_NR; i++) {
1658 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; 1689 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
1659 int size = sgp->size * sizeof(struct scatterlist); 1690 int size = sgp->size * sizeof(struct scatterlist);
@@ -1663,6 +1694,7 @@ int __init scsi_init_queue(void)
1663 if (!sgp->slab) { 1694 if (!sgp->slab) {
1664 printk(KERN_ERR "SCSI: can't init sg slab %s\n", 1695 printk(KERN_ERR "SCSI: can't init sg slab %s\n",
1665 sgp->name); 1696 sgp->name);
1697 goto cleanup_bidi_sdb;
1666 } 1698 }
1667 1699
1668 sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE, 1700 sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
@@ -1670,10 +1702,25 @@ int __init scsi_init_queue(void)
1670 if (!sgp->pool) { 1702 if (!sgp->pool) {
1671 printk(KERN_ERR "SCSI: can't init sg mempool %s\n", 1703 printk(KERN_ERR "SCSI: can't init sg mempool %s\n",
1672 sgp->name); 1704 sgp->name);
1705 goto cleanup_bidi_sdb;
1673 } 1706 }
1674 } 1707 }
1675 1708
1676 return 0; 1709 return 0;
1710
1711cleanup_bidi_sdb:
1712 for (i = 0; i < SG_MEMPOOL_NR; i++) {
1713 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
1714 if (sgp->pool)
1715 mempool_destroy(sgp->pool);
1716 if (sgp->slab)
1717 kmem_cache_destroy(sgp->slab);
1718 }
1719 kmem_cache_destroy(scsi_bidi_sdb_cache);
1720cleanup_io_context:
1721 kmem_cache_destroy(scsi_io_context_cache);
1722
1723 return -ENOMEM;
1677} 1724}
1678 1725
1679void scsi_exit_queue(void) 1726void scsi_exit_queue(void)
@@ -1681,6 +1728,7 @@ void scsi_exit_queue(void)
1681 int i; 1728 int i;
1682 1729
1683 kmem_cache_destroy(scsi_io_context_cache); 1730 kmem_cache_destroy(scsi_io_context_cache);
1731 kmem_cache_destroy(scsi_bidi_sdb_cache);
1684 1732
1685 for (i = 0; i < SG_MEMPOOL_NR; i++) { 1733 for (i = 0; i < SG_MEMPOOL_NR; i++) {
1686 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; 1734 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c
index 01e03f3f6ffa..91630baea532 100644
--- a/drivers/scsi/scsi_tgt_lib.c
+++ b/drivers/scsi/scsi_tgt_lib.c
@@ -331,8 +331,7 @@ static void scsi_tgt_cmd_done(struct scsi_cmnd *cmd)
331 331
332 scsi_tgt_uspace_send_status(cmd, tcmd->itn_id, tcmd->tag); 332 scsi_tgt_uspace_send_status(cmd, tcmd->itn_id, tcmd->tag);
333 333
334 if (scsi_sglist(cmd)) 334 scsi_release_buffers(cmd);
335 scsi_free_sgtable(cmd);
336 335
337 queue_work(scsi_tgtd, &tcmd->work); 336 queue_work(scsi_tgtd, &tcmd->work);
338} 337}
@@ -353,25 +352,6 @@ static int scsi_tgt_transfer_response(struct scsi_cmnd *cmd)
353 return 0; 352 return 0;
354} 353}
355 354
356static int scsi_tgt_init_cmd(struct scsi_cmnd *cmd, gfp_t gfp_mask)
357{
358 struct request *rq = cmd->request;
359 int count;
360
361 cmd->use_sg = rq->nr_phys_segments;
362 if (scsi_alloc_sgtable(cmd, gfp_mask))
363 return -ENOMEM;
364
365 cmd->request_bufflen = rq->data_len;
366
367 dprintk("cmd %p cnt %d %lu\n", cmd, scsi_sg_count(cmd),
368 rq_data_dir(rq));
369 count = blk_rq_map_sg(rq->q, rq, scsi_sglist(cmd));
370 BUG_ON(count > cmd->use_sg);
371 cmd->use_sg = count;
372 return 0;
373}
374
375/* TODO: test this crap and replace bio_map_user with new interface maybe */ 355/* TODO: test this crap and replace bio_map_user with new interface maybe */
376static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd, 356static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
377 unsigned long uaddr, unsigned int len, int rw) 357 unsigned long uaddr, unsigned int len, int rw)
@@ -397,9 +377,11 @@ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
397 } 377 }
398 378
399 tcmd->bio = rq->bio; 379 tcmd->bio = rq->bio;
400 err = scsi_tgt_init_cmd(cmd, GFP_KERNEL); 380 err = scsi_init_io(cmd, GFP_KERNEL);
401 if (err) 381 if (err) {
382 scsi_release_buffers(cmd);
402 goto unmap_rq; 383 goto unmap_rq;
384 }
403 385
404 return 0; 386 return 0;
405 387
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 24eba3118b5a..51a5557f42dd 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -519,7 +519,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
519 SCpnt->cmnd[4] = (unsigned char) this_count; 519 SCpnt->cmnd[4] = (unsigned char) this_count;
520 SCpnt->cmnd[5] = 0; 520 SCpnt->cmnd[5] = 0;
521 } 521 }
522 SCpnt->request_bufflen = this_count * sdp->sector_size; 522 SCpnt->sdb.length = this_count * sdp->sector_size;
523 523
524 /* 524 /*
525 * We shouldn't disconnect in the middle of a sector, so with a dumb 525 * We shouldn't disconnect in the middle of a sector, so with a dumb
@@ -926,7 +926,7 @@ static struct block_device_operations sd_fops = {
926static int sd_done(struct scsi_cmnd *SCpnt) 926static int sd_done(struct scsi_cmnd *SCpnt)
927{ 927{
928 int result = SCpnt->result; 928 int result = SCpnt->result;
929 unsigned int xfer_size = SCpnt->request_bufflen; 929 unsigned int xfer_size = scsi_bufflen(SCpnt);
930 unsigned int good_bytes = result ? 0 : xfer_size; 930 unsigned int good_bytes = result ? 0 : xfer_size;
931 u64 start_lba = SCpnt->request->sector; 931 u64 start_lba = SCpnt->request->sector;
932 u64 bad_lba; 932 u64 bad_lba;
diff --git a/drivers/scsi/sgiwd93.c b/drivers/scsi/sgiwd93.c
index d4ebe8c67ba9..26cfc56c7091 100644
--- a/drivers/scsi/sgiwd93.c
+++ b/drivers/scsi/sgiwd93.c
@@ -33,10 +33,9 @@
33 33
34struct ip22_hostdata { 34struct ip22_hostdata {
35 struct WD33C93_hostdata wh; 35 struct WD33C93_hostdata wh;
36 struct hpc_data { 36 dma_addr_t dma;
37 dma_addr_t dma; 37 void *cpu;
38 void *cpu; 38 struct device *dev;
39 } hd;
40}; 39};
41 40
42#define host_to_hostdata(host) ((struct ip22_hostdata *)((host)->hostdata)) 41#define host_to_hostdata(host) ((struct ip22_hostdata *)((host)->hostdata))
@@ -46,6 +45,11 @@ struct hpc_chunk {
46 u32 _padding; /* align to quadword boundary */ 45 u32 _padding; /* align to quadword boundary */
47}; 46};
48 47
48/* space for hpc dma descriptors */
49#define HPC_DMA_SIZE PAGE_SIZE
50
51#define DMA_DIR(d) ((d == DATA_OUT_DIR) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
52
49static irqreturn_t sgiwd93_intr(int irq, void *dev_id) 53static irqreturn_t sgiwd93_intr(int irq, void *dev_id)
50{ 54{
51 struct Scsi_Host * host = dev_id; 55 struct Scsi_Host * host = dev_id;
@@ -59,15 +63,17 @@ static irqreturn_t sgiwd93_intr(int irq, void *dev_id)
59} 63}
60 64
61static inline 65static inline
62void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp) 66void fill_hpc_entries(struct ip22_hostdata *hd, struct scsi_cmnd *cmd, int din)
63{ 67{
64 unsigned long len = cmd->SCp.this_residual; 68 unsigned long len = cmd->SCp.this_residual;
65 void *addr = cmd->SCp.ptr; 69 void *addr = cmd->SCp.ptr;
66 dma_addr_t physaddr; 70 dma_addr_t physaddr;
67 unsigned long count; 71 unsigned long count;
72 struct hpc_chunk *hcp;
68 73
69 physaddr = dma_map_single(NULL, addr, len, cmd->sc_data_direction); 74 physaddr = dma_map_single(hd->dev, addr, len, DMA_DIR(din));
70 cmd->SCp.dma_handle = physaddr; 75 cmd->SCp.dma_handle = physaddr;
76 hcp = hd->cpu;
71 77
72 while (len) { 78 while (len) {
73 /* 79 /*
@@ -89,6 +95,9 @@ void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp)
89 */ 95 */
90 hcp->desc.pbuf = 0; 96 hcp->desc.pbuf = 0;
91 hcp->desc.cntinfo = HPCDMA_EOX; 97 hcp->desc.cntinfo = HPCDMA_EOX;
98 dma_cache_sync(hd->dev, hd->cpu,
99 (unsigned long)(hcp + 1) - (unsigned long)hd->cpu,
100 DMA_TO_DEVICE);
92} 101}
93 102
94static int dma_setup(struct scsi_cmnd *cmd, int datainp) 103static int dma_setup(struct scsi_cmnd *cmd, int datainp)
@@ -96,9 +105,8 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp)
96 struct ip22_hostdata *hdata = host_to_hostdata(cmd->device->host); 105 struct ip22_hostdata *hdata = host_to_hostdata(cmd->device->host);
97 struct hpc3_scsiregs *hregs = 106 struct hpc3_scsiregs *hregs =
98 (struct hpc3_scsiregs *) cmd->device->host->base; 107 (struct hpc3_scsiregs *) cmd->device->host->base;
99 struct hpc_chunk *hcp = (struct hpc_chunk *) hdata->hd.cpu;
100 108
101 pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hcp); 109 pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hdata->cpu);
102 110
103 hdata->wh.dma_dir = datainp; 111 hdata->wh.dma_dir = datainp;
104 112
@@ -111,12 +119,12 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp)
111 if (cmd->SCp.ptr == NULL || cmd->SCp.this_residual == 0) 119 if (cmd->SCp.ptr == NULL || cmd->SCp.this_residual == 0)
112 return 1; 120 return 1;
113 121
114 fill_hpc_entries(hcp, cmd, datainp); 122 fill_hpc_entries(hdata, cmd, datainp);
115 123
116 pr_debug(" HPCGO\n"); 124 pr_debug(" HPCGO\n");
117 125
118 /* Start up the HPC. */ 126 /* Start up the HPC. */
119 hregs->ndptr = hdata->hd.dma; 127 hregs->ndptr = hdata->dma;
120 if (datainp) 128 if (datainp)
121 hregs->ctrl = HPC3_SCTRL_ACTIVE; 129 hregs->ctrl = HPC3_SCTRL_ACTIVE;
122 else 130 else
@@ -134,6 +142,9 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt,
134 if (!SCpnt) 142 if (!SCpnt)
135 return; 143 return;
136 144
145 if (SCpnt->SCp.ptr == NULL || SCpnt->SCp.this_residual == 0)
146 return;
147
137 hregs = (struct hpc3_scsiregs *) SCpnt->device->host->base; 148 hregs = (struct hpc3_scsiregs *) SCpnt->device->host->base;
138 149
139 pr_debug("dma_stop: status<%d> ", status); 150 pr_debug("dma_stop: status<%d> ", status);
@@ -145,8 +156,9 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt,
145 barrier(); 156 barrier();
146 } 157 }
147 hregs->ctrl = 0; 158 hregs->ctrl = 0;
148 dma_unmap_single(NULL, SCpnt->SCp.dma_handle, SCpnt->SCp.this_residual, 159 dma_unmap_single(hdata->dev, SCpnt->SCp.dma_handle,
149 SCpnt->sc_data_direction); 160 SCpnt->SCp.this_residual,
161 DMA_DIR(hdata->wh.dma_dir));
150 162
151 pr_debug("\n"); 163 pr_debug("\n");
152} 164}
@@ -161,22 +173,23 @@ void sgiwd93_reset(unsigned long base)
161} 173}
162EXPORT_SYMBOL_GPL(sgiwd93_reset); 174EXPORT_SYMBOL_GPL(sgiwd93_reset);
163 175
164static inline void init_hpc_chain(struct hpc_data *hd) 176static inline void init_hpc_chain(struct ip22_hostdata *hdata)
165{ 177{
166 struct hpc_chunk *hcp = (struct hpc_chunk *) hd->cpu; 178 struct hpc_chunk *hcp = (struct hpc_chunk *)hdata->cpu;
167 struct hpc_chunk *dma = (struct hpc_chunk *) hd->dma; 179 dma_addr_t dma = hdata->dma;
168 unsigned long start, end; 180 unsigned long start, end;
169 181
170 start = (unsigned long) hcp; 182 start = (unsigned long) hcp;
171 end = start + PAGE_SIZE; 183 end = start + HPC_DMA_SIZE;
172 while (start < end) { 184 while (start < end) {
173 hcp->desc.pnext = (u32) (dma + 1); 185 hcp->desc.pnext = (u32) (dma + sizeof(struct hpc_chunk));
174 hcp->desc.cntinfo = HPCDMA_EOX; 186 hcp->desc.cntinfo = HPCDMA_EOX;
175 hcp++; dma++; 187 hcp++;
188 dma += sizeof(struct hpc_chunk);
176 start += sizeof(struct hpc_chunk); 189 start += sizeof(struct hpc_chunk);
177 }; 190 };
178 hcp--; 191 hcp--;
179 hcp->desc.pnext = hd->dma; 192 hcp->desc.pnext = hdata->dma;
180} 193}
181 194
182static int sgiwd93_bus_reset(struct scsi_cmnd *cmd) 195static int sgiwd93_bus_reset(struct scsi_cmnd *cmd)
@@ -235,16 +248,17 @@ static int __init sgiwd93_probe(struct platform_device *pdev)
235 host->irq = irq; 248 host->irq = irq;
236 249
237 hdata = host_to_hostdata(host); 250 hdata = host_to_hostdata(host);
238 hdata->hd.cpu = dma_alloc_coherent(&pdev->dev, PAGE_SIZE, 251 hdata->dev = &pdev->dev;
239 &hdata->hd.dma, GFP_KERNEL); 252 hdata->cpu = dma_alloc_noncoherent(&pdev->dev, HPC_DMA_SIZE,
240 if (!hdata->hd.cpu) { 253 &hdata->dma, GFP_KERNEL);
254 if (!hdata->cpu) {
241 printk(KERN_WARNING "sgiwd93: Could not allocate memory for " 255 printk(KERN_WARNING "sgiwd93: Could not allocate memory for "
242 "host %d buffer.\n", unit); 256 "host %d buffer.\n", unit);
243 err = -ENOMEM; 257 err = -ENOMEM;
244 goto out_put; 258 goto out_put;
245 } 259 }
246 260
247 init_hpc_chain(&hdata->hd); 261 init_hpc_chain(hdata);
248 262
249 regs.SASR = wdregs + 3; 263 regs.SASR = wdregs + 3;
250 regs.SCMD = wdregs + 7; 264 regs.SCMD = wdregs + 7;
@@ -274,7 +288,7 @@ static int __init sgiwd93_probe(struct platform_device *pdev)
274out_irq: 288out_irq:
275 free_irq(irq, host); 289 free_irq(irq, host);
276out_free: 290out_free:
277 dma_free_coherent(NULL, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma); 291 dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma);
278out_put: 292out_put:
279 scsi_host_put(host); 293 scsi_host_put(host);
280out: 294out:
@@ -290,7 +304,7 @@ static void __exit sgiwd93_remove(struct platform_device *pdev)
290 304
291 scsi_remove_host(host); 305 scsi_remove_host(host);
292 free_irq(pd->irq, host); 306 free_irq(pd->irq, host);
293 dma_free_coherent(&pdev->dev, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma); 307 dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma);
294 scsi_host_put(host); 308 scsi_host_put(host);
295} 309}
296 310
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 1fcee16fa36d..50ba49250203 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -231,7 +231,7 @@ out:
231static int sr_done(struct scsi_cmnd *SCpnt) 231static int sr_done(struct scsi_cmnd *SCpnt)
232{ 232{
233 int result = SCpnt->result; 233 int result = SCpnt->result;
234 int this_count = SCpnt->request_bufflen; 234 int this_count = scsi_bufflen(SCpnt);
235 int good_bytes = (result == 0 ? this_count : 0); 235 int good_bytes = (result == 0 ? this_count : 0);
236 int block_sectors = 0; 236 int block_sectors = 0;
237 long error_sector; 237 long error_sector;
@@ -379,17 +379,18 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
379 } 379 }
380 380
381 { 381 {
382 struct scatterlist *sg = SCpnt->request_buffer; 382 struct scatterlist *sg;
383 int i, size = 0; 383 int i, size = 0, sg_count = scsi_sg_count(SCpnt);
384 for (i = 0; i < SCpnt->use_sg; i++)
385 size += sg[i].length;
386 384
387 if (size != SCpnt->request_bufflen && SCpnt->use_sg) { 385 scsi_for_each_sg(SCpnt, sg, sg_count, i)
386 size += sg->length;
387
388 if (size != scsi_bufflen(SCpnt)) {
388 scmd_printk(KERN_ERR, SCpnt, 389 scmd_printk(KERN_ERR, SCpnt,
389 "mismatch count %d, bytes %d\n", 390 "mismatch count %d, bytes %d\n",
390 size, SCpnt->request_bufflen); 391 size, scsi_bufflen(SCpnt));
391 if (SCpnt->request_bufflen > size) 392 if (scsi_bufflen(SCpnt) > size)
392 SCpnt->request_bufflen = size; 393 SCpnt->sdb.length = size;
393 } 394 }
394 } 395 }
395 396
@@ -397,12 +398,12 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
397 * request doesn't start on hw block boundary, add scatter pads 398 * request doesn't start on hw block boundary, add scatter pads
398 */ 399 */
399 if (((unsigned int)rq->sector % (s_size >> 9)) || 400 if (((unsigned int)rq->sector % (s_size >> 9)) ||
400 (SCpnt->request_bufflen % s_size)) { 401 (scsi_bufflen(SCpnt) % s_size)) {
401 scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n"); 402 scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n");
402 goto out; 403 goto out;
403 } 404 }
404 405
405 this_count = (SCpnt->request_bufflen >> 9) / (s_size >> 9); 406 this_count = (scsi_bufflen(SCpnt) >> 9) / (s_size >> 9);
406 407
407 408
408 SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n", 409 SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n",
@@ -416,7 +417,7 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
416 417
417 if (this_count > 0xffff) { 418 if (this_count > 0xffff) {
418 this_count = 0xffff; 419 this_count = 0xffff;
419 SCpnt->request_bufflen = this_count * s_size; 420 SCpnt->sdb.length = this_count * s_size;
420 } 421 }
421 422
422 SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff; 423 SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff;
diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c
index e3fab3a6aed7..72f6d8015358 100644
--- a/drivers/scsi/stex.c
+++ b/drivers/scsi/stex.c
@@ -1123,7 +1123,6 @@ static struct scsi_host_template driver_template = {
1123 .this_id = -1, 1123 .this_id = -1,
1124 .sg_tablesize = ST_MAX_SG, 1124 .sg_tablesize = ST_MAX_SG,
1125 .cmd_per_lun = ST_CMD_PER_LUN, 1125 .cmd_per_lun = ST_CMD_PER_LUN,
1126 .use_sg_chaining = ENABLE_SG_CHAINING,
1127}; 1126};
1128 1127
1129static int stex_set_dma_mask(struct pci_dev * pdev) 1128static int stex_set_dma_mask(struct pci_dev * pdev)
diff --git a/drivers/scsi/sym53c416.c b/drivers/scsi/sym53c416.c
index 1f6fd1680335..6325901e5093 100644
--- a/drivers/scsi/sym53c416.c
+++ b/drivers/scsi/sym53c416.c
@@ -840,6 +840,5 @@ static struct scsi_host_template driver_template = {
840 .cmd_per_lun = 1, 840 .cmd_per_lun = 1,
841 .unchecked_isa_dma = 1, 841 .unchecked_isa_dma = 1,
842 .use_clustering = ENABLE_CLUSTERING, 842 .use_clustering = ENABLE_CLUSTERING,
843 .use_sg_chaining = ENABLE_SG_CHAINING,
844}; 843};
845#include "scsi_module.c" 844#include "scsi_module.c"
diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c b/drivers/scsi/sym53c8xx_2/sym_glue.c
index 21e926dcdab0..d39107b7669b 100644
--- a/drivers/scsi/sym53c8xx_2/sym_glue.c
+++ b/drivers/scsi/sym53c8xx_2/sym_glue.c
@@ -207,7 +207,7 @@ void sym_set_cam_result_error(struct sym_hcb *np, struct sym_ccb *cp, int resid)
207 /* 207 /*
208 * Bounce back the sense data to user. 208 * Bounce back the sense data to user.
209 */ 209 */
210 memset(&cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); 210 memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
211 memcpy(cmd->sense_buffer, cp->sns_bbuf, 211 memcpy(cmd->sense_buffer, cp->sns_bbuf,
212 min(SCSI_SENSE_BUFFERSIZE, SYM_SNS_BBUF_LEN)); 212 min(SCSI_SENSE_BUFFERSIZE, SYM_SNS_BBUF_LEN));
213#if 0 213#if 0
@@ -1681,7 +1681,6 @@ static struct scsi_host_template sym2_template = {
1681 .eh_host_reset_handler = sym53c8xx_eh_host_reset_handler, 1681 .eh_host_reset_handler = sym53c8xx_eh_host_reset_handler,
1682 .this_id = 7, 1682 .this_id = 7,
1683 .use_clustering = ENABLE_CLUSTERING, 1683 .use_clustering = ENABLE_CLUSTERING,
1684 .use_sg_chaining = ENABLE_SG_CHAINING,
1685 .max_sectors = 0xFFFF, 1684 .max_sectors = 0xFFFF,
1686#ifdef SYM_LINUX_PROC_INFO_SUPPORT 1685#ifdef SYM_LINUX_PROC_INFO_SUPPORT
1687 .proc_info = sym53c8xx_proc_info, 1686 .proc_info = sym53c8xx_proc_info,
diff --git a/drivers/scsi/u14-34f.c b/drivers/scsi/u14-34f.c
index 4bc5407f9695..662c00451be4 100644
--- a/drivers/scsi/u14-34f.c
+++ b/drivers/scsi/u14-34f.c
@@ -451,7 +451,6 @@ static struct scsi_host_template driver_template = {
451 .this_id = 7, 451 .this_id = 7,
452 .unchecked_isa_dma = 1, 452 .unchecked_isa_dma = 1,
453 .use_clustering = ENABLE_CLUSTERING, 453 .use_clustering = ENABLE_CLUSTERING,
454 .use_sg_chaining = ENABLE_SG_CHAINING,
455 }; 454 };
456 455
457#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD) 456#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/drivers/scsi/ultrastor.c b/drivers/scsi/ultrastor.c
index 75eca6b22db5..f385dce8dfbe 100644
--- a/drivers/scsi/ultrastor.c
+++ b/drivers/scsi/ultrastor.c
@@ -1204,6 +1204,5 @@ static struct scsi_host_template driver_template = {
1204 .cmd_per_lun = ULTRASTOR_MAX_CMDS_PER_LUN, 1204 .cmd_per_lun = ULTRASTOR_MAX_CMDS_PER_LUN,
1205 .unchecked_isa_dma = 1, 1205 .unchecked_isa_dma = 1,
1206 .use_clustering = ENABLE_CLUSTERING, 1206 .use_clustering = ENABLE_CLUSTERING,
1207 .use_sg_chaining = ENABLE_SG_CHAINING,
1208}; 1207};
1209#include "scsi_module.c" 1208#include "scsi_module.c"
diff --git a/drivers/scsi/wd7000.c b/drivers/scsi/wd7000.c
index b4304ae78527..c975c01b3a02 100644
--- a/drivers/scsi/wd7000.c
+++ b/drivers/scsi/wd7000.c
@@ -1671,7 +1671,6 @@ static struct scsi_host_template driver_template = {
1671 .cmd_per_lun = 1, 1671 .cmd_per_lun = 1,
1672 .unchecked_isa_dma = 1, 1672 .unchecked_isa_dma = 1,
1673 .use_clustering = ENABLE_CLUSTERING, 1673 .use_clustering = ENABLE_CLUSTERING,
1674 .use_sg_chaining = ENABLE_SG_CHAINING,
1675}; 1674};
1676 1675
1677#include "scsi_module.c" 1676#include "scsi_module.c"
diff --git a/drivers/usb/storage/isd200.c b/drivers/usb/storage/isd200.c
index 178e8c2a8a2f..0db488624ab1 100644
--- a/drivers/usb/storage/isd200.c
+++ b/drivers/usb/storage/isd200.c
@@ -415,14 +415,14 @@ static void isd200_set_srb(struct isd200_info *info,
415 sg_init_one(&info->sg, buff, bufflen); 415 sg_init_one(&info->sg, buff, bufflen);
416 416
417 srb->sc_data_direction = dir; 417 srb->sc_data_direction = dir;
418 srb->request_buffer = buff ? &info->sg : NULL; 418 srb->sdb.table.sgl = buff ? &info->sg : NULL;
419 srb->request_bufflen = bufflen; 419 srb->sdb.length = bufflen;
420 srb->use_sg = buff ? 1 : 0; 420 srb->sdb.table.nents = buff ? 1 : 0;
421} 421}
422 422
423static void isd200_srb_set_bufflen(struct scsi_cmnd *srb, unsigned bufflen) 423static void isd200_srb_set_bufflen(struct scsi_cmnd *srb, unsigned bufflen)
424{ 424{
425 srb->request_bufflen = bufflen; 425 srb->sdb.length = bufflen;
426} 426}
427 427
428 428
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 899fc13d0612..afcdc69e37d6 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -609,7 +609,7 @@ config SBC_EPX_C3_WATCHDOG
609 609
610config INDYDOG 610config INDYDOG
611 tristate "Indy/I2 Hardware Watchdog" 611 tristate "Indy/I2 Hardware Watchdog"
612 depends on SGI_IP22 612 depends on SGI_HAS_INDYDOG
613 help 613 help
614 Hardware driver for the Indy's/I2's watchdog. This is a 614 Hardware driver for the Indy's/I2's watchdog. This is a
615 watchdog timer that will reboot the machine after a 60 second 615 watchdog timer that will reboot the machine after a 60 second