aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/DocBook/kernel-api.tmpl8
-rw-r--r--Documentation/lguest/lguest.c49
-rw-r--r--arch/ia64/hp/sim/simscsi.c1
-rw-r--r--arch/powerpc/kernel/vio.c13
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/kvm/Kconfig (renamed from drivers/kvm/Kconfig)7
-rw-r--r--arch/x86/kvm/Makefile (renamed from drivers/kvm/Makefile)6
-rw-r--r--arch/x86/kvm/i8259.c (renamed from drivers/kvm/i8259.c)8
-rw-r--r--arch/x86/kvm/irq.c (renamed from drivers/kvm/irq.c)22
-rw-r--r--arch/x86/kvm/irq.h88
-rw-r--r--arch/x86/kvm/kvm_svm.h (renamed from drivers/kvm/kvm_svm.h)2
-rw-r--r--arch/x86/kvm/lapic.c (renamed from drivers/kvm/lapic.c)216
-rw-r--r--arch/x86/kvm/lapic.h50
-rw-r--r--arch/x86/kvm/mmu.c1885
-rw-r--r--arch/x86/kvm/mmu.h44
-rw-r--r--arch/x86/kvm/paging_tmpl.h484
-rw-r--r--arch/x86/kvm/segment_descriptor.h (renamed from drivers/kvm/segment_descriptor.h)12
-rw-r--r--arch/x86/kvm/svm.c (renamed from drivers/kvm/svm.c)353
-rw-r--r--arch/x86/kvm/svm.h (renamed from drivers/kvm/svm.h)3
-rw-r--r--arch/x86/kvm/vmx.c (renamed from drivers/kvm/vmx.c)1079
-rw-r--r--arch/x86/kvm/vmx.h (renamed from drivers/kvm/vmx.h)26
-rw-r--r--arch/x86/kvm/x86.c (renamed from drivers/kvm/kvm_main.c)4243
-rw-r--r--arch/x86/kvm/x86_emulate.c1912
-rw-r--r--arch/x86/lguest/boot.c11
-rw-r--r--block/bsg.c1
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile3
-rw-r--r--drivers/base/bus.c41
-rw-r--r--drivers/base/class.c4
-rw-r--r--drivers/base/core.c30
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c1
-rw-r--r--drivers/kvm/irq.h165
-rw-r--r--drivers/kvm/mmu.c1498
-rw-r--r--drivers/kvm/paging_tmpl.h511
-rw-r--r--drivers/kvm/x86_emulate.c1662
-rw-r--r--drivers/lguest/core.c46
-rw-r--r--drivers/lguest/hypercalls.c106
-rw-r--r--drivers/lguest/interrupts_and_traps.c149
-rw-r--r--drivers/lguest/lg.h154
-rw-r--r--drivers/lguest/lguest_user.c147
-rw-r--r--drivers/lguest/page_tables.c179
-rw-r--r--drivers/lguest/segments.c48
-rw-r--r--drivers/lguest/x86/core.c127
-rw-r--r--drivers/s390/scsi/zfcp_fsf.c4
-rw-r--r--drivers/scsi/3w-9xxx.c1
-rw-r--r--drivers/scsi/3w-xxxx.c1
-rw-r--r--drivers/scsi/BusLogic.c1
-rw-r--r--drivers/scsi/Kconfig2
-rw-r--r--drivers/scsi/NCR53c406a.c1
-rw-r--r--drivers/scsi/a100u2w.c1
-rw-r--r--drivers/scsi/aacraid/commctrl.c29
-rw-r--r--drivers/scsi/aacraid/linit.c1
-rw-r--r--drivers/scsi/aha1740.c1
-rw-r--r--drivers/scsi/aic7xxx/aic79xx.h5
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_core.c2
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_osm.c3
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_osm_pci.c33
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_pci.c2
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx.h4
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_core.c3
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_osm.c10
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_osm_pci.c33
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_pci.c2
-rw-r--r--drivers/scsi/aic7xxx_old.c1
-rw-r--r--drivers/scsi/arcmsr/arcmsr_hba.c1
-rw-r--r--drivers/scsi/dc395x.c1
-rw-r--r--drivers/scsi/dpt_i2o.c1
-rw-r--r--drivers/scsi/eata.c1
-rw-r--r--drivers/scsi/hosts.c1
-rw-r--r--drivers/scsi/hptiop.c3
-rw-r--r--drivers/scsi/ibmmca.c1
-rw-r--r--drivers/scsi/ibmvscsi/ibmvscsi.c1
-rw-r--r--drivers/scsi/initio.c1
-rw-r--r--drivers/scsi/iscsi_tcp.c1
-rw-r--r--drivers/scsi/libsrp.c4
-rw-r--r--drivers/scsi/lpfc/lpfc_scsi.c2
-rw-r--r--drivers/scsi/mac53c94.c1
-rw-r--r--drivers/scsi/megaraid.c1
-rw-r--r--drivers/scsi/megaraid/megaraid_mbox.c1
-rw-r--r--drivers/scsi/megaraid/megaraid_sas.c1
-rw-r--r--drivers/scsi/mesh.c1
-rw-r--r--drivers/scsi/ncr53c8xx.c2
-rw-r--r--drivers/scsi/nsp32.c1
-rw-r--r--drivers/scsi/pcmcia/sym53c500_cs.c1
-rw-r--r--drivers/scsi/qla1280.c1
-rw-r--r--drivers/scsi/qla2xxx/qla_os.c2
-rw-r--r--drivers/scsi/qla4xxx/ql4_os.c1
-rw-r--r--drivers/scsi/qlogicfas.c1
-rw-r--r--drivers/scsi/scsi.c2
-rw-r--r--drivers/scsi/scsi_debug.c174
-rw-r--r--drivers/scsi/scsi_error.c33
-rw-r--r--drivers/scsi/scsi_lib.c274
-rw-r--r--drivers/scsi/scsi_tgt_lib.c28
-rw-r--r--drivers/scsi/sd.c4
-rw-r--r--drivers/scsi/sgiwd93.c64
-rw-r--r--drivers/scsi/sr.c25
-rw-r--r--drivers/scsi/stex.c1
-rw-r--r--drivers/scsi/sym53c416.c1
-rw-r--r--drivers/scsi/sym53c8xx_2/sym_glue.c3
-rw-r--r--drivers/scsi/u14-34f.c1
-rw-r--r--drivers/scsi/ultrastor.c1
-rw-r--r--drivers/scsi/wd7000.c1
-rw-r--r--drivers/usb/storage/isd200.c8
-rw-r--r--drivers/watchdog/Kconfig2
-rw-r--r--fs/dlm/dir.c76
-rw-r--r--fs/dlm/dlm_internal.h16
-rw-r--r--fs/dlm/lock.c249
-rw-r--r--fs/dlm/lock.h2
-rw-r--r--fs/dlm/lockspace.c16
-rw-r--r--fs/dlm/lowcomms.c15
-rw-r--r--fs/dlm/main.c10
-rw-r--r--fs/dlm/member.c4
-rw-r--r--fs/dlm/member.h3
-rw-r--r--fs/dlm/memory.c32
-rw-r--r--fs/dlm/memory.h16
-rw-r--r--fs/dlm/midcomms.c15
-rw-r--r--fs/dlm/rcom.c25
-rw-r--r--fs/dlm/recover.c27
-rw-r--r--fs/dlm/recoverd.c11
-rw-r--r--fs/dlm/user.c29
-rw-r--r--fs/dlm/util.c82
-rw-r--r--include/asm-x86/Kbuild1
-rw-r--r--include/asm-x86/kvm.h191
-rw-r--r--include/asm-x86/kvm_host.h (renamed from drivers/kvm/kvm.h)537
-rw-r--r--include/asm-x86/kvm_para.h105
-rw-r--r--include/asm-x86/kvm_x86_emulate.h (renamed from drivers/kvm/x86_emulate.h)69
-rw-r--r--include/asm-x86/lguest.h2
-rw-r--r--include/asm-x86/lguest_hcall.h6
-rw-r--r--include/linux/Kbuild2
-rw-r--r--include/linux/audit.h2
-rw-r--r--include/linux/device.h3
-rw-r--r--include/linux/kvm.h203
-rw-r--r--include/linux/kvm_host.h299
-rw-r--r--include/linux/kvm_para.h82
-rw-r--r--include/linux/kvm_types.h54
-rw-r--r--include/linux/selinux.h45
-rw-r--r--include/net/netlabel.h99
-rw-r--r--include/scsi/scsi.h20
-rw-r--r--include/scsi/scsi_cmnd.h59
-rw-r--r--include/scsi/scsi_eh.h9
-rw-r--r--include/scsi/scsi_host.h17
-rw-r--r--kernel/fork.c1
-rw-r--r--net/ipv4/cipso_ipv4.c59
-rw-r--r--net/netfilter/xt_SECMARK.c13
-rw-r--r--net/netlabel/netlabel_cipso_v4.c5
-rw-r--r--net/netlabel/netlabel_domainhash.c77
-rw-r--r--net/netlabel/netlabel_kapi.c21
-rw-r--r--net/netlabel/netlabel_mgmt.c63
-rw-r--r--net/netlabel/netlabel_mgmt.h7
-rw-r--r--net/netlabel/netlabel_unlabeled.c1565
-rw-r--r--net/netlabel/netlabel_unlabeled.h145
-rw-r--r--security/Kconfig1
-rw-r--r--security/selinux/Kconfig2
-rw-r--r--security/selinux/Makefile9
-rw-r--r--security/selinux/avc.c15
-rw-r--r--security/selinux/exports.c20
-rw-r--r--security/selinux/hooks.c667
-rw-r--r--security/selinux/include/av_perm_to_string.h9
-rw-r--r--security/selinux/include/av_permissions.h9
-rw-r--r--security/selinux/include/avc.h2
-rw-r--r--security/selinux/include/class_to_string.h7
-rw-r--r--security/selinux/include/flask.h1
-rw-r--r--security/selinux/include/netif.h4
-rw-r--r--security/selinux/include/netlabel.h11
-rw-r--r--security/selinux/include/netnode.h32
-rw-r--r--security/selinux/include/objsec.h16
-rw-r--r--security/selinux/include/security.h24
-rw-r--r--security/selinux/include/xfrm.h12
-rw-r--r--security/selinux/netif.c263
-rw-r--r--security/selinux/netlabel.c75
-rw-r--r--security/selinux/netnode.c354
-rw-r--r--security/selinux/selinuxfs.c89
-rw-r--r--security/selinux/ss/mls.c10
-rw-r--r--security/selinux/ss/policydb.c18
-rw-r--r--security/selinux/ss/policydb.h2
-rw-r--r--security/selinux/ss/services.c291
-rw-r--r--security/selinux/xfrm.c18
-rw-r--r--virt/kvm/ioapic.c (renamed from drivers/kvm/ioapic.c)99
-rw-r--r--virt/kvm/ioapic.h95
-rw-r--r--virt/kvm/iodev.h63
-rw-r--r--virt/kvm/kvm_main.c1400
182 files changed, 14842 insertions, 9360 deletions
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index aa38cc5692a0..77436d735013 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -419,7 +419,13 @@ X!Edrivers/pnp/system.c
419 419
420 <chapter id="blkdev"> 420 <chapter id="blkdev">
421 <title>Block Devices</title> 421 <title>Block Devices</title>
422!Eblock/ll_rw_blk.c 422!Eblock/blk-core.c
423!Eblock/blk-map.c
424!Iblock/blk-sysfs.c
425!Eblock/blk-settings.c
426!Eblock/blk-exec.c
427!Eblock/blk-barrier.c
428!Eblock/blk-tag.c
423 </chapter> 429 </chapter>
424 430
425 <chapter id="chrdev"> 431 <chapter id="chrdev">
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 9b0e322118b5..6c8a2386cd50 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -79,6 +79,9 @@ static void *guest_base;
79/* The maximum guest physical address allowed, and maximum possible. */ 79/* The maximum guest physical address allowed, and maximum possible. */
80static unsigned long guest_limit, guest_max; 80static unsigned long guest_limit, guest_max;
81 81
82/* a per-cpu variable indicating whose vcpu is currently running */
83static unsigned int __thread cpu_id;
84
82/* This is our list of devices. */ 85/* This is our list of devices. */
83struct device_list 86struct device_list
84{ 87{
@@ -153,6 +156,9 @@ struct virtqueue
153 void (*handle_output)(int fd, struct virtqueue *me); 156 void (*handle_output)(int fd, struct virtqueue *me);
154}; 157};
155 158
159/* Remember the arguments to the program so we can "reboot" */
160static char **main_args;
161
156/* Since guest is UP and we don't run at the same time, we don't need barriers. 162/* Since guest is UP and we don't run at the same time, we don't need barriers.
157 * But I include them in the code in case others copy it. */ 163 * But I include them in the code in case others copy it. */
158#define wmb() 164#define wmb()
@@ -554,7 +560,7 @@ static void wake_parent(int pipefd, int lguest_fd)
554 else 560 else
555 FD_CLR(-fd - 1, &devices.infds); 561 FD_CLR(-fd - 1, &devices.infds);
556 } else /* Send LHREQ_BREAK command. */ 562 } else /* Send LHREQ_BREAK command. */
557 write(lguest_fd, args, sizeof(args)); 563 pwrite(lguest_fd, args, sizeof(args), cpu_id);
558 } 564 }
559} 565}
560 566
@@ -1489,7 +1495,9 @@ static void setup_block_file(const char *filename)
1489 1495
1490 /* Create stack for thread and run it */ 1496 /* Create stack for thread and run it */
1491 stack = malloc(32768); 1497 stack = malloc(32768);
1492 if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1) 1498 /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from
1499 * becoming a zombie. */
1500 if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1)
1493 err(1, "Creating clone"); 1501 err(1, "Creating clone");
1494 1502
1495 /* We don't need to keep the I/O thread's end of the pipes open. */ 1503 /* We don't need to keep the I/O thread's end of the pipes open. */
@@ -1499,7 +1507,21 @@ static void setup_block_file(const char *filename)
1499 verbose("device %u: virtblock %llu sectors\n", 1507 verbose("device %u: virtblock %llu sectors\n",
1500 devices.device_num, cap); 1508 devices.device_num, cap);
1501} 1509}
1502/* That's the end of device setup. */ 1510/* That's the end of device setup. :*/
1511
1512/* Reboot */
1513static void __attribute__((noreturn)) restart_guest(void)
1514{
1515 unsigned int i;
1516
1517 /* Closing pipes causes the waker thread and io_threads to die, and
1518 * closing /dev/lguest cleans up the Guest. Since we don't track all
1519 * open fds, we simply close everything beyond stderr. */
1520 for (i = 3; i < FD_SETSIZE; i++)
1521 close(i);
1522 execv(main_args[0], main_args);
1523 err(1, "Could not exec %s", main_args[0]);
1524}
1503 1525
1504/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves 1526/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
1505 * its input and output, and finally, lays it to rest. */ 1527 * its input and output, and finally, lays it to rest. */
@@ -1511,7 +1533,8 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
1511 int readval; 1533 int readval;
1512 1534
1513 /* We read from the /dev/lguest device to run the Guest. */ 1535 /* We read from the /dev/lguest device to run the Guest. */
1514 readval = read(lguest_fd, &notify_addr, sizeof(notify_addr)); 1536 readval = pread(lguest_fd, &notify_addr,
1537 sizeof(notify_addr), cpu_id);
1515 1538
1516 /* One unsigned long means the Guest did HCALL_NOTIFY */ 1539 /* One unsigned long means the Guest did HCALL_NOTIFY */
1517 if (readval == sizeof(notify_addr)) { 1540 if (readval == sizeof(notify_addr)) {
@@ -1521,16 +1544,23 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
1521 /* ENOENT means the Guest died. Reading tells us why. */ 1544 /* ENOENT means the Guest died. Reading tells us why. */
1522 } else if (errno == ENOENT) { 1545 } else if (errno == ENOENT) {
1523 char reason[1024] = { 0 }; 1546 char reason[1024] = { 0 };
1524 read(lguest_fd, reason, sizeof(reason)-1); 1547 pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
1525 errx(1, "%s", reason); 1548 errx(1, "%s", reason);
1549 /* ERESTART means that we need to reboot the guest */
1550 } else if (errno == ERESTART) {
1551 restart_guest();
1526 /* EAGAIN means the Waker wanted us to look at some input. 1552 /* EAGAIN means the Waker wanted us to look at some input.
1527 * Anything else means a bug or incompatible change. */ 1553 * Anything else means a bug or incompatible change. */
1528 } else if (errno != EAGAIN) 1554 } else if (errno != EAGAIN)
1529 err(1, "Running guest failed"); 1555 err(1, "Running guest failed");
1530 1556
1557 /* Only service input on thread for CPU 0. */
1558 if (cpu_id != 0)
1559 continue;
1560
1531 /* Service input, then unset the BREAK to release the Waker. */ 1561 /* Service input, then unset the BREAK to release the Waker. */
1532 handle_input(lguest_fd); 1562 handle_input(lguest_fd);
1533 if (write(lguest_fd, args, sizeof(args)) < 0) 1563 if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
1534 err(1, "Resetting break"); 1564 err(1, "Resetting break");
1535 } 1565 }
1536} 1566}
@@ -1571,6 +1601,12 @@ int main(int argc, char *argv[])
1571 /* If they specify an initrd file to load. */ 1601 /* If they specify an initrd file to load. */
1572 const char *initrd_name = NULL; 1602 const char *initrd_name = NULL;
1573 1603
1604 /* Save the args: we "reboot" by execing ourselves again. */
1605 main_args = argv;
1606 /* We don't "wait" for the children, so prevent them from becoming
1607 * zombies. */
1608 signal(SIGCHLD, SIG_IGN);
1609
1574 /* First we initialize the device list. Since console and network 1610 /* First we initialize the device list. Since console and network
1575 * device receive input from a file descriptor, we keep an fdset 1611 * device receive input from a file descriptor, we keep an fdset
1576 * (infds) and the maximum fd number (max_infd) with the head of the 1612 * (infds) and the maximum fd number (max_infd) with the head of the
@@ -1582,6 +1618,7 @@ int main(int argc, char *argv[])
1582 devices.lastdev = &devices.dev; 1618 devices.lastdev = &devices.dev;
1583 devices.next_irq = 1; 1619 devices.next_irq = 1;
1584 1620
1621 cpu_id = 0;
1585 /* We need to know how much memory so we can set up the device 1622 /* We need to know how much memory so we can set up the device
1586 * descriptor and memory pages for the devices as we parse the command 1623 * descriptor and memory pages for the devices as we parse the command
1587 * line. So we quickly look through the arguments to find the amount 1624 * line. So we quickly look through the arguments to find the amount
diff --git a/arch/ia64/hp/sim/simscsi.c b/arch/ia64/hp/sim/simscsi.c
index 6ef9b5219930..7661bb065fa5 100644
--- a/arch/ia64/hp/sim/simscsi.c
+++ b/arch/ia64/hp/sim/simscsi.c
@@ -360,7 +360,6 @@ static struct scsi_host_template driver_template = {
360 .max_sectors = 1024, 360 .max_sectors = 1024,
361 .cmd_per_lun = SIMSCSI_REQ_QUEUE_LEN, 361 .cmd_per_lun = SIMSCSI_REQ_QUEUE_LEN,
362 .use_clustering = DISABLE_CLUSTERING, 362 .use_clustering = DISABLE_CLUSTERING,
363 .use_sg_chaining = ENABLE_SG_CHAINING,
364}; 363};
365 364
366static int __init 365static int __init
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 19a5656001c0..f0bad7070fb5 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -37,8 +37,6 @@
37#include <asm/iseries/hv_call_xm.h> 37#include <asm/iseries/hv_call_xm.h>
38#include <asm/iseries/iommu.h> 38#include <asm/iseries/iommu.h>
39 39
40extern struct kset devices_subsys; /* needed for vio_find_name() */
41
42static struct bus_type vio_bus_type; 40static struct bus_type vio_bus_type;
43 41
44static struct vio_dev vio_bus_device = { /* fake "parent" device */ 42static struct vio_dev vio_bus_device = { /* fake "parent" device */
@@ -361,19 +359,16 @@ EXPORT_SYMBOL(vio_get_attribute);
361#ifdef CONFIG_PPC_PSERIES 359#ifdef CONFIG_PPC_PSERIES
362/* vio_find_name() - internal because only vio.c knows how we formatted the 360/* vio_find_name() - internal because only vio.c knows how we formatted the
363 * kobject name 361 * kobject name
364 * XXX once vio_bus_type.devices is actually used as a kset in
365 * drivers/base/bus.c, this function should be removed in favor of
366 * "device_find(kobj_name, &vio_bus_type)"
367 */ 362 */
368static struct vio_dev *vio_find_name(const char *kobj_name) 363static struct vio_dev *vio_find_name(const char *name)
369{ 364{
370 struct kobject *found; 365 struct device *found;
371 366
372 found = kset_find_obj(&devices_subsys, kobj_name); 367 found = bus_find_device_by_name(&vio_bus_type, NULL, name);
373 if (!found) 368 if (!found)
374 return NULL; 369 return NULL;
375 370
376 return to_vio_dev(container_of(found, struct device, kobj)); 371 return to_vio_dev(found);
377} 372}
378 373
379/** 374/**
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fb3eea3e38ee..65b449134cf7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -107,6 +107,7 @@ config ARCH_SUPPORTS_OPROFILE
107 bool 107 bool
108 default y 108 default y
109 109
110select HAVE_KVM
110 111
111config ZONE_DMA32 112config ZONE_DMA32
112 bool 113 bool
@@ -1598,4 +1599,6 @@ source "security/Kconfig"
1598 1599
1599source "crypto/Kconfig" 1600source "crypto/Kconfig"
1600 1601
1602source "arch/x86/kvm/Kconfig"
1603
1601source "lib/Kconfig" 1604source "lib/Kconfig"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b08f18261df6..da8f4129780b 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,6 +7,8 @@ else
7 KBUILD_DEFCONFIG := $(ARCH)_defconfig 7 KBUILD_DEFCONFIG := $(ARCH)_defconfig
8endif 8endif
9 9
10core-$(CONFIG_KVM) += arch/x86/kvm/
11
10# BITS is used as extension for files which are available in a 32 bit 12# BITS is used as extension for files which are available in a 32 bit
11# and a 64 bit version to simplify shared Makefiles. 13# and a 64 bit version to simplify shared Makefiles.
12# e.g.: obj-y += foo_$(BITS).o 14# e.g.: obj-y += foo_$(BITS).o
diff --git a/drivers/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 656920636cb2..c83e1c9b5129 100644
--- a/drivers/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -1,9 +1,12 @@
1# 1#
2# KVM configuration 2# KVM configuration
3# 3#
4config HAVE_KVM
5 bool
6
4menuconfig VIRTUALIZATION 7menuconfig VIRTUALIZATION
5 bool "Virtualization" 8 bool "Virtualization"
6 depends on X86 9 depends on HAVE_KVM || X86
7 default y 10 default y
8 ---help--- 11 ---help---
9 Say Y here to get to see options for using your Linux host to run other 12 Say Y here to get to see options for using your Linux host to run other
@@ -16,7 +19,7 @@ if VIRTUALIZATION
16 19
17config KVM 20config KVM
18 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
19 depends on X86 && EXPERIMENTAL 22 depends on HAVE_KVM && EXPERIMENTAL
20 select PREEMPT_NOTIFIERS 23 select PREEMPT_NOTIFIERS
21 select ANON_INODES 24 select ANON_INODES
22 ---help--- 25 ---help---
diff --git a/drivers/kvm/Makefile b/arch/x86/kvm/Makefile
index e5a8f4d3e973..ffdd0b310784 100644
--- a/drivers/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -2,7 +2,11 @@
2# Makefile for Kernel-based Virtual Machine module 2# Makefile for Kernel-based Virtual Machine module
3# 3#
4 4
5kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o 5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
6
7EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
8
9kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
6obj-$(CONFIG_KVM) += kvm.o 10obj-$(CONFIG_KVM) += kvm.o
7kvm-intel-objs = vmx.o 11kvm-intel-objs = vmx.o
8obj-$(CONFIG_KVM_INTEL) += kvm-intel.o 12obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/drivers/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a679157bc599..ab29cf2def47 100644
--- a/drivers/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -28,6 +28,8 @@
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include "irq.h" 29#include "irq.h"
30 30
31#include <linux/kvm_host.h>
32
31/* 33/*
32 * set irq level. If an edge is detected, then the IRR is set to 1 34 * set irq level. If an edge is detected, then the IRR is set to 1
33 */ 35 */
@@ -181,10 +183,8 @@ int kvm_pic_read_irq(struct kvm_pic *s)
181 return intno; 183 return intno;
182} 184}
183 185
184static void pic_reset(void *opaque) 186void kvm_pic_reset(struct kvm_kpic_state *s)
185{ 187{
186 struct kvm_kpic_state *s = opaque;
187
188 s->last_irr = 0; 188 s->last_irr = 0;
189 s->irr = 0; 189 s->irr = 0;
190 s->imr = 0; 190 s->imr = 0;
@@ -209,7 +209,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
209 addr &= 1; 209 addr &= 1;
210 if (addr == 0) { 210 if (addr == 0) {
211 if (val & 0x10) { 211 if (val & 0x10) {
212 pic_reset(s); /* init */ 212 kvm_pic_reset(s); /* init */
213 /* 213 /*
214 * deassert a pending interrupt 214 * deassert a pending interrupt
215 */ 215 */
diff --git a/drivers/kvm/irq.c b/arch/x86/kvm/irq.c
index 7628c7ff628f..e5714759e97f 100644
--- a/drivers/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -20,8 +20,8 @@
20 */ 20 */
21 21
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/kvm_host.h>
23 24
24#include "kvm.h"
25#include "irq.h" 25#include "irq.h"
26 26
27/* 27/*
@@ -63,26 +63,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
63} 63}
64EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); 64EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
65 65
66static void vcpu_kick_intr(void *info)
67{
68#ifdef DEBUG
69 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
70 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
71#endif
72}
73
74void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
75{
76 int ipi_pcpu = vcpu->cpu;
77
78 if (waitqueue_active(&vcpu->wq)) {
79 wake_up_interruptible(&vcpu->wq);
80 ++vcpu->stat.halt_wakeup;
81 }
82 if (vcpu->guest_mode)
83 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
84}
85
86void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 66void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
87{ 67{
88 kvm_inject_apic_timer_irqs(vcpu); 68 kvm_inject_apic_timer_irqs(vcpu);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
new file mode 100644
index 000000000000..fa5ed5d59b5d
--- /dev/null
+++ b/arch/x86/kvm/irq.h
@@ -0,0 +1,88 @@
1/*
2 * irq.h: in kernel interrupt controller related definitions
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#ifndef __IRQ_H
23#define __IRQ_H
24
25#include <linux/mm_types.h>
26#include <linux/hrtimer.h>
27#include <linux/kvm_host.h>
28
29#include "iodev.h"
30#include "ioapic.h"
31#include "lapic.h"
32
33struct kvm;
34struct kvm_vcpu;
35
36typedef void irq_request_func(void *opaque, int level);
37
38struct kvm_kpic_state {
39 u8 last_irr; /* edge detection */
40 u8 irr; /* interrupt request register */
41 u8 imr; /* interrupt mask register */
42 u8 isr; /* interrupt service register */
43 u8 priority_add; /* highest irq priority */
44 u8 irq_base;
45 u8 read_reg_select;
46 u8 poll;
47 u8 special_mask;
48 u8 init_state;
49 u8 auto_eoi;
50 u8 rotate_on_auto_eoi;
51 u8 special_fully_nested_mode;
52 u8 init4; /* true if 4 byte init */
53 u8 elcr; /* PIIX edge/trigger selection */
54 u8 elcr_mask;
55 struct kvm_pic *pics_state;
56};
57
58struct kvm_pic {
59 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
60 irq_request_func *irq_request;
61 void *irq_request_opaque;
62 int output; /* intr from master PIC */
63 struct kvm_io_device dev;
64};
65
66struct kvm_pic *kvm_create_pic(struct kvm *kvm);
67void kvm_pic_set_irq(void *opaque, int irq, int level);
68int kvm_pic_read_irq(struct kvm_pic *s);
69void kvm_pic_update_irq(struct kvm_pic *s);
70
71static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
72{
73 return kvm->arch.vpic;
74}
75
76static inline int irqchip_in_kernel(struct kvm *kvm)
77{
78 return pic_irqchip(kvm) != NULL;
79}
80
81void kvm_pic_reset(struct kvm_kpic_state *s);
82
83void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
84void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
85void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
86void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
87
88#endif
diff --git a/drivers/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index a0e415daef5b..ecdfe97e4635 100644
--- a/drivers/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -4,10 +4,10 @@
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/list.h> 6#include <linux/list.h>
7#include <linux/kvm_host.h>
7#include <asm/msr.h> 8#include <asm/msr.h>
8 9
9#include "svm.h" 10#include "svm.h"
10#include "kvm.h"
11 11
12static const u32 host_save_user_msrs[] = { 12static const u32 host_save_user_msrs[] = {
13#ifdef CONFIG_X86_64 13#ifdef CONFIG_X86_64
diff --git a/drivers/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 238fcad3cece..2cbee9479ce4 100644
--- a/drivers/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -17,7 +17,7 @@
17 * the COPYING file in the top-level directory. 17 * the COPYING file in the top-level directory.
18 */ 18 */
19 19
20#include "kvm.h" 20#include <linux/kvm_host.h>
21#include <linux/kvm.h> 21#include <linux/kvm.h>
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
@@ -56,6 +56,7 @@
56 56
57#define VEC_POS(v) ((v) & (32 - 1)) 57#define VEC_POS(v) ((v) & (32 - 1))
58#define REG_POS(v) (((v) >> 5) << 4) 58#define REG_POS(v) (((v) >> 5) << 4)
59
59static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) 60static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
60{ 61{
61 return *((u32 *) (apic->regs + reg_off)); 62 return *((u32 *) (apic->regs + reg_off));
@@ -88,7 +89,7 @@ static inline void apic_clear_vector(int vec, void *bitmap)
88 89
89static inline int apic_hw_enabled(struct kvm_lapic *apic) 90static inline int apic_hw_enabled(struct kvm_lapic *apic)
90{ 91{
91 return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE; 92 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
92} 93}
93 94
94static inline int apic_sw_enabled(struct kvm_lapic *apic) 95static inline int apic_sw_enabled(struct kvm_lapic *apic)
@@ -172,7 +173,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
172 173
173int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) 174int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
174{ 175{
175 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; 176 struct kvm_lapic *apic = vcpu->arch.apic;
176 int highest_irr; 177 int highest_irr;
177 178
178 if (!apic) 179 if (!apic)
@@ -183,8 +184,10 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
183} 184}
184EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); 185EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
185 186
186int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig) 187int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
187{ 188{
189 struct kvm_lapic *apic = vcpu->arch.apic;
190
188 if (!apic_test_and_set_irr(vec, apic)) { 191 if (!apic_test_and_set_irr(vec, apic)) {
189 /* a new pending irq is set in IRR */ 192 /* a new pending irq is set in IRR */
190 if (trig) 193 if (trig)
@@ -268,7 +271,7 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
268 int short_hand, int dest, int dest_mode) 271 int short_hand, int dest, int dest_mode)
269{ 272{
270 int result = 0; 273 int result = 0;
271 struct kvm_lapic *target = vcpu->apic; 274 struct kvm_lapic *target = vcpu->arch.apic;
272 275
273 apic_debug("target %p, source %p, dest 0x%x, " 276 apic_debug("target %p, source %p, dest 0x%x, "
274 "dest_mode 0x%x, short_hand 0x%x", 277 "dest_mode 0x%x, short_hand 0x%x",
@@ -335,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
335 } else 338 } else
336 apic_clear_vector(vector, apic->regs + APIC_TMR); 339 apic_clear_vector(vector, apic->regs + APIC_TMR);
337 340
338 if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) 341 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
339 kvm_vcpu_kick(vcpu); 342 kvm_vcpu_kick(vcpu);
340 else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) { 343 else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
341 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; 344 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
342 if (waitqueue_active(&vcpu->wq)) 345 if (waitqueue_active(&vcpu->wq))
343 wake_up_interruptible(&vcpu->wq); 346 wake_up_interruptible(&vcpu->wq);
344 } 347 }
@@ -359,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
359 362
360 case APIC_DM_INIT: 363 case APIC_DM_INIT:
361 if (level) { 364 if (level) {
362 if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) 365 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
363 printk(KERN_DEBUG 366 printk(KERN_DEBUG
364 "INIT on a runnable vcpu %d\n", 367 "INIT on a runnable vcpu %d\n",
365 vcpu->vcpu_id); 368 vcpu->vcpu_id);
366 vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED; 369 vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
367 kvm_vcpu_kick(vcpu); 370 kvm_vcpu_kick(vcpu);
368 } else { 371 } else {
369 printk(KERN_DEBUG 372 printk(KERN_DEBUG
@@ -376,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
376 case APIC_DM_STARTUP: 379 case APIC_DM_STARTUP:
377 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", 380 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
378 vcpu->vcpu_id, vector); 381 vcpu->vcpu_id, vector);
379 if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) { 382 if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
380 vcpu->sipi_vector = vector; 383 vcpu->arch.sipi_vector = vector;
381 vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED; 384 vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
382 if (waitqueue_active(&vcpu->wq)) 385 if (waitqueue_active(&vcpu->wq))
383 wake_up_interruptible(&vcpu->wq); 386 wake_up_interruptible(&vcpu->wq);
384 } 387 }
@@ -392,15 +395,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
392 return result; 395 return result;
393} 396}
394 397
395struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, 398static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
396 unsigned long bitmap) 399 unsigned long bitmap)
397{ 400{
398 int vcpu_id;
399 int last; 401 int last;
400 int next; 402 int next;
401 struct kvm_lapic *apic; 403 struct kvm_lapic *apic = NULL;
402 404
403 last = kvm->round_robin_prev_vcpu; 405 last = kvm->arch.round_robin_prev_vcpu;
404 next = last; 406 next = last;
405 407
406 do { 408 do {
@@ -408,25 +410,30 @@ struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
408 next = 0; 410 next = 0;
409 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) 411 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
410 continue; 412 continue;
411 apic = kvm->vcpus[next]->apic; 413 apic = kvm->vcpus[next]->arch.apic;
412 if (apic && apic_enabled(apic)) 414 if (apic && apic_enabled(apic))
413 break; 415 break;
414 apic = NULL; 416 apic = NULL;
415 } while (next != last); 417 } while (next != last);
416 kvm->round_robin_prev_vcpu = next; 418 kvm->arch.round_robin_prev_vcpu = next;
417 419
418 if (!apic) { 420 if (!apic)
419 vcpu_id = ffs(bitmap) - 1; 421 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
420 if (vcpu_id < 0) {
421 vcpu_id = 0;
422 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
423 }
424 apic = kvm->vcpus[vcpu_id]->apic;
425 }
426 422
427 return apic; 423 return apic;
428} 424}
429 425
426struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
427 unsigned long bitmap)
428{
429 struct kvm_lapic *apic;
430
431 apic = kvm_apic_round_robin(kvm, vector, bitmap);
432 if (apic)
433 return apic->vcpu;
434 return NULL;
435}
436
430static void apic_set_eoi(struct kvm_lapic *apic) 437static void apic_set_eoi(struct kvm_lapic *apic)
431{ 438{
432 int vector = apic_find_highest_isr(apic); 439 int vector = apic_find_highest_isr(apic);
@@ -458,7 +465,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
458 unsigned int delivery_mode = icr_low & APIC_MODE_MASK; 465 unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
459 unsigned int vector = icr_low & APIC_VECTOR_MASK; 466 unsigned int vector = icr_low & APIC_VECTOR_MASK;
460 467
461 struct kvm_lapic *target; 468 struct kvm_vcpu *target;
462 struct kvm_vcpu *vcpu; 469 struct kvm_vcpu *vcpu;
463 unsigned long lpr_map = 0; 470 unsigned long lpr_map = 0;
464 int i; 471 int i;
@@ -474,20 +481,20 @@ static void apic_send_ipi(struct kvm_lapic *apic)
474 if (!vcpu) 481 if (!vcpu)
475 continue; 482 continue;
476 483
477 if (vcpu->apic && 484 if (vcpu->arch.apic &&
478 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { 485 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
479 if (delivery_mode == APIC_DM_LOWEST) 486 if (delivery_mode == APIC_DM_LOWEST)
480 set_bit(vcpu->vcpu_id, &lpr_map); 487 set_bit(vcpu->vcpu_id, &lpr_map);
481 else 488 else
482 __apic_accept_irq(vcpu->apic, delivery_mode, 489 __apic_accept_irq(vcpu->arch.apic, delivery_mode,
483 vector, level, trig_mode); 490 vector, level, trig_mode);
484 } 491 }
485 } 492 }
486 493
487 if (delivery_mode == APIC_DM_LOWEST) { 494 if (delivery_mode == APIC_DM_LOWEST) {
488 target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map); 495 target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
489 if (target != NULL) 496 if (target != NULL)
490 __apic_accept_irq(target, delivery_mode, 497 __apic_accept_irq(target->arch.apic, delivery_mode,
491 vector, level, trig_mode); 498 vector, level, trig_mode);
492 } 499 }
493} 500}
@@ -544,6 +551,23 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
544 return tmcct; 551 return tmcct;
545} 552}
546 553
554static void __report_tpr_access(struct kvm_lapic *apic, bool write)
555{
556 struct kvm_vcpu *vcpu = apic->vcpu;
557 struct kvm_run *run = vcpu->run;
558
559 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
560 kvm_x86_ops->cache_regs(vcpu);
561 run->tpr_access.rip = vcpu->arch.rip;
562 run->tpr_access.is_write = write;
563}
564
565static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
566{
567 if (apic->vcpu->arch.tpr_access_reporting)
568 __report_tpr_access(apic, write);
569}
570
547static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) 571static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
548{ 572{
549 u32 val = 0; 573 u32 val = 0;
@@ -561,6 +585,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
561 val = apic_get_tmcct(apic); 585 val = apic_get_tmcct(apic);
562 break; 586 break;
563 587
588 case APIC_TASKPRI:
589 report_tpr_access(apic, false);
590 /* fall thru */
564 default: 591 default:
565 apic_update_ppr(apic); 592 apic_update_ppr(apic);
566 val = apic_get_reg(apic, offset); 593 val = apic_get_reg(apic, offset);
@@ -670,6 +697,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
670 break; 697 break;
671 698
672 case APIC_TASKPRI: 699 case APIC_TASKPRI:
700 report_tpr_access(apic, true);
673 apic_set_tpr(apic, val & 0xff); 701 apic_set_tpr(apic, val & 0xff);
674 break; 702 break;
675 703
@@ -762,19 +790,17 @@ static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
762 return ret; 790 return ret;
763} 791}
764 792
765void kvm_free_apic(struct kvm_lapic *apic) 793void kvm_free_lapic(struct kvm_vcpu *vcpu)
766{ 794{
767 if (!apic) 795 if (!vcpu->arch.apic)
768 return; 796 return;
769 797
770 hrtimer_cancel(&apic->timer.dev); 798 hrtimer_cancel(&vcpu->arch.apic->timer.dev);
771 799
772 if (apic->regs_page) { 800 if (vcpu->arch.apic->regs_page)
773 __free_page(apic->regs_page); 801 __free_page(vcpu->arch.apic->regs_page);
774 apic->regs_page = 0;
775 }
776 802
777 kfree(apic); 803 kfree(vcpu->arch.apic);
778} 804}
779 805
780/* 806/*
@@ -785,16 +811,17 @@ void kvm_free_apic(struct kvm_lapic *apic)
785 811
786void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) 812void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
787{ 813{
788 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; 814 struct kvm_lapic *apic = vcpu->arch.apic;
789 815
790 if (!apic) 816 if (!apic)
791 return; 817 return;
792 apic_set_tpr(apic, ((cr8 & 0x0f) << 4)); 818 apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
819 | (apic_get_reg(apic, APIC_TASKPRI) & 4));
793} 820}
794 821
795u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) 822u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
796{ 823{
797 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; 824 struct kvm_lapic *apic = vcpu->arch.apic;
798 u64 tpr; 825 u64 tpr;
799 826
800 if (!apic) 827 if (!apic)
@@ -807,29 +834,29 @@ EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
807 834
808void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) 835void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
809{ 836{
810 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; 837 struct kvm_lapic *apic = vcpu->arch.apic;
811 838
812 if (!apic) { 839 if (!apic) {
813 value |= MSR_IA32_APICBASE_BSP; 840 value |= MSR_IA32_APICBASE_BSP;
814 vcpu->apic_base = value; 841 vcpu->arch.apic_base = value;
815 return; 842 return;
816 } 843 }
817 if (apic->vcpu->vcpu_id) 844 if (apic->vcpu->vcpu_id)
818 value &= ~MSR_IA32_APICBASE_BSP; 845 value &= ~MSR_IA32_APICBASE_BSP;
819 846
820 vcpu->apic_base = value; 847 vcpu->arch.apic_base = value;
821 apic->base_address = apic->vcpu->apic_base & 848 apic->base_address = apic->vcpu->arch.apic_base &
822 MSR_IA32_APICBASE_BASE; 849 MSR_IA32_APICBASE_BASE;
823 850
824 /* with FSB delivery interrupt, we can restart APIC functionality */ 851 /* with FSB delivery interrupt, we can restart APIC functionality */
825 apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " 852 apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
826 "0x%lx.\n", apic->apic_base, apic->base_address); 853 "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
827 854
828} 855}
829 856
830u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) 857u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
831{ 858{
832 return vcpu->apic_base; 859 return vcpu->arch.apic_base;
833} 860}
834EXPORT_SYMBOL_GPL(kvm_lapic_get_base); 861EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
835 862
@@ -841,7 +868,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
841 apic_debug("%s\n", __FUNCTION__); 868 apic_debug("%s\n", __FUNCTION__);
842 869
843 ASSERT(vcpu); 870 ASSERT(vcpu);
844 apic = vcpu->apic; 871 apic = vcpu->arch.apic;
845 ASSERT(apic != NULL); 872 ASSERT(apic != NULL);
846 873
847 /* Stop the timer in case it's a reset to an active apic */ 874 /* Stop the timer in case it's a reset to an active apic */
@@ -872,19 +899,19 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
872 update_divide_count(apic); 899 update_divide_count(apic);
873 atomic_set(&apic->timer.pending, 0); 900 atomic_set(&apic->timer.pending, 0);
874 if (vcpu->vcpu_id == 0) 901 if (vcpu->vcpu_id == 0)
875 vcpu->apic_base |= MSR_IA32_APICBASE_BSP; 902 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
876 apic_update_ppr(apic); 903 apic_update_ppr(apic);
877 904
878 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" 905 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
879 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, 906 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
880 vcpu, kvm_apic_id(apic), 907 vcpu, kvm_apic_id(apic),
881 vcpu->apic_base, apic->base_address); 908 vcpu->arch.apic_base, apic->base_address);
882} 909}
883EXPORT_SYMBOL_GPL(kvm_lapic_reset); 910EXPORT_SYMBOL_GPL(kvm_lapic_reset);
884 911
885int kvm_lapic_enabled(struct kvm_vcpu *vcpu) 912int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
886{ 913{
887 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; 914 struct kvm_lapic *apic = vcpu->arch.apic;
888 int ret = 0; 915 int ret = 0;
889 916
890 if (!apic) 917 if (!apic)
@@ -908,9 +935,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
908 wait_queue_head_t *q = &apic->vcpu->wq; 935 wait_queue_head_t *q = &apic->vcpu->wq;
909 936
910 atomic_inc(&apic->timer.pending); 937 atomic_inc(&apic->timer.pending);
911 if (waitqueue_active(q)) 938 if (waitqueue_active(q)) {
912 { 939 apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
913 apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
914 wake_up_interruptible(q); 940 wake_up_interruptible(q);
915 } 941 }
916 if (apic_lvtt_period(apic)) { 942 if (apic_lvtt_period(apic)) {
@@ -956,13 +982,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
956 if (!apic) 982 if (!apic)
957 goto nomem; 983 goto nomem;
958 984
959 vcpu->apic = apic; 985 vcpu->arch.apic = apic;
960 986
961 apic->regs_page = alloc_page(GFP_KERNEL); 987 apic->regs_page = alloc_page(GFP_KERNEL);
962 if (apic->regs_page == NULL) { 988 if (apic->regs_page == NULL) {
963 printk(KERN_ERR "malloc apic regs error for vcpu %x\n", 989 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
964 vcpu->vcpu_id); 990 vcpu->vcpu_id);
965 goto nomem; 991 goto nomem_free_apic;
966 } 992 }
967 apic->regs = page_address(apic->regs_page); 993 apic->regs = page_address(apic->regs_page);
968 memset(apic->regs, 0, PAGE_SIZE); 994 memset(apic->regs, 0, PAGE_SIZE);
@@ -971,7 +997,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
971 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 997 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
972 apic->timer.dev.function = apic_timer_fn; 998 apic->timer.dev.function = apic_timer_fn;
973 apic->base_address = APIC_DEFAULT_PHYS_BASE; 999 apic->base_address = APIC_DEFAULT_PHYS_BASE;
974 vcpu->apic_base = APIC_DEFAULT_PHYS_BASE; 1000 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
975 1001
976 kvm_lapic_reset(vcpu); 1002 kvm_lapic_reset(vcpu);
977 apic->dev.read = apic_mmio_read; 1003 apic->dev.read = apic_mmio_read;
@@ -980,15 +1006,16 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
980 apic->dev.private = apic; 1006 apic->dev.private = apic;
981 1007
982 return 0; 1008 return 0;
1009nomem_free_apic:
1010 kfree(apic);
983nomem: 1011nomem:
984 kvm_free_apic(apic);
985 return -ENOMEM; 1012 return -ENOMEM;
986} 1013}
987EXPORT_SYMBOL_GPL(kvm_create_lapic); 1014EXPORT_SYMBOL_GPL(kvm_create_lapic);
988 1015
989int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) 1016int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
990{ 1017{
991 struct kvm_lapic *apic = vcpu->apic; 1018 struct kvm_lapic *apic = vcpu->arch.apic;
992 int highest_irr; 1019 int highest_irr;
993 1020
994 if (!apic || !apic_enabled(apic)) 1021 if (!apic || !apic_enabled(apic))
@@ -1004,11 +1031,11 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
1004 1031
1005int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) 1032int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1006{ 1033{
1007 u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0); 1034 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
1008 int r = 0; 1035 int r = 0;
1009 1036
1010 if (vcpu->vcpu_id == 0) { 1037 if (vcpu->vcpu_id == 0) {
1011 if (!apic_hw_enabled(vcpu->apic)) 1038 if (!apic_hw_enabled(vcpu->arch.apic))
1012 r = 1; 1039 r = 1;
1013 if ((lvt0 & APIC_LVT_MASKED) == 0 && 1040 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1014 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) 1041 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
@@ -1019,7 +1046,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1019 1046
1020void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) 1047void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1021{ 1048{
1022 struct kvm_lapic *apic = vcpu->apic; 1049 struct kvm_lapic *apic = vcpu->arch.apic;
1023 1050
1024 if (apic && apic_lvt_enabled(apic, APIC_LVTT) && 1051 if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
1025 atomic_read(&apic->timer.pending) > 0) { 1052 atomic_read(&apic->timer.pending) > 0) {
@@ -1030,7 +1057,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1030 1057
1031void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) 1058void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
1032{ 1059{
1033 struct kvm_lapic *apic = vcpu->apic; 1060 struct kvm_lapic *apic = vcpu->arch.apic;
1034 1061
1035 if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) 1062 if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
1036 apic->timer.last_update = ktime_add_ns( 1063 apic->timer.last_update = ktime_add_ns(
@@ -1041,7 +1068,7 @@ void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
1041int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) 1068int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1042{ 1069{
1043 int vector = kvm_apic_has_interrupt(vcpu); 1070 int vector = kvm_apic_has_interrupt(vcpu);
1044 struct kvm_lapic *apic = vcpu->apic; 1071 struct kvm_lapic *apic = vcpu->arch.apic;
1045 1072
1046 if (vector == -1) 1073 if (vector == -1)
1047 return -1; 1074 return -1;
@@ -1054,9 +1081,9 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1054 1081
1055void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) 1082void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1056{ 1083{
1057 struct kvm_lapic *apic = vcpu->apic; 1084 struct kvm_lapic *apic = vcpu->arch.apic;
1058 1085
1059 apic->base_address = vcpu->apic_base & 1086 apic->base_address = vcpu->arch.apic_base &
1060 MSR_IA32_APICBASE_BASE; 1087 MSR_IA32_APICBASE_BASE;
1061 apic_set_reg(apic, APIC_LVR, APIC_VERSION); 1088 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
1062 apic_update_ppr(apic); 1089 apic_update_ppr(apic);
@@ -1065,9 +1092,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1065 start_apic_timer(apic); 1092 start_apic_timer(apic);
1066} 1093}
1067 1094
1068void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1095void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1069{ 1096{
1070 struct kvm_lapic *apic = vcpu->apic; 1097 struct kvm_lapic *apic = vcpu->arch.apic;
1071 struct hrtimer *timer; 1098 struct hrtimer *timer;
1072 1099
1073 if (!apic) 1100 if (!apic)
@@ -1077,4 +1104,51 @@ void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1077 if (hrtimer_cancel(timer)) 1104 if (hrtimer_cancel(timer))
1078 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); 1105 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
1079} 1106}
1080EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer); 1107
1108void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
1109{
1110 u32 data;
1111 void *vapic;
1112
1113 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
1114 return;
1115
1116 vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
1117 data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
1118 kunmap_atomic(vapic, KM_USER0);
1119
1120 apic_set_tpr(vcpu->arch.apic, data & 0xff);
1121}
1122
1123void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1124{
1125 u32 data, tpr;
1126 int max_irr, max_isr;
1127 struct kvm_lapic *apic;
1128 void *vapic;
1129
1130 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
1131 return;
1132
1133 apic = vcpu->arch.apic;
1134 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
1135 max_irr = apic_find_highest_irr(apic);
1136 if (max_irr < 0)
1137 max_irr = 0;
1138 max_isr = apic_find_highest_isr(apic);
1139 if (max_isr < 0)
1140 max_isr = 0;
1141 data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
1142
1143 vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
1144 *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
1145 kunmap_atomic(vapic, KM_USER0);
1146}
1147
1148void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
1149{
1150 if (!irqchip_in_kernel(vcpu->kvm))
1151 return;
1152
1153 vcpu->arch.apic->vapic_addr = vapic_addr;
1154}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
new file mode 100644
index 000000000000..676c396c9cee
--- /dev/null
+++ b/arch/x86/kvm/lapic.h
@@ -0,0 +1,50 @@
1#ifndef __KVM_X86_LAPIC_H
2#define __KVM_X86_LAPIC_H
3
4#include "iodev.h"
5
6#include <linux/kvm_host.h>
7
8struct kvm_lapic {
9 unsigned long base_address;
10 struct kvm_io_device dev;
11 struct {
12 atomic_t pending;
13 s64 period; /* unit: ns */
14 u32 divide_count;
15 ktime_t last_update;
16 struct hrtimer dev;
17 } timer;
18 struct kvm_vcpu *vcpu;
19 struct page *regs_page;
20 void *regs;
21 gpa_t vapic_addr;
22 struct page *vapic_page;
23};
24int kvm_create_lapic(struct kvm_vcpu *vcpu);
25void kvm_free_lapic(struct kvm_vcpu *vcpu);
26
27int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
28int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
29int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
30void kvm_lapic_reset(struct kvm_vcpu *vcpu);
31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
34
35int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
36int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
37int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
38
39u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
40void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
41void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
42int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
43int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
44void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
45
46void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
47void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
48void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
49
50#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
new file mode 100644
index 000000000000..8efdcdbebb03
--- /dev/null
+++ b/arch/x86/kvm/mmu.c
@@ -0,0 +1,1885 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include "vmx.h"
21#include "mmu.h"
22
23#include <linux/kvm_host.h>
24#include <linux/types.h>
25#include <linux/string.h>
26#include <linux/mm.h>
27#include <linux/highmem.h>
28#include <linux/module.h>
29#include <linux/swap.h>
30
31#include <asm/page.h>
32#include <asm/cmpxchg.h>
33#include <asm/io.h>
34
35#undef MMU_DEBUG
36
37#undef AUDIT
38
39#ifdef AUDIT
40static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
41#else
42static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
43#endif
44
45#ifdef MMU_DEBUG
46
47#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
48#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
49
50#else
51
52#define pgprintk(x...) do { } while (0)
53#define rmap_printk(x...) do { } while (0)
54
55#endif
56
57#if defined(MMU_DEBUG) || defined(AUDIT)
58static int dbg = 1;
59#endif
60
61#ifndef MMU_DEBUG
62#define ASSERT(x) do { } while (0)
63#else
64#define ASSERT(x) \
65 if (!(x)) { \
66 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
67 __FILE__, __LINE__, #x); \
68 }
69#endif
70
71#define PT64_PT_BITS 9
72#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
73#define PT32_PT_BITS 10
74#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
75
76#define PT_WRITABLE_SHIFT 1
77
78#define PT_PRESENT_MASK (1ULL << 0)
79#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
80#define PT_USER_MASK (1ULL << 2)
81#define PT_PWT_MASK (1ULL << 3)
82#define PT_PCD_MASK (1ULL << 4)
83#define PT_ACCESSED_MASK (1ULL << 5)
84#define PT_DIRTY_MASK (1ULL << 6)
85#define PT_PAGE_SIZE_MASK (1ULL << 7)
86#define PT_PAT_MASK (1ULL << 7)
87#define PT_GLOBAL_MASK (1ULL << 8)
88#define PT64_NX_SHIFT 63
89#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
90
91#define PT_PAT_SHIFT 7
92#define PT_DIR_PAT_SHIFT 12
93#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
94
95#define PT32_DIR_PSE36_SIZE 4
96#define PT32_DIR_PSE36_SHIFT 13
97#define PT32_DIR_PSE36_MASK \
98 (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
99
100
101#define PT_FIRST_AVAIL_BITS_SHIFT 9
102#define PT64_SECOND_AVAIL_BITS_SHIFT 52
103
104#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
105
106#define VALID_PAGE(x) ((x) != INVALID_PAGE)
107
108#define PT64_LEVEL_BITS 9
109
110#define PT64_LEVEL_SHIFT(level) \
111 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
112
113#define PT64_LEVEL_MASK(level) \
114 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
115
116#define PT64_INDEX(address, level)\
117 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
118
119
120#define PT32_LEVEL_BITS 10
121
122#define PT32_LEVEL_SHIFT(level) \
123 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
124
125#define PT32_LEVEL_MASK(level) \
126 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
127
128#define PT32_INDEX(address, level)\
129 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
130
131
132#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
133#define PT64_DIR_BASE_ADDR_MASK \
134 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
135
136#define PT32_BASE_ADDR_MASK PAGE_MASK
137#define PT32_DIR_BASE_ADDR_MASK \
138 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
139
140#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
141 | PT64_NX_MASK)
142
143#define PFERR_PRESENT_MASK (1U << 0)
144#define PFERR_WRITE_MASK (1U << 1)
145#define PFERR_USER_MASK (1U << 2)
146#define PFERR_FETCH_MASK (1U << 4)
147
148#define PT64_ROOT_LEVEL 4
149#define PT32_ROOT_LEVEL 2
150#define PT32E_ROOT_LEVEL 3
151
152#define PT_DIRECTORY_LEVEL 2
153#define PT_PAGE_TABLE_LEVEL 1
154
155#define RMAP_EXT 4
156
157#define ACC_EXEC_MASK 1
158#define ACC_WRITE_MASK PT_WRITABLE_MASK
159#define ACC_USER_MASK PT_USER_MASK
160#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
161
162struct kvm_rmap_desc {
163 u64 *shadow_ptes[RMAP_EXT];
164 struct kvm_rmap_desc *more;
165};
166
167static struct kmem_cache *pte_chain_cache;
168static struct kmem_cache *rmap_desc_cache;
169static struct kmem_cache *mmu_page_header_cache;
170
171static u64 __read_mostly shadow_trap_nonpresent_pte;
172static u64 __read_mostly shadow_notrap_nonpresent_pte;
173
174void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
175{
176 shadow_trap_nonpresent_pte = trap_pte;
177 shadow_notrap_nonpresent_pte = notrap_pte;
178}
179EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
180
181static int is_write_protection(struct kvm_vcpu *vcpu)
182{
183 return vcpu->arch.cr0 & X86_CR0_WP;
184}
185
186static int is_cpuid_PSE36(void)
187{
188 return 1;
189}
190
191static int is_nx(struct kvm_vcpu *vcpu)
192{
193 return vcpu->arch.shadow_efer & EFER_NX;
194}
195
196static int is_present_pte(unsigned long pte)
197{
198 return pte & PT_PRESENT_MASK;
199}
200
201static int is_shadow_present_pte(u64 pte)
202{
203 pte &= ~PT_SHADOW_IO_MARK;
204 return pte != shadow_trap_nonpresent_pte
205 && pte != shadow_notrap_nonpresent_pte;
206}
207
208static int is_writeble_pte(unsigned long pte)
209{
210 return pte & PT_WRITABLE_MASK;
211}
212
213static int is_dirty_pte(unsigned long pte)
214{
215 return pte & PT_DIRTY_MASK;
216}
217
218static int is_io_pte(unsigned long pte)
219{
220 return pte & PT_SHADOW_IO_MARK;
221}
222
223static int is_rmap_pte(u64 pte)
224{
225 return pte != shadow_trap_nonpresent_pte
226 && pte != shadow_notrap_nonpresent_pte;
227}
228
229static gfn_t pse36_gfn_delta(u32 gpte)
230{
231 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
232
233 return (gpte & PT32_DIR_PSE36_MASK) << shift;
234}
235
236static void set_shadow_pte(u64 *sptep, u64 spte)
237{
238#ifdef CONFIG_X86_64
239 set_64bit((unsigned long *)sptep, spte);
240#else
241 set_64bit((unsigned long long *)sptep, spte);
242#endif
243}
244
245static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
246 struct kmem_cache *base_cache, int min)
247{
248 void *obj;
249
250 if (cache->nobjs >= min)
251 return 0;
252 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
253 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
254 if (!obj)
255 return -ENOMEM;
256 cache->objects[cache->nobjs++] = obj;
257 }
258 return 0;
259}
260
261static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
262{
263 while (mc->nobjs)
264 kfree(mc->objects[--mc->nobjs]);
265}
266
267static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
268 int min)
269{
270 struct page *page;
271
272 if (cache->nobjs >= min)
273 return 0;
274 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
275 page = alloc_page(GFP_KERNEL);
276 if (!page)
277 return -ENOMEM;
278 set_page_private(page, 0);
279 cache->objects[cache->nobjs++] = page_address(page);
280 }
281 return 0;
282}
283
284static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
285{
286 while (mc->nobjs)
287 free_page((unsigned long)mc->objects[--mc->nobjs]);
288}
289
290static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
291{
292 int r;
293
294 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
295 pte_chain_cache, 4);
296 if (r)
297 goto out;
298 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
299 rmap_desc_cache, 1);
300 if (r)
301 goto out;
302 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
303 if (r)
304 goto out;
305 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
306 mmu_page_header_cache, 4);
307out:
308 return r;
309}
310
311static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
312{
313 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
314 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
315 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
316 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
317}
318
319static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
320 size_t size)
321{
322 void *p;
323
324 BUG_ON(!mc->nobjs);
325 p = mc->objects[--mc->nobjs];
326 memset(p, 0, size);
327 return p;
328}
329
330static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
331{
332 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
333 sizeof(struct kvm_pte_chain));
334}
335
336static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
337{
338 kfree(pc);
339}
340
341static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
342{
343 return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
344 sizeof(struct kvm_rmap_desc));
345}
346
347static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
348{
349 kfree(rd);
350}
351
352/*
353 * Take gfn and return the reverse mapping to it.
354 * Note: gfn must be unaliased before this function get called
355 */
356
357static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
358{
359 struct kvm_memory_slot *slot;
360
361 slot = gfn_to_memslot(kvm, gfn);
362 return &slot->rmap[gfn - slot->base_gfn];
363}
364
365/*
366 * Reverse mapping data structures:
367 *
368 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
369 * that points to page_address(page).
370 *
371 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
372 * containing more mappings.
373 */
374static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
375{
376 struct kvm_mmu_page *sp;
377 struct kvm_rmap_desc *desc;
378 unsigned long *rmapp;
379 int i;
380
381 if (!is_rmap_pte(*spte))
382 return;
383 gfn = unalias_gfn(vcpu->kvm, gfn);
384 sp = page_header(__pa(spte));
385 sp->gfns[spte - sp->spt] = gfn;
386 rmapp = gfn_to_rmap(vcpu->kvm, gfn);
387 if (!*rmapp) {
388 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
389 *rmapp = (unsigned long)spte;
390 } else if (!(*rmapp & 1)) {
391 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
392 desc = mmu_alloc_rmap_desc(vcpu);
393 desc->shadow_ptes[0] = (u64 *)*rmapp;
394 desc->shadow_ptes[1] = spte;
395 *rmapp = (unsigned long)desc | 1;
396 } else {
397 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
398 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
399 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
400 desc = desc->more;
401 if (desc->shadow_ptes[RMAP_EXT-1]) {
402 desc->more = mmu_alloc_rmap_desc(vcpu);
403 desc = desc->more;
404 }
405 for (i = 0; desc->shadow_ptes[i]; ++i)
406 ;
407 desc->shadow_ptes[i] = spte;
408 }
409}
410
411static void rmap_desc_remove_entry(unsigned long *rmapp,
412 struct kvm_rmap_desc *desc,
413 int i,
414 struct kvm_rmap_desc *prev_desc)
415{
416 int j;
417
418 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
419 ;
420 desc->shadow_ptes[i] = desc->shadow_ptes[j];
421 desc->shadow_ptes[j] = NULL;
422 if (j != 0)
423 return;
424 if (!prev_desc && !desc->more)
425 *rmapp = (unsigned long)desc->shadow_ptes[0];
426 else
427 if (prev_desc)
428 prev_desc->more = desc->more;
429 else
430 *rmapp = (unsigned long)desc->more | 1;
431 mmu_free_rmap_desc(desc);
432}
433
434static void rmap_remove(struct kvm *kvm, u64 *spte)
435{
436 struct kvm_rmap_desc *desc;
437 struct kvm_rmap_desc *prev_desc;
438 struct kvm_mmu_page *sp;
439 struct page *page;
440 unsigned long *rmapp;
441 int i;
442
443 if (!is_rmap_pte(*spte))
444 return;
445 sp = page_header(__pa(spte));
446 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
447 mark_page_accessed(page);
448 if (is_writeble_pte(*spte))
449 kvm_release_page_dirty(page);
450 else
451 kvm_release_page_clean(page);
452 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
453 if (!*rmapp) {
454 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
455 BUG();
456 } else if (!(*rmapp & 1)) {
457 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
458 if ((u64 *)*rmapp != spte) {
459 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
460 spte, *spte);
461 BUG();
462 }
463 *rmapp = 0;
464 } else {
465 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
466 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
467 prev_desc = NULL;
468 while (desc) {
469 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
470 if (desc->shadow_ptes[i] == spte) {
471 rmap_desc_remove_entry(rmapp,
472 desc, i,
473 prev_desc);
474 return;
475 }
476 prev_desc = desc;
477 desc = desc->more;
478 }
479 BUG();
480 }
481}
482
483static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
484{
485 struct kvm_rmap_desc *desc;
486 struct kvm_rmap_desc *prev_desc;
487 u64 *prev_spte;
488 int i;
489
490 if (!*rmapp)
491 return NULL;
492 else if (!(*rmapp & 1)) {
493 if (!spte)
494 return (u64 *)*rmapp;
495 return NULL;
496 }
497 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
498 prev_desc = NULL;
499 prev_spte = NULL;
500 while (desc) {
501 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
502 if (prev_spte == spte)
503 return desc->shadow_ptes[i];
504 prev_spte = desc->shadow_ptes[i];
505 }
506 desc = desc->more;
507 }
508 return NULL;
509}
510
511static void rmap_write_protect(struct kvm *kvm, u64 gfn)
512{
513 unsigned long *rmapp;
514 u64 *spte;
515 int write_protected = 0;
516
517 gfn = unalias_gfn(kvm, gfn);
518 rmapp = gfn_to_rmap(kvm, gfn);
519
520 spte = rmap_next(kvm, rmapp, NULL);
521 while (spte) {
522 BUG_ON(!spte);
523 BUG_ON(!(*spte & PT_PRESENT_MASK));
524 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
525 if (is_writeble_pte(*spte)) {
526 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
527 write_protected = 1;
528 }
529 spte = rmap_next(kvm, rmapp, spte);
530 }
531 if (write_protected)
532 kvm_flush_remote_tlbs(kvm);
533}
534
535#ifdef MMU_DEBUG
536static int is_empty_shadow_page(u64 *spt)
537{
538 u64 *pos;
539 u64 *end;
540
541 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
542 if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
543 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
544 pos, *pos);
545 return 0;
546 }
547 return 1;
548}
549#endif
550
551static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
552{
553 ASSERT(is_empty_shadow_page(sp->spt));
554 list_del(&sp->link);
555 __free_page(virt_to_page(sp->spt));
556 __free_page(virt_to_page(sp->gfns));
557 kfree(sp);
558 ++kvm->arch.n_free_mmu_pages;
559}
560
561static unsigned kvm_page_table_hashfn(gfn_t gfn)
562{
563 return gfn;
564}
565
566static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
567 u64 *parent_pte)
568{
569 struct kvm_mmu_page *sp;
570
571 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
572 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
573 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
574 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
575 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
576 ASSERT(is_empty_shadow_page(sp->spt));
577 sp->slot_bitmap = 0;
578 sp->multimapped = 0;
579 sp->parent_pte = parent_pte;
580 --vcpu->kvm->arch.n_free_mmu_pages;
581 return sp;
582}
583
584static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
585 struct kvm_mmu_page *sp, u64 *parent_pte)
586{
587 struct kvm_pte_chain *pte_chain;
588 struct hlist_node *node;
589 int i;
590
591 if (!parent_pte)
592 return;
593 if (!sp->multimapped) {
594 u64 *old = sp->parent_pte;
595
596 if (!old) {
597 sp->parent_pte = parent_pte;
598 return;
599 }
600 sp->multimapped = 1;
601 pte_chain = mmu_alloc_pte_chain(vcpu);
602 INIT_HLIST_HEAD(&sp->parent_ptes);
603 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
604 pte_chain->parent_ptes[0] = old;
605 }
606 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
607 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
608 continue;
609 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
610 if (!pte_chain->parent_ptes[i]) {
611 pte_chain->parent_ptes[i] = parent_pte;
612 return;
613 }
614 }
615 pte_chain = mmu_alloc_pte_chain(vcpu);
616 BUG_ON(!pte_chain);
617 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
618 pte_chain->parent_ptes[0] = parent_pte;
619}
620
621static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
622 u64 *parent_pte)
623{
624 struct kvm_pte_chain *pte_chain;
625 struct hlist_node *node;
626 int i;
627
628 if (!sp->multimapped) {
629 BUG_ON(sp->parent_pte != parent_pte);
630 sp->parent_pte = NULL;
631 return;
632 }
633 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
634 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
635 if (!pte_chain->parent_ptes[i])
636 break;
637 if (pte_chain->parent_ptes[i] != parent_pte)
638 continue;
639 while (i + 1 < NR_PTE_CHAIN_ENTRIES
640 && pte_chain->parent_ptes[i + 1]) {
641 pte_chain->parent_ptes[i]
642 = pte_chain->parent_ptes[i + 1];
643 ++i;
644 }
645 pte_chain->parent_ptes[i] = NULL;
646 if (i == 0) {
647 hlist_del(&pte_chain->link);
648 mmu_free_pte_chain(pte_chain);
649 if (hlist_empty(&sp->parent_ptes)) {
650 sp->multimapped = 0;
651 sp->parent_pte = NULL;
652 }
653 }
654 return;
655 }
656 BUG();
657}
658
659static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
660{
661 unsigned index;
662 struct hlist_head *bucket;
663 struct kvm_mmu_page *sp;
664 struct hlist_node *node;
665
666 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
667 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
668 bucket = &kvm->arch.mmu_page_hash[index];
669 hlist_for_each_entry(sp, node, bucket, hash_link)
670 if (sp->gfn == gfn && !sp->role.metaphysical) {
671 pgprintk("%s: found role %x\n",
672 __FUNCTION__, sp->role.word);
673 return sp;
674 }
675 return NULL;
676}
677
678static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
679 gfn_t gfn,
680 gva_t gaddr,
681 unsigned level,
682 int metaphysical,
683 unsigned access,
684 u64 *parent_pte,
685 bool *new_page)
686{
687 union kvm_mmu_page_role role;
688 unsigned index;
689 unsigned quadrant;
690 struct hlist_head *bucket;
691 struct kvm_mmu_page *sp;
692 struct hlist_node *node;
693
694 role.word = 0;
695 role.glevels = vcpu->arch.mmu.root_level;
696 role.level = level;
697 role.metaphysical = metaphysical;
698 role.access = access;
699 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
700 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
701 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
702 role.quadrant = quadrant;
703 }
704 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
705 gfn, role.word);
706 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
707 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
708 hlist_for_each_entry(sp, node, bucket, hash_link)
709 if (sp->gfn == gfn && sp->role.word == role.word) {
710 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
711 pgprintk("%s: found\n", __FUNCTION__);
712 return sp;
713 }
714 ++vcpu->kvm->stat.mmu_cache_miss;
715 sp = kvm_mmu_alloc_page(vcpu, parent_pte);
716 if (!sp)
717 return sp;
718 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
719 sp->gfn = gfn;
720 sp->role = role;
721 hlist_add_head(&sp->hash_link, bucket);
722 vcpu->arch.mmu.prefetch_page(vcpu, sp);
723 if (!metaphysical)
724 rmap_write_protect(vcpu->kvm, gfn);
725 if (new_page)
726 *new_page = 1;
727 return sp;
728}
729
730static void kvm_mmu_page_unlink_children(struct kvm *kvm,
731 struct kvm_mmu_page *sp)
732{
733 unsigned i;
734 u64 *pt;
735 u64 ent;
736
737 pt = sp->spt;
738
739 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
740 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
741 if (is_shadow_present_pte(pt[i]))
742 rmap_remove(kvm, &pt[i]);
743 pt[i] = shadow_trap_nonpresent_pte;
744 }
745 kvm_flush_remote_tlbs(kvm);
746 return;
747 }
748
749 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
750 ent = pt[i];
751
752 pt[i] = shadow_trap_nonpresent_pte;
753 if (!is_shadow_present_pte(ent))
754 continue;
755 ent &= PT64_BASE_ADDR_MASK;
756 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
757 }
758 kvm_flush_remote_tlbs(kvm);
759}
760
761static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
762{
763 mmu_page_remove_parent_pte(sp, parent_pte);
764}
765
766static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
767{
768 int i;
769
770 for (i = 0; i < KVM_MAX_VCPUS; ++i)
771 if (kvm->vcpus[i])
772 kvm->vcpus[i]->arch.last_pte_updated = NULL;
773}
774
775static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
776{
777 u64 *parent_pte;
778
779 ++kvm->stat.mmu_shadow_zapped;
780 while (sp->multimapped || sp->parent_pte) {
781 if (!sp->multimapped)
782 parent_pte = sp->parent_pte;
783 else {
784 struct kvm_pte_chain *chain;
785
786 chain = container_of(sp->parent_ptes.first,
787 struct kvm_pte_chain, link);
788 parent_pte = chain->parent_ptes[0];
789 }
790 BUG_ON(!parent_pte);
791 kvm_mmu_put_page(sp, parent_pte);
792 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
793 }
794 kvm_mmu_page_unlink_children(kvm, sp);
795 if (!sp->root_count) {
796 hlist_del(&sp->hash_link);
797 kvm_mmu_free_page(kvm, sp);
798 } else
799 list_move(&sp->link, &kvm->arch.active_mmu_pages);
800 kvm_mmu_reset_last_pte_updated(kvm);
801}
802
803/*
804 * Changing the number of mmu pages allocated to the vm
805 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
806 */
807void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
808{
809 /*
810 * If we set the number of mmu pages to be smaller be than the
811 * number of actived pages , we must to free some mmu pages before we
812 * change the value
813 */
814
815 if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
816 kvm_nr_mmu_pages) {
817 int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
818 - kvm->arch.n_free_mmu_pages;
819
820 while (n_used_mmu_pages > kvm_nr_mmu_pages) {
821 struct kvm_mmu_page *page;
822
823 page = container_of(kvm->arch.active_mmu_pages.prev,
824 struct kvm_mmu_page, link);
825 kvm_mmu_zap_page(kvm, page);
826 n_used_mmu_pages--;
827 }
828 kvm->arch.n_free_mmu_pages = 0;
829 }
830 else
831 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
832 - kvm->arch.n_alloc_mmu_pages;
833
834 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
835}
836
837static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
838{
839 unsigned index;
840 struct hlist_head *bucket;
841 struct kvm_mmu_page *sp;
842 struct hlist_node *node, *n;
843 int r;
844
845 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
846 r = 0;
847 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
848 bucket = &kvm->arch.mmu_page_hash[index];
849 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
850 if (sp->gfn == gfn && !sp->role.metaphysical) {
851 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
852 sp->role.word);
853 kvm_mmu_zap_page(kvm, sp);
854 r = 1;
855 }
856 return r;
857}
858
859static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
860{
861 struct kvm_mmu_page *sp;
862
863 while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
864 pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
865 kvm_mmu_zap_page(kvm, sp);
866 }
867}
868
869static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
870{
871 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
872 struct kvm_mmu_page *sp = page_header(__pa(pte));
873
874 __set_bit(slot, &sp->slot_bitmap);
875}
876
877struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
878{
879 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
880
881 if (gpa == UNMAPPED_GVA)
882 return NULL;
883 return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
884}
885
886static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
887 unsigned pt_access, unsigned pte_access,
888 int user_fault, int write_fault, int dirty,
889 int *ptwrite, gfn_t gfn, struct page *page)
890{
891 u64 spte;
892 int was_rmapped = is_rmap_pte(*shadow_pte);
893 int was_writeble = is_writeble_pte(*shadow_pte);
894
895 pgprintk("%s: spte %llx access %x write_fault %d"
896 " user_fault %d gfn %lx\n",
897 __FUNCTION__, *shadow_pte, pt_access,
898 write_fault, user_fault, gfn);
899
900 /*
901 * We don't set the accessed bit, since we sometimes want to see
902 * whether the guest actually used the pte (in order to detect
903 * demand paging).
904 */
905 spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
906 if (!dirty)
907 pte_access &= ~ACC_WRITE_MASK;
908 if (!(pte_access & ACC_EXEC_MASK))
909 spte |= PT64_NX_MASK;
910
911 spte |= PT_PRESENT_MASK;
912 if (pte_access & ACC_USER_MASK)
913 spte |= PT_USER_MASK;
914
915 if (is_error_page(page)) {
916 set_shadow_pte(shadow_pte,
917 shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
918 kvm_release_page_clean(page);
919 return;
920 }
921
922 spte |= page_to_phys(page);
923
924 if ((pte_access & ACC_WRITE_MASK)
925 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
926 struct kvm_mmu_page *shadow;
927
928 spte |= PT_WRITABLE_MASK;
929 if (user_fault) {
930 mmu_unshadow(vcpu->kvm, gfn);
931 goto unshadowed;
932 }
933
934 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
935 if (shadow) {
936 pgprintk("%s: found shadow page for %lx, marking ro\n",
937 __FUNCTION__, gfn);
938 pte_access &= ~ACC_WRITE_MASK;
939 if (is_writeble_pte(spte)) {
940 spte &= ~PT_WRITABLE_MASK;
941 kvm_x86_ops->tlb_flush(vcpu);
942 }
943 if (write_fault)
944 *ptwrite = 1;
945 }
946 }
947
948unshadowed:
949
950 if (pte_access & ACC_WRITE_MASK)
951 mark_page_dirty(vcpu->kvm, gfn);
952
953 pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
954 set_shadow_pte(shadow_pte, spte);
955 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
956 if (!was_rmapped) {
957 rmap_add(vcpu, shadow_pte, gfn);
958 if (!is_rmap_pte(*shadow_pte))
959 kvm_release_page_clean(page);
960 } else {
961 if (was_writeble)
962 kvm_release_page_dirty(page);
963 else
964 kvm_release_page_clean(page);
965 }
966 if (!ptwrite || !*ptwrite)
967 vcpu->arch.last_pte_updated = shadow_pte;
968}
969
970static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
971{
972}
973
974static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
975 gfn_t gfn, struct page *page)
976{
977 int level = PT32E_ROOT_LEVEL;
978 hpa_t table_addr = vcpu->arch.mmu.root_hpa;
979 int pt_write = 0;
980
981 for (; ; level--) {
982 u32 index = PT64_INDEX(v, level);
983 u64 *table;
984
985 ASSERT(VALID_PAGE(table_addr));
986 table = __va(table_addr);
987
988 if (level == 1) {
989 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
990 0, write, 1, &pt_write, gfn, page);
991 return pt_write || is_io_pte(table[index]);
992 }
993
994 if (table[index] == shadow_trap_nonpresent_pte) {
995 struct kvm_mmu_page *new_table;
996 gfn_t pseudo_gfn;
997
998 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
999 >> PAGE_SHIFT;
1000 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
1001 v, level - 1,
1002 1, ACC_ALL, &table[index],
1003 NULL);
1004 if (!new_table) {
1005 pgprintk("nonpaging_map: ENOMEM\n");
1006 kvm_release_page_clean(page);
1007 return -ENOMEM;
1008 }
1009
1010 table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
1011 | PT_WRITABLE_MASK | PT_USER_MASK;
1012 }
1013 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1014 }
1015}
1016
1017static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1018{
1019 int r;
1020
1021 struct page *page;
1022
1023 down_read(&current->mm->mmap_sem);
1024 page = gfn_to_page(vcpu->kvm, gfn);
1025
1026 spin_lock(&vcpu->kvm->mmu_lock);
1027 kvm_mmu_free_some_pages(vcpu);
1028 r = __nonpaging_map(vcpu, v, write, gfn, page);
1029 spin_unlock(&vcpu->kvm->mmu_lock);
1030
1031 up_read(&current->mm->mmap_sem);
1032
1033 return r;
1034}
1035
1036
1037static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1038 struct kvm_mmu_page *sp)
1039{
1040 int i;
1041
1042 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1043 sp->spt[i] = shadow_trap_nonpresent_pte;
1044}
1045
1046static void mmu_free_roots(struct kvm_vcpu *vcpu)
1047{
1048 int i;
1049 struct kvm_mmu_page *sp;
1050
1051 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1052 return;
1053 spin_lock(&vcpu->kvm->mmu_lock);
1054#ifdef CONFIG_X86_64
1055 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1056 hpa_t root = vcpu->arch.mmu.root_hpa;
1057
1058 sp = page_header(root);
1059 --sp->root_count;
1060 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1061 spin_unlock(&vcpu->kvm->mmu_lock);
1062 return;
1063 }
1064#endif
1065 for (i = 0; i < 4; ++i) {
1066 hpa_t root = vcpu->arch.mmu.pae_root[i];
1067
1068 if (root) {
1069 root &= PT64_BASE_ADDR_MASK;
1070 sp = page_header(root);
1071 --sp->root_count;
1072 }
1073 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1074 }
1075 spin_unlock(&vcpu->kvm->mmu_lock);
1076 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1077}
1078
1079static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1080{
1081 int i;
1082 gfn_t root_gfn;
1083 struct kvm_mmu_page *sp;
1084
1085 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1086
1087#ifdef CONFIG_X86_64
1088 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1089 hpa_t root = vcpu->arch.mmu.root_hpa;
1090
1091 ASSERT(!VALID_PAGE(root));
1092 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1093 PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
1094 root = __pa(sp->spt);
1095 ++sp->root_count;
1096 vcpu->arch.mmu.root_hpa = root;
1097 return;
1098 }
1099#endif
1100 for (i = 0; i < 4; ++i) {
1101 hpa_t root = vcpu->arch.mmu.pae_root[i];
1102
1103 ASSERT(!VALID_PAGE(root));
1104 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1105 if (!is_present_pte(vcpu->arch.pdptrs[i])) {
1106 vcpu->arch.mmu.pae_root[i] = 0;
1107 continue;
1108 }
1109 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1110 } else if (vcpu->arch.mmu.root_level == 0)
1111 root_gfn = 0;
1112 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1113 PT32_ROOT_LEVEL, !is_paging(vcpu),
1114 ACC_ALL, NULL, NULL);
1115 root = __pa(sp->spt);
1116 ++sp->root_count;
1117 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1118 }
1119 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1120}
1121
1122static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1123{
1124 return vaddr;
1125}
1126
1127static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1128 u32 error_code)
1129{
1130 gfn_t gfn;
1131 int r;
1132
1133 pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
1134 r = mmu_topup_memory_caches(vcpu);
1135 if (r)
1136 return r;
1137
1138 ASSERT(vcpu);
1139 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1140
1141 gfn = gva >> PAGE_SHIFT;
1142
1143 return nonpaging_map(vcpu, gva & PAGE_MASK,
1144 error_code & PFERR_WRITE_MASK, gfn);
1145}
1146
1147static void nonpaging_free(struct kvm_vcpu *vcpu)
1148{
1149 mmu_free_roots(vcpu);
1150}
1151
1152static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1153{
1154 struct kvm_mmu *context = &vcpu->arch.mmu;
1155
1156 context->new_cr3 = nonpaging_new_cr3;
1157 context->page_fault = nonpaging_page_fault;
1158 context->gva_to_gpa = nonpaging_gva_to_gpa;
1159 context->free = nonpaging_free;
1160 context->prefetch_page = nonpaging_prefetch_page;
1161 context->root_level = 0;
1162 context->shadow_root_level = PT32E_ROOT_LEVEL;
1163 context->root_hpa = INVALID_PAGE;
1164 return 0;
1165}
1166
1167void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1168{
1169 ++vcpu->stat.tlb_flush;
1170 kvm_x86_ops->tlb_flush(vcpu);
1171}
1172
1173static void paging_new_cr3(struct kvm_vcpu *vcpu)
1174{
1175 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
1176 mmu_free_roots(vcpu);
1177}
1178
1179static void inject_page_fault(struct kvm_vcpu *vcpu,
1180 u64 addr,
1181 u32 err_code)
1182{
1183 kvm_inject_page_fault(vcpu, addr, err_code);
1184}
1185
1186static void paging_free(struct kvm_vcpu *vcpu)
1187{
1188 nonpaging_free(vcpu);
1189}
1190
1191#define PTTYPE 64
1192#include "paging_tmpl.h"
1193#undef PTTYPE
1194
1195#define PTTYPE 32
1196#include "paging_tmpl.h"
1197#undef PTTYPE
1198
1199static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1200{
1201 struct kvm_mmu *context = &vcpu->arch.mmu;
1202
1203 ASSERT(is_pae(vcpu));
1204 context->new_cr3 = paging_new_cr3;
1205 context->page_fault = paging64_page_fault;
1206 context->gva_to_gpa = paging64_gva_to_gpa;
1207 context->prefetch_page = paging64_prefetch_page;
1208 context->free = paging_free;
1209 context->root_level = level;
1210 context->shadow_root_level = level;
1211 context->root_hpa = INVALID_PAGE;
1212 return 0;
1213}
1214
1215static int paging64_init_context(struct kvm_vcpu *vcpu)
1216{
1217 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1218}
1219
1220static int paging32_init_context(struct kvm_vcpu *vcpu)
1221{
1222 struct kvm_mmu *context = &vcpu->arch.mmu;
1223
1224 context->new_cr3 = paging_new_cr3;
1225 context->page_fault = paging32_page_fault;
1226 context->gva_to_gpa = paging32_gva_to_gpa;
1227 context->free = paging_free;
1228 context->prefetch_page = paging32_prefetch_page;
1229 context->root_level = PT32_ROOT_LEVEL;
1230 context->shadow_root_level = PT32E_ROOT_LEVEL;
1231 context->root_hpa = INVALID_PAGE;
1232 return 0;
1233}
1234
1235static int paging32E_init_context(struct kvm_vcpu *vcpu)
1236{
1237 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1238}
1239
1240static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1241{
1242 ASSERT(vcpu);
1243 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1244
1245 if (!is_paging(vcpu))
1246 return nonpaging_init_context(vcpu);
1247 else if (is_long_mode(vcpu))
1248 return paging64_init_context(vcpu);
1249 else if (is_pae(vcpu))
1250 return paging32E_init_context(vcpu);
1251 else
1252 return paging32_init_context(vcpu);
1253}
1254
1255static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1256{
1257 ASSERT(vcpu);
1258 if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
1259 vcpu->arch.mmu.free(vcpu);
1260 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1261 }
1262}
1263
1264int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1265{
1266 destroy_kvm_mmu(vcpu);
1267 return init_kvm_mmu(vcpu);
1268}
1269EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1270
1271int kvm_mmu_load(struct kvm_vcpu *vcpu)
1272{
1273 int r;
1274
1275 r = mmu_topup_memory_caches(vcpu);
1276 if (r)
1277 goto out;
1278 spin_lock(&vcpu->kvm->mmu_lock);
1279 kvm_mmu_free_some_pages(vcpu);
1280 mmu_alloc_roots(vcpu);
1281 spin_unlock(&vcpu->kvm->mmu_lock);
1282 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1283 kvm_mmu_flush_tlb(vcpu);
1284out:
1285 return r;
1286}
1287EXPORT_SYMBOL_GPL(kvm_mmu_load);
1288
1289void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1290{
1291 mmu_free_roots(vcpu);
1292}
1293
1294static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1295 struct kvm_mmu_page *sp,
1296 u64 *spte)
1297{
1298 u64 pte;
1299 struct kvm_mmu_page *child;
1300
1301 pte = *spte;
1302 if (is_shadow_present_pte(pte)) {
1303 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1304 rmap_remove(vcpu->kvm, spte);
1305 else {
1306 child = page_header(pte & PT64_BASE_ADDR_MASK);
1307 mmu_page_remove_parent_pte(child, spte);
1308 }
1309 }
1310 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1311}
1312
1313static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1314 struct kvm_mmu_page *sp,
1315 u64 *spte,
1316 const void *new, int bytes,
1317 int offset_in_pte)
1318{
1319 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
1320 ++vcpu->kvm->stat.mmu_pde_zapped;
1321 return;
1322 }
1323
1324 ++vcpu->kvm->stat.mmu_pte_updated;
1325 if (sp->role.glevels == PT32_ROOT_LEVEL)
1326 paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1327 else
1328 paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1329}
1330
1331static bool need_remote_flush(u64 old, u64 new)
1332{
1333 if (!is_shadow_present_pte(old))
1334 return false;
1335 if (!is_shadow_present_pte(new))
1336 return true;
1337 if ((old ^ new) & PT64_BASE_ADDR_MASK)
1338 return true;
1339 old ^= PT64_NX_MASK;
1340 new ^= PT64_NX_MASK;
1341 return (old & ~new & PT64_PERM_MASK) != 0;
1342}
1343
1344static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
1345{
1346 if (need_remote_flush(old, new))
1347 kvm_flush_remote_tlbs(vcpu->kvm);
1348 else
1349 kvm_mmu_flush_tlb(vcpu);
1350}
1351
1352static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
1353{
1354 u64 *spte = vcpu->arch.last_pte_updated;
1355
1356 return !!(spte && (*spte & PT_ACCESSED_MASK));
1357}
1358
1359static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1360 const u8 *new, int bytes)
1361{
1362 gfn_t gfn;
1363 int r;
1364 u64 gpte = 0;
1365
1366 if (bytes != 4 && bytes != 8)
1367 return;
1368
1369 /*
1370 * Assume that the pte write on a page table of the same type
1371 * as the current vcpu paging mode. This is nearly always true
1372 * (might be false while changing modes). Note it is verified later
1373 * by update_pte().
1374 */
1375 if (is_pae(vcpu)) {
1376 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
1377 if ((bytes == 4) && (gpa % 4 == 0)) {
1378 r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
1379 if (r)
1380 return;
1381 memcpy((void *)&gpte + (gpa % 8), new, 4);
1382 } else if ((bytes == 8) && (gpa % 8 == 0)) {
1383 memcpy((void *)&gpte, new, 8);
1384 }
1385 } else {
1386 if ((bytes == 4) && (gpa % 4 == 0))
1387 memcpy((void *)&gpte, new, 4);
1388 }
1389 if (!is_present_pte(gpte))
1390 return;
1391 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1392 vcpu->arch.update_pte.gfn = gfn;
1393 vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
1394}
1395
1396void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1397 const u8 *new, int bytes)
1398{
1399 gfn_t gfn = gpa >> PAGE_SHIFT;
1400 struct kvm_mmu_page *sp;
1401 struct hlist_node *node, *n;
1402 struct hlist_head *bucket;
1403 unsigned index;
1404 u64 entry;
1405 u64 *spte;
1406 unsigned offset = offset_in_page(gpa);
1407 unsigned pte_size;
1408 unsigned page_offset;
1409 unsigned misaligned;
1410 unsigned quadrant;
1411 int level;
1412 int flooded = 0;
1413 int npte;
1414
1415 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1416 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1417 spin_lock(&vcpu->kvm->mmu_lock);
1418 kvm_mmu_free_some_pages(vcpu);
1419 ++vcpu->kvm->stat.mmu_pte_write;
1420 kvm_mmu_audit(vcpu, "pre pte write");
1421 if (gfn == vcpu->arch.last_pt_write_gfn
1422 && !last_updated_pte_accessed(vcpu)) {
1423 ++vcpu->arch.last_pt_write_count;
1424 if (vcpu->arch.last_pt_write_count >= 3)
1425 flooded = 1;
1426 } else {
1427 vcpu->arch.last_pt_write_gfn = gfn;
1428 vcpu->arch.last_pt_write_count = 1;
1429 vcpu->arch.last_pte_updated = NULL;
1430 }
1431 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1432 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1433 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1434 if (sp->gfn != gfn || sp->role.metaphysical)
1435 continue;
1436 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1437 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1438 misaligned |= bytes < 4;
1439 if (misaligned || flooded) {
1440 /*
1441 * Misaligned accesses are too much trouble to fix
1442 * up; also, they usually indicate a page is not used
1443 * as a page table.
1444 *
1445 * If we're seeing too many writes to a page,
1446 * it may no longer be a page table, or we may be
1447 * forking, in which case it is better to unmap the
1448 * page.
1449 */
1450 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1451 gpa, bytes, sp->role.word);
1452 kvm_mmu_zap_page(vcpu->kvm, sp);
1453 ++vcpu->kvm->stat.mmu_flooded;
1454 continue;
1455 }
1456 page_offset = offset;
1457 level = sp->role.level;
1458 npte = 1;
1459 if (sp->role.glevels == PT32_ROOT_LEVEL) {
1460 page_offset <<= 1; /* 32->64 */
1461 /*
1462 * A 32-bit pde maps 4MB while the shadow pdes map
1463 * only 2MB. So we need to double the offset again
1464 * and zap two pdes instead of one.
1465 */
1466 if (level == PT32_ROOT_LEVEL) {
1467 page_offset &= ~7; /* kill rounding error */
1468 page_offset <<= 1;
1469 npte = 2;
1470 }
1471 quadrant = page_offset >> PAGE_SHIFT;
1472 page_offset &= ~PAGE_MASK;
1473 if (quadrant != sp->role.quadrant)
1474 continue;
1475 }
1476 spte = &sp->spt[page_offset / sizeof(*spte)];
1477 while (npte--) {
1478 entry = *spte;
1479 mmu_pte_write_zap_pte(vcpu, sp, spte);
1480 mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
1481 page_offset & (pte_size - 1));
1482 mmu_pte_write_flush_tlb(vcpu, entry, *spte);
1483 ++spte;
1484 }
1485 }
1486 kvm_mmu_audit(vcpu, "post pte write");
1487 spin_unlock(&vcpu->kvm->mmu_lock);
1488 if (vcpu->arch.update_pte.page) {
1489 kvm_release_page_clean(vcpu->arch.update_pte.page);
1490 vcpu->arch.update_pte.page = NULL;
1491 }
1492}
1493
1494int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1495{
1496 gpa_t gpa;
1497 int r;
1498
1499 down_read(&current->mm->mmap_sem);
1500 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1501 up_read(&current->mm->mmap_sem);
1502
1503 spin_lock(&vcpu->kvm->mmu_lock);
1504 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1505 spin_unlock(&vcpu->kvm->mmu_lock);
1506 return r;
1507}
1508
1509void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1510{
1511 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
1512 struct kvm_mmu_page *sp;
1513
1514 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
1515 struct kvm_mmu_page, link);
1516 kvm_mmu_zap_page(vcpu->kvm, sp);
1517 ++vcpu->kvm->stat.mmu_recycled;
1518 }
1519}
1520
1521int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
1522{
1523 int r;
1524 enum emulation_result er;
1525
1526 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
1527 if (r < 0)
1528 goto out;
1529
1530 if (!r) {
1531 r = 1;
1532 goto out;
1533 }
1534
1535 r = mmu_topup_memory_caches(vcpu);
1536 if (r)
1537 goto out;
1538
1539 er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
1540
1541 switch (er) {
1542 case EMULATE_DONE:
1543 return 1;
1544 case EMULATE_DO_MMIO:
1545 ++vcpu->stat.mmio_exits;
1546 return 0;
1547 case EMULATE_FAIL:
1548 kvm_report_emulation_failure(vcpu, "pagetable");
1549 return 1;
1550 default:
1551 BUG();
1552 }
1553out:
1554 return r;
1555}
1556EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1557
1558static void free_mmu_pages(struct kvm_vcpu *vcpu)
1559{
1560 struct kvm_mmu_page *sp;
1561
1562 while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
1563 sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
1564 struct kvm_mmu_page, link);
1565 kvm_mmu_zap_page(vcpu->kvm, sp);
1566 }
1567 free_page((unsigned long)vcpu->arch.mmu.pae_root);
1568}
1569
1570static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1571{
1572 struct page *page;
1573 int i;
1574
1575 ASSERT(vcpu);
1576
1577 if (vcpu->kvm->arch.n_requested_mmu_pages)
1578 vcpu->kvm->arch.n_free_mmu_pages =
1579 vcpu->kvm->arch.n_requested_mmu_pages;
1580 else
1581 vcpu->kvm->arch.n_free_mmu_pages =
1582 vcpu->kvm->arch.n_alloc_mmu_pages;
1583 /*
1584 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1585 * Therefore we need to allocate shadow page tables in the first
1586 * 4GB of memory, which happens to fit the DMA32 zone.
1587 */
1588 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1589 if (!page)
1590 goto error_1;
1591 vcpu->arch.mmu.pae_root = page_address(page);
1592 for (i = 0; i < 4; ++i)
1593 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1594
1595 return 0;
1596
1597error_1:
1598 free_mmu_pages(vcpu);
1599 return -ENOMEM;
1600}
1601
1602int kvm_mmu_create(struct kvm_vcpu *vcpu)
1603{
1604 ASSERT(vcpu);
1605 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1606
1607 return alloc_mmu_pages(vcpu);
1608}
1609
1610int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1611{
1612 ASSERT(vcpu);
1613 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1614
1615 return init_kvm_mmu(vcpu);
1616}
1617
1618void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1619{
1620 ASSERT(vcpu);
1621
1622 destroy_kvm_mmu(vcpu);
1623 free_mmu_pages(vcpu);
1624 mmu_free_memory_caches(vcpu);
1625}
1626
1627void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1628{
1629 struct kvm_mmu_page *sp;
1630
1631 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
1632 int i;
1633 u64 *pt;
1634
1635 if (!test_bit(slot, &sp->slot_bitmap))
1636 continue;
1637
1638 pt = sp->spt;
1639 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1640 /* avoid RMW */
1641 if (pt[i] & PT_WRITABLE_MASK)
1642 pt[i] &= ~PT_WRITABLE_MASK;
1643 }
1644}
1645
1646void kvm_mmu_zap_all(struct kvm *kvm)
1647{
1648 struct kvm_mmu_page *sp, *node;
1649
1650 spin_lock(&kvm->mmu_lock);
1651 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
1652 kvm_mmu_zap_page(kvm, sp);
1653 spin_unlock(&kvm->mmu_lock);
1654
1655 kvm_flush_remote_tlbs(kvm);
1656}
1657
1658void kvm_mmu_module_exit(void)
1659{
1660 if (pte_chain_cache)
1661 kmem_cache_destroy(pte_chain_cache);
1662 if (rmap_desc_cache)
1663 kmem_cache_destroy(rmap_desc_cache);
1664 if (mmu_page_header_cache)
1665 kmem_cache_destroy(mmu_page_header_cache);
1666}
1667
1668int kvm_mmu_module_init(void)
1669{
1670 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1671 sizeof(struct kvm_pte_chain),
1672 0, 0, NULL);
1673 if (!pte_chain_cache)
1674 goto nomem;
1675 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1676 sizeof(struct kvm_rmap_desc),
1677 0, 0, NULL);
1678 if (!rmap_desc_cache)
1679 goto nomem;
1680
1681 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1682 sizeof(struct kvm_mmu_page),
1683 0, 0, NULL);
1684 if (!mmu_page_header_cache)
1685 goto nomem;
1686
1687 return 0;
1688
1689nomem:
1690 kvm_mmu_module_exit();
1691 return -ENOMEM;
1692}
1693
1694/*
1695 * Caculate mmu pages needed for kvm.
1696 */
1697unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
1698{
1699 int i;
1700 unsigned int nr_mmu_pages;
1701 unsigned int nr_pages = 0;
1702
1703 for (i = 0; i < kvm->nmemslots; i++)
1704 nr_pages += kvm->memslots[i].npages;
1705
1706 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
1707 nr_mmu_pages = max(nr_mmu_pages,
1708 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
1709
1710 return nr_mmu_pages;
1711}
1712
1713#ifdef AUDIT
1714
1715static const char *audit_msg;
1716
1717static gva_t canonicalize(gva_t gva)
1718{
1719#ifdef CONFIG_X86_64
1720 gva = (long long)(gva << 16) >> 16;
1721#endif
1722 return gva;
1723}
1724
1725static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1726 gva_t va, int level)
1727{
1728 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1729 int i;
1730 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1731
1732 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1733 u64 ent = pt[i];
1734
1735 if (ent == shadow_trap_nonpresent_pte)
1736 continue;
1737
1738 va = canonicalize(va);
1739 if (level > 1) {
1740 if (ent == shadow_notrap_nonpresent_pte)
1741 printk(KERN_ERR "audit: (%s) nontrapping pte"
1742 " in nonleaf level: levels %d gva %lx"
1743 " level %d pte %llx\n", audit_msg,
1744 vcpu->arch.mmu.root_level, va, level, ent);
1745
1746 audit_mappings_page(vcpu, ent, va, level - 1);
1747 } else {
1748 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
1749 struct page *page = gpa_to_page(vcpu, gpa);
1750 hpa_t hpa = page_to_phys(page);
1751
1752 if (is_shadow_present_pte(ent)
1753 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1754 printk(KERN_ERR "xx audit error: (%s) levels %d"
1755 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
1756 audit_msg, vcpu->arch.mmu.root_level,
1757 va, gpa, hpa, ent,
1758 is_shadow_present_pte(ent));
1759 else if (ent == shadow_notrap_nonpresent_pte
1760 && !is_error_hpa(hpa))
1761 printk(KERN_ERR "audit: (%s) notrap shadow,"
1762 " valid guest gva %lx\n", audit_msg, va);
1763 kvm_release_page_clean(page);
1764
1765 }
1766 }
1767}
1768
1769static void audit_mappings(struct kvm_vcpu *vcpu)
1770{
1771 unsigned i;
1772
1773 if (vcpu->arch.mmu.root_level == 4)
1774 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
1775 else
1776 for (i = 0; i < 4; ++i)
1777 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
1778 audit_mappings_page(vcpu,
1779 vcpu->arch.mmu.pae_root[i],
1780 i << 30,
1781 2);
1782}
1783
1784static int count_rmaps(struct kvm_vcpu *vcpu)
1785{
1786 int nmaps = 0;
1787 int i, j, k;
1788
1789 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1790 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1791 struct kvm_rmap_desc *d;
1792
1793 for (j = 0; j < m->npages; ++j) {
1794 unsigned long *rmapp = &m->rmap[j];
1795
1796 if (!*rmapp)
1797 continue;
1798 if (!(*rmapp & 1)) {
1799 ++nmaps;
1800 continue;
1801 }
1802 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
1803 while (d) {
1804 for (k = 0; k < RMAP_EXT; ++k)
1805 if (d->shadow_ptes[k])
1806 ++nmaps;
1807 else
1808 break;
1809 d = d->more;
1810 }
1811 }
1812 }
1813 return nmaps;
1814}
1815
1816static int count_writable_mappings(struct kvm_vcpu *vcpu)
1817{
1818 int nmaps = 0;
1819 struct kvm_mmu_page *sp;
1820 int i;
1821
1822 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1823 u64 *pt = sp->spt;
1824
1825 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
1826 continue;
1827
1828 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1829 u64 ent = pt[i];
1830
1831 if (!(ent & PT_PRESENT_MASK))
1832 continue;
1833 if (!(ent & PT_WRITABLE_MASK))
1834 continue;
1835 ++nmaps;
1836 }
1837 }
1838 return nmaps;
1839}
1840
1841static void audit_rmap(struct kvm_vcpu *vcpu)
1842{
1843 int n_rmap = count_rmaps(vcpu);
1844 int n_actual = count_writable_mappings(vcpu);
1845
1846 if (n_rmap != n_actual)
1847 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1848 __FUNCTION__, audit_msg, n_rmap, n_actual);
1849}
1850
1851static void audit_write_protection(struct kvm_vcpu *vcpu)
1852{
1853 struct kvm_mmu_page *sp;
1854 struct kvm_memory_slot *slot;
1855 unsigned long *rmapp;
1856 gfn_t gfn;
1857
1858 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1859 if (sp->role.metaphysical)
1860 continue;
1861
1862 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
1863 gfn = unalias_gfn(vcpu->kvm, sp->gfn);
1864 rmapp = &slot->rmap[gfn - slot->base_gfn];
1865 if (*rmapp)
1866 printk(KERN_ERR "%s: (%s) shadow page has writable"
1867 " mappings: gfn %lx role %x\n",
1868 __FUNCTION__, audit_msg, sp->gfn,
1869 sp->role.word);
1870 }
1871}
1872
1873static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1874{
1875 int olddbg = dbg;
1876
1877 dbg = 0;
1878 audit_msg = msg;
1879 audit_rmap(vcpu);
1880 audit_write_protection(vcpu);
1881 audit_mappings(vcpu);
1882 dbg = olddbg;
1883}
1884
1885#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
new file mode 100644
index 000000000000..1fce19ec7a23
--- /dev/null
+++ b/arch/x86/kvm/mmu.h
@@ -0,0 +1,44 @@
1#ifndef __KVM_X86_MMU_H
2#define __KVM_X86_MMU_H
3
4#include <linux/kvm_host.h>
5
6static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
7{
8 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
9 __kvm_mmu_free_some_pages(vcpu);
10}
11
12static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
13{
14 if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
15 return 0;
16
17 return kvm_mmu_load(vcpu);
18}
19
20static inline int is_long_mode(struct kvm_vcpu *vcpu)
21{
22#ifdef CONFIG_X86_64
23 return vcpu->arch.shadow_efer & EFER_LME;
24#else
25 return 0;
26#endif
27}
28
29static inline int is_pae(struct kvm_vcpu *vcpu)
30{
31 return vcpu->arch.cr4 & X86_CR4_PAE;
32}
33
34static inline int is_pse(struct kvm_vcpu *vcpu)
35{
36 return vcpu->arch.cr4 & X86_CR4_PSE;
37}
38
39static inline int is_paging(struct kvm_vcpu *vcpu)
40{
41 return vcpu->arch.cr0 & X86_CR0_PG;
42}
43
44#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
new file mode 100644
index 000000000000..03ba8608fe0f
--- /dev/null
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -0,0 +1,484 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20/*
21 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
22 * so the code in this file is compiled twice, once per pte size.
23 */
24
25#if PTTYPE == 64
26 #define pt_element_t u64
27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
35 #ifdef CONFIG_X86_64
36 #define PT_MAX_FULL_LEVELS 4
37 #define CMPXCHG cmpxchg
38 #else
39 #define CMPXCHG cmpxchg64
40 #define PT_MAX_FULL_LEVELS 2
41 #endif
42#elif PTTYPE == 32
43 #define pt_element_t u32
44 #define guest_walker guest_walker32
45 #define FNAME(name) paging##32_##name
46 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
48 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
49 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
51 #define PT_LEVEL_BITS PT32_LEVEL_BITS
52 #define PT_MAX_FULL_LEVELS 2
53 #define CMPXCHG cmpxchg
54#else
55 #error Invalid PTTYPE value
56#endif
57
58#define gpte_to_gfn FNAME(gpte_to_gfn)
59#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
60
61/*
62 * The guest_walker structure emulates the behavior of the hardware page
63 * table walker.
64 */
65struct guest_walker {
66 int level;
67 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
68 pt_element_t ptes[PT_MAX_FULL_LEVELS];
69 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
70 unsigned pt_access;
71 unsigned pte_access;
72 gfn_t gfn;
73 u32 error_code;
74};
75
76static gfn_t gpte_to_gfn(pt_element_t gpte)
77{
78 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
79}
80
81static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
82{
83 return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
84}
85
86static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
87 gfn_t table_gfn, unsigned index,
88 pt_element_t orig_pte, pt_element_t new_pte)
89{
90 pt_element_t ret;
91 pt_element_t *table;
92 struct page *page;
93
94 page = gfn_to_page(kvm, table_gfn);
95 table = kmap_atomic(page, KM_USER0);
96
97 ret = CMPXCHG(&table[index], orig_pte, new_pte);
98
99 kunmap_atomic(table, KM_USER0);
100
101 kvm_release_page_dirty(page);
102
103 return (ret != orig_pte);
104}
105
106static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
107{
108 unsigned access;
109
110 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
111#if PTTYPE == 64
112 if (is_nx(vcpu))
113 access &= ~(gpte >> PT64_NX_SHIFT);
114#endif
115 return access;
116}
117
118/*
119 * Fetch a guest pte for a guest virtual address
120 */
121static int FNAME(walk_addr)(struct guest_walker *walker,
122 struct kvm_vcpu *vcpu, gva_t addr,
123 int write_fault, int user_fault, int fetch_fault)
124{
125 pt_element_t pte;
126 gfn_t table_gfn;
127 unsigned index, pt_access, pte_access;
128 gpa_t pte_gpa;
129
130 pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
131walk:
132 walker->level = vcpu->arch.mmu.root_level;
133 pte = vcpu->arch.cr3;
134#if PTTYPE == 64
135 if (!is_long_mode(vcpu)) {
136 pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
137 if (!is_present_pte(pte))
138 goto not_present;
139 --walker->level;
140 }
141#endif
142 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
143 (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
144
145 pt_access = ACC_ALL;
146
147 for (;;) {
148 index = PT_INDEX(addr, walker->level);
149
150 table_gfn = gpte_to_gfn(pte);
151 pte_gpa = gfn_to_gpa(table_gfn);
152 pte_gpa += index * sizeof(pt_element_t);
153 walker->table_gfn[walker->level - 1] = table_gfn;
154 walker->pte_gpa[walker->level - 1] = pte_gpa;
155 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
156 walker->level - 1, table_gfn);
157
158 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
159
160 if (!is_present_pte(pte))
161 goto not_present;
162
163 if (write_fault && !is_writeble_pte(pte))
164 if (user_fault || is_write_protection(vcpu))
165 goto access_error;
166
167 if (user_fault && !(pte & PT_USER_MASK))
168 goto access_error;
169
170#if PTTYPE == 64
171 if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
172 goto access_error;
173#endif
174
175 if (!(pte & PT_ACCESSED_MASK)) {
176 mark_page_dirty(vcpu->kvm, table_gfn);
177 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
178 index, pte, pte|PT_ACCESSED_MASK))
179 goto walk;
180 pte |= PT_ACCESSED_MASK;
181 }
182
183 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
184
185 walker->ptes[walker->level - 1] = pte;
186
187 if (walker->level == PT_PAGE_TABLE_LEVEL) {
188 walker->gfn = gpte_to_gfn(pte);
189 break;
190 }
191
192 if (walker->level == PT_DIRECTORY_LEVEL
193 && (pte & PT_PAGE_SIZE_MASK)
194 && (PTTYPE == 64 || is_pse(vcpu))) {
195 walker->gfn = gpte_to_gfn_pde(pte);
196 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
197 if (PTTYPE == 32 && is_cpuid_PSE36())
198 walker->gfn += pse36_gfn_delta(pte);
199 break;
200 }
201
202 pt_access = pte_access;
203 --walker->level;
204 }
205
206 if (write_fault && !is_dirty_pte(pte)) {
207 bool ret;
208
209 mark_page_dirty(vcpu->kvm, table_gfn);
210 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
211 pte|PT_DIRTY_MASK);
212 if (ret)
213 goto walk;
214 pte |= PT_DIRTY_MASK;
215 kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
216 walker->ptes[walker->level - 1] = pte;
217 }
218
219 walker->pt_access = pt_access;
220 walker->pte_access = pte_access;
221 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
222 __FUNCTION__, (u64)pte, pt_access, pte_access);
223 return 1;
224
225not_present:
226 walker->error_code = 0;
227 goto err;
228
229access_error:
230 walker->error_code = PFERR_PRESENT_MASK;
231
232err:
233 if (write_fault)
234 walker->error_code |= PFERR_WRITE_MASK;
235 if (user_fault)
236 walker->error_code |= PFERR_USER_MASK;
237 if (fetch_fault)
238 walker->error_code |= PFERR_FETCH_MASK;
239 return 0;
240}
241
242static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
243 u64 *spte, const void *pte, int bytes,
244 int offset_in_pte)
245{
246 pt_element_t gpte;
247 unsigned pte_access;
248 struct page *npage;
249
250 gpte = *(const pt_element_t *)pte;
251 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
252 if (!offset_in_pte && !is_present_pte(gpte))
253 set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
254 return;
255 }
256 if (bytes < sizeof(pt_element_t))
257 return;
258 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
259 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
260 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
261 return;
262 npage = vcpu->arch.update_pte.page;
263 if (!npage)
264 return;
265 get_page(npage);
266 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
267 gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
268}
269
270/*
271 * Fetch a shadow pte for a specific level in the paging hierarchy.
272 */
273static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
274 struct guest_walker *walker,
275 int user_fault, int write_fault, int *ptwrite,
276 struct page *page)
277{
278 hpa_t shadow_addr;
279 int level;
280 u64 *shadow_ent;
281 unsigned access = walker->pt_access;
282
283 if (!is_present_pte(walker->ptes[walker->level - 1]))
284 return NULL;
285
286 shadow_addr = vcpu->arch.mmu.root_hpa;
287 level = vcpu->arch.mmu.shadow_root_level;
288 if (level == PT32E_ROOT_LEVEL) {
289 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
290 shadow_addr &= PT64_BASE_ADDR_MASK;
291 --level;
292 }
293
294 for (; ; level--) {
295 u32 index = SHADOW_PT_INDEX(addr, level);
296 struct kvm_mmu_page *shadow_page;
297 u64 shadow_pte;
298 int metaphysical;
299 gfn_t table_gfn;
300 bool new_page = 0;
301
302 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
303 if (level == PT_PAGE_TABLE_LEVEL)
304 break;
305 if (is_shadow_present_pte(*shadow_ent)) {
306 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
307 continue;
308 }
309
310 if (level - 1 == PT_PAGE_TABLE_LEVEL
311 && walker->level == PT_DIRECTORY_LEVEL) {
312 metaphysical = 1;
313 if (!is_dirty_pte(walker->ptes[level - 1]))
314 access &= ~ACC_WRITE_MASK;
315 table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
316 } else {
317 metaphysical = 0;
318 table_gfn = walker->table_gfn[level - 2];
319 }
320 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
321 metaphysical, access,
322 shadow_ent, &new_page);
323 if (new_page && !metaphysical) {
324 int r;
325 pt_element_t curr_pte;
326 r = kvm_read_guest_atomic(vcpu->kvm,
327 walker->pte_gpa[level - 2],
328 &curr_pte, sizeof(curr_pte));
329 if (r || curr_pte != walker->ptes[level - 2]) {
330 kvm_release_page_clean(page);
331 return NULL;
332 }
333 }
334 shadow_addr = __pa(shadow_page->spt);
335 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
336 | PT_WRITABLE_MASK | PT_USER_MASK;
337 *shadow_ent = shadow_pte;
338 }
339
340 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
341 user_fault, write_fault,
342 walker->ptes[walker->level-1] & PT_DIRTY_MASK,
343 ptwrite, walker->gfn, page);
344
345 return shadow_ent;
346}
347
348/*
349 * Page fault handler. There are several causes for a page fault:
350 * - there is no shadow pte for the guest pte
351 * - write access through a shadow pte marked read only so that we can set
352 * the dirty bit
353 * - write access to a shadow pte marked read only so we can update the page
354 * dirty bitmap, when userspace requests it
355 * - mmio access; in this case we will never install a present shadow pte
356 * - normal guest page fault due to the guest pte marked not present, not
357 * writable, or not executable
358 *
359 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
360 * a negative value on error.
361 */
362static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
363 u32 error_code)
364{
365 int write_fault = error_code & PFERR_WRITE_MASK;
366 int user_fault = error_code & PFERR_USER_MASK;
367 int fetch_fault = error_code & PFERR_FETCH_MASK;
368 struct guest_walker walker;
369 u64 *shadow_pte;
370 int write_pt = 0;
371 int r;
372 struct page *page;
373
374 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
375 kvm_mmu_audit(vcpu, "pre page fault");
376
377 r = mmu_topup_memory_caches(vcpu);
378 if (r)
379 return r;
380
381 down_read(&current->mm->mmap_sem);
382 /*
383 * Look up the shadow pte for the faulting address.
384 */
385 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
386 fetch_fault);
387
388 /*
389 * The page is not mapped by the guest. Let the guest handle it.
390 */
391 if (!r) {
392 pgprintk("%s: guest page fault\n", __FUNCTION__);
393 inject_page_fault(vcpu, addr, walker.error_code);
394 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
395 up_read(&current->mm->mmap_sem);
396 return 0;
397 }
398
399 page = gfn_to_page(vcpu->kvm, walker.gfn);
400
401 spin_lock(&vcpu->kvm->mmu_lock);
402 kvm_mmu_free_some_pages(vcpu);
403 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
404 &write_pt, page);
405 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
406 shadow_pte, *shadow_pte, write_pt);
407
408 if (!write_pt)
409 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
410
411 /*
412 * mmio: emulate if accessible, otherwise its a guest fault.
413 */
414 if (shadow_pte && is_io_pte(*shadow_pte)) {
415 spin_unlock(&vcpu->kvm->mmu_lock);
416 up_read(&current->mm->mmap_sem);
417 return 1;
418 }
419
420 ++vcpu->stat.pf_fixed;
421 kvm_mmu_audit(vcpu, "post page fault (fixed)");
422 spin_unlock(&vcpu->kvm->mmu_lock);
423 up_read(&current->mm->mmap_sem);
424
425 return write_pt;
426}
427
428static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
429{
430 struct guest_walker walker;
431 gpa_t gpa = UNMAPPED_GVA;
432 int r;
433
434 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
435
436 if (r) {
437 gpa = gfn_to_gpa(walker.gfn);
438 gpa |= vaddr & ~PAGE_MASK;
439 }
440
441 return gpa;
442}
443
444static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
445 struct kvm_mmu_page *sp)
446{
447 int i, offset = 0, r = 0;
448 pt_element_t pt;
449
450 if (sp->role.metaphysical
451 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
452 nonpaging_prefetch_page(vcpu, sp);
453 return;
454 }
455
456 if (PTTYPE == 32)
457 offset = sp->role.quadrant << PT64_LEVEL_BITS;
458
459 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
460 gpa_t pte_gpa = gfn_to_gpa(sp->gfn);
461 pte_gpa += (i+offset) * sizeof(pt_element_t);
462
463 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt,
464 sizeof(pt_element_t));
465 if (r || is_present_pte(pt))
466 sp->spt[i] = shadow_trap_nonpresent_pte;
467 else
468 sp->spt[i] = shadow_notrap_nonpresent_pte;
469 }
470}
471
472#undef pt_element_t
473#undef guest_walker
474#undef FNAME
475#undef PT_BASE_ADDR_MASK
476#undef PT_INDEX
477#undef SHADOW_PT_INDEX
478#undef PT_LEVEL_MASK
479#undef PT_DIR_BASE_ADDR_MASK
480#undef PT_LEVEL_BITS
481#undef PT_MAX_FULL_LEVELS
482#undef gpte_to_gfn
483#undef gpte_to_gfn_pde
484#undef CMPXCHG
diff --git a/drivers/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h
index 71fdf458619a..56fc4c873389 100644
--- a/drivers/kvm/segment_descriptor.h
+++ b/arch/x86/kvm/segment_descriptor.h
@@ -1,3 +1,6 @@
1#ifndef __SEGMENT_DESCRIPTOR_H
2#define __SEGMENT_DESCRIPTOR_H
3
1struct segment_descriptor { 4struct segment_descriptor {
2 u16 limit_low; 5 u16 limit_low;
3 u16 base_low; 6 u16 base_low;
@@ -14,4 +17,13 @@ struct segment_descriptor {
14 u8 base_high; 17 u8 base_high;
15} __attribute__((packed)); 18} __attribute__((packed));
16 19
20#ifdef CONFIG_X86_64
21/* LDT or TSS descriptor in the GDT. 16 bytes. */
22struct segment_descriptor_64 {
23 struct segment_descriptor s;
24 u32 base_higher;
25 u32 pad_zero;
26};
17 27
28#endif
29#endif
diff --git a/drivers/kvm/svm.c b/arch/x86/kvm/svm.c
index ced4ac1955db..de755cb1431d 100644
--- a/drivers/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -13,10 +13,11 @@
13 * the COPYING file in the top-level directory. 13 * the COPYING file in the top-level directory.
14 * 14 *
15 */ 15 */
16#include <linux/kvm_host.h>
16 17
17#include "kvm_svm.h" 18#include "kvm_svm.h"
18#include "x86_emulate.h"
19#include "irq.h" 19#include "irq.h"
20#include "mmu.h"
20 21
21#include <linux/module.h> 22#include <linux/module.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
@@ -42,9 +43,6 @@ MODULE_LICENSE("GPL");
42#define SEG_TYPE_LDT 2 43#define SEG_TYPE_LDT 2
43#define SEG_TYPE_BUSY_TSS16 3 44#define SEG_TYPE_BUSY_TSS16 3
44 45
45#define KVM_EFER_LMA (1 << 10)
46#define KVM_EFER_LME (1 << 8)
47
48#define SVM_FEATURE_NPT (1 << 0) 46#define SVM_FEATURE_NPT (1 << 0)
49#define SVM_FEATURE_LBRV (1 << 1) 47#define SVM_FEATURE_LBRV (1 << 1)
50#define SVM_DEATURE_SVML (1 << 2) 48#define SVM_DEATURE_SVML (1 << 2)
@@ -102,20 +100,20 @@ static inline u32 svm_has(u32 feat)
102 100
103static inline u8 pop_irq(struct kvm_vcpu *vcpu) 101static inline u8 pop_irq(struct kvm_vcpu *vcpu)
104{ 102{
105 int word_index = __ffs(vcpu->irq_summary); 103 int word_index = __ffs(vcpu->arch.irq_summary);
106 int bit_index = __ffs(vcpu->irq_pending[word_index]); 104 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
107 int irq = word_index * BITS_PER_LONG + bit_index; 105 int irq = word_index * BITS_PER_LONG + bit_index;
108 106
109 clear_bit(bit_index, &vcpu->irq_pending[word_index]); 107 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
110 if (!vcpu->irq_pending[word_index]) 108 if (!vcpu->arch.irq_pending[word_index])
111 clear_bit(word_index, &vcpu->irq_summary); 109 clear_bit(word_index, &vcpu->arch.irq_summary);
112 return irq; 110 return irq;
113} 111}
114 112
115static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) 113static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
116{ 114{
117 set_bit(irq, vcpu->irq_pending); 115 set_bit(irq, vcpu->arch.irq_pending);
118 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); 116 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
119} 117}
120 118
121static inline void clgi(void) 119static inline void clgi(void)
@@ -184,35 +182,30 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
184 182
185static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 183static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
186{ 184{
187 if (!(efer & KVM_EFER_LMA)) 185 if (!(efer & EFER_LMA))
188 efer &= ~KVM_EFER_LME; 186 efer &= ~EFER_LME;
189 187
190 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; 188 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
191 vcpu->shadow_efer = efer; 189 vcpu->arch.shadow_efer = efer;
192} 190}
193 191
194static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) 192static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
193 bool has_error_code, u32 error_code)
195{ 194{
196 struct vcpu_svm *svm = to_svm(vcpu); 195 struct vcpu_svm *svm = to_svm(vcpu);
197 196
198 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | 197 svm->vmcb->control.event_inj = nr
199 SVM_EVTINJ_VALID_ERR | 198 | SVM_EVTINJ_VALID
200 SVM_EVTINJ_TYPE_EXEPT | 199 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
201 GP_VECTOR; 200 | SVM_EVTINJ_TYPE_EXEPT;
202 svm->vmcb->control.event_inj_err = error_code; 201 svm->vmcb->control.event_inj_err = error_code;
203} 202}
204 203
205static void inject_ud(struct kvm_vcpu *vcpu) 204static bool svm_exception_injected(struct kvm_vcpu *vcpu)
206{ 205{
207 to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID | 206 struct vcpu_svm *svm = to_svm(vcpu);
208 SVM_EVTINJ_TYPE_EXEPT |
209 UD_VECTOR;
210}
211 207
212static int is_page_fault(uint32_t info) 208 return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
213{
214 info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
215 return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT);
216} 209}
217 210
218static int is_external_interrupt(u32 info) 211static int is_external_interrupt(u32 info)
@@ -229,17 +222,16 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
229 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); 222 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
230 return; 223 return;
231 } 224 }
232 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) { 225 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
233 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", 226 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
234 __FUNCTION__, 227 __FUNCTION__,
235 svm->vmcb->save.rip, 228 svm->vmcb->save.rip,
236 svm->next_rip); 229 svm->next_rip);
237 }
238 230
239 vcpu->rip = svm->vmcb->save.rip = svm->next_rip; 231 vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
240 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 232 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
241 233
242 vcpu->interrupt_window_open = 1; 234 vcpu->arch.interrupt_window_open = 1;
243} 235}
244 236
245static int has_svm(void) 237static int has_svm(void)
@@ -312,7 +304,7 @@ static void svm_hardware_enable(void *garbage)
312 svm_data->next_asid = svm_data->max_asid + 1; 304 svm_data->next_asid = svm_data->max_asid + 1;
313 svm_features = cpuid_edx(SVM_CPUID_FUNC); 305 svm_features = cpuid_edx(SVM_CPUID_FUNC);
314 306
315 asm volatile ( "sgdt %0" : "=m"(gdt_descr) ); 307 asm volatile ("sgdt %0" : "=m"(gdt_descr));
316 gdt = (struct desc_struct *)gdt_descr.address; 308 gdt = (struct desc_struct *)gdt_descr.address;
317 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 309 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
318 310
@@ -458,11 +450,13 @@ static void init_vmcb(struct vmcb *vmcb)
458 450
459 control->intercept_cr_read = INTERCEPT_CR0_MASK | 451 control->intercept_cr_read = INTERCEPT_CR0_MASK |
460 INTERCEPT_CR3_MASK | 452 INTERCEPT_CR3_MASK |
461 INTERCEPT_CR4_MASK; 453 INTERCEPT_CR4_MASK |
454 INTERCEPT_CR8_MASK;
462 455
463 control->intercept_cr_write = INTERCEPT_CR0_MASK | 456 control->intercept_cr_write = INTERCEPT_CR0_MASK |
464 INTERCEPT_CR3_MASK | 457 INTERCEPT_CR3_MASK |
465 INTERCEPT_CR4_MASK; 458 INTERCEPT_CR4_MASK |
459 INTERCEPT_CR8_MASK;
466 460
467 control->intercept_dr_read = INTERCEPT_DR0_MASK | 461 control->intercept_dr_read = INTERCEPT_DR0_MASK |
468 INTERCEPT_DR1_MASK | 462 INTERCEPT_DR1_MASK |
@@ -476,7 +470,8 @@ static void init_vmcb(struct vmcb *vmcb)
476 INTERCEPT_DR5_MASK | 470 INTERCEPT_DR5_MASK |
477 INTERCEPT_DR7_MASK; 471 INTERCEPT_DR7_MASK;
478 472
479 control->intercept_exceptions = 1 << PF_VECTOR; 473 control->intercept_exceptions = (1 << PF_VECTOR) |
474 (1 << UD_VECTOR);
480 475
481 476
482 control->intercept = (1ULL << INTERCEPT_INTR) | 477 control->intercept = (1ULL << INTERCEPT_INTR) |
@@ -543,8 +538,7 @@ static void init_vmcb(struct vmcb *vmcb)
543 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 538 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
544 539
545 save->efer = MSR_EFER_SVME_MASK; 540 save->efer = MSR_EFER_SVME_MASK;
546 541 save->dr6 = 0xffff0ff0;
547 save->dr6 = 0xffff0ff0;
548 save->dr7 = 0x400; 542 save->dr7 = 0x400;
549 save->rflags = 2; 543 save->rflags = 2;
550 save->rip = 0x0000fff0; 544 save->rip = 0x0000fff0;
@@ -558,7 +552,7 @@ static void init_vmcb(struct vmcb *vmcb)
558 /* rdx = ?? */ 552 /* rdx = ?? */
559} 553}
560 554
561static void svm_vcpu_reset(struct kvm_vcpu *vcpu) 555static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
562{ 556{
563 struct vcpu_svm *svm = to_svm(vcpu); 557 struct vcpu_svm *svm = to_svm(vcpu);
564 558
@@ -566,9 +560,11 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
566 560
567 if (vcpu->vcpu_id != 0) { 561 if (vcpu->vcpu_id != 0) {
568 svm->vmcb->save.rip = 0; 562 svm->vmcb->save.rip = 0;
569 svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12; 563 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
570 svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8; 564 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
571 } 565 }
566
567 return 0;
572} 568}
573 569
574static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) 570static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
@@ -587,12 +583,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
587 if (err) 583 if (err)
588 goto free_svm; 584 goto free_svm;
589 585
590 if (irqchip_in_kernel(kvm)) {
591 err = kvm_create_lapic(&svm->vcpu);
592 if (err < 0)
593 goto free_svm;
594 }
595
596 page = alloc_page(GFP_KERNEL); 586 page = alloc_page(GFP_KERNEL);
597 if (!page) { 587 if (!page) {
598 err = -ENOMEM; 588 err = -ENOMEM;
@@ -608,9 +598,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
608 598
609 fx_init(&svm->vcpu); 599 fx_init(&svm->vcpu);
610 svm->vcpu.fpu_active = 1; 600 svm->vcpu.fpu_active = 1;
611 svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 601 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
612 if (svm->vcpu.vcpu_id == 0) 602 if (svm->vcpu.vcpu_id == 0)
613 svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP; 603 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
614 604
615 return &svm->vcpu; 605 return &svm->vcpu;
616 606
@@ -644,7 +634,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
644 * increasing TSC. 634 * increasing TSC.
645 */ 635 */
646 rdtscll(tsc_this); 636 rdtscll(tsc_this);
647 delta = vcpu->host_tsc - tsc_this; 637 delta = vcpu->arch.host_tsc - tsc_this;
648 svm->vmcb->control.tsc_offset += delta; 638 svm->vmcb->control.tsc_offset += delta;
649 vcpu->cpu = cpu; 639 vcpu->cpu = cpu;
650 kvm_migrate_apic_timer(vcpu); 640 kvm_migrate_apic_timer(vcpu);
@@ -659,11 +649,11 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
659 struct vcpu_svm *svm = to_svm(vcpu); 649 struct vcpu_svm *svm = to_svm(vcpu);
660 int i; 650 int i;
661 651
652 ++vcpu->stat.host_state_reload;
662 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 653 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
663 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 654 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
664 655
665 rdtscll(vcpu->host_tsc); 656 rdtscll(vcpu->arch.host_tsc);
666 kvm_put_guest_fpu(vcpu);
667} 657}
668 658
669static void svm_vcpu_decache(struct kvm_vcpu *vcpu) 659static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
@@ -674,17 +664,17 @@ static void svm_cache_regs(struct kvm_vcpu *vcpu)
674{ 664{
675 struct vcpu_svm *svm = to_svm(vcpu); 665 struct vcpu_svm *svm = to_svm(vcpu);
676 666
677 vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 667 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
678 vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 668 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
679 vcpu->rip = svm->vmcb->save.rip; 669 vcpu->arch.rip = svm->vmcb->save.rip;
680} 670}
681 671
682static void svm_decache_regs(struct kvm_vcpu *vcpu) 672static void svm_decache_regs(struct kvm_vcpu *vcpu)
683{ 673{
684 struct vcpu_svm *svm = to_svm(vcpu); 674 struct vcpu_svm *svm = to_svm(vcpu);
685 svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX]; 675 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
686 svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP]; 676 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
687 svm->vmcb->save.rip = vcpu->rip; 677 svm->vmcb->save.rip = vcpu->arch.rip;
688} 678}
689 679
690static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 680static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -782,24 +772,24 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
782 struct vcpu_svm *svm = to_svm(vcpu); 772 struct vcpu_svm *svm = to_svm(vcpu);
783 773
784#ifdef CONFIG_X86_64 774#ifdef CONFIG_X86_64
785 if (vcpu->shadow_efer & KVM_EFER_LME) { 775 if (vcpu->arch.shadow_efer & EFER_LME) {
786 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 776 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
787 vcpu->shadow_efer |= KVM_EFER_LMA; 777 vcpu->arch.shadow_efer |= EFER_LMA;
788 svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME; 778 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
789 } 779 }
790 780
791 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) { 781 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
792 vcpu->shadow_efer &= ~KVM_EFER_LMA; 782 vcpu->arch.shadow_efer &= ~EFER_LMA;
793 svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME); 783 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
794 } 784 }
795 } 785 }
796#endif 786#endif
797 if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { 787 if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
798 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 788 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
799 vcpu->fpu_active = 1; 789 vcpu->fpu_active = 1;
800 } 790 }
801 791
802 vcpu->cr0 = cr0; 792 vcpu->arch.cr0 = cr0;
803 cr0 |= X86_CR0_PG | X86_CR0_WP; 793 cr0 |= X86_CR0_PG | X86_CR0_WP;
804 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 794 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
805 svm->vmcb->save.cr0 = cr0; 795 svm->vmcb->save.cr0 = cr0;
@@ -807,7 +797,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
807 797
808static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 798static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
809{ 799{
810 vcpu->cr4 = cr4; 800 vcpu->arch.cr4 = cr4;
811 to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; 801 to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
812} 802}
813 803
@@ -912,7 +902,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
912 svm->db_regs[dr] = value; 902 svm->db_regs[dr] = value;
913 return; 903 return;
914 case 4 ... 5: 904 case 4 ... 5:
915 if (vcpu->cr4 & X86_CR4_DE) { 905 if (vcpu->arch.cr4 & X86_CR4_DE) {
916 *exception = UD_VECTOR; 906 *exception = UD_VECTOR;
917 return; 907 return;
918 } 908 }
@@ -938,51 +928,30 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
938 struct kvm *kvm = svm->vcpu.kvm; 928 struct kvm *kvm = svm->vcpu.kvm;
939 u64 fault_address; 929 u64 fault_address;
940 u32 error_code; 930 u32 error_code;
941 enum emulation_result er;
942 int r;
943 931
944 if (!irqchip_in_kernel(kvm) && 932 if (!irqchip_in_kernel(kvm) &&
945 is_external_interrupt(exit_int_info)) 933 is_external_interrupt(exit_int_info))
946 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); 934 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
947 935
948 mutex_lock(&kvm->lock);
949
950 fault_address = svm->vmcb->control.exit_info_2; 936 fault_address = svm->vmcb->control.exit_info_2;
951 error_code = svm->vmcb->control.exit_info_1; 937 error_code = svm->vmcb->control.exit_info_1;
952 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 938 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
953 if (r < 0) { 939}
954 mutex_unlock(&kvm->lock);
955 return r;
956 }
957 if (!r) {
958 mutex_unlock(&kvm->lock);
959 return 1;
960 }
961 er = emulate_instruction(&svm->vcpu, kvm_run, fault_address,
962 error_code);
963 mutex_unlock(&kvm->lock);
964 940
965 switch (er) { 941static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
966 case EMULATE_DONE: 942{
967 return 1; 943 int er;
968 case EMULATE_DO_MMIO:
969 ++svm->vcpu.stat.mmio_exits;
970 return 0;
971 case EMULATE_FAIL:
972 kvm_report_emulation_failure(&svm->vcpu, "pagetable");
973 break;
974 default:
975 BUG();
976 }
977 944
978 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 945 er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
979 return 0; 946 if (er != EMULATE_DONE)
947 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
948 return 1;
980} 949}
981 950
982static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 951static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
983{ 952{
984 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 953 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
985 if (!(svm->vcpu.cr0 & X86_CR0_TS)) 954 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
986 svm->vmcb->save.cr0 &= ~X86_CR0_TS; 955 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
987 svm->vcpu.fpu_active = 1; 956 svm->vcpu.fpu_active = 1;
988 957
@@ -1004,7 +973,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1004 973
1005static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 974static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1006{ 975{
1007 u32 io_info = svm->vmcb->control.exit_info_1; //address size bug? 976 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1008 int size, down, in, string, rep; 977 int size, down, in, string, rep;
1009 unsigned port; 978 unsigned port;
1010 979
@@ -1015,7 +984,8 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1015 string = (io_info & SVM_IOIO_STR_MASK) != 0; 984 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1016 985
1017 if (string) { 986 if (string) {
1018 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) 987 if (emulate_instruction(&svm->vcpu,
988 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
1019 return 0; 989 return 0;
1020 return 1; 990 return 1;
1021 } 991 }
@@ -1045,13 +1015,14 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1045{ 1015{
1046 svm->next_rip = svm->vmcb->save.rip + 3; 1016 svm->next_rip = svm->vmcb->save.rip + 3;
1047 skip_emulated_instruction(&svm->vcpu); 1017 skip_emulated_instruction(&svm->vcpu);
1048 return kvm_hypercall(&svm->vcpu, kvm_run); 1018 kvm_emulate_hypercall(&svm->vcpu);
1019 return 1;
1049} 1020}
1050 1021
1051static int invalid_op_interception(struct vcpu_svm *svm, 1022static int invalid_op_interception(struct vcpu_svm *svm,
1052 struct kvm_run *kvm_run) 1023 struct kvm_run *kvm_run)
1053{ 1024{
1054 inject_ud(&svm->vcpu); 1025 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1055 return 1; 1026 return 1;
1056} 1027}
1057 1028
@@ -1073,11 +1044,20 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1073static int emulate_on_interception(struct vcpu_svm *svm, 1044static int emulate_on_interception(struct vcpu_svm *svm,
1074 struct kvm_run *kvm_run) 1045 struct kvm_run *kvm_run)
1075{ 1046{
1076 if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE) 1047 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
1077 pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); 1048 pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
1078 return 1; 1049 return 1;
1079} 1050}
1080 1051
1052static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1053{
1054 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
1055 if (irqchip_in_kernel(svm->vcpu.kvm))
1056 return 1;
1057 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1058 return 0;
1059}
1060
1081static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) 1061static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1082{ 1062{
1083 struct vcpu_svm *svm = to_svm(vcpu); 1063 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1124,14 +1104,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1124 1104
1125static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1105static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1126{ 1106{
1127 u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; 1107 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1128 u64 data; 1108 u64 data;
1129 1109
1130 if (svm_get_msr(&svm->vcpu, ecx, &data)) 1110 if (svm_get_msr(&svm->vcpu, ecx, &data))
1131 svm_inject_gp(&svm->vcpu, 0); 1111 kvm_inject_gp(&svm->vcpu, 0);
1132 else { 1112 else {
1133 svm->vmcb->save.rax = data & 0xffffffff; 1113 svm->vmcb->save.rax = data & 0xffffffff;
1134 svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32; 1114 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1135 svm->next_rip = svm->vmcb->save.rip + 2; 1115 svm->next_rip = svm->vmcb->save.rip + 2;
1136 skip_emulated_instruction(&svm->vcpu); 1116 skip_emulated_instruction(&svm->vcpu);
1137 } 1117 }
@@ -1176,7 +1156,20 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1176 case MSR_IA32_SYSENTER_ESP: 1156 case MSR_IA32_SYSENTER_ESP:
1177 svm->vmcb->save.sysenter_esp = data; 1157 svm->vmcb->save.sysenter_esp = data;
1178 break; 1158 break;
1159 case MSR_K7_EVNTSEL0:
1160 case MSR_K7_EVNTSEL1:
1161 case MSR_K7_EVNTSEL2:
1162 case MSR_K7_EVNTSEL3:
1163 /*
1164 * only support writing 0 to the performance counters for now
1165 * to make Windows happy. Should be replaced by a real
1166 * performance counter emulation later.
1167 */
1168 if (data != 0)
1169 goto unhandled;
1170 break;
1179 default: 1171 default:
1172 unhandled:
1180 return kvm_set_msr_common(vcpu, ecx, data); 1173 return kvm_set_msr_common(vcpu, ecx, data);
1181 } 1174 }
1182 return 0; 1175 return 0;
@@ -1184,12 +1177,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1184 1177
1185static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1178static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1186{ 1179{
1187 u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; 1180 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1188 u64 data = (svm->vmcb->save.rax & -1u) 1181 u64 data = (svm->vmcb->save.rax & -1u)
1189 | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32); 1182 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1190 svm->next_rip = svm->vmcb->save.rip + 2; 1183 svm->next_rip = svm->vmcb->save.rip + 2;
1191 if (svm_set_msr(&svm->vcpu, ecx, data)) 1184 if (svm_set_msr(&svm->vcpu, ecx, data))
1192 svm_inject_gp(&svm->vcpu, 0); 1185 kvm_inject_gp(&svm->vcpu, 0);
1193 else 1186 else
1194 skip_emulated_instruction(&svm->vcpu); 1187 skip_emulated_instruction(&svm->vcpu);
1195 return 1; 1188 return 1;
@@ -1213,7 +1206,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
1213 * possible 1206 * possible
1214 */ 1207 */
1215 if (kvm_run->request_interrupt_window && 1208 if (kvm_run->request_interrupt_window &&
1216 !svm->vcpu.irq_summary) { 1209 !svm->vcpu.arch.irq_summary) {
1217 ++svm->vcpu.stat.irq_window_exits; 1210 ++svm->vcpu.stat.irq_window_exits;
1218 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 1211 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1219 return 0; 1212 return 0;
@@ -1227,10 +1220,12 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1227 [SVM_EXIT_READ_CR0] = emulate_on_interception, 1220 [SVM_EXIT_READ_CR0] = emulate_on_interception,
1228 [SVM_EXIT_READ_CR3] = emulate_on_interception, 1221 [SVM_EXIT_READ_CR3] = emulate_on_interception,
1229 [SVM_EXIT_READ_CR4] = emulate_on_interception, 1222 [SVM_EXIT_READ_CR4] = emulate_on_interception,
1223 [SVM_EXIT_READ_CR8] = emulate_on_interception,
1230 /* for now: */ 1224 /* for now: */
1231 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 1225 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
1232 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 1226 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
1233 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 1227 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
1228 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
1234 [SVM_EXIT_READ_DR0] = emulate_on_interception, 1229 [SVM_EXIT_READ_DR0] = emulate_on_interception,
1235 [SVM_EXIT_READ_DR1] = emulate_on_interception, 1230 [SVM_EXIT_READ_DR1] = emulate_on_interception,
1236 [SVM_EXIT_READ_DR2] = emulate_on_interception, 1231 [SVM_EXIT_READ_DR2] = emulate_on_interception,
@@ -1241,6 +1236,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1241 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 1236 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
1242 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 1237 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
1243 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 1238 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
1239 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
1244 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 1240 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1245 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 1241 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1246 [SVM_EXIT_INTR] = nop_on_interception, 1242 [SVM_EXIT_INTR] = nop_on_interception,
@@ -1293,7 +1289,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1293 exit_code); 1289 exit_code);
1294 1290
1295 if (exit_code >= ARRAY_SIZE(svm_exit_handlers) 1291 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
1296 || svm_exit_handlers[exit_code] == 0) { 1292 || !svm_exit_handlers[exit_code]) {
1297 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 1293 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1298 kvm_run->hw.hardware_exit_reason = exit_code; 1294 kvm_run->hw.hardware_exit_reason = exit_code;
1299 return 0; 1295 return 0;
@@ -1307,7 +1303,7 @@ static void reload_tss(struct kvm_vcpu *vcpu)
1307 int cpu = raw_smp_processor_id(); 1303 int cpu = raw_smp_processor_id();
1308 1304
1309 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); 1305 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1310 svm_data->tss_desc->type = 9; //available 32/64-bit TSS 1306 svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
1311 load_TR_desc(); 1307 load_TR_desc();
1312} 1308}
1313 1309
@@ -1348,7 +1344,6 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
1348 struct vmcb *vmcb = svm->vmcb; 1344 struct vmcb *vmcb = svm->vmcb;
1349 int intr_vector = -1; 1345 int intr_vector = -1;
1350 1346
1351 kvm_inject_pending_timer_irqs(vcpu);
1352 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && 1347 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
1353 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { 1348 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
1354 intr_vector = vmcb->control.exit_int_info & 1349 intr_vector = vmcb->control.exit_int_info &
@@ -1388,20 +1383,20 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
1388 push_irq(&svm->vcpu, control->int_vector); 1383 push_irq(&svm->vcpu, control->int_vector);
1389 } 1384 }
1390 1385
1391 svm->vcpu.interrupt_window_open = 1386 svm->vcpu.arch.interrupt_window_open =
1392 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); 1387 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
1393} 1388}
1394 1389
1395static void svm_do_inject_vector(struct vcpu_svm *svm) 1390static void svm_do_inject_vector(struct vcpu_svm *svm)
1396{ 1391{
1397 struct kvm_vcpu *vcpu = &svm->vcpu; 1392 struct kvm_vcpu *vcpu = &svm->vcpu;
1398 int word_index = __ffs(vcpu->irq_summary); 1393 int word_index = __ffs(vcpu->arch.irq_summary);
1399 int bit_index = __ffs(vcpu->irq_pending[word_index]); 1394 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
1400 int irq = word_index * BITS_PER_LONG + bit_index; 1395 int irq = word_index * BITS_PER_LONG + bit_index;
1401 1396
1402 clear_bit(bit_index, &vcpu->irq_pending[word_index]); 1397 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
1403 if (!vcpu->irq_pending[word_index]) 1398 if (!vcpu->arch.irq_pending[word_index])
1404 clear_bit(word_index, &vcpu->irq_summary); 1399 clear_bit(word_index, &vcpu->arch.irq_summary);
1405 svm_inject_irq(svm, irq); 1400 svm_inject_irq(svm, irq);
1406} 1401}
1407 1402
@@ -1411,11 +1406,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1411 struct vcpu_svm *svm = to_svm(vcpu); 1406 struct vcpu_svm *svm = to_svm(vcpu);
1412 struct vmcb_control_area *control = &svm->vmcb->control; 1407 struct vmcb_control_area *control = &svm->vmcb->control;
1413 1408
1414 svm->vcpu.interrupt_window_open = 1409 svm->vcpu.arch.interrupt_window_open =
1415 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && 1410 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
1416 (svm->vmcb->save.rflags & X86_EFLAGS_IF)); 1411 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
1417 1412
1418 if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary) 1413 if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
1419 /* 1414 /*
1420 * If interrupts enabled, and not blocked by sti or mov ss. Good. 1415 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1421 */ 1416 */
@@ -1424,13 +1419,18 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1424 /* 1419 /*
1425 * Interrupts blocked. Wait for unblock. 1420 * Interrupts blocked. Wait for unblock.
1426 */ 1421 */
1427 if (!svm->vcpu.interrupt_window_open && 1422 if (!svm->vcpu.arch.interrupt_window_open &&
1428 (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) { 1423 (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
1429 control->intercept |= 1ULL << INTERCEPT_VINTR; 1424 control->intercept |= 1ULL << INTERCEPT_VINTR;
1430 } else 1425 else
1431 control->intercept &= ~(1ULL << INTERCEPT_VINTR); 1426 control->intercept &= ~(1ULL << INTERCEPT_VINTR);
1432} 1427}
1433 1428
1429static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
1430{
1431 return 0;
1432}
1433
1434static void save_db_regs(unsigned long *db_regs) 1434static void save_db_regs(unsigned long *db_regs)
1435{ 1435{
1436 asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); 1436 asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
@@ -1472,7 +1472,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1472 svm->host_cr2 = kvm_read_cr2(); 1472 svm->host_cr2 = kvm_read_cr2();
1473 svm->host_dr6 = read_dr6(); 1473 svm->host_dr6 = read_dr6();
1474 svm->host_dr7 = read_dr7(); 1474 svm->host_dr7 = read_dr7();
1475 svm->vmcb->save.cr2 = vcpu->cr2; 1475 svm->vmcb->save.cr2 = vcpu->arch.cr2;
1476 1476
1477 if (svm->vmcb->save.dr7 & 0xff) { 1477 if (svm->vmcb->save.dr7 & 0xff) {
1478 write_dr7(0); 1478 write_dr7(0);
@@ -1486,13 +1486,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1486 1486
1487 asm volatile ( 1487 asm volatile (
1488#ifdef CONFIG_X86_64 1488#ifdef CONFIG_X86_64
1489 "push %%rbx; push %%rcx; push %%rdx;" 1489 "push %%rbp; \n\t"
1490 "push %%rsi; push %%rdi; push %%rbp;"
1491 "push %%r8; push %%r9; push %%r10; push %%r11;"
1492 "push %%r12; push %%r13; push %%r14; push %%r15;"
1493#else 1490#else
1494 "push %%ebx; push %%ecx; push %%edx;" 1491 "push %%ebp; \n\t"
1495 "push %%esi; push %%edi; push %%ebp;"
1496#endif 1492#endif
1497 1493
1498#ifdef CONFIG_X86_64 1494#ifdef CONFIG_X86_64
@@ -1554,10 +1550,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1554 "mov %%r14, %c[r14](%[svm]) \n\t" 1550 "mov %%r14, %c[r14](%[svm]) \n\t"
1555 "mov %%r15, %c[r15](%[svm]) \n\t" 1551 "mov %%r15, %c[r15](%[svm]) \n\t"
1556 1552
1557 "pop %%r15; pop %%r14; pop %%r13; pop %%r12;" 1553 "pop %%rbp; \n\t"
1558 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
1559 "pop %%rbp; pop %%rdi; pop %%rsi;"
1560 "pop %%rdx; pop %%rcx; pop %%rbx; \n\t"
1561#else 1554#else
1562 "mov %%ebx, %c[rbx](%[svm]) \n\t" 1555 "mov %%ebx, %c[rbx](%[svm]) \n\t"
1563 "mov %%ecx, %c[rcx](%[svm]) \n\t" 1556 "mov %%ecx, %c[rcx](%[svm]) \n\t"
@@ -1566,34 +1559,40 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1566 "mov %%edi, %c[rdi](%[svm]) \n\t" 1559 "mov %%edi, %c[rdi](%[svm]) \n\t"
1567 "mov %%ebp, %c[rbp](%[svm]) \n\t" 1560 "mov %%ebp, %c[rbp](%[svm]) \n\t"
1568 1561
1569 "pop %%ebp; pop %%edi; pop %%esi;" 1562 "pop %%ebp; \n\t"
1570 "pop %%edx; pop %%ecx; pop %%ebx; \n\t"
1571#endif 1563#endif
1572 : 1564 :
1573 : [svm]"a"(svm), 1565 : [svm]"a"(svm),
1574 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), 1566 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
1575 [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])), 1567 [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
1576 [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])), 1568 [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
1577 [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])), 1569 [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
1578 [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])), 1570 [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
1579 [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])), 1571 [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
1580 [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP])) 1572 [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
1581#ifdef CONFIG_X86_64 1573#ifdef CONFIG_X86_64
1582 ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])), 1574 , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
1583 [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])), 1575 [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
1584 [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])), 1576 [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
1585 [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])), 1577 [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
1586 [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])), 1578 [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
1587 [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])), 1579 [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
1588 [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])), 1580 [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
1589 [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15])) 1581 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
1590#endif 1582#endif
1591 : "cc", "memory" ); 1583 : "cc", "memory"
1584#ifdef CONFIG_X86_64
1585 , "rbx", "rcx", "rdx", "rsi", "rdi"
1586 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
1587#else
1588 , "ebx", "ecx", "edx" , "esi", "edi"
1589#endif
1590 );
1592 1591
1593 if ((svm->vmcb->save.dr7 & 0xff)) 1592 if ((svm->vmcb->save.dr7 & 0xff))
1594 load_db_regs(svm->host_db_regs); 1593 load_db_regs(svm->host_db_regs);
1595 1594
1596 vcpu->cr2 = svm->vmcb->save.cr2; 1595 vcpu->arch.cr2 = svm->vmcb->save.cr2;
1597 1596
1598 write_dr6(svm->host_dr6); 1597 write_dr6(svm->host_dr6);
1599 write_dr7(svm->host_dr7); 1598 write_dr7(svm->host_dr7);
@@ -1627,34 +1626,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1627 } 1626 }
1628} 1627}
1629 1628
1630static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
1631 unsigned long addr,
1632 uint32_t err_code)
1633{
1634 struct vcpu_svm *svm = to_svm(vcpu);
1635 uint32_t exit_int_info = svm->vmcb->control.exit_int_info;
1636
1637 ++vcpu->stat.pf_guest;
1638
1639 if (is_page_fault(exit_int_info)) {
1640
1641 svm->vmcb->control.event_inj_err = 0;
1642 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
1643 SVM_EVTINJ_VALID_ERR |
1644 SVM_EVTINJ_TYPE_EXEPT |
1645 DF_VECTOR;
1646 return;
1647 }
1648 vcpu->cr2 = addr;
1649 svm->vmcb->save.cr2 = addr;
1650 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
1651 SVM_EVTINJ_VALID_ERR |
1652 SVM_EVTINJ_TYPE_EXEPT |
1653 PF_VECTOR;
1654 svm->vmcb->control.event_inj_err = err_code;
1655}
1656
1657
1658static int is_disabled(void) 1629static int is_disabled(void)
1659{ 1630{
1660 u64 vm_cr; 1631 u64 vm_cr;
@@ -1675,7 +1646,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1675 hypercall[0] = 0x0f; 1646 hypercall[0] = 0x0f;
1676 hypercall[1] = 0x01; 1647 hypercall[1] = 0x01;
1677 hypercall[2] = 0xd9; 1648 hypercall[2] = 0xd9;
1678 hypercall[3] = 0xc3;
1679} 1649}
1680 1650
1681static void svm_check_processor_compat(void *rtn) 1651static void svm_check_processor_compat(void *rtn)
@@ -1683,6 +1653,11 @@ static void svm_check_processor_compat(void *rtn)
1683 *(int *)rtn = 0; 1653 *(int *)rtn = 0;
1684} 1654}
1685 1655
1656static bool svm_cpu_has_accelerated_tpr(void)
1657{
1658 return false;
1659}
1660
1686static struct kvm_x86_ops svm_x86_ops = { 1661static struct kvm_x86_ops svm_x86_ops = {
1687 .cpu_has_kvm_support = has_svm, 1662 .cpu_has_kvm_support = has_svm,
1688 .disabled_by_bios = is_disabled, 1663 .disabled_by_bios = is_disabled,
@@ -1691,6 +1666,7 @@ static struct kvm_x86_ops svm_x86_ops = {
1691 .check_processor_compatibility = svm_check_processor_compat, 1666 .check_processor_compatibility = svm_check_processor_compat,
1692 .hardware_enable = svm_hardware_enable, 1667 .hardware_enable = svm_hardware_enable,
1693 .hardware_disable = svm_hardware_disable, 1668 .hardware_disable = svm_hardware_disable,
1669 .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
1694 1670
1695 .vcpu_create = svm_create_vcpu, 1671 .vcpu_create = svm_create_vcpu,
1696 .vcpu_free = svm_free_vcpu, 1672 .vcpu_free = svm_free_vcpu,
@@ -1725,9 +1701,6 @@ static struct kvm_x86_ops svm_x86_ops = {
1725 .set_rflags = svm_set_rflags, 1701 .set_rflags = svm_set_rflags,
1726 1702
1727 .tlb_flush = svm_flush_tlb, 1703 .tlb_flush = svm_flush_tlb,
1728 .inject_page_fault = svm_inject_page_fault,
1729
1730 .inject_gp = svm_inject_gp,
1731 1704
1732 .run = svm_vcpu_run, 1705 .run = svm_vcpu_run,
1733 .handle_exit = handle_exit, 1706 .handle_exit = handle_exit,
@@ -1735,19 +1708,23 @@ static struct kvm_x86_ops svm_x86_ops = {
1735 .patch_hypercall = svm_patch_hypercall, 1708 .patch_hypercall = svm_patch_hypercall,
1736 .get_irq = svm_get_irq, 1709 .get_irq = svm_get_irq,
1737 .set_irq = svm_set_irq, 1710 .set_irq = svm_set_irq,
1711 .queue_exception = svm_queue_exception,
1712 .exception_injected = svm_exception_injected,
1738 .inject_pending_irq = svm_intr_assist, 1713 .inject_pending_irq = svm_intr_assist,
1739 .inject_pending_vectors = do_interrupt_requests, 1714 .inject_pending_vectors = do_interrupt_requests,
1715
1716 .set_tss_addr = svm_set_tss_addr,
1740}; 1717};
1741 1718
1742static int __init svm_init(void) 1719static int __init svm_init(void)
1743{ 1720{
1744 return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm), 1721 return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
1745 THIS_MODULE); 1722 THIS_MODULE);
1746} 1723}
1747 1724
1748static void __exit svm_exit(void) 1725static void __exit svm_exit(void)
1749{ 1726{
1750 kvm_exit_x86(); 1727 kvm_exit();
1751} 1728}
1752 1729
1753module_init(svm_init) 1730module_init(svm_init)
diff --git a/drivers/kvm/svm.h b/arch/x86/kvm/svm.h
index 3b1b0f35b6cb..5fd50491b555 100644
--- a/drivers/kvm/svm.h
+++ b/arch/x86/kvm/svm.h
@@ -204,6 +204,7 @@ struct __attribute__ ((__packed__)) vmcb {
204#define INTERCEPT_CR0_MASK 1 204#define INTERCEPT_CR0_MASK 1
205#define INTERCEPT_CR3_MASK (1 << 3) 205#define INTERCEPT_CR3_MASK (1 << 3)
206#define INTERCEPT_CR4_MASK (1 << 4) 206#define INTERCEPT_CR4_MASK (1 << 4)
207#define INTERCEPT_CR8_MASK (1 << 8)
207 208
208#define INTERCEPT_DR0_MASK 1 209#define INTERCEPT_DR0_MASK 1
209#define INTERCEPT_DR1_MASK (1 << 1) 210#define INTERCEPT_DR1_MASK (1 << 1)
@@ -311,7 +312,7 @@ struct __attribute__ ((__packed__)) vmcb {
311 312
312#define SVM_EXIT_ERR -1 313#define SVM_EXIT_ERR -1
313 314
314#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP 315#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
315 316
316#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" 317#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
317#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" 318#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
diff --git a/drivers/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5b397b6c9f93..ad36447e696e 100644
--- a/drivers/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -15,17 +15,18 @@
15 * 15 *
16 */ 16 */
17 17
18#include "kvm.h"
19#include "x86_emulate.h"
20#include "irq.h" 18#include "irq.h"
21#include "vmx.h" 19#include "vmx.h"
22#include "segment_descriptor.h" 20#include "segment_descriptor.h"
21#include "mmu.h"
23 22
23#include <linux/kvm_host.h>
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/sched.h> 28#include <linux/sched.h>
29#include <linux/moduleparam.h>
29 30
30#include <asm/io.h> 31#include <asm/io.h>
31#include <asm/desc.h> 32#include <asm/desc.h>
@@ -33,6 +34,9 @@
33MODULE_AUTHOR("Qumranet"); 34MODULE_AUTHOR("Qumranet");
34MODULE_LICENSE("GPL"); 35MODULE_LICENSE("GPL");
35 36
37static int bypass_guest_pf = 1;
38module_param(bypass_guest_pf, bool, 0);
39
36struct vmcs { 40struct vmcs {
37 u32 revision_id; 41 u32 revision_id;
38 u32 abort; 42 u32 abort;
@@ -43,6 +47,7 @@ struct vcpu_vmx {
43 struct kvm_vcpu vcpu; 47 struct kvm_vcpu vcpu;
44 int launched; 48 int launched;
45 u8 fail; 49 u8 fail;
50 u32 idt_vectoring_info;
46 struct kvm_msr_entry *guest_msrs; 51 struct kvm_msr_entry *guest_msrs;
47 struct kvm_msr_entry *host_msrs; 52 struct kvm_msr_entry *host_msrs;
48 int nmsrs; 53 int nmsrs;
@@ -57,8 +62,15 @@ struct vcpu_vmx {
57 u16 fs_sel, gs_sel, ldt_sel; 62 u16 fs_sel, gs_sel, ldt_sel;
58 int gs_ldt_reload_needed; 63 int gs_ldt_reload_needed;
59 int fs_reload_needed; 64 int fs_reload_needed;
60 }host_state; 65 int guest_efer_loaded;
61 66 } host_state;
67 struct {
68 struct {
69 bool pending;
70 u8 vector;
71 unsigned rip;
72 } irq;
73 } rmode;
62}; 74};
63 75
64static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 76static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -74,14 +86,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
74static struct page *vmx_io_bitmap_a; 86static struct page *vmx_io_bitmap_a;
75static struct page *vmx_io_bitmap_b; 87static struct page *vmx_io_bitmap_b;
76 88
77#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
78
79static struct vmcs_config { 89static struct vmcs_config {
80 int size; 90 int size;
81 int order; 91 int order;
82 u32 revision_id; 92 u32 revision_id;
83 u32 pin_based_exec_ctrl; 93 u32 pin_based_exec_ctrl;
84 u32 cpu_based_exec_ctrl; 94 u32 cpu_based_exec_ctrl;
95 u32 cpu_based_2nd_exec_ctrl;
85 u32 vmexit_ctrl; 96 u32 vmexit_ctrl;
86 u32 vmentry_ctrl; 97 u32 vmentry_ctrl;
87} vmcs_config; 98} vmcs_config;
@@ -138,18 +149,6 @@ static void save_msrs(struct kvm_msr_entry *e, int n)
138 rdmsrl(e[i].index, e[i].data); 149 rdmsrl(e[i].index, e[i].data);
139} 150}
140 151
141static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr)
142{
143 return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
144}
145
146static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx)
147{
148 int efer_offset = vmx->msr_offset_efer;
149 return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) !=
150 msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
151}
152
153static inline int is_page_fault(u32 intr_info) 152static inline int is_page_fault(u32 intr_info)
154{ 153{
155 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 154 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -164,6 +163,13 @@ static inline int is_no_device(u32 intr_info)
164 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); 163 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
165} 164}
166 165
166static inline int is_invalid_opcode(u32 intr_info)
167{
168 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
169 INTR_INFO_VALID_MASK)) ==
170 (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
171}
172
167static inline int is_external_interrupt(u32 intr_info) 173static inline int is_external_interrupt(u32 intr_info)
168{ 174{
169 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 175 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -180,6 +186,24 @@ static inline int vm_need_tpr_shadow(struct kvm *kvm)
180 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); 186 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
181} 187}
182 188
189static inline int cpu_has_secondary_exec_ctrls(void)
190{
191 return (vmcs_config.cpu_based_exec_ctrl &
192 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
193}
194
195static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
196{
197 return (vmcs_config.cpu_based_2nd_exec_ctrl &
198 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
199}
200
201static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
202{
203 return ((cpu_has_vmx_virtualize_apic_accesses()) &&
204 (irqchip_in_kernel(kvm)));
205}
206
183static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 207static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
184{ 208{
185 int i; 209 int i;
@@ -222,16 +246,14 @@ static void __vcpu_clear(void *arg)
222 vmcs_clear(vmx->vmcs); 246 vmcs_clear(vmx->vmcs);
223 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 247 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
224 per_cpu(current_vmcs, cpu) = NULL; 248 per_cpu(current_vmcs, cpu) = NULL;
225 rdtscll(vmx->vcpu.host_tsc); 249 rdtscll(vmx->vcpu.arch.host_tsc);
226} 250}
227 251
228static void vcpu_clear(struct vcpu_vmx *vmx) 252static void vcpu_clear(struct vcpu_vmx *vmx)
229{ 253{
230 if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1) 254 if (vmx->vcpu.cpu == -1)
231 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, 255 return;
232 vmx, 0, 1); 256 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
233 else
234 __vcpu_clear(vmx);
235 vmx->launched = 0; 257 vmx->launched = 0;
236} 258}
237 259
@@ -275,7 +297,7 @@ static void vmcs_writel(unsigned long field, unsigned long value)
275 u8 error; 297 u8 error;
276 298
277 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" 299 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
278 : "=q"(error) : "a"(value), "d"(field) : "cc" ); 300 : "=q"(error) : "a"(value), "d"(field) : "cc");
279 if (unlikely(error)) 301 if (unlikely(error))
280 vmwrite_error(field, value); 302 vmwrite_error(field, value);
281} 303}
@@ -315,12 +337,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
315{ 337{
316 u32 eb; 338 u32 eb;
317 339
318 eb = 1u << PF_VECTOR; 340 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
319 if (!vcpu->fpu_active) 341 if (!vcpu->fpu_active)
320 eb |= 1u << NM_VECTOR; 342 eb |= 1u << NM_VECTOR;
321 if (vcpu->guest_debug.enabled) 343 if (vcpu->guest_debug.enabled)
322 eb |= 1u << 1; 344 eb |= 1u << 1;
323 if (vcpu->rmode.active) 345 if (vcpu->arch.rmode.active)
324 eb = ~0; 346 eb = ~0;
325 vmcs_write32(EXCEPTION_BITMAP, eb); 347 vmcs_write32(EXCEPTION_BITMAP, eb);
326} 348}
@@ -344,16 +366,42 @@ static void reload_tss(void)
344 366
345static void load_transition_efer(struct vcpu_vmx *vmx) 367static void load_transition_efer(struct vcpu_vmx *vmx)
346{ 368{
347 u64 trans_efer;
348 int efer_offset = vmx->msr_offset_efer; 369 int efer_offset = vmx->msr_offset_efer;
370 u64 host_efer = vmx->host_msrs[efer_offset].data;
371 u64 guest_efer = vmx->guest_msrs[efer_offset].data;
372 u64 ignore_bits;
349 373
350 trans_efer = vmx->host_msrs[efer_offset].data; 374 if (efer_offset < 0)
351 trans_efer &= ~EFER_SAVE_RESTORE_BITS; 375 return;
352 trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]); 376 /*
353 wrmsrl(MSR_EFER, trans_efer); 377 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
378 * outside long mode
379 */
380 ignore_bits = EFER_NX | EFER_SCE;
381#ifdef CONFIG_X86_64
382 ignore_bits |= EFER_LMA | EFER_LME;
383 /* SCE is meaningful only in long mode on Intel */
384 if (guest_efer & EFER_LMA)
385 ignore_bits &= ~(u64)EFER_SCE;
386#endif
387 if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
388 return;
389
390 vmx->host_state.guest_efer_loaded = 1;
391 guest_efer &= ~ignore_bits;
392 guest_efer |= host_efer & ignore_bits;
393 wrmsrl(MSR_EFER, guest_efer);
354 vmx->vcpu.stat.efer_reload++; 394 vmx->vcpu.stat.efer_reload++;
355} 395}
356 396
397static void reload_host_efer(struct vcpu_vmx *vmx)
398{
399 if (vmx->host_state.guest_efer_loaded) {
400 vmx->host_state.guest_efer_loaded = 0;
401 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
402 }
403}
404
357static void vmx_save_host_state(struct kvm_vcpu *vcpu) 405static void vmx_save_host_state(struct kvm_vcpu *vcpu)
358{ 406{
359 struct vcpu_vmx *vmx = to_vmx(vcpu); 407 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -393,14 +441,13 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
393#endif 441#endif
394 442
395#ifdef CONFIG_X86_64 443#ifdef CONFIG_X86_64
396 if (is_long_mode(&vmx->vcpu)) { 444 if (is_long_mode(&vmx->vcpu))
397 save_msrs(vmx->host_msrs + 445 save_msrs(vmx->host_msrs +
398 vmx->msr_offset_kernel_gs_base, 1); 446 vmx->msr_offset_kernel_gs_base, 1);
399 } 447
400#endif 448#endif
401 load_msrs(vmx->guest_msrs, vmx->save_nmsrs); 449 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
402 if (msr_efer_need_save_restore(vmx)) 450 load_transition_efer(vmx);
403 load_transition_efer(vmx);
404} 451}
405 452
406static void vmx_load_host_state(struct vcpu_vmx *vmx) 453static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -410,6 +457,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
410 if (!vmx->host_state.loaded) 457 if (!vmx->host_state.loaded)
411 return; 458 return;
412 459
460 ++vmx->vcpu.stat.host_state_reload;
413 vmx->host_state.loaded = 0; 461 vmx->host_state.loaded = 0;
414 if (vmx->host_state.fs_reload_needed) 462 if (vmx->host_state.fs_reload_needed)
415 load_fs(vmx->host_state.fs_sel); 463 load_fs(vmx->host_state.fs_sel);
@@ -429,8 +477,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
429 reload_tss(); 477 reload_tss();
430 save_msrs(vmx->guest_msrs, vmx->save_nmsrs); 478 save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
431 load_msrs(vmx->host_msrs, vmx->save_nmsrs); 479 load_msrs(vmx->host_msrs, vmx->save_nmsrs);
432 if (msr_efer_need_save_restore(vmx)) 480 reload_host_efer(vmx);
433 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
434} 481}
435 482
436/* 483/*
@@ -480,7 +527,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
480 * Make sure the time stamp counter is monotonous. 527 * Make sure the time stamp counter is monotonous.
481 */ 528 */
482 rdtscll(tsc_this); 529 rdtscll(tsc_this);
483 delta = vcpu->host_tsc - tsc_this; 530 delta = vcpu->arch.host_tsc - tsc_this;
484 vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); 531 vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
485 } 532 }
486} 533}
@@ -488,7 +535,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
488static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 535static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
489{ 536{
490 vmx_load_host_state(to_vmx(vcpu)); 537 vmx_load_host_state(to_vmx(vcpu));
491 kvm_put_guest_fpu(vcpu);
492} 538}
493 539
494static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 540static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -497,7 +543,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
497 return; 543 return;
498 vcpu->fpu_active = 1; 544 vcpu->fpu_active = 1;
499 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); 545 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
500 if (vcpu->cr0 & X86_CR0_TS) 546 if (vcpu->arch.cr0 & X86_CR0_TS)
501 vmcs_set_bits(GUEST_CR0, X86_CR0_TS); 547 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
502 update_exception_bitmap(vcpu); 548 update_exception_bitmap(vcpu);
503} 549}
@@ -523,7 +569,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
523 569
524static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 570static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
525{ 571{
526 if (vcpu->rmode.active) 572 if (vcpu->arch.rmode.active)
527 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 573 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
528 vmcs_writel(GUEST_RFLAGS, rflags); 574 vmcs_writel(GUEST_RFLAGS, rflags);
529} 575}
@@ -545,19 +591,25 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
545 if (interruptibility & 3) 591 if (interruptibility & 3)
546 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 592 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
547 interruptibility & ~3); 593 interruptibility & ~3);
548 vcpu->interrupt_window_open = 1; 594 vcpu->arch.interrupt_window_open = 1;
549} 595}
550 596
551static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) 597static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
598 bool has_error_code, u32 error_code)
552{ 599{
553 printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
554 vmcs_readl(GUEST_RIP));
555 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
556 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 600 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
557 GP_VECTOR | 601 nr | INTR_TYPE_EXCEPTION
558 INTR_TYPE_EXCEPTION | 602 | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
559 INTR_INFO_DELIEVER_CODE_MASK | 603 | INTR_INFO_VALID_MASK);
560 INTR_INFO_VALID_MASK); 604 if (has_error_code)
605 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
606}
607
608static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
609{
610 struct vcpu_vmx *vmx = to_vmx(vcpu);
611
612 return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
561} 613}
562 614
563/* 615/*
@@ -608,7 +660,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
608 * if efer.sce is enabled. 660 * if efer.sce is enabled.
609 */ 661 */
610 index = __find_msr_index(vmx, MSR_K6_STAR); 662 index = __find_msr_index(vmx, MSR_K6_STAR);
611 if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE)) 663 if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
612 move_msr_up(vmx, index, save_nmsrs++); 664 move_msr_up(vmx, index, save_nmsrs++);
613 } 665 }
614#endif 666#endif
@@ -712,8 +764,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
712#ifdef CONFIG_X86_64 764#ifdef CONFIG_X86_64
713 case MSR_EFER: 765 case MSR_EFER:
714 ret = kvm_set_msr_common(vcpu, msr_index, data); 766 ret = kvm_set_msr_common(vcpu, msr_index, data);
715 if (vmx->host_state.loaded) 767 if (vmx->host_state.loaded) {
768 reload_host_efer(vmx);
716 load_transition_efer(vmx); 769 load_transition_efer(vmx);
770 }
717 break; 771 break;
718 case MSR_FS_BASE: 772 case MSR_FS_BASE:
719 vmcs_writel(GUEST_FS_BASE, data); 773 vmcs_writel(GUEST_FS_BASE, data);
@@ -750,12 +804,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
750 804
751/* 805/*
752 * Sync the rsp and rip registers into the vcpu structure. This allows 806 * Sync the rsp and rip registers into the vcpu structure. This allows
753 * registers to be accessed by indexing vcpu->regs. 807 * registers to be accessed by indexing vcpu->arch.regs.
754 */ 808 */
755static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) 809static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
756{ 810{
757 vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 811 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
758 vcpu->rip = vmcs_readl(GUEST_RIP); 812 vcpu->arch.rip = vmcs_readl(GUEST_RIP);
759} 813}
760 814
761/* 815/*
@@ -764,8 +818,8 @@ static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
764 */ 818 */
765static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) 819static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
766{ 820{
767 vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]); 821 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
768 vmcs_writel(GUEST_RIP, vcpu->rip); 822 vmcs_writel(GUEST_RIP, vcpu->arch.rip);
769} 823}
770 824
771static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) 825static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -808,14 +862,15 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
808 862
809static int vmx_get_irq(struct kvm_vcpu *vcpu) 863static int vmx_get_irq(struct kvm_vcpu *vcpu)
810{ 864{
865 struct vcpu_vmx *vmx = to_vmx(vcpu);
811 u32 idtv_info_field; 866 u32 idtv_info_field;
812 867
813 idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); 868 idtv_info_field = vmx->idt_vectoring_info;
814 if (idtv_info_field & INTR_INFO_VALID_MASK) { 869 if (idtv_info_field & INTR_INFO_VALID_MASK) {
815 if (is_external_interrupt(idtv_info_field)) 870 if (is_external_interrupt(idtv_info_field))
816 return idtv_info_field & VECTORING_INFO_VECTOR_MASK; 871 return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
817 else 872 else
818 printk("pending exception: not handled yet\n"); 873 printk(KERN_DEBUG "pending exception: not handled yet\n");
819 } 874 }
820 return -1; 875 return -1;
821} 876}
@@ -863,7 +918,7 @@ static void hardware_disable(void *garbage)
863} 918}
864 919
865static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 920static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
866 u32 msr, u32* result) 921 u32 msr, u32 *result)
867{ 922{
868 u32 vmx_msr_low, vmx_msr_high; 923 u32 vmx_msr_low, vmx_msr_high;
869 u32 ctl = ctl_min | ctl_opt; 924 u32 ctl = ctl_min | ctl_opt;
@@ -887,6 +942,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
887 u32 min, opt; 942 u32 min, opt;
888 u32 _pin_based_exec_control = 0; 943 u32 _pin_based_exec_control = 0;
889 u32 _cpu_based_exec_control = 0; 944 u32 _cpu_based_exec_control = 0;
945 u32 _cpu_based_2nd_exec_control = 0;
890 u32 _vmexit_control = 0; 946 u32 _vmexit_control = 0;
891 u32 _vmentry_control = 0; 947 u32 _vmentry_control = 0;
892 948
@@ -904,11 +960,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
904 CPU_BASED_USE_IO_BITMAPS | 960 CPU_BASED_USE_IO_BITMAPS |
905 CPU_BASED_MOV_DR_EXITING | 961 CPU_BASED_MOV_DR_EXITING |
906 CPU_BASED_USE_TSC_OFFSETING; 962 CPU_BASED_USE_TSC_OFFSETING;
907#ifdef CONFIG_X86_64 963 opt = CPU_BASED_TPR_SHADOW |
908 opt = CPU_BASED_TPR_SHADOW; 964 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
909#else
910 opt = 0;
911#endif
912 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 965 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
913 &_cpu_based_exec_control) < 0) 966 &_cpu_based_exec_control) < 0)
914 return -EIO; 967 return -EIO;
@@ -917,6 +970,19 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
917 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & 970 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
918 ~CPU_BASED_CR8_STORE_EXITING; 971 ~CPU_BASED_CR8_STORE_EXITING;
919#endif 972#endif
973 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
974 min = 0;
975 opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
976 SECONDARY_EXEC_WBINVD_EXITING;
977 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
978 &_cpu_based_2nd_exec_control) < 0)
979 return -EIO;
980 }
981#ifndef CONFIG_X86_64
982 if (!(_cpu_based_2nd_exec_control &
983 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
984 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
985#endif
920 986
921 min = 0; 987 min = 0;
922#ifdef CONFIG_X86_64 988#ifdef CONFIG_X86_64
@@ -954,6 +1020,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
954 1020
955 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 1021 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
956 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 1022 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
1023 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
957 vmcs_conf->vmexit_ctrl = _vmexit_control; 1024 vmcs_conf->vmexit_ctrl = _vmexit_control;
958 vmcs_conf->vmentry_ctrl = _vmentry_control; 1025 vmcs_conf->vmentry_ctrl = _vmentry_control;
959 1026
@@ -1043,15 +1110,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1043{ 1110{
1044 unsigned long flags; 1111 unsigned long flags;
1045 1112
1046 vcpu->rmode.active = 0; 1113 vcpu->arch.rmode.active = 0;
1047 1114
1048 vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base); 1115 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
1049 vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit); 1116 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
1050 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar); 1117 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
1051 1118
1052 flags = vmcs_readl(GUEST_RFLAGS); 1119 flags = vmcs_readl(GUEST_RFLAGS);
1053 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); 1120 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
1054 flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT); 1121 flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
1055 vmcs_writel(GUEST_RFLAGS, flags); 1122 vmcs_writel(GUEST_RFLAGS, flags);
1056 1123
1057 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 1124 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1059,10 +1126,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1059 1126
1060 update_exception_bitmap(vcpu); 1127 update_exception_bitmap(vcpu);
1061 1128
1062 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es); 1129 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1063 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds); 1130 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1064 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs); 1131 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1065 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs); 1132 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1066 1133
1067 vmcs_write16(GUEST_SS_SELECTOR, 0); 1134 vmcs_write16(GUEST_SS_SELECTOR, 0);
1068 vmcs_write32(GUEST_SS_AR_BYTES, 0x93); 1135 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
@@ -1072,10 +1139,14 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1072 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); 1139 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1073} 1140}
1074 1141
1075static gva_t rmode_tss_base(struct kvm* kvm) 1142static gva_t rmode_tss_base(struct kvm *kvm)
1076{ 1143{
1077 gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3; 1144 if (!kvm->arch.tss_addr) {
1078 return base_gfn << PAGE_SHIFT; 1145 gfn_t base_gfn = kvm->memslots[0].base_gfn +
1146 kvm->memslots[0].npages - 3;
1147 return base_gfn << PAGE_SHIFT;
1148 }
1149 return kvm->arch.tss_addr;
1079} 1150}
1080 1151
1081static void fix_rmode_seg(int seg, struct kvm_save_segment *save) 1152static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
@@ -1086,7 +1157,8 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1086 save->base = vmcs_readl(sf->base); 1157 save->base = vmcs_readl(sf->base);
1087 save->limit = vmcs_read32(sf->limit); 1158 save->limit = vmcs_read32(sf->limit);
1088 save->ar = vmcs_read32(sf->ar_bytes); 1159 save->ar = vmcs_read32(sf->ar_bytes);
1089 vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4); 1160 vmcs_write16(sf->selector, save->base >> 4);
1161 vmcs_write32(sf->base, save->base & 0xfffff);
1090 vmcs_write32(sf->limit, 0xffff); 1162 vmcs_write32(sf->limit, 0xffff);
1091 vmcs_write32(sf->ar_bytes, 0xf3); 1163 vmcs_write32(sf->ar_bytes, 0xf3);
1092} 1164}
@@ -1095,19 +1167,20 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1095{ 1167{
1096 unsigned long flags; 1168 unsigned long flags;
1097 1169
1098 vcpu->rmode.active = 1; 1170 vcpu->arch.rmode.active = 1;
1099 1171
1100 vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1172 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1101 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1173 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1102 1174
1103 vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); 1175 vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
1104 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 1176 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1105 1177
1106 vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); 1178 vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
1107 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 1179 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1108 1180
1109 flags = vmcs_readl(GUEST_RFLAGS); 1181 flags = vmcs_readl(GUEST_RFLAGS);
1110 vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1182 vcpu->arch.rmode.save_iopl
1183 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1111 1184
1112 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1185 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1113 1186
@@ -1125,10 +1198,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1125 vmcs_writel(GUEST_CS_BASE, 0xf0000); 1198 vmcs_writel(GUEST_CS_BASE, 0xf0000);
1126 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); 1199 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1127 1200
1128 fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); 1201 fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1129 fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); 1202 fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1130 fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); 1203 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1131 fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); 1204 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1132 1205
1133 kvm_mmu_reset_context(vcpu); 1206 kvm_mmu_reset_context(vcpu);
1134 init_rmode_tss(vcpu->kvm); 1207 init_rmode_tss(vcpu->kvm);
@@ -1149,7 +1222,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1149 | AR_TYPE_BUSY_64_TSS); 1222 | AR_TYPE_BUSY_64_TSS);
1150 } 1223 }
1151 1224
1152 vcpu->shadow_efer |= EFER_LMA; 1225 vcpu->arch.shadow_efer |= EFER_LMA;
1153 1226
1154 find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; 1227 find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
1155 vmcs_write32(VM_ENTRY_CONTROLS, 1228 vmcs_write32(VM_ENTRY_CONTROLS,
@@ -1159,7 +1232,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1159 1232
1160static void exit_lmode(struct kvm_vcpu *vcpu) 1233static void exit_lmode(struct kvm_vcpu *vcpu)
1161{ 1234{
1162 vcpu->shadow_efer &= ~EFER_LMA; 1235 vcpu->arch.shadow_efer &= ~EFER_LMA;
1163 1236
1164 vmcs_write32(VM_ENTRY_CONTROLS, 1237 vmcs_write32(VM_ENTRY_CONTROLS,
1165 vmcs_read32(VM_ENTRY_CONTROLS) 1238 vmcs_read32(VM_ENTRY_CONTROLS)
@@ -1170,22 +1243,22 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1170 1243
1171static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1244static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1172{ 1245{
1173 vcpu->cr4 &= KVM_GUEST_CR4_MASK; 1246 vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
1174 vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; 1247 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
1175} 1248}
1176 1249
1177static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1250static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1178{ 1251{
1179 vmx_fpu_deactivate(vcpu); 1252 vmx_fpu_deactivate(vcpu);
1180 1253
1181 if (vcpu->rmode.active && (cr0 & X86_CR0_PE)) 1254 if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
1182 enter_pmode(vcpu); 1255 enter_pmode(vcpu);
1183 1256
1184 if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE)) 1257 if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
1185 enter_rmode(vcpu); 1258 enter_rmode(vcpu);
1186 1259
1187#ifdef CONFIG_X86_64 1260#ifdef CONFIG_X86_64
1188 if (vcpu->shadow_efer & EFER_LME) { 1261 if (vcpu->arch.shadow_efer & EFER_LME) {
1189 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) 1262 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1190 enter_lmode(vcpu); 1263 enter_lmode(vcpu);
1191 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) 1264 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
@@ -1196,7 +1269,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1196 vmcs_writel(CR0_READ_SHADOW, cr0); 1269 vmcs_writel(CR0_READ_SHADOW, cr0);
1197 vmcs_writel(GUEST_CR0, 1270 vmcs_writel(GUEST_CR0,
1198 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); 1271 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
1199 vcpu->cr0 = cr0; 1272 vcpu->arch.cr0 = cr0;
1200 1273
1201 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) 1274 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1202 vmx_fpu_activate(vcpu); 1275 vmx_fpu_activate(vcpu);
@@ -1205,16 +1278,16 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1205static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 1278static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1206{ 1279{
1207 vmcs_writel(GUEST_CR3, cr3); 1280 vmcs_writel(GUEST_CR3, cr3);
1208 if (vcpu->cr0 & X86_CR0_PE) 1281 if (vcpu->arch.cr0 & X86_CR0_PE)
1209 vmx_fpu_deactivate(vcpu); 1282 vmx_fpu_deactivate(vcpu);
1210} 1283}
1211 1284
1212static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1285static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1213{ 1286{
1214 vmcs_writel(CR4_READ_SHADOW, cr4); 1287 vmcs_writel(CR4_READ_SHADOW, cr4);
1215 vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ? 1288 vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
1216 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); 1289 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
1217 vcpu->cr4 = cr4; 1290 vcpu->arch.cr4 = cr4;
1218} 1291}
1219 1292
1220#ifdef CONFIG_X86_64 1293#ifdef CONFIG_X86_64
@@ -1224,7 +1297,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1224 struct vcpu_vmx *vmx = to_vmx(vcpu); 1297 struct vcpu_vmx *vmx = to_vmx(vcpu);
1225 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 1298 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1226 1299
1227 vcpu->shadow_efer = efer; 1300 vcpu->arch.shadow_efer = efer;
1228 if (efer & EFER_LMA) { 1301 if (efer & EFER_LMA) {
1229 vmcs_write32(VM_ENTRY_CONTROLS, 1302 vmcs_write32(VM_ENTRY_CONTROLS,
1230 vmcs_read32(VM_ENTRY_CONTROLS) | 1303 vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1301,17 +1374,17 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
1301 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 1374 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1302 u32 ar; 1375 u32 ar;
1303 1376
1304 if (vcpu->rmode.active && seg == VCPU_SREG_TR) { 1377 if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
1305 vcpu->rmode.tr.selector = var->selector; 1378 vcpu->arch.rmode.tr.selector = var->selector;
1306 vcpu->rmode.tr.base = var->base; 1379 vcpu->arch.rmode.tr.base = var->base;
1307 vcpu->rmode.tr.limit = var->limit; 1380 vcpu->arch.rmode.tr.limit = var->limit;
1308 vcpu->rmode.tr.ar = vmx_segment_access_rights(var); 1381 vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
1309 return; 1382 return;
1310 } 1383 }
1311 vmcs_writel(sf->base, var->base); 1384 vmcs_writel(sf->base, var->base);
1312 vmcs_write32(sf->limit, var->limit); 1385 vmcs_write32(sf->limit, var->limit);
1313 vmcs_write16(sf->selector, var->selector); 1386 vmcs_write16(sf->selector, var->selector);
1314 if (vcpu->rmode.active && var->s) { 1387 if (vcpu->arch.rmode.active && var->s) {
1315 /* 1388 /*
1316 * Hack real-mode segments into vm86 compatibility. 1389 * Hack real-mode segments into vm86 compatibility.
1317 */ 1390 */
@@ -1355,36 +1428,38 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1355 vmcs_writel(GUEST_GDTR_BASE, dt->base); 1428 vmcs_writel(GUEST_GDTR_BASE, dt->base);
1356} 1429}
1357 1430
1358static int init_rmode_tss(struct kvm* kvm) 1431static int init_rmode_tss(struct kvm *kvm)
1359{ 1432{
1360 struct page *p1, *p2, *p3;
1361 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 1433 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
1362 char *page; 1434 u16 data = 0;
1363 1435 int ret = 0;
1364 p1 = gfn_to_page(kvm, fn++); 1436 int r;
1365 p2 = gfn_to_page(kvm, fn++);
1366 p3 = gfn_to_page(kvm, fn);
1367
1368 if (!p1 || !p2 || !p3) {
1369 kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
1370 return 0;
1371 }
1372
1373 page = kmap_atomic(p1, KM_USER0);
1374 clear_page(page);
1375 *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1376 kunmap_atomic(page, KM_USER0);
1377
1378 page = kmap_atomic(p2, KM_USER0);
1379 clear_page(page);
1380 kunmap_atomic(page, KM_USER0);
1381 1437
1382 page = kmap_atomic(p3, KM_USER0); 1438 down_read(&current->mm->mmap_sem);
1383 clear_page(page); 1439 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1384 *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; 1440 if (r < 0)
1385 kunmap_atomic(page, KM_USER0); 1441 goto out;
1442 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1443 r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
1444 if (r < 0)
1445 goto out;
1446 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
1447 if (r < 0)
1448 goto out;
1449 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1450 if (r < 0)
1451 goto out;
1452 data = ~0;
1453 r = kvm_write_guest_page(kvm, fn, &data,
1454 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
1455 sizeof(u8));
1456 if (r < 0)
1457 goto out;
1386 1458
1387 return 1; 1459 ret = 1;
1460out:
1461 up_read(&current->mm->mmap_sem);
1462 return ret;
1388} 1463}
1389 1464
1390static void seg_setup(int seg) 1465static void seg_setup(int seg)
@@ -1397,6 +1472,27 @@ static void seg_setup(int seg)
1397 vmcs_write32(sf->ar_bytes, 0x93); 1472 vmcs_write32(sf->ar_bytes, 0x93);
1398} 1473}
1399 1474
1475static int alloc_apic_access_page(struct kvm *kvm)
1476{
1477 struct kvm_userspace_memory_region kvm_userspace_mem;
1478 int r = 0;
1479
1480 down_write(&current->mm->mmap_sem);
1481 if (kvm->arch.apic_access_page)
1482 goto out;
1483 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
1484 kvm_userspace_mem.flags = 0;
1485 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
1486 kvm_userspace_mem.memory_size = PAGE_SIZE;
1487 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
1488 if (r)
1489 goto out;
1490 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
1491out:
1492 up_write(&current->mm->mmap_sem);
1493 return r;
1494}
1495
1400/* 1496/*
1401 * Sets up the vmcs for emulated real mode. 1497 * Sets up the vmcs for emulated real mode.
1402 */ 1498 */
@@ -1407,92 +1503,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1407 unsigned long a; 1503 unsigned long a;
1408 struct descriptor_table dt; 1504 struct descriptor_table dt;
1409 int i; 1505 int i;
1410 int ret = 0;
1411 unsigned long kvm_vmx_return; 1506 unsigned long kvm_vmx_return;
1412 u64 msr;
1413 u32 exec_control; 1507 u32 exec_control;
1414 1508
1415 if (!init_rmode_tss(vmx->vcpu.kvm)) {
1416 ret = -ENOMEM;
1417 goto out;
1418 }
1419
1420 vmx->vcpu.rmode.active = 0;
1421
1422 vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
1423 set_cr8(&vmx->vcpu, 0);
1424 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1425 if (vmx->vcpu.vcpu_id == 0)
1426 msr |= MSR_IA32_APICBASE_BSP;
1427 kvm_set_apic_base(&vmx->vcpu, msr);
1428
1429 fx_init(&vmx->vcpu);
1430
1431 /*
1432 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1433 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
1434 */
1435 if (vmx->vcpu.vcpu_id == 0) {
1436 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1437 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1438 } else {
1439 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8);
1440 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12);
1441 }
1442 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1443 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1444
1445 seg_setup(VCPU_SREG_DS);
1446 seg_setup(VCPU_SREG_ES);
1447 seg_setup(VCPU_SREG_FS);
1448 seg_setup(VCPU_SREG_GS);
1449 seg_setup(VCPU_SREG_SS);
1450
1451 vmcs_write16(GUEST_TR_SELECTOR, 0);
1452 vmcs_writel(GUEST_TR_BASE, 0);
1453 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1454 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1455
1456 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1457 vmcs_writel(GUEST_LDTR_BASE, 0);
1458 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1459 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1460
1461 vmcs_write32(GUEST_SYSENTER_CS, 0);
1462 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1463 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1464
1465 vmcs_writel(GUEST_RFLAGS, 0x02);
1466 if (vmx->vcpu.vcpu_id == 0)
1467 vmcs_writel(GUEST_RIP, 0xfff0);
1468 else
1469 vmcs_writel(GUEST_RIP, 0);
1470 vmcs_writel(GUEST_RSP, 0);
1471
1472 //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1473 vmcs_writel(GUEST_DR7, 0x400);
1474
1475 vmcs_writel(GUEST_GDTR_BASE, 0);
1476 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1477
1478 vmcs_writel(GUEST_IDTR_BASE, 0);
1479 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1480
1481 vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1482 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1483 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1484
1485 /* I/O */ 1509 /* I/O */
1486 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); 1510 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
1487 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); 1511 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
1488 1512
1489 guest_write_tsc(0);
1490
1491 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 1513 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1492 1514
1493 /* Special registers */
1494 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1495
1496 /* Control */ 1515 /* Control */
1497 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, 1516 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1498 vmcs_config.pin_based_exec_ctrl); 1517 vmcs_config.pin_based_exec_ctrl);
@@ -1507,8 +1526,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1507 } 1526 }
1508 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); 1527 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1509 1528
1510 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 1529 if (cpu_has_secondary_exec_ctrls()) {
1511 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 1530 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
1531 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1532 exec_control &=
1533 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1534 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
1535 }
1536
1537 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
1538 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
1512 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 1539 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
1513 1540
1514 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ 1541 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
@@ -1536,7 +1563,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1536 get_idt(&dt); 1563 get_idt(&dt);
1537 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 1564 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1538 1565
1539 asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); 1566 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
1540 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ 1567 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
1541 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 1568 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
1542 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 1569 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
@@ -1567,97 +1594,145 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1567 ++vmx->nmsrs; 1594 ++vmx->nmsrs;
1568 } 1595 }
1569 1596
1570 setup_msrs(vmx);
1571
1572 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); 1597 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1573 1598
1574 /* 22.2.1, 20.8.1 */ 1599 /* 22.2.1, 20.8.1 */
1575 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); 1600 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1576 1601
1577 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1578
1579#ifdef CONFIG_X86_64
1580 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
1581 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
1582 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
1583 page_to_phys(vmx->vcpu.apic->regs_page));
1584 vmcs_write32(TPR_THRESHOLD, 0);
1585#endif
1586
1587 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 1602 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
1588 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); 1603 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1589 1604
1590 vmx->vcpu.cr0 = 0x60000010; 1605 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1591 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode 1606 if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
1592 vmx_set_cr4(&vmx->vcpu, 0); 1607 return -ENOMEM;
1593#ifdef CONFIG_X86_64
1594 vmx_set_efer(&vmx->vcpu, 0);
1595#endif
1596 vmx_fpu_activate(&vmx->vcpu);
1597 update_exception_bitmap(&vmx->vcpu);
1598 1608
1599 return 0; 1609 return 0;
1600
1601out:
1602 return ret;
1603} 1610}
1604 1611
1605static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) 1612static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
1606{ 1613{
1607 struct vcpu_vmx *vmx = to_vmx(vcpu); 1614 struct vcpu_vmx *vmx = to_vmx(vcpu);
1615 u64 msr;
1616 int ret;
1608 1617
1609 vmx_vcpu_setup(vmx); 1618 if (!init_rmode_tss(vmx->vcpu.kvm)) {
1610} 1619 ret = -ENOMEM;
1611 1620 goto out;
1612static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
1613{
1614 u16 ent[2];
1615 u16 cs;
1616 u16 ip;
1617 unsigned long flags;
1618 unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
1619 u16 sp = vmcs_readl(GUEST_RSP);
1620 u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
1621
1622 if (sp > ss_limit || sp < 6 ) {
1623 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
1624 __FUNCTION__,
1625 vmcs_readl(GUEST_RSP),
1626 vmcs_readl(GUEST_SS_BASE),
1627 vmcs_read32(GUEST_SS_LIMIT));
1628 return;
1629 } 1621 }
1630 1622
1631 if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) != 1623 vmx->vcpu.arch.rmode.active = 0;
1632 X86EMUL_CONTINUE) { 1624
1633 vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__); 1625 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
1634 return; 1626 set_cr8(&vmx->vcpu, 0);
1627 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1628 if (vmx->vcpu.vcpu_id == 0)
1629 msr |= MSR_IA32_APICBASE_BSP;
1630 kvm_set_apic_base(&vmx->vcpu, msr);
1631
1632 fx_init(&vmx->vcpu);
1633
1634 /*
1635 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1636 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
1637 */
1638 if (vmx->vcpu.vcpu_id == 0) {
1639 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1640 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1641 } else {
1642 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
1643 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
1635 } 1644 }
1645 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1646 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1647
1648 seg_setup(VCPU_SREG_DS);
1649 seg_setup(VCPU_SREG_ES);
1650 seg_setup(VCPU_SREG_FS);
1651 seg_setup(VCPU_SREG_GS);
1652 seg_setup(VCPU_SREG_SS);
1653
1654 vmcs_write16(GUEST_TR_SELECTOR, 0);
1655 vmcs_writel(GUEST_TR_BASE, 0);
1656 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1657 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1636 1658
1637 flags = vmcs_readl(GUEST_RFLAGS); 1659 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1638 cs = vmcs_readl(GUEST_CS_BASE) >> 4; 1660 vmcs_writel(GUEST_LDTR_BASE, 0);
1639 ip = vmcs_readl(GUEST_RIP); 1661 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1662 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1640 1663
1664 vmcs_write32(GUEST_SYSENTER_CS, 0);
1665 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1666 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1641 1667
1642 if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE || 1668 vmcs_writel(GUEST_RFLAGS, 0x02);
1643 emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE || 1669 if (vmx->vcpu.vcpu_id == 0)
1644 emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) { 1670 vmcs_writel(GUEST_RIP, 0xfff0);
1645 vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__); 1671 else
1646 return; 1672 vmcs_writel(GUEST_RIP, 0);
1673 vmcs_writel(GUEST_RSP, 0);
1674
1675 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
1676 vmcs_writel(GUEST_DR7, 0x400);
1677
1678 vmcs_writel(GUEST_GDTR_BASE, 0);
1679 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1680
1681 vmcs_writel(GUEST_IDTR_BASE, 0);
1682 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1683
1684 vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1685 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1686 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1687
1688 guest_write_tsc(0);
1689
1690 /* Special registers */
1691 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1692
1693 setup_msrs(vmx);
1694
1695 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1696
1697 if (cpu_has_vmx_tpr_shadow()) {
1698 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
1699 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
1700 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
1701 page_to_phys(vmx->vcpu.arch.apic->regs_page));
1702 vmcs_write32(TPR_THRESHOLD, 0);
1647 } 1703 }
1648 1704
1649 vmcs_writel(GUEST_RFLAGS, flags & 1705 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1650 ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF)); 1706 vmcs_write64(APIC_ACCESS_ADDR,
1651 vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ; 1707 page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
1652 vmcs_writel(GUEST_CS_BASE, ent[1] << 4); 1708
1653 vmcs_writel(GUEST_RIP, ent[0]); 1709 vmx->vcpu.arch.cr0 = 0x60000010;
1654 vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6)); 1710 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
1711 vmx_set_cr4(&vmx->vcpu, 0);
1712#ifdef CONFIG_X86_64
1713 vmx_set_efer(&vmx->vcpu, 0);
1714#endif
1715 vmx_fpu_activate(&vmx->vcpu);
1716 update_exception_bitmap(&vmx->vcpu);
1717
1718 return 0;
1719
1720out:
1721 return ret;
1655} 1722}
1656 1723
1657static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) 1724static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
1658{ 1725{
1659 if (vcpu->rmode.active) { 1726 struct vcpu_vmx *vmx = to_vmx(vcpu);
1660 inject_rmode_irq(vcpu, irq); 1727
1728 if (vcpu->arch.rmode.active) {
1729 vmx->rmode.irq.pending = true;
1730 vmx->rmode.irq.vector = irq;
1731 vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
1732 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1733 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
1734 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
1735 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
1661 return; 1736 return;
1662 } 1737 }
1663 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 1738 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -1666,13 +1741,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
1666 1741
1667static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 1742static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1668{ 1743{
1669 int word_index = __ffs(vcpu->irq_summary); 1744 int word_index = __ffs(vcpu->arch.irq_summary);
1670 int bit_index = __ffs(vcpu->irq_pending[word_index]); 1745 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
1671 int irq = word_index * BITS_PER_LONG + bit_index; 1746 int irq = word_index * BITS_PER_LONG + bit_index;
1672 1747
1673 clear_bit(bit_index, &vcpu->irq_pending[word_index]); 1748 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
1674 if (!vcpu->irq_pending[word_index]) 1749 if (!vcpu->arch.irq_pending[word_index])
1675 clear_bit(word_index, &vcpu->irq_summary); 1750 clear_bit(word_index, &vcpu->arch.irq_summary);
1676 vmx_inject_irq(vcpu, irq); 1751 vmx_inject_irq(vcpu, irq);
1677} 1752}
1678 1753
@@ -1682,12 +1757,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1682{ 1757{
1683 u32 cpu_based_vm_exec_control; 1758 u32 cpu_based_vm_exec_control;
1684 1759
1685 vcpu->interrupt_window_open = 1760 vcpu->arch.interrupt_window_open =
1686 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 1761 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
1687 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); 1762 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
1688 1763
1689 if (vcpu->interrupt_window_open && 1764 if (vcpu->arch.interrupt_window_open &&
1690 vcpu->irq_summary && 1765 vcpu->arch.irq_summary &&
1691 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) 1766 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
1692 /* 1767 /*
1693 * If interrupts enabled, and not blocked by sti or mov ss. Good. 1768 * If interrupts enabled, and not blocked by sti or mov ss. Good.
@@ -1695,8 +1770,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1695 kvm_do_inject_irq(vcpu); 1770 kvm_do_inject_irq(vcpu);
1696 1771
1697 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 1772 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
1698 if (!vcpu->interrupt_window_open && 1773 if (!vcpu->arch.interrupt_window_open &&
1699 (vcpu->irq_summary || kvm_run->request_interrupt_window)) 1774 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
1700 /* 1775 /*
1701 * Interrupts blocked. Wait for unblock. 1776 * Interrupts blocked. Wait for unblock.
1702 */ 1777 */
@@ -1706,6 +1781,23 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1706 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 1781 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
1707} 1782}
1708 1783
1784static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
1785{
1786 int ret;
1787 struct kvm_userspace_memory_region tss_mem = {
1788 .slot = 8,
1789 .guest_phys_addr = addr,
1790 .memory_size = PAGE_SIZE * 3,
1791 .flags = 0,
1792 };
1793
1794 ret = kvm_set_memory_region(kvm, &tss_mem, 0);
1795 if (ret)
1796 return ret;
1797 kvm->arch.tss_addr = addr;
1798 return 0;
1799}
1800
1709static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) 1801static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1710{ 1802{
1711 struct kvm_guest_debug *dbg = &vcpu->guest_debug; 1803 struct kvm_guest_debug *dbg = &vcpu->guest_debug;
@@ -1727,7 +1819,7 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1727static int handle_rmode_exception(struct kvm_vcpu *vcpu, 1819static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1728 int vec, u32 err_code) 1820 int vec, u32 err_code)
1729{ 1821{
1730 if (!vcpu->rmode.active) 1822 if (!vcpu->arch.rmode.active)
1731 return 0; 1823 return 0;
1732 1824
1733 /* 1825 /*
@@ -1735,32 +1827,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1735 * Cause the #SS fault with 0 error code in VM86 mode. 1827 * Cause the #SS fault with 0 error code in VM86 mode.
1736 */ 1828 */
1737 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 1829 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
1738 if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) 1830 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
1739 return 1; 1831 return 1;
1740 return 0; 1832 return 0;
1741} 1833}
1742 1834
1743static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1835static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1744{ 1836{
1837 struct vcpu_vmx *vmx = to_vmx(vcpu);
1745 u32 intr_info, error_code; 1838 u32 intr_info, error_code;
1746 unsigned long cr2, rip; 1839 unsigned long cr2, rip;
1747 u32 vect_info; 1840 u32 vect_info;
1748 enum emulation_result er; 1841 enum emulation_result er;
1749 int r;
1750 1842
1751 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 1843 vect_info = vmx->idt_vectoring_info;
1752 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 1844 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1753 1845
1754 if ((vect_info & VECTORING_INFO_VALID_MASK) && 1846 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1755 !is_page_fault(intr_info)) { 1847 !is_page_fault(intr_info))
1756 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 1848 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
1757 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); 1849 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1758 }
1759 1850
1760 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { 1851 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
1761 int irq = vect_info & VECTORING_INFO_VECTOR_MASK; 1852 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1762 set_bit(irq, vcpu->irq_pending); 1853 set_bit(irq, vcpu->arch.irq_pending);
1763 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); 1854 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1764 } 1855 }
1765 1856
1766 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ 1857 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
@@ -1771,52 +1862,34 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1771 return 1; 1862 return 1;
1772 } 1863 }
1773 1864
1865 if (is_invalid_opcode(intr_info)) {
1866 er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
1867 if (er != EMULATE_DONE)
1868 kvm_queue_exception(vcpu, UD_VECTOR);
1869 return 1;
1870 }
1871
1774 error_code = 0; 1872 error_code = 0;
1775 rip = vmcs_readl(GUEST_RIP); 1873 rip = vmcs_readl(GUEST_RIP);
1776 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) 1874 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1777 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 1875 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1778 if (is_page_fault(intr_info)) { 1876 if (is_page_fault(intr_info)) {
1779 cr2 = vmcs_readl(EXIT_QUALIFICATION); 1877 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1780 1878 return kvm_mmu_page_fault(vcpu, cr2, error_code);
1781 mutex_lock(&vcpu->kvm->lock);
1782 r = kvm_mmu_page_fault(vcpu, cr2, error_code);
1783 if (r < 0) {
1784 mutex_unlock(&vcpu->kvm->lock);
1785 return r;
1786 }
1787 if (!r) {
1788 mutex_unlock(&vcpu->kvm->lock);
1789 return 1;
1790 }
1791
1792 er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
1793 mutex_unlock(&vcpu->kvm->lock);
1794
1795 switch (er) {
1796 case EMULATE_DONE:
1797 return 1;
1798 case EMULATE_DO_MMIO:
1799 ++vcpu->stat.mmio_exits;
1800 return 0;
1801 case EMULATE_FAIL:
1802 kvm_report_emulation_failure(vcpu, "pagetable");
1803 break;
1804 default:
1805 BUG();
1806 }
1807 } 1879 }
1808 1880
1809 if (vcpu->rmode.active && 1881 if (vcpu->arch.rmode.active &&
1810 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, 1882 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1811 error_code)) { 1883 error_code)) {
1812 if (vcpu->halt_request) { 1884 if (vcpu->arch.halt_request) {
1813 vcpu->halt_request = 0; 1885 vcpu->arch.halt_request = 0;
1814 return kvm_emulate_halt(vcpu); 1886 return kvm_emulate_halt(vcpu);
1815 } 1887 }
1816 return 1; 1888 return 1;
1817 } 1889 }
1818 1890
1819 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { 1891 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
1892 (INTR_TYPE_EXCEPTION | 1)) {
1820 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1893 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1821 return 0; 1894 return 0;
1822 } 1895 }
@@ -1850,7 +1923,8 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1850 string = (exit_qualification & 16) != 0; 1923 string = (exit_qualification & 16) != 0;
1851 1924
1852 if (string) { 1925 if (string) {
1853 if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) 1926 if (emulate_instruction(vcpu,
1927 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
1854 return 0; 1928 return 0;
1855 return 1; 1929 return 1;
1856 } 1930 }
@@ -1873,7 +1947,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1873 hypercall[0] = 0x0f; 1947 hypercall[0] = 0x0f;
1874 hypercall[1] = 0x01; 1948 hypercall[1] = 0x01;
1875 hypercall[2] = 0xc1; 1949 hypercall[2] = 0xc1;
1876 hypercall[3] = 0xc3;
1877} 1950}
1878 1951
1879static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1952static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1890,23 +1963,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1890 switch (cr) { 1963 switch (cr) {
1891 case 0: 1964 case 0:
1892 vcpu_load_rsp_rip(vcpu); 1965 vcpu_load_rsp_rip(vcpu);
1893 set_cr0(vcpu, vcpu->regs[reg]); 1966 set_cr0(vcpu, vcpu->arch.regs[reg]);
1894 skip_emulated_instruction(vcpu); 1967 skip_emulated_instruction(vcpu);
1895 return 1; 1968 return 1;
1896 case 3: 1969 case 3:
1897 vcpu_load_rsp_rip(vcpu); 1970 vcpu_load_rsp_rip(vcpu);
1898 set_cr3(vcpu, vcpu->regs[reg]); 1971 set_cr3(vcpu, vcpu->arch.regs[reg]);
1899 skip_emulated_instruction(vcpu); 1972 skip_emulated_instruction(vcpu);
1900 return 1; 1973 return 1;
1901 case 4: 1974 case 4:
1902 vcpu_load_rsp_rip(vcpu); 1975 vcpu_load_rsp_rip(vcpu);
1903 set_cr4(vcpu, vcpu->regs[reg]); 1976 set_cr4(vcpu, vcpu->arch.regs[reg]);
1904 skip_emulated_instruction(vcpu); 1977 skip_emulated_instruction(vcpu);
1905 return 1; 1978 return 1;
1906 case 8: 1979 case 8:
1907 vcpu_load_rsp_rip(vcpu); 1980 vcpu_load_rsp_rip(vcpu);
1908 set_cr8(vcpu, vcpu->regs[reg]); 1981 set_cr8(vcpu, vcpu->arch.regs[reg]);
1909 skip_emulated_instruction(vcpu); 1982 skip_emulated_instruction(vcpu);
1983 if (irqchip_in_kernel(vcpu->kvm))
1984 return 1;
1910 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 1985 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1911 return 0; 1986 return 0;
1912 }; 1987 };
@@ -1914,8 +1989,8 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1914 case 2: /* clts */ 1989 case 2: /* clts */
1915 vcpu_load_rsp_rip(vcpu); 1990 vcpu_load_rsp_rip(vcpu);
1916 vmx_fpu_deactivate(vcpu); 1991 vmx_fpu_deactivate(vcpu);
1917 vcpu->cr0 &= ~X86_CR0_TS; 1992 vcpu->arch.cr0 &= ~X86_CR0_TS;
1918 vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); 1993 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1919 vmx_fpu_activate(vcpu); 1994 vmx_fpu_activate(vcpu);
1920 skip_emulated_instruction(vcpu); 1995 skip_emulated_instruction(vcpu);
1921 return 1; 1996 return 1;
@@ -1923,13 +1998,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1923 switch (cr) { 1998 switch (cr) {
1924 case 3: 1999 case 3:
1925 vcpu_load_rsp_rip(vcpu); 2000 vcpu_load_rsp_rip(vcpu);
1926 vcpu->regs[reg] = vcpu->cr3; 2001 vcpu->arch.regs[reg] = vcpu->arch.cr3;
1927 vcpu_put_rsp_rip(vcpu); 2002 vcpu_put_rsp_rip(vcpu);
1928 skip_emulated_instruction(vcpu); 2003 skip_emulated_instruction(vcpu);
1929 return 1; 2004 return 1;
1930 case 8: 2005 case 8:
1931 vcpu_load_rsp_rip(vcpu); 2006 vcpu_load_rsp_rip(vcpu);
1932 vcpu->regs[reg] = get_cr8(vcpu); 2007 vcpu->arch.regs[reg] = get_cr8(vcpu);
1933 vcpu_put_rsp_rip(vcpu); 2008 vcpu_put_rsp_rip(vcpu);
1934 skip_emulated_instruction(vcpu); 2009 skip_emulated_instruction(vcpu);
1935 return 1; 2010 return 1;
@@ -1975,7 +2050,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1975 default: 2050 default:
1976 val = 0; 2051 val = 0;
1977 } 2052 }
1978 vcpu->regs[reg] = val; 2053 vcpu->arch.regs[reg] = val;
1979 } else { 2054 } else {
1980 /* mov to dr */ 2055 /* mov to dr */
1981 } 2056 }
@@ -1992,29 +2067,29 @@ static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1992 2067
1993static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2068static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1994{ 2069{
1995 u32 ecx = vcpu->regs[VCPU_REGS_RCX]; 2070 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
1996 u64 data; 2071 u64 data;
1997 2072
1998 if (vmx_get_msr(vcpu, ecx, &data)) { 2073 if (vmx_get_msr(vcpu, ecx, &data)) {
1999 vmx_inject_gp(vcpu, 0); 2074 kvm_inject_gp(vcpu, 0);
2000 return 1; 2075 return 1;
2001 } 2076 }
2002 2077
2003 /* FIXME: handling of bits 32:63 of rax, rdx */ 2078 /* FIXME: handling of bits 32:63 of rax, rdx */
2004 vcpu->regs[VCPU_REGS_RAX] = data & -1u; 2079 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
2005 vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u; 2080 vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2006 skip_emulated_instruction(vcpu); 2081 skip_emulated_instruction(vcpu);
2007 return 1; 2082 return 1;
2008} 2083}
2009 2084
2010static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2085static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2011{ 2086{
2012 u32 ecx = vcpu->regs[VCPU_REGS_RCX]; 2087 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2013 u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u) 2088 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
2014 | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); 2089 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2015 2090
2016 if (vmx_set_msr(vcpu, ecx, data) != 0) { 2091 if (vmx_set_msr(vcpu, ecx, data) != 0) {
2017 vmx_inject_gp(vcpu, 0); 2092 kvm_inject_gp(vcpu, 0);
2018 return 1; 2093 return 1;
2019 } 2094 }
2020 2095
@@ -2042,7 +2117,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2042 * possible 2117 * possible
2043 */ 2118 */
2044 if (kvm_run->request_interrupt_window && 2119 if (kvm_run->request_interrupt_window &&
2045 !vcpu->irq_summary) { 2120 !vcpu->arch.irq_summary) {
2046 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 2121 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2047 ++vcpu->stat.irq_window_exits; 2122 ++vcpu->stat.irq_window_exits;
2048 return 0; 2123 return 0;
@@ -2059,7 +2134,35 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2059static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2134static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2060{ 2135{
2061 skip_emulated_instruction(vcpu); 2136 skip_emulated_instruction(vcpu);
2062 return kvm_hypercall(vcpu, kvm_run); 2137 kvm_emulate_hypercall(vcpu);
2138 return 1;
2139}
2140
2141static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2142{
2143 skip_emulated_instruction(vcpu);
2144 /* TODO: Add support for VT-d/pass-through device */
2145 return 1;
2146}
2147
2148static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2149{
2150 u64 exit_qualification;
2151 enum emulation_result er;
2152 unsigned long offset;
2153
2154 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2155 offset = exit_qualification & 0xffful;
2156
2157 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2158
2159 if (er != EMULATE_DONE) {
2160 printk(KERN_ERR
2161 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
2162 offset);
2163 return -ENOTSUPP;
2164 }
2165 return 1;
2063} 2166}
2064 2167
2065/* 2168/*
@@ -2081,7 +2184,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2081 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 2184 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2082 [EXIT_REASON_HLT] = handle_halt, 2185 [EXIT_REASON_HLT] = handle_halt,
2083 [EXIT_REASON_VMCALL] = handle_vmcall, 2186 [EXIT_REASON_VMCALL] = handle_vmcall,
2084 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold 2187 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
2188 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
2189 [EXIT_REASON_WBINVD] = handle_wbinvd,
2085}; 2190};
2086 2191
2087static const int kvm_vmx_max_exit_handlers = 2192static const int kvm_vmx_max_exit_handlers =
@@ -2093,9 +2198,9 @@ static const int kvm_vmx_max_exit_handlers =
2093 */ 2198 */
2094static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 2199static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2095{ 2200{
2096 u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2097 u32 exit_reason = vmcs_read32(VM_EXIT_REASON); 2201 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
2098 struct vcpu_vmx *vmx = to_vmx(vcpu); 2202 struct vcpu_vmx *vmx = to_vmx(vcpu);
2203 u32 vectoring_info = vmx->idt_vectoring_info;
2099 2204
2100 if (unlikely(vmx->fail)) { 2205 if (unlikely(vmx->fail)) {
2101 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2206 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2104,8 +2209,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2104 return 0; 2209 return 0;
2105 } 2210 }
2106 2211
2107 if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && 2212 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
2108 exit_reason != EXIT_REASON_EXCEPTION_NMI ) 2213 exit_reason != EXIT_REASON_EXCEPTION_NMI)
2109 printk(KERN_WARNING "%s: unexpected, valid vectoring info and " 2214 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
2110 "exit reason is 0x%x\n", __FUNCTION__, exit_reason); 2215 "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2111 if (exit_reason < kvm_vmx_max_exit_handlers 2216 if (exit_reason < kvm_vmx_max_exit_handlers
@@ -2150,26 +2255,38 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
2150 2255
2151static void vmx_intr_assist(struct kvm_vcpu *vcpu) 2256static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2152{ 2257{
2258 struct vcpu_vmx *vmx = to_vmx(vcpu);
2153 u32 idtv_info_field, intr_info_field; 2259 u32 idtv_info_field, intr_info_field;
2154 int has_ext_irq, interrupt_window_open; 2260 int has_ext_irq, interrupt_window_open;
2155 int vector; 2261 int vector;
2156 2262
2157 kvm_inject_pending_timer_irqs(vcpu);
2158 update_tpr_threshold(vcpu); 2263 update_tpr_threshold(vcpu);
2159 2264
2160 has_ext_irq = kvm_cpu_has_interrupt(vcpu); 2265 has_ext_irq = kvm_cpu_has_interrupt(vcpu);
2161 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 2266 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2162 idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); 2267 idtv_info_field = vmx->idt_vectoring_info;
2163 if (intr_info_field & INTR_INFO_VALID_MASK) { 2268 if (intr_info_field & INTR_INFO_VALID_MASK) {
2164 if (idtv_info_field & INTR_INFO_VALID_MASK) { 2269 if (idtv_info_field & INTR_INFO_VALID_MASK) {
2165 /* TODO: fault when IDT_Vectoring */ 2270 /* TODO: fault when IDT_Vectoring */
2166 printk(KERN_ERR "Fault when IDT_Vectoring\n"); 2271 if (printk_ratelimit())
2272 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2167 } 2273 }
2168 if (has_ext_irq) 2274 if (has_ext_irq)
2169 enable_irq_window(vcpu); 2275 enable_irq_window(vcpu);
2170 return; 2276 return;
2171 } 2277 }
2172 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { 2278 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
2279 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2280 == INTR_TYPE_EXT_INTR
2281 && vcpu->arch.rmode.active) {
2282 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2283
2284 vmx_inject_irq(vcpu, vect);
2285 if (unlikely(has_ext_irq))
2286 enable_irq_window(vcpu);
2287 return;
2288 }
2289
2173 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); 2290 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
2174 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2291 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2175 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 2292 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
@@ -2194,6 +2311,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2194 enable_irq_window(vcpu); 2311 enable_irq_window(vcpu);
2195} 2312}
2196 2313
2314/*
2315 * Failure to inject an interrupt should give us the information
2316 * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
2317 * when fetching the interrupt redirection bitmap in the real-mode
2318 * tss, this doesn't happen. So we do it ourselves.
2319 */
2320static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2321{
2322 vmx->rmode.irq.pending = 0;
2323 if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
2324 return;
2325 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
2326 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
2327 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
2328 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
2329 return;
2330 }
2331 vmx->idt_vectoring_info =
2332 VECTORING_INFO_VALID_MASK
2333 | INTR_TYPE_EXT_INTR
2334 | vmx->rmode.irq.vector;
2335}
2336
2197static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2337static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2198{ 2338{
2199 struct vcpu_vmx *vmx = to_vmx(vcpu); 2339 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2204,50 +2344,47 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2204 */ 2344 */
2205 vmcs_writel(HOST_CR0, read_cr0()); 2345 vmcs_writel(HOST_CR0, read_cr0());
2206 2346
2207 asm ( 2347 asm(
2208 /* Store host registers */ 2348 /* Store host registers */
2209#ifdef CONFIG_X86_64 2349#ifdef CONFIG_X86_64
2210 "push %%rax; push %%rbx; push %%rdx;" 2350 "push %%rdx; push %%rbp;"
2211 "push %%rsi; push %%rdi; push %%rbp;"
2212 "push %%r8; push %%r9; push %%r10; push %%r11;"
2213 "push %%r12; push %%r13; push %%r14; push %%r15;"
2214 "push %%rcx \n\t" 2351 "push %%rcx \n\t"
2215 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
2216#else 2352#else
2217 "pusha; push %%ecx \n\t" 2353 "push %%edx; push %%ebp;"
2218 ASM_VMX_VMWRITE_RSP_RDX "\n\t" 2354 "push %%ecx \n\t"
2219#endif 2355#endif
2356 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
2220 /* Check if vmlaunch of vmresume is needed */ 2357 /* Check if vmlaunch of vmresume is needed */
2221 "cmp $0, %1 \n\t" 2358 "cmpl $0, %c[launched](%0) \n\t"
2222 /* Load guest registers. Don't clobber flags. */ 2359 /* Load guest registers. Don't clobber flags. */
2223#ifdef CONFIG_X86_64 2360#ifdef CONFIG_X86_64
2224 "mov %c[cr2](%3), %%rax \n\t" 2361 "mov %c[cr2](%0), %%rax \n\t"
2225 "mov %%rax, %%cr2 \n\t" 2362 "mov %%rax, %%cr2 \n\t"
2226 "mov %c[rax](%3), %%rax \n\t" 2363 "mov %c[rax](%0), %%rax \n\t"
2227 "mov %c[rbx](%3), %%rbx \n\t" 2364 "mov %c[rbx](%0), %%rbx \n\t"
2228 "mov %c[rdx](%3), %%rdx \n\t" 2365 "mov %c[rdx](%0), %%rdx \n\t"
2229 "mov %c[rsi](%3), %%rsi \n\t" 2366 "mov %c[rsi](%0), %%rsi \n\t"
2230 "mov %c[rdi](%3), %%rdi \n\t" 2367 "mov %c[rdi](%0), %%rdi \n\t"
2231 "mov %c[rbp](%3), %%rbp \n\t" 2368 "mov %c[rbp](%0), %%rbp \n\t"
2232 "mov %c[r8](%3), %%r8 \n\t" 2369 "mov %c[r8](%0), %%r8 \n\t"
2233 "mov %c[r9](%3), %%r9 \n\t" 2370 "mov %c[r9](%0), %%r9 \n\t"
2234 "mov %c[r10](%3), %%r10 \n\t" 2371 "mov %c[r10](%0), %%r10 \n\t"
2235 "mov %c[r11](%3), %%r11 \n\t" 2372 "mov %c[r11](%0), %%r11 \n\t"
2236 "mov %c[r12](%3), %%r12 \n\t" 2373 "mov %c[r12](%0), %%r12 \n\t"
2237 "mov %c[r13](%3), %%r13 \n\t" 2374 "mov %c[r13](%0), %%r13 \n\t"
2238 "mov %c[r14](%3), %%r14 \n\t" 2375 "mov %c[r14](%0), %%r14 \n\t"
2239 "mov %c[r15](%3), %%r15 \n\t" 2376 "mov %c[r15](%0), %%r15 \n\t"
2240 "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */ 2377 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
2241#else 2378#else
2242 "mov %c[cr2](%3), %%eax \n\t" 2379 "mov %c[cr2](%0), %%eax \n\t"
2243 "mov %%eax, %%cr2 \n\t" 2380 "mov %%eax, %%cr2 \n\t"
2244 "mov %c[rax](%3), %%eax \n\t" 2381 "mov %c[rax](%0), %%eax \n\t"
2245 "mov %c[rbx](%3), %%ebx \n\t" 2382 "mov %c[rbx](%0), %%ebx \n\t"
2246 "mov %c[rdx](%3), %%edx \n\t" 2383 "mov %c[rdx](%0), %%edx \n\t"
2247 "mov %c[rsi](%3), %%esi \n\t" 2384 "mov %c[rsi](%0), %%esi \n\t"
2248 "mov %c[rdi](%3), %%edi \n\t" 2385 "mov %c[rdi](%0), %%edi \n\t"
2249 "mov %c[rbp](%3), %%ebp \n\t" 2386 "mov %c[rbp](%0), %%ebp \n\t"
2250 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ 2387 "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
2251#endif 2388#endif
2252 /* Enter guest mode */ 2389 /* Enter guest mode */
2253 "jne .Llaunched \n\t" 2390 "jne .Llaunched \n\t"
@@ -2257,72 +2394,79 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2257 ".Lkvm_vmx_return: " 2394 ".Lkvm_vmx_return: "
2258 /* Save guest registers, load host registers, keep flags */ 2395 /* Save guest registers, load host registers, keep flags */
2259#ifdef CONFIG_X86_64 2396#ifdef CONFIG_X86_64
2260 "xchg %3, (%%rsp) \n\t" 2397 "xchg %0, (%%rsp) \n\t"
2261 "mov %%rax, %c[rax](%3) \n\t" 2398 "mov %%rax, %c[rax](%0) \n\t"
2262 "mov %%rbx, %c[rbx](%3) \n\t" 2399 "mov %%rbx, %c[rbx](%0) \n\t"
2263 "pushq (%%rsp); popq %c[rcx](%3) \n\t" 2400 "pushq (%%rsp); popq %c[rcx](%0) \n\t"
2264 "mov %%rdx, %c[rdx](%3) \n\t" 2401 "mov %%rdx, %c[rdx](%0) \n\t"
2265 "mov %%rsi, %c[rsi](%3) \n\t" 2402 "mov %%rsi, %c[rsi](%0) \n\t"
2266 "mov %%rdi, %c[rdi](%3) \n\t" 2403 "mov %%rdi, %c[rdi](%0) \n\t"
2267 "mov %%rbp, %c[rbp](%3) \n\t" 2404 "mov %%rbp, %c[rbp](%0) \n\t"
2268 "mov %%r8, %c[r8](%3) \n\t" 2405 "mov %%r8, %c[r8](%0) \n\t"
2269 "mov %%r9, %c[r9](%3) \n\t" 2406 "mov %%r9, %c[r9](%0) \n\t"
2270 "mov %%r10, %c[r10](%3) \n\t" 2407 "mov %%r10, %c[r10](%0) \n\t"
2271 "mov %%r11, %c[r11](%3) \n\t" 2408 "mov %%r11, %c[r11](%0) \n\t"
2272 "mov %%r12, %c[r12](%3) \n\t" 2409 "mov %%r12, %c[r12](%0) \n\t"
2273 "mov %%r13, %c[r13](%3) \n\t" 2410 "mov %%r13, %c[r13](%0) \n\t"
2274 "mov %%r14, %c[r14](%3) \n\t" 2411 "mov %%r14, %c[r14](%0) \n\t"
2275 "mov %%r15, %c[r15](%3) \n\t" 2412 "mov %%r15, %c[r15](%0) \n\t"
2276 "mov %%cr2, %%rax \n\t" 2413 "mov %%cr2, %%rax \n\t"
2277 "mov %%rax, %c[cr2](%3) \n\t" 2414 "mov %%rax, %c[cr2](%0) \n\t"
2278 "mov (%%rsp), %3 \n\t"
2279 2415
2280 "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;" 2416 "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
2281 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
2282 "pop %%rbp; pop %%rdi; pop %%rsi;"
2283 "pop %%rdx; pop %%rbx; pop %%rax \n\t"
2284#else 2417#else
2285 "xchg %3, (%%esp) \n\t" 2418 "xchg %0, (%%esp) \n\t"
2286 "mov %%eax, %c[rax](%3) \n\t" 2419 "mov %%eax, %c[rax](%0) \n\t"
2287 "mov %%ebx, %c[rbx](%3) \n\t" 2420 "mov %%ebx, %c[rbx](%0) \n\t"
2288 "pushl (%%esp); popl %c[rcx](%3) \n\t" 2421 "pushl (%%esp); popl %c[rcx](%0) \n\t"
2289 "mov %%edx, %c[rdx](%3) \n\t" 2422 "mov %%edx, %c[rdx](%0) \n\t"
2290 "mov %%esi, %c[rsi](%3) \n\t" 2423 "mov %%esi, %c[rsi](%0) \n\t"
2291 "mov %%edi, %c[rdi](%3) \n\t" 2424 "mov %%edi, %c[rdi](%0) \n\t"
2292 "mov %%ebp, %c[rbp](%3) \n\t" 2425 "mov %%ebp, %c[rbp](%0) \n\t"
2293 "mov %%cr2, %%eax \n\t" 2426 "mov %%cr2, %%eax \n\t"
2294 "mov %%eax, %c[cr2](%3) \n\t" 2427 "mov %%eax, %c[cr2](%0) \n\t"
2295 "mov (%%esp), %3 \n\t"
2296 2428
2297 "pop %%ecx; popa \n\t" 2429 "pop %%ebp; pop %%ebp; pop %%edx \n\t"
2430#endif
2431 "setbe %c[fail](%0) \n\t"
2432 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
2433 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
2434 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
2435 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
2436 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
2437 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
2438 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
2439 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
2440 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
2441 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
2442#ifdef CONFIG_X86_64
2443 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
2444 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
2445 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
2446 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
2447 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
2448 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
2449 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
2450 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
2298#endif 2451#endif
2299 "setbe %0 \n\t" 2452 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
2300 : "=q" (vmx->fail) 2453 : "cc", "memory"
2301 : "r"(vmx->launched), "d"((unsigned long)HOST_RSP),
2302 "c"(vcpu),
2303 [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
2304 [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
2305 [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
2306 [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
2307 [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
2308 [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
2309 [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
2310#ifdef CONFIG_X86_64 2454#ifdef CONFIG_X86_64
2311 [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), 2455 , "rbx", "rdi", "rsi"
2312 [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), 2456 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
2313 [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), 2457#else
2314 [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), 2458 , "ebx", "edi", "rsi"
2315 [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
2316 [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
2317 [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
2318 [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
2319#endif 2459#endif
2320 [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) 2460 );
2321 : "cc", "memory" ); 2461
2462 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2463 if (vmx->rmode.irq.pending)
2464 fixup_rmode_irq(vmx);
2322 2465
2323 vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; 2466 vcpu->arch.interrupt_window_open =
2467 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
2324 2468
2325 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 2469 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2326 vmx->launched = 1; 2470 vmx->launched = 1;
2327 2471
2328 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2472 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
@@ -2332,36 +2476,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2332 asm("int $2"); 2476 asm("int $2");
2333} 2477}
2334 2478
2335static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
2336 unsigned long addr,
2337 u32 err_code)
2338{
2339 u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2340
2341 ++vcpu->stat.pf_guest;
2342
2343 if (is_page_fault(vect_info)) {
2344 printk(KERN_DEBUG "inject_page_fault: "
2345 "double fault 0x%lx @ 0x%lx\n",
2346 addr, vmcs_readl(GUEST_RIP));
2347 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
2348 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2349 DF_VECTOR |
2350 INTR_TYPE_EXCEPTION |
2351 INTR_INFO_DELIEVER_CODE_MASK |
2352 INTR_INFO_VALID_MASK);
2353 return;
2354 }
2355 vcpu->cr2 = addr;
2356 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
2357 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2358 PF_VECTOR |
2359 INTR_TYPE_EXCEPTION |
2360 INTR_INFO_DELIEVER_CODE_MASK |
2361 INTR_INFO_VALID_MASK);
2362
2363}
2364
2365static void vmx_free_vmcs(struct kvm_vcpu *vcpu) 2479static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2366{ 2480{
2367 struct vcpu_vmx *vmx = to_vmx(vcpu); 2481 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2397,12 +2511,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2397 if (err) 2511 if (err)
2398 goto free_vcpu; 2512 goto free_vcpu;
2399 2513
2400 if (irqchip_in_kernel(kvm)) {
2401 err = kvm_create_lapic(&vmx->vcpu);
2402 if (err < 0)
2403 goto free_vcpu;
2404 }
2405
2406 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 2514 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2407 if (!vmx->guest_msrs) { 2515 if (!vmx->guest_msrs) {
2408 err = -ENOMEM; 2516 err = -ENOMEM;
@@ -2464,6 +2572,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
2464 .check_processor_compatibility = vmx_check_processor_compat, 2572 .check_processor_compatibility = vmx_check_processor_compat,
2465 .hardware_enable = hardware_enable, 2573 .hardware_enable = hardware_enable,
2466 .hardware_disable = hardware_disable, 2574 .hardware_disable = hardware_disable,
2575 .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
2467 2576
2468 .vcpu_create = vmx_create_vcpu, 2577 .vcpu_create = vmx_create_vcpu,
2469 .vcpu_free = vmx_free_vcpu, 2578 .vcpu_free = vmx_free_vcpu,
@@ -2499,9 +2608,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
2499 .set_rflags = vmx_set_rflags, 2608 .set_rflags = vmx_set_rflags,
2500 2609
2501 .tlb_flush = vmx_flush_tlb, 2610 .tlb_flush = vmx_flush_tlb,
2502 .inject_page_fault = vmx_inject_page_fault,
2503
2504 .inject_gp = vmx_inject_gp,
2505 2611
2506 .run = vmx_vcpu_run, 2612 .run = vmx_vcpu_run,
2507 .handle_exit = kvm_handle_exit, 2613 .handle_exit = kvm_handle_exit,
@@ -2509,8 +2615,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
2509 .patch_hypercall = vmx_patch_hypercall, 2615 .patch_hypercall = vmx_patch_hypercall,
2510 .get_irq = vmx_get_irq, 2616 .get_irq = vmx_get_irq,
2511 .set_irq = vmx_inject_irq, 2617 .set_irq = vmx_inject_irq,
2618 .queue_exception = vmx_queue_exception,
2619 .exception_injected = vmx_exception_injected,
2512 .inject_pending_irq = vmx_intr_assist, 2620 .inject_pending_irq = vmx_intr_assist,
2513 .inject_pending_vectors = do_interrupt_requests, 2621 .inject_pending_vectors = do_interrupt_requests,
2622
2623 .set_tss_addr = vmx_set_tss_addr,
2514}; 2624};
2515 2625
2516static int __init vmx_init(void) 2626static int __init vmx_init(void)
@@ -2541,10 +2651,13 @@ static int __init vmx_init(void)
2541 memset(iova, 0xff, PAGE_SIZE); 2651 memset(iova, 0xff, PAGE_SIZE);
2542 kunmap(vmx_io_bitmap_b); 2652 kunmap(vmx_io_bitmap_b);
2543 2653
2544 r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); 2654 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
2545 if (r) 2655 if (r)
2546 goto out1; 2656 goto out1;
2547 2657
2658 if (bypass_guest_pf)
2659 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
2660
2548 return 0; 2661 return 0;
2549 2662
2550out1: 2663out1:
@@ -2559,7 +2672,7 @@ static void __exit vmx_exit(void)
2559 __free_page(vmx_io_bitmap_b); 2672 __free_page(vmx_io_bitmap_b);
2560 __free_page(vmx_io_bitmap_a); 2673 __free_page(vmx_io_bitmap_a);
2561 2674
2562 kvm_exit_x86(); 2675 kvm_exit();
2563} 2676}
2564 2677
2565module_init(vmx_init) 2678module_init(vmx_init)
diff --git a/drivers/kvm/vmx.h b/arch/x86/kvm/vmx.h
index fd4e14666088..d52ae8d7303d 100644
--- a/drivers/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -25,6 +25,9 @@
25 * 25 *
26 */ 26 */
27 27
28/*
29 * Definitions of Primary Processor-Based VM-Execution Controls.
30 */
28#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 31#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
29#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 32#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
30#define CPU_BASED_HLT_EXITING 0x00000080 33#define CPU_BASED_HLT_EXITING 0x00000080
@@ -42,6 +45,12 @@
42#define CPU_BASED_MONITOR_EXITING 0x20000000 45#define CPU_BASED_MONITOR_EXITING 0x20000000
43#define CPU_BASED_PAUSE_EXITING 0x40000000 46#define CPU_BASED_PAUSE_EXITING 0x40000000
44#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 47#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
48/*
49 * Definitions of Secondary Processor-Based VM-Execution Controls.
50 */
51#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
52#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
53
45 54
46#define PIN_BASED_EXT_INTR_MASK 0x00000001 55#define PIN_BASED_EXT_INTR_MASK 0x00000001
47#define PIN_BASED_NMI_EXITING 0x00000008 56#define PIN_BASED_NMI_EXITING 0x00000008
@@ -54,8 +63,6 @@
54#define VM_ENTRY_SMM 0x00000400 63#define VM_ENTRY_SMM 0x00000400
55#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 64#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
56 65
57#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
58
59/* VMCS Encodings */ 66/* VMCS Encodings */
60enum vmcs_field { 67enum vmcs_field {
61 GUEST_ES_SELECTOR = 0x00000800, 68 GUEST_ES_SELECTOR = 0x00000800,
@@ -89,6 +96,8 @@ enum vmcs_field {
89 TSC_OFFSET_HIGH = 0x00002011, 96 TSC_OFFSET_HIGH = 0x00002011,
90 VIRTUAL_APIC_PAGE_ADDR = 0x00002012, 97 VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
91 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, 98 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
99 APIC_ACCESS_ADDR = 0x00002014,
100 APIC_ACCESS_ADDR_HIGH = 0x00002015,
92 VMCS_LINK_POINTER = 0x00002800, 101 VMCS_LINK_POINTER = 0x00002800,
93 VMCS_LINK_POINTER_HIGH = 0x00002801, 102 VMCS_LINK_POINTER_HIGH = 0x00002801,
94 GUEST_IA32_DEBUGCTL = 0x00002802, 103 GUEST_IA32_DEBUGCTL = 0x00002802,
@@ -214,6 +223,8 @@ enum vmcs_field {
214#define EXIT_REASON_MSR_WRITE 32 223#define EXIT_REASON_MSR_WRITE 32
215#define EXIT_REASON_MWAIT_INSTRUCTION 36 224#define EXIT_REASON_MWAIT_INSTRUCTION 36
216#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 225#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
226#define EXIT_REASON_APIC_ACCESS 44
227#define EXIT_REASON_WBINVD 54
217 228
218/* 229/*
219 * Interruption-information format 230 * Interruption-information format
@@ -230,13 +241,14 @@ enum vmcs_field {
230 241
231#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ 242#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
232#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ 243#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
244#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
233 245
234/* 246/*
235 * Exit Qualifications for MOV for Control Register Access 247 * Exit Qualifications for MOV for Control Register Access
236 */ 248 */
237#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */ 249#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/
238#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ 250#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
239#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */ 251#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */
240#define LMSW_SOURCE_DATA_SHIFT 16 252#define LMSW_SOURCE_DATA_SHIFT 16
241#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ 253#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
242#define REG_EAX (0 << 8) 254#define REG_EAX (0 << 8)
@@ -259,11 +271,11 @@ enum vmcs_field {
259/* 271/*
260 * Exit Qualifications for MOV for Debug Register Access 272 * Exit Qualifications for MOV for Debug Register Access
261 */ 273 */
262#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */ 274#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */
263#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ 275#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
264#define TYPE_MOV_TO_DR (0 << 4) 276#define TYPE_MOV_TO_DR (0 << 4)
265#define TYPE_MOV_FROM_DR (1 << 4) 277#define TYPE_MOV_FROM_DR (1 << 4)
266#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */ 278#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */
267 279
268 280
269/* segment AR */ 281/* segment AR */
@@ -307,4 +319,6 @@ enum vmcs_field {
307#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 319#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
308#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 320#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
309 321
322#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
323
310#endif 324#endif
diff --git a/drivers/kvm/kvm_main.c b/arch/x86/kvm/x86.c
index c0f372f1d761..8f94a0b89dff 100644
--- a/drivers/kvm/kvm_main.c
+++ b/arch/x86/kvm/x86.c
@@ -1,8 +1,7 @@
1/* 1/*
2 * Kernel-based Virtual Machine driver for Linux 2 * Kernel-based Virtual Machine driver for Linux
3 * 3 *
4 * This module enables machines with Intel VT-x extensions to run virtual 4 * derived from drivers/kvm/kvm_main.c
5 * machines without emulation or binary translation.
6 * 5 *
7 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
8 * 7 *
@@ -15,80 +14,22 @@
15 * 14 *
16 */ 15 */
17 16
18#include "kvm.h" 17#include <linux/kvm_host.h>
19#include "x86_emulate.h"
20#include "segment_descriptor.h" 18#include "segment_descriptor.h"
21#include "irq.h" 19#include "irq.h"
20#include "mmu.h"
22 21
23#include <linux/kvm.h> 22#include <linux/kvm.h>
24#include <linux/module.h> 23#include <linux/fs.h>
25#include <linux/errno.h>
26#include <linux/percpu.h>
27#include <linux/gfp.h>
28#include <linux/mm.h>
29#include <linux/miscdevice.h>
30#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
31#include <linux/reboot.h> 25#include <linux/module.h>
32#include <linux/debugfs.h> 26#include <linux/mman.h>
33#include <linux/highmem.h> 27#include <linux/highmem.h>
34#include <linux/file.h>
35#include <linux/sysdev.h>
36#include <linux/cpu.h>
37#include <linux/sched.h>
38#include <linux/cpumask.h>
39#include <linux/smp.h>
40#include <linux/anon_inodes.h>
41#include <linux/profile.h>
42
43#include <asm/processor.h>
44#include <asm/msr.h>
45#include <asm/io.h>
46#include <asm/uaccess.h>
47#include <asm/desc.h>
48
49MODULE_AUTHOR("Qumranet");
50MODULE_LICENSE("GPL");
51 28
52static DEFINE_SPINLOCK(kvm_lock); 29#include <asm/uaccess.h>
53static LIST_HEAD(vm_list); 30#include <asm/msr.h>
54
55static cpumask_t cpus_hardware_enabled;
56
57struct kvm_x86_ops *kvm_x86_ops;
58struct kmem_cache *kvm_vcpu_cache;
59EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
60
61static __read_mostly struct preempt_ops kvm_preempt_ops;
62
63#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
64
65static struct kvm_stats_debugfs_item {
66 const char *name;
67 int offset;
68 struct dentry *dentry;
69} debugfs_entries[] = {
70 { "pf_fixed", STAT_OFFSET(pf_fixed) },
71 { "pf_guest", STAT_OFFSET(pf_guest) },
72 { "tlb_flush", STAT_OFFSET(tlb_flush) },
73 { "invlpg", STAT_OFFSET(invlpg) },
74 { "exits", STAT_OFFSET(exits) },
75 { "io_exits", STAT_OFFSET(io_exits) },
76 { "mmio_exits", STAT_OFFSET(mmio_exits) },
77 { "signal_exits", STAT_OFFSET(signal_exits) },
78 { "irq_window", STAT_OFFSET(irq_window_exits) },
79 { "halt_exits", STAT_OFFSET(halt_exits) },
80 { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
81 { "request_irq", STAT_OFFSET(request_irq_exits) },
82 { "irq_exits", STAT_OFFSET(irq_exits) },
83 { "light_exits", STAT_OFFSET(light_exits) },
84 { "efer_reload", STAT_OFFSET(efer_reload) },
85 { NULL }
86};
87
88static struct dentry *debugfs_dir;
89 31
90#define MAX_IO_MSRS 256 32#define MAX_IO_MSRS 256
91
92#define CR0_RESERVED_BITS \ 33#define CR0_RESERVED_BITS \
93 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 34 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
94 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 35 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
@@ -102,317 +43,151 @@ static struct dentry *debugfs_dir;
102#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 43#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
103#define EFER_RESERVED_BITS 0xfffffffffffff2fe 44#define EFER_RESERVED_BITS 0xfffffffffffff2fe
104 45
105#ifdef CONFIG_X86_64 46#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
106// LDT or TSS descriptor in the GDT. 16 bytes. 47#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
107struct segment_descriptor_64 {
108 struct segment_descriptor s;
109 u32 base_higher;
110 u32 pad_zero;
111};
112 48
113#endif 49struct kvm_x86_ops *kvm_x86_ops;
50
51struct kvm_stats_debugfs_item debugfs_entries[] = {
52 { "pf_fixed", VCPU_STAT(pf_fixed) },
53 { "pf_guest", VCPU_STAT(pf_guest) },
54 { "tlb_flush", VCPU_STAT(tlb_flush) },
55 { "invlpg", VCPU_STAT(invlpg) },
56 { "exits", VCPU_STAT(exits) },
57 { "io_exits", VCPU_STAT(io_exits) },
58 { "mmio_exits", VCPU_STAT(mmio_exits) },
59 { "signal_exits", VCPU_STAT(signal_exits) },
60 { "irq_window", VCPU_STAT(irq_window_exits) },
61 { "halt_exits", VCPU_STAT(halt_exits) },
62 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
63 { "request_irq", VCPU_STAT(request_irq_exits) },
64 { "irq_exits", VCPU_STAT(irq_exits) },
65 { "host_state_reload", VCPU_STAT(host_state_reload) },
66 { "efer_reload", VCPU_STAT(efer_reload) },
67 { "fpu_reload", VCPU_STAT(fpu_reload) },
68 { "insn_emulation", VCPU_STAT(insn_emulation) },
69 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
70 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
71 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
72 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
73 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
74 { "mmu_flooded", VM_STAT(mmu_flooded) },
75 { "mmu_recycled", VM_STAT(mmu_recycled) },
76 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
77 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
78 { NULL }
79};
114 80
115static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
116 unsigned long arg);
117 81
118unsigned long segment_base(u16 selector) 82unsigned long segment_base(u16 selector)
119{ 83{
120 struct descriptor_table gdt; 84 struct descriptor_table gdt;
121 struct segment_descriptor *d; 85 struct segment_descriptor *d;
122 unsigned long table_base; 86 unsigned long table_base;
123 typedef unsigned long ul;
124 unsigned long v; 87 unsigned long v;
125 88
126 if (selector == 0) 89 if (selector == 0)
127 return 0; 90 return 0;
128 91
129 asm ("sgdt %0" : "=m"(gdt)); 92 asm("sgdt %0" : "=m"(gdt));
130 table_base = gdt.base; 93 table_base = gdt.base;
131 94
132 if (selector & 4) { /* from ldt */ 95 if (selector & 4) { /* from ldt */
133 u16 ldt_selector; 96 u16 ldt_selector;
134 97
135 asm ("sldt %0" : "=g"(ldt_selector)); 98 asm("sldt %0" : "=g"(ldt_selector));
136 table_base = segment_base(ldt_selector); 99 table_base = segment_base(ldt_selector);
137 } 100 }
138 d = (struct segment_descriptor *)(table_base + (selector & ~7)); 101 d = (struct segment_descriptor *)(table_base + (selector & ~7));
139 v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24); 102 v = d->base_low | ((unsigned long)d->base_mid << 16) |
103 ((unsigned long)d->base_high << 24);
140#ifdef CONFIG_X86_64 104#ifdef CONFIG_X86_64
141 if (d->system == 0 105 if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
142 && (d->type == 2 || d->type == 9 || d->type == 11)) 106 v |= ((unsigned long) \
143 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32; 107 ((struct segment_descriptor_64 *)d)->base_higher) << 32;
144#endif 108#endif
145 return v; 109 return v;
146} 110}
147EXPORT_SYMBOL_GPL(segment_base); 111EXPORT_SYMBOL_GPL(segment_base);
148 112
149static inline int valid_vcpu(int n) 113u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
150{
151 return likely(n >= 0 && n < KVM_MAX_VCPUS);
152}
153
154void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
155{
156 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
157 return;
158
159 vcpu->guest_fpu_loaded = 1;
160 fx_save(&vcpu->host_fx_image);
161 fx_restore(&vcpu->guest_fx_image);
162}
163EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
164
165void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
166{
167 if (!vcpu->guest_fpu_loaded)
168 return;
169
170 vcpu->guest_fpu_loaded = 0;
171 fx_save(&vcpu->guest_fx_image);
172 fx_restore(&vcpu->host_fx_image);
173}
174EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
175
176/*
177 * Switches to specified vcpu, until a matching vcpu_put()
178 */
179static void vcpu_load(struct kvm_vcpu *vcpu)
180{
181 int cpu;
182
183 mutex_lock(&vcpu->mutex);
184 cpu = get_cpu();
185 preempt_notifier_register(&vcpu->preempt_notifier);
186 kvm_x86_ops->vcpu_load(vcpu, cpu);
187 put_cpu();
188}
189
190static void vcpu_put(struct kvm_vcpu *vcpu)
191{
192 preempt_disable();
193 kvm_x86_ops->vcpu_put(vcpu);
194 preempt_notifier_unregister(&vcpu->preempt_notifier);
195 preempt_enable();
196 mutex_unlock(&vcpu->mutex);
197}
198
199static void ack_flush(void *_completed)
200{
201}
202
203void kvm_flush_remote_tlbs(struct kvm *kvm)
204{
205 int i, cpu;
206 cpumask_t cpus;
207 struct kvm_vcpu *vcpu;
208
209 cpus_clear(cpus);
210 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
211 vcpu = kvm->vcpus[i];
212 if (!vcpu)
213 continue;
214 if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
215 continue;
216 cpu = vcpu->cpu;
217 if (cpu != -1 && cpu != raw_smp_processor_id())
218 cpu_set(cpu, cpus);
219 }
220 smp_call_function_mask(cpus, ack_flush, NULL, 1);
221}
222
223int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
224{ 114{
225 struct page *page; 115 if (irqchip_in_kernel(vcpu->kvm))
226 int r; 116 return vcpu->arch.apic_base;
227
228 mutex_init(&vcpu->mutex);
229 vcpu->cpu = -1;
230 vcpu->mmu.root_hpa = INVALID_PAGE;
231 vcpu->kvm = kvm;
232 vcpu->vcpu_id = id;
233 if (!irqchip_in_kernel(kvm) || id == 0)
234 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
235 else 117 else
236 vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED; 118 return vcpu->arch.apic_base;
237 init_waitqueue_head(&vcpu->wq);
238
239 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
240 if (!page) {
241 r = -ENOMEM;
242 goto fail;
243 }
244 vcpu->run = page_address(page);
245
246 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
247 if (!page) {
248 r = -ENOMEM;
249 goto fail_free_run;
250 }
251 vcpu->pio_data = page_address(page);
252
253 r = kvm_mmu_create(vcpu);
254 if (r < 0)
255 goto fail_free_pio_data;
256
257 return 0;
258
259fail_free_pio_data:
260 free_page((unsigned long)vcpu->pio_data);
261fail_free_run:
262 free_page((unsigned long)vcpu->run);
263fail:
264 return -ENOMEM;
265}
266EXPORT_SYMBOL_GPL(kvm_vcpu_init);
267
268void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
269{
270 kvm_mmu_destroy(vcpu);
271 if (vcpu->apic)
272 hrtimer_cancel(&vcpu->apic->timer.dev);
273 kvm_free_apic(vcpu->apic);
274 free_page((unsigned long)vcpu->pio_data);
275 free_page((unsigned long)vcpu->run);
276}
277EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
278
279static struct kvm *kvm_create_vm(void)
280{
281 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
282
283 if (!kvm)
284 return ERR_PTR(-ENOMEM);
285
286 kvm_io_bus_init(&kvm->pio_bus);
287 mutex_init(&kvm->lock);
288 INIT_LIST_HEAD(&kvm->active_mmu_pages);
289 kvm_io_bus_init(&kvm->mmio_bus);
290 spin_lock(&kvm_lock);
291 list_add(&kvm->vm_list, &vm_list);
292 spin_unlock(&kvm_lock);
293 return kvm;
294}
295
296/*
297 * Free any memory in @free but not in @dont.
298 */
299static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
300 struct kvm_memory_slot *dont)
301{
302 int i;
303
304 if (!dont || free->phys_mem != dont->phys_mem)
305 if (free->phys_mem) {
306 for (i = 0; i < free->npages; ++i)
307 if (free->phys_mem[i])
308 __free_page(free->phys_mem[i]);
309 vfree(free->phys_mem);
310 }
311
312 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
313 vfree(free->dirty_bitmap);
314
315 free->phys_mem = NULL;
316 free->npages = 0;
317 free->dirty_bitmap = NULL;
318}
319
320static void kvm_free_physmem(struct kvm *kvm)
321{
322 int i;
323
324 for (i = 0; i < kvm->nmemslots; ++i)
325 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
326} 119}
120EXPORT_SYMBOL_GPL(kvm_get_apic_base);
327 121
328static void free_pio_guest_pages(struct kvm_vcpu *vcpu) 122void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
329{ 123{
330 int i; 124 /* TODO: reserve bits check */
331 125 if (irqchip_in_kernel(vcpu->kvm))
332 for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i) 126 kvm_lapic_set_base(vcpu, data);
333 if (vcpu->pio.guest_pages[i]) { 127 else
334 __free_page(vcpu->pio.guest_pages[i]); 128 vcpu->arch.apic_base = data;
335 vcpu->pio.guest_pages[i] = NULL;
336 }
337} 129}
130EXPORT_SYMBOL_GPL(kvm_set_apic_base);
338 131
339static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 132void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
340{ 133{
341 vcpu_load(vcpu); 134 WARN_ON(vcpu->arch.exception.pending);
342 kvm_mmu_unload(vcpu); 135 vcpu->arch.exception.pending = true;
343 vcpu_put(vcpu); 136 vcpu->arch.exception.has_error_code = false;
137 vcpu->arch.exception.nr = nr;
344} 138}
139EXPORT_SYMBOL_GPL(kvm_queue_exception);
345 140
346static void kvm_free_vcpus(struct kvm *kvm) 141void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
142 u32 error_code)
347{ 143{
348 unsigned int i; 144 ++vcpu->stat.pf_guest;
349 145 if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
350 /* 146 printk(KERN_DEBUG "kvm: inject_page_fault:"
351 * Unpin any mmu pages first. 147 " double fault 0x%lx\n", addr);
352 */ 148 vcpu->arch.exception.nr = DF_VECTOR;
353 for (i = 0; i < KVM_MAX_VCPUS; ++i) 149 vcpu->arch.exception.error_code = 0;
354 if (kvm->vcpus[i]) 150 return;
355 kvm_unload_vcpu_mmu(kvm->vcpus[i]);
356 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
357 if (kvm->vcpus[i]) {
358 kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
359 kvm->vcpus[i] = NULL;
360 }
361 } 151 }
362 152 vcpu->arch.cr2 = addr;
153 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
363} 154}
364 155
365static void kvm_destroy_vm(struct kvm *kvm) 156void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
366{ 157{
367 spin_lock(&kvm_lock); 158 WARN_ON(vcpu->arch.exception.pending);
368 list_del(&kvm->vm_list); 159 vcpu->arch.exception.pending = true;
369 spin_unlock(&kvm_lock); 160 vcpu->arch.exception.has_error_code = true;
370 kvm_io_bus_destroy(&kvm->pio_bus); 161 vcpu->arch.exception.nr = nr;
371 kvm_io_bus_destroy(&kvm->mmio_bus); 162 vcpu->arch.exception.error_code = error_code;
372 kfree(kvm->vpic);
373 kfree(kvm->vioapic);
374 kvm_free_vcpus(kvm);
375 kvm_free_physmem(kvm);
376 kfree(kvm);
377} 163}
164EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
378 165
379static int kvm_vm_release(struct inode *inode, struct file *filp) 166static void __queue_exception(struct kvm_vcpu *vcpu)
380{ 167{
381 struct kvm *kvm = filp->private_data; 168 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
382 169 vcpu->arch.exception.has_error_code,
383 kvm_destroy_vm(kvm); 170 vcpu->arch.exception.error_code);
384 return 0;
385}
386
387static void inject_gp(struct kvm_vcpu *vcpu)
388{
389 kvm_x86_ops->inject_gp(vcpu, 0);
390} 171}
391 172
392/* 173/*
393 * Load the pae pdptrs. Return true is they are all valid. 174 * Load the pae pdptrs. Return true is they are all valid.
394 */ 175 */
395static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 176int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
396{ 177{
397 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 178 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
398 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 179 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
399 int i; 180 int i;
400 u64 *pdpt;
401 int ret; 181 int ret;
402 struct page *page; 182 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
403 u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
404 183
405 mutex_lock(&vcpu->kvm->lock); 184 down_read(&current->mm->mmap_sem);
406 page = gfn_to_page(vcpu->kvm, pdpt_gfn); 185 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
407 if (!page) { 186 offset * sizeof(u64), sizeof(pdpte));
187 if (ret < 0) {
408 ret = 0; 188 ret = 0;
409 goto out; 189 goto out;
410 } 190 }
411
412 pdpt = kmap_atomic(page, KM_USER0);
413 memcpy(pdpte, pdpt+offset, sizeof(pdpte));
414 kunmap_atomic(pdpt, KM_USER0);
415
416 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 191 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
417 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { 192 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
418 ret = 0; 193 ret = 0;
@@ -421,78 +196,96 @@ static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
421 } 196 }
422 ret = 1; 197 ret = 1;
423 198
424 memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs)); 199 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
425out: 200out:
426 mutex_unlock(&vcpu->kvm->lock); 201 up_read(&current->mm->mmap_sem);
427 202
428 return ret; 203 return ret;
429} 204}
430 205
206static bool pdptrs_changed(struct kvm_vcpu *vcpu)
207{
208 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
209 bool changed = true;
210 int r;
211
212 if (is_long_mode(vcpu) || !is_pae(vcpu))
213 return false;
214
215 down_read(&current->mm->mmap_sem);
216 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
217 if (r < 0)
218 goto out;
219 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
220out:
221 up_read(&current->mm->mmap_sem);
222
223 return changed;
224}
225
431void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 226void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
432{ 227{
433 if (cr0 & CR0_RESERVED_BITS) { 228 if (cr0 & CR0_RESERVED_BITS) {
434 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 229 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
435 cr0, vcpu->cr0); 230 cr0, vcpu->arch.cr0);
436 inject_gp(vcpu); 231 kvm_inject_gp(vcpu, 0);
437 return; 232 return;
438 } 233 }
439 234
440 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 235 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
441 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 236 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
442 inject_gp(vcpu); 237 kvm_inject_gp(vcpu, 0);
443 return; 238 return;
444 } 239 }
445 240
446 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 241 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
447 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 242 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
448 "and a clear PE flag\n"); 243 "and a clear PE flag\n");
449 inject_gp(vcpu); 244 kvm_inject_gp(vcpu, 0);
450 return; 245 return;
451 } 246 }
452 247
453 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 248 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
454#ifdef CONFIG_X86_64 249#ifdef CONFIG_X86_64
455 if ((vcpu->shadow_efer & EFER_LME)) { 250 if ((vcpu->arch.shadow_efer & EFER_LME)) {
456 int cs_db, cs_l; 251 int cs_db, cs_l;
457 252
458 if (!is_pae(vcpu)) { 253 if (!is_pae(vcpu)) {
459 printk(KERN_DEBUG "set_cr0: #GP, start paging " 254 printk(KERN_DEBUG "set_cr0: #GP, start paging "
460 "in long mode while PAE is disabled\n"); 255 "in long mode while PAE is disabled\n");
461 inject_gp(vcpu); 256 kvm_inject_gp(vcpu, 0);
462 return; 257 return;
463 } 258 }
464 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 259 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
465 if (cs_l) { 260 if (cs_l) {
466 printk(KERN_DEBUG "set_cr0: #GP, start paging " 261 printk(KERN_DEBUG "set_cr0: #GP, start paging "
467 "in long mode while CS.L == 1\n"); 262 "in long mode while CS.L == 1\n");
468 inject_gp(vcpu); 263 kvm_inject_gp(vcpu, 0);
469 return; 264 return;
470 265
471 } 266 }
472 } else 267 } else
473#endif 268#endif
474 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) { 269 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
475 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 270 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
476 "reserved bits\n"); 271 "reserved bits\n");
477 inject_gp(vcpu); 272 kvm_inject_gp(vcpu, 0);
478 return; 273 return;
479 } 274 }
480 275
481 } 276 }
482 277
483 kvm_x86_ops->set_cr0(vcpu, cr0); 278 kvm_x86_ops->set_cr0(vcpu, cr0);
484 vcpu->cr0 = cr0; 279 vcpu->arch.cr0 = cr0;
485 280
486 mutex_lock(&vcpu->kvm->lock);
487 kvm_mmu_reset_context(vcpu); 281 kvm_mmu_reset_context(vcpu);
488 mutex_unlock(&vcpu->kvm->lock);
489 return; 282 return;
490} 283}
491EXPORT_SYMBOL_GPL(set_cr0); 284EXPORT_SYMBOL_GPL(set_cr0);
492 285
493void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 286void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
494{ 287{
495 set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); 288 set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
496} 289}
497EXPORT_SYMBOL_GPL(lmsw); 290EXPORT_SYMBOL_GPL(lmsw);
498 291
@@ -500,7 +293,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
500{ 293{
501 if (cr4 & CR4_RESERVED_BITS) { 294 if (cr4 & CR4_RESERVED_BITS) {
502 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 295 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
503 inject_gp(vcpu); 296 kvm_inject_gp(vcpu, 0);
504 return; 297 return;
505 } 298 }
506 299
@@ -508,35 +301,38 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
508 if (!(cr4 & X86_CR4_PAE)) { 301 if (!(cr4 & X86_CR4_PAE)) {
509 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 302 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
510 "in long mode\n"); 303 "in long mode\n");
511 inject_gp(vcpu); 304 kvm_inject_gp(vcpu, 0);
512 return; 305 return;
513 } 306 }
514 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) 307 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
515 && !load_pdptrs(vcpu, vcpu->cr3)) { 308 && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
516 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 309 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
517 inject_gp(vcpu); 310 kvm_inject_gp(vcpu, 0);
518 return; 311 return;
519 } 312 }
520 313
521 if (cr4 & X86_CR4_VMXE) { 314 if (cr4 & X86_CR4_VMXE) {
522 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 315 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
523 inject_gp(vcpu); 316 kvm_inject_gp(vcpu, 0);
524 return; 317 return;
525 } 318 }
526 kvm_x86_ops->set_cr4(vcpu, cr4); 319 kvm_x86_ops->set_cr4(vcpu, cr4);
527 vcpu->cr4 = cr4; 320 vcpu->arch.cr4 = cr4;
528 mutex_lock(&vcpu->kvm->lock);
529 kvm_mmu_reset_context(vcpu); 321 kvm_mmu_reset_context(vcpu);
530 mutex_unlock(&vcpu->kvm->lock);
531} 322}
532EXPORT_SYMBOL_GPL(set_cr4); 323EXPORT_SYMBOL_GPL(set_cr4);
533 324
534void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 325void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
535{ 326{
327 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
328 kvm_mmu_flush_tlb(vcpu);
329 return;
330 }
331
536 if (is_long_mode(vcpu)) { 332 if (is_long_mode(vcpu)) {
537 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 333 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
538 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 334 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
539 inject_gp(vcpu); 335 kvm_inject_gp(vcpu, 0);
540 return; 336 return;
541 } 337 }
542 } else { 338 } else {
@@ -544,26 +340,23 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
544 if (cr3 & CR3_PAE_RESERVED_BITS) { 340 if (cr3 & CR3_PAE_RESERVED_BITS) {
545 printk(KERN_DEBUG 341 printk(KERN_DEBUG
546 "set_cr3: #GP, reserved bits\n"); 342 "set_cr3: #GP, reserved bits\n");
547 inject_gp(vcpu); 343 kvm_inject_gp(vcpu, 0);
548 return; 344 return;
549 } 345 }
550 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 346 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
551 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 347 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
552 "reserved bits\n"); 348 "reserved bits\n");
553 inject_gp(vcpu); 349 kvm_inject_gp(vcpu, 0);
554 return;
555 }
556 } else {
557 if (cr3 & CR3_NONPAE_RESERVED_BITS) {
558 printk(KERN_DEBUG
559 "set_cr3: #GP, reserved bits\n");
560 inject_gp(vcpu);
561 return; 350 return;
562 } 351 }
563 } 352 }
353 /*
354 * We don't check reserved bits in nonpae mode, because
355 * this isn't enforced, and VMware depends on this.
356 */
564 } 357 }
565 358
566 mutex_lock(&vcpu->kvm->lock); 359 down_read(&current->mm->mmap_sem);
567 /* 360 /*
568 * Does the new cr3 value map to physical memory? (Note, we 361 * Does the new cr3 value map to physical memory? (Note, we
569 * catch an invalid cr3 even in real-mode, because it would 362 * catch an invalid cr3 even in real-mode, because it would
@@ -574,12 +367,12 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
574 * to debug) behavior on the guest side. 367 * to debug) behavior on the guest side.
575 */ 368 */
576 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 369 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
577 inject_gp(vcpu); 370 kvm_inject_gp(vcpu, 0);
578 else { 371 else {
579 vcpu->cr3 = cr3; 372 vcpu->arch.cr3 = cr3;
580 vcpu->mmu.new_cr3(vcpu); 373 vcpu->arch.mmu.new_cr3(vcpu);
581 } 374 }
582 mutex_unlock(&vcpu->kvm->lock); 375 up_read(&current->mm->mmap_sem);
583} 376}
584EXPORT_SYMBOL_GPL(set_cr3); 377EXPORT_SYMBOL_GPL(set_cr3);
585 378
@@ -587,13 +380,13 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
587{ 380{
588 if (cr8 & CR8_RESERVED_BITS) { 381 if (cr8 & CR8_RESERVED_BITS) {
589 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 382 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
590 inject_gp(vcpu); 383 kvm_inject_gp(vcpu, 0);
591 return; 384 return;
592 } 385 }
593 if (irqchip_in_kernel(vcpu->kvm)) 386 if (irqchip_in_kernel(vcpu->kvm))
594 kvm_lapic_set_tpr(vcpu, cr8); 387 kvm_lapic_set_tpr(vcpu, cr8);
595 else 388 else
596 vcpu->cr8 = cr8; 389 vcpu->arch.cr8 = cr8;
597} 390}
598EXPORT_SYMBOL_GPL(set_cr8); 391EXPORT_SYMBOL_GPL(set_cr8);
599 392
@@ -602,210 +395,846 @@ unsigned long get_cr8(struct kvm_vcpu *vcpu)
602 if (irqchip_in_kernel(vcpu->kvm)) 395 if (irqchip_in_kernel(vcpu->kvm))
603 return kvm_lapic_get_cr8(vcpu); 396 return kvm_lapic_get_cr8(vcpu);
604 else 397 else
605 return vcpu->cr8; 398 return vcpu->arch.cr8;
606} 399}
607EXPORT_SYMBOL_GPL(get_cr8); 400EXPORT_SYMBOL_GPL(get_cr8);
608 401
609u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 402/*
403 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
404 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
405 *
406 * This list is modified at module load time to reflect the
407 * capabilities of the host cpu.
408 */
409static u32 msrs_to_save[] = {
410 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
411 MSR_K6_STAR,
412#ifdef CONFIG_X86_64
413 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
414#endif
415 MSR_IA32_TIME_STAMP_COUNTER,
416};
417
418static unsigned num_msrs_to_save;
419
420static u32 emulated_msrs[] = {
421 MSR_IA32_MISC_ENABLE,
422};
423
424#ifdef CONFIG_X86_64
425
426static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
610{ 427{
611 if (irqchip_in_kernel(vcpu->kvm)) 428 if (efer & EFER_RESERVED_BITS) {
612 return vcpu->apic_base; 429 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
613 else 430 efer);
614 return vcpu->apic_base; 431 kvm_inject_gp(vcpu, 0);
432 return;
433 }
434
435 if (is_paging(vcpu)
436 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
437 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
438 kvm_inject_gp(vcpu, 0);
439 return;
440 }
441
442 kvm_x86_ops->set_efer(vcpu, efer);
443
444 efer &= ~EFER_LMA;
445 efer |= vcpu->arch.shadow_efer & EFER_LMA;
446
447 vcpu->arch.shadow_efer = efer;
615} 448}
616EXPORT_SYMBOL_GPL(kvm_get_apic_base);
617 449
618void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 450#endif
451
452/*
453 * Writes msr value into into the appropriate "register".
454 * Returns 0 on success, non-0 otherwise.
455 * Assumes vcpu_load() was already called.
456 */
457int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
619{ 458{
620 /* TODO: reserve bits check */ 459 return kvm_x86_ops->set_msr(vcpu, msr_index, data);
621 if (irqchip_in_kernel(vcpu->kvm))
622 kvm_lapic_set_base(vcpu, data);
623 else
624 vcpu->apic_base = data;
625} 460}
626EXPORT_SYMBOL_GPL(kvm_set_apic_base);
627 461
628void fx_init(struct kvm_vcpu *vcpu) 462/*
463 * Adapt set_msr() to msr_io()'s calling convention
464 */
465static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
629{ 466{
630 unsigned after_mxcsr_mask; 467 return kvm_set_msr(vcpu, index, *data);
468}
631 469
632 /* Initialize guest FPU by resetting ours and saving into guest's */
633 preempt_disable();
634 fx_save(&vcpu->host_fx_image);
635 fpu_init();
636 fx_save(&vcpu->guest_fx_image);
637 fx_restore(&vcpu->host_fx_image);
638 preempt_enable();
639 470
640 vcpu->cr0 |= X86_CR0_ET; 471int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
641 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 472{
642 vcpu->guest_fx_image.mxcsr = 0x1f80; 473 switch (msr) {
643 memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask, 474#ifdef CONFIG_X86_64
644 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 475 case MSR_EFER:
476 set_efer(vcpu, data);
477 break;
478#endif
479 case MSR_IA32_MC0_STATUS:
480 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
481 __FUNCTION__, data);
482 break;
483 case MSR_IA32_MCG_STATUS:
484 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
485 __FUNCTION__, data);
486 break;
487 case MSR_IA32_UCODE_REV:
488 case MSR_IA32_UCODE_WRITE:
489 case 0x200 ... 0x2ff: /* MTRRs */
490 break;
491 case MSR_IA32_APICBASE:
492 kvm_set_apic_base(vcpu, data);
493 break;
494 case MSR_IA32_MISC_ENABLE:
495 vcpu->arch.ia32_misc_enable_msr = data;
496 break;
497 default:
498 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
499 return 1;
500 }
501 return 0;
645} 502}
646EXPORT_SYMBOL_GPL(fx_init); 503EXPORT_SYMBOL_GPL(kvm_set_msr_common);
504
505
506/*
507 * Reads an msr value (of 'msr_index') into 'pdata'.
508 * Returns 0 on success, non-0 otherwise.
509 * Assumes vcpu_load() was already called.
510 */
511int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
512{
513 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
514}
515
516int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
517{
518 u64 data;
519
520 switch (msr) {
521 case 0xc0010010: /* SYSCFG */
522 case 0xc0010015: /* HWCR */
523 case MSR_IA32_PLATFORM_ID:
524 case MSR_IA32_P5_MC_ADDR:
525 case MSR_IA32_P5_MC_TYPE:
526 case MSR_IA32_MC0_CTL:
527 case MSR_IA32_MCG_STATUS:
528 case MSR_IA32_MCG_CAP:
529 case MSR_IA32_MC0_MISC:
530 case MSR_IA32_MC0_MISC+4:
531 case MSR_IA32_MC0_MISC+8:
532 case MSR_IA32_MC0_MISC+12:
533 case MSR_IA32_MC0_MISC+16:
534 case MSR_IA32_UCODE_REV:
535 case MSR_IA32_PERF_STATUS:
536 case MSR_IA32_EBL_CR_POWERON:
537 /* MTRR registers */
538 case 0xfe:
539 case 0x200 ... 0x2ff:
540 data = 0;
541 break;
542 case 0xcd: /* fsb frequency */
543 data = 3;
544 break;
545 case MSR_IA32_APICBASE:
546 data = kvm_get_apic_base(vcpu);
547 break;
548 case MSR_IA32_MISC_ENABLE:
549 data = vcpu->arch.ia32_misc_enable_msr;
550 break;
551#ifdef CONFIG_X86_64
552 case MSR_EFER:
553 data = vcpu->arch.shadow_efer;
554 break;
555#endif
556 default:
557 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
558 return 1;
559 }
560 *pdata = data;
561 return 0;
562}
563EXPORT_SYMBOL_GPL(kvm_get_msr_common);
647 564
648/* 565/*
649 * Allocate some memory and give it an address in the guest physical address 566 * Read or write a bunch of msrs. All parameters are kernel addresses.
650 * space.
651 * 567 *
652 * Discontiguous memory is allowed, mostly for framebuffers. 568 * @return number of msrs set successfully.
653 */ 569 */
654static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 570static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
655 struct kvm_memory_region *mem) 571 struct kvm_msr_entry *entries,
572 int (*do_msr)(struct kvm_vcpu *vcpu,
573 unsigned index, u64 *data))
656{ 574{
657 int r; 575 int i;
658 gfn_t base_gfn;
659 unsigned long npages;
660 unsigned long i;
661 struct kvm_memory_slot *memslot;
662 struct kvm_memory_slot old, new;
663 576
664 r = -EINVAL; 577 vcpu_load(vcpu);
665 /* General sanity checks */ 578
666 if (mem->memory_size & (PAGE_SIZE - 1)) 579 for (i = 0; i < msrs->nmsrs; ++i)
667 goto out; 580 if (do_msr(vcpu, entries[i].index, &entries[i].data))
668 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 581 break;
582
583 vcpu_put(vcpu);
584
585 return i;
586}
587
588/*
589 * Read or write a bunch of msrs. Parameters are user addresses.
590 *
591 * @return number of msrs set successfully.
592 */
593static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
594 int (*do_msr)(struct kvm_vcpu *vcpu,
595 unsigned index, u64 *data),
596 int writeback)
597{
598 struct kvm_msrs msrs;
599 struct kvm_msr_entry *entries;
600 int r, n;
601 unsigned size;
602
603 r = -EFAULT;
604 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
669 goto out; 605 goto out;
670 if (mem->slot >= KVM_MEMORY_SLOTS) 606
607 r = -E2BIG;
608 if (msrs.nmsrs >= MAX_IO_MSRS)
671 goto out; 609 goto out;
672 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 610
611 r = -ENOMEM;
612 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
613 entries = vmalloc(size);
614 if (!entries)
673 goto out; 615 goto out;
674 616
675 memslot = &kvm->memslots[mem->slot]; 617 r = -EFAULT;
676 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 618 if (copy_from_user(entries, user_msrs->entries, size))
677 npages = mem->memory_size >> PAGE_SHIFT; 619 goto out_free;
678 620
679 if (!npages) 621 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
680 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 622 if (r < 0)
623 goto out_free;
681 624
682 mutex_lock(&kvm->lock); 625 r = -EFAULT;
626 if (writeback && copy_to_user(user_msrs->entries, entries, size))
627 goto out_free;
683 628
684 new = old = *memslot; 629 r = n;
685 630
686 new.base_gfn = base_gfn; 631out_free:
687 new.npages = npages; 632 vfree(entries);
688 new.flags = mem->flags; 633out:
634 return r;
635}
689 636
690 /* Disallow changing a memory slot's size. */ 637/*
691 r = -EINVAL; 638 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
692 if (npages && old.npages && npages != old.npages) 639 * cached on it.
693 goto out_unlock; 640 */
641void decache_vcpus_on_cpu(int cpu)
642{
643 struct kvm *vm;
644 struct kvm_vcpu *vcpu;
645 int i;
694 646
695 /* Check for overlaps */ 647 spin_lock(&kvm_lock);
696 r = -EEXIST; 648 list_for_each_entry(vm, &vm_list, vm_list)
697 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 649 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
698 struct kvm_memory_slot *s = &kvm->memslots[i]; 650 vcpu = vm->vcpus[i];
651 if (!vcpu)
652 continue;
653 /*
654 * If the vcpu is locked, then it is running on some
655 * other cpu and therefore it is not cached on the
656 * cpu in question.
657 *
658 * If it's not locked, check the last cpu it executed
659 * on.
660 */
661 if (mutex_trylock(&vcpu->mutex)) {
662 if (vcpu->cpu == cpu) {
663 kvm_x86_ops->vcpu_decache(vcpu);
664 vcpu->cpu = -1;
665 }
666 mutex_unlock(&vcpu->mutex);
667 }
668 }
669 spin_unlock(&kvm_lock);
670}
699 671
700 if (s == memslot) 672int kvm_dev_ioctl_check_extension(long ext)
701 continue; 673{
702 if (!((base_gfn + npages <= s->base_gfn) || 674 int r;
703 (base_gfn >= s->base_gfn + s->npages))) 675
704 goto out_unlock; 676 switch (ext) {
677 case KVM_CAP_IRQCHIP:
678 case KVM_CAP_HLT:
679 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
680 case KVM_CAP_USER_MEMORY:
681 case KVM_CAP_SET_TSS_ADDR:
682 case KVM_CAP_EXT_CPUID:
683 r = 1;
684 break;
685 case KVM_CAP_VAPIC:
686 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
687 break;
688 default:
689 r = 0;
690 break;
705 } 691 }
692 return r;
706 693
707 /* Deallocate if slot is being removed */ 694}
708 if (!npages)
709 new.phys_mem = NULL;
710 695
711 /* Free page dirty bitmap if unneeded */ 696long kvm_arch_dev_ioctl(struct file *filp,
712 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 697 unsigned int ioctl, unsigned long arg)
713 new.dirty_bitmap = NULL; 698{
699 void __user *argp = (void __user *)arg;
700 long r;
714 701
715 r = -ENOMEM; 702 switch (ioctl) {
703 case KVM_GET_MSR_INDEX_LIST: {
704 struct kvm_msr_list __user *user_msr_list = argp;
705 struct kvm_msr_list msr_list;
706 unsigned n;
716 707
717 /* Allocate if a slot is being created */ 708 r = -EFAULT;
718 if (npages && !new.phys_mem) { 709 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
719 new.phys_mem = vmalloc(npages * sizeof(struct page *)); 710 goto out;
711 n = msr_list.nmsrs;
712 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
713 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
714 goto out;
715 r = -E2BIG;
716 if (n < num_msrs_to_save)
717 goto out;
718 r = -EFAULT;
719 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
720 num_msrs_to_save * sizeof(u32)))
721 goto out;
722 if (copy_to_user(user_msr_list->indices
723 + num_msrs_to_save * sizeof(u32),
724 &emulated_msrs,
725 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
726 goto out;
727 r = 0;
728 break;
729 }
730 default:
731 r = -EINVAL;
732 }
733out:
734 return r;
735}
736
737void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
738{
739 kvm_x86_ops->vcpu_load(vcpu, cpu);
740}
720 741
721 if (!new.phys_mem) 742void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
722 goto out_unlock; 743{
744 kvm_x86_ops->vcpu_put(vcpu);
745 kvm_put_guest_fpu(vcpu);
746}
723 747
724 memset(new.phys_mem, 0, npages * sizeof(struct page *)); 748static int is_efer_nx(void)
725 for (i = 0; i < npages; ++i) { 749{
726 new.phys_mem[i] = alloc_page(GFP_HIGHUSER 750 u64 efer;
727 | __GFP_ZERO); 751
728 if (!new.phys_mem[i]) 752 rdmsrl(MSR_EFER, efer);
729 goto out_unlock; 753 return efer & EFER_NX;
730 set_page_private(new.phys_mem[i],0); 754}
731 }
732 }
733 755
734 /* Allocate page dirty bitmap if needed */ 756static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
735 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 757{
736 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 758 int i;
759 struct kvm_cpuid_entry2 *e, *entry;
737 760
738 new.dirty_bitmap = vmalloc(dirty_bytes); 761 entry = NULL;
739 if (!new.dirty_bitmap) 762 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
740 goto out_unlock; 763 e = &vcpu->arch.cpuid_entries[i];
741 memset(new.dirty_bitmap, 0, dirty_bytes); 764 if (e->function == 0x80000001) {
765 entry = e;
766 break;
767 }
742 } 768 }
769 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
770 entry->edx &= ~(1 << 20);
771 printk(KERN_INFO "kvm: guest NX capability removed\n");
772 }
773}
743 774
744 if (mem->slot >= kvm->nmemslots) 775/* when an old userspace process fills a new kernel module */
745 kvm->nmemslots = mem->slot + 1; 776static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
777 struct kvm_cpuid *cpuid,
778 struct kvm_cpuid_entry __user *entries)
779{
780 int r, i;
781 struct kvm_cpuid_entry *cpuid_entries;
746 782
747 *memslot = new; 783 r = -E2BIG;
784 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
785 goto out;
786 r = -ENOMEM;
787 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
788 if (!cpuid_entries)
789 goto out;
790 r = -EFAULT;
791 if (copy_from_user(cpuid_entries, entries,
792 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
793 goto out_free;
794 for (i = 0; i < cpuid->nent; i++) {
795 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
796 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
797 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
798 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
799 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
800 vcpu->arch.cpuid_entries[i].index = 0;
801 vcpu->arch.cpuid_entries[i].flags = 0;
802 vcpu->arch.cpuid_entries[i].padding[0] = 0;
803 vcpu->arch.cpuid_entries[i].padding[1] = 0;
804 vcpu->arch.cpuid_entries[i].padding[2] = 0;
805 }
806 vcpu->arch.cpuid_nent = cpuid->nent;
807 cpuid_fix_nx_cap(vcpu);
808 r = 0;
748 809
749 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 810out_free:
750 kvm_flush_remote_tlbs(kvm); 811 vfree(cpuid_entries);
812out:
813 return r;
814}
751 815
752 mutex_unlock(&kvm->lock); 816static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
817 struct kvm_cpuid2 *cpuid,
818 struct kvm_cpuid_entry2 __user *entries)
819{
820 int r;
753 821
754 kvm_free_physmem_slot(&old, &new); 822 r = -E2BIG;
823 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
824 goto out;
825 r = -EFAULT;
826 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
827 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
828 goto out;
829 vcpu->arch.cpuid_nent = cpuid->nent;
755 return 0; 830 return 0;
756 831
757out_unlock:
758 mutex_unlock(&kvm->lock);
759 kvm_free_physmem_slot(&new, &old);
760out: 832out:
761 return r; 833 return r;
762} 834}
763 835
764/* 836static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
765 * Get (and clear) the dirty memory log for a memory slot. 837 struct kvm_cpuid2 *cpuid,
766 */ 838 struct kvm_cpuid_entry2 __user *entries)
767static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
768 struct kvm_dirty_log *log)
769{ 839{
770 struct kvm_memory_slot *memslot; 840 int r;
771 int r, i;
772 int n;
773 unsigned long any = 0;
774
775 mutex_lock(&kvm->lock);
776 841
777 r = -EINVAL; 842 r = -E2BIG;
778 if (log->slot >= KVM_MEMORY_SLOTS) 843 if (cpuid->nent < vcpu->arch.cpuid_nent)
779 goto out; 844 goto out;
780 845 r = -EFAULT;
781 memslot = &kvm->memslots[log->slot]; 846 if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
782 r = -ENOENT; 847 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
783 if (!memslot->dirty_bitmap)
784 goto out; 848 goto out;
849 return 0;
850
851out:
852 cpuid->nent = vcpu->arch.cpuid_nent;
853 return r;
854}
855
856static inline u32 bit(int bitno)
857{
858 return 1 << (bitno & 31);
859}
860
861static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
862 u32 index)
863{
864 entry->function = function;
865 entry->index = index;
866 cpuid_count(entry->function, entry->index,
867 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
868 entry->flags = 0;
869}
870
871static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
872 u32 index, int *nent, int maxnent)
873{
874 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
875 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
876 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
877 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
878 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
879 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
880 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
881 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
882 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
883 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
884 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
885 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
886 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
887 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
888 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
889 bit(X86_FEATURE_PGE) |
890 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
891 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
892 bit(X86_FEATURE_SYSCALL) |
893 (bit(X86_FEATURE_NX) && is_efer_nx()) |
894#ifdef CONFIG_X86_64
895 bit(X86_FEATURE_LM) |
896#endif
897 bit(X86_FEATURE_MMXEXT) |
898 bit(X86_FEATURE_3DNOWEXT) |
899 bit(X86_FEATURE_3DNOW);
900 const u32 kvm_supported_word3_x86_features =
901 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
902 const u32 kvm_supported_word6_x86_features =
903 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
904
905 /* all func 2 cpuid_count() should be called on the same cpu */
906 get_cpu();
907 do_cpuid_1_ent(entry, function, index);
908 ++*nent;
909
910 switch (function) {
911 case 0:
912 entry->eax = min(entry->eax, (u32)0xb);
913 break;
914 case 1:
915 entry->edx &= kvm_supported_word0_x86_features;
916 entry->ecx &= kvm_supported_word3_x86_features;
917 break;
918 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
919 * may return different values. This forces us to get_cpu() before
920 * issuing the first command, and also to emulate this annoying behavior
921 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
922 case 2: {
923 int t, times = entry->eax & 0xff;
924
925 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
926 for (t = 1; t < times && *nent < maxnent; ++t) {
927 do_cpuid_1_ent(&entry[t], function, 0);
928 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
929 ++*nent;
930 }
931 break;
932 }
933 /* function 4 and 0xb have additional index. */
934 case 4: {
935 int index, cache_type;
936
937 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
938 /* read more entries until cache_type is zero */
939 for (index = 1; *nent < maxnent; ++index) {
940 cache_type = entry[index - 1].eax & 0x1f;
941 if (!cache_type)
942 break;
943 do_cpuid_1_ent(&entry[index], function, index);
944 entry[index].flags |=
945 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
946 ++*nent;
947 }
948 break;
949 }
950 case 0xb: {
951 int index, level_type;
952
953 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
954 /* read more entries until level_type is zero */
955 for (index = 1; *nent < maxnent; ++index) {
956 level_type = entry[index - 1].ecx & 0xff;
957 if (!level_type)
958 break;
959 do_cpuid_1_ent(&entry[index], function, index);
960 entry[index].flags |=
961 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
962 ++*nent;
963 }
964 break;
965 }
966 case 0x80000000:
967 entry->eax = min(entry->eax, 0x8000001a);
968 break;
969 case 0x80000001:
970 entry->edx &= kvm_supported_word1_x86_features;
971 entry->ecx &= kvm_supported_word6_x86_features;
972 break;
973 }
974 put_cpu();
975}
785 976
786 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 977static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
978 struct kvm_cpuid2 *cpuid,
979 struct kvm_cpuid_entry2 __user *entries)
980{
981 struct kvm_cpuid_entry2 *cpuid_entries;
982 int limit, nent = 0, r = -E2BIG;
983 u32 func;
787 984
788 for (i = 0; !any && i < n/sizeof(long); ++i) 985 if (cpuid->nent < 1)
789 any = memslot->dirty_bitmap[i]; 986 goto out;
987 r = -ENOMEM;
988 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
989 if (!cpuid_entries)
990 goto out;
790 991
992 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
993 limit = cpuid_entries[0].eax;
994 for (func = 1; func <= limit && nent < cpuid->nent; ++func)
995 do_cpuid_ent(&cpuid_entries[nent], func, 0,
996 &nent, cpuid->nent);
997 r = -E2BIG;
998 if (nent >= cpuid->nent)
999 goto out_free;
1000
1001 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1002 limit = cpuid_entries[nent - 1].eax;
1003 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1004 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1005 &nent, cpuid->nent);
791 r = -EFAULT; 1006 r = -EFAULT;
792 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1007 if (copy_to_user(entries, cpuid_entries,
793 goto out; 1008 nent * sizeof(struct kvm_cpuid_entry2)))
1009 goto out_free;
1010 cpuid->nent = nent;
1011 r = 0;
794 1012
795 /* If nothing is dirty, don't bother messing with page tables. */ 1013out_free:
796 if (any) { 1014 vfree(cpuid_entries);
797 kvm_mmu_slot_remove_write_access(kvm, log->slot); 1015out:
798 kvm_flush_remote_tlbs(kvm); 1016 return r;
799 memset(memslot->dirty_bitmap, 0, n); 1017}
1018
1019static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1020 struct kvm_lapic_state *s)
1021{
1022 vcpu_load(vcpu);
1023 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1024 vcpu_put(vcpu);
1025
1026 return 0;
1027}
1028
1029static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1030 struct kvm_lapic_state *s)
1031{
1032 vcpu_load(vcpu);
1033 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1034 kvm_apic_post_state_restore(vcpu);
1035 vcpu_put(vcpu);
1036
1037 return 0;
1038}
1039
1040static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1041 struct kvm_interrupt *irq)
1042{
1043 if (irq->irq < 0 || irq->irq >= 256)
1044 return -EINVAL;
1045 if (irqchip_in_kernel(vcpu->kvm))
1046 return -ENXIO;
1047 vcpu_load(vcpu);
1048
1049 set_bit(irq->irq, vcpu->arch.irq_pending);
1050 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1051
1052 vcpu_put(vcpu);
1053
1054 return 0;
1055}
1056
1057static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1058 struct kvm_tpr_access_ctl *tac)
1059{
1060 if (tac->flags)
1061 return -EINVAL;
1062 vcpu->arch.tpr_access_reporting = !!tac->enabled;
1063 return 0;
1064}
1065
1066long kvm_arch_vcpu_ioctl(struct file *filp,
1067 unsigned int ioctl, unsigned long arg)
1068{
1069 struct kvm_vcpu *vcpu = filp->private_data;
1070 void __user *argp = (void __user *)arg;
1071 int r;
1072
1073 switch (ioctl) {
1074 case KVM_GET_LAPIC: {
1075 struct kvm_lapic_state lapic;
1076
1077 memset(&lapic, 0, sizeof lapic);
1078 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
1079 if (r)
1080 goto out;
1081 r = -EFAULT;
1082 if (copy_to_user(argp, &lapic, sizeof lapic))
1083 goto out;
1084 r = 0;
1085 break;
800 } 1086 }
1087 case KVM_SET_LAPIC: {
1088 struct kvm_lapic_state lapic;
801 1089
802 r = 0; 1090 r = -EFAULT;
1091 if (copy_from_user(&lapic, argp, sizeof lapic))
1092 goto out;
1093 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
1094 if (r)
1095 goto out;
1096 r = 0;
1097 break;
1098 }
1099 case KVM_INTERRUPT: {
1100 struct kvm_interrupt irq;
1101
1102 r = -EFAULT;
1103 if (copy_from_user(&irq, argp, sizeof irq))
1104 goto out;
1105 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1106 if (r)
1107 goto out;
1108 r = 0;
1109 break;
1110 }
1111 case KVM_SET_CPUID: {
1112 struct kvm_cpuid __user *cpuid_arg = argp;
1113 struct kvm_cpuid cpuid;
803 1114
1115 r = -EFAULT;
1116 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1117 goto out;
1118 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1119 if (r)
1120 goto out;
1121 break;
1122 }
1123 case KVM_SET_CPUID2: {
1124 struct kvm_cpuid2 __user *cpuid_arg = argp;
1125 struct kvm_cpuid2 cpuid;
1126
1127 r = -EFAULT;
1128 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1129 goto out;
1130 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1131 cpuid_arg->entries);
1132 if (r)
1133 goto out;
1134 break;
1135 }
1136 case KVM_GET_CPUID2: {
1137 struct kvm_cpuid2 __user *cpuid_arg = argp;
1138 struct kvm_cpuid2 cpuid;
1139
1140 r = -EFAULT;
1141 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1142 goto out;
1143 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1144 cpuid_arg->entries);
1145 if (r)
1146 goto out;
1147 r = -EFAULT;
1148 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1149 goto out;
1150 r = 0;
1151 break;
1152 }
1153 case KVM_GET_MSRS:
1154 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1155 break;
1156 case KVM_SET_MSRS:
1157 r = msr_io(vcpu, argp, do_set_msr, 0);
1158 break;
1159 case KVM_TPR_ACCESS_REPORTING: {
1160 struct kvm_tpr_access_ctl tac;
1161
1162 r = -EFAULT;
1163 if (copy_from_user(&tac, argp, sizeof tac))
1164 goto out;
1165 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1166 if (r)
1167 goto out;
1168 r = -EFAULT;
1169 if (copy_to_user(argp, &tac, sizeof tac))
1170 goto out;
1171 r = 0;
1172 break;
1173 };
1174 case KVM_SET_VAPIC_ADDR: {
1175 struct kvm_vapic_addr va;
1176
1177 r = -EINVAL;
1178 if (!irqchip_in_kernel(vcpu->kvm))
1179 goto out;
1180 r = -EFAULT;
1181 if (copy_from_user(&va, argp, sizeof va))
1182 goto out;
1183 r = 0;
1184 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1185 break;
1186 }
1187 default:
1188 r = -EINVAL;
1189 }
804out: 1190out:
805 mutex_unlock(&kvm->lock);
806 return r; 1191 return r;
807} 1192}
808 1193
1194static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1195{
1196 int ret;
1197
1198 if (addr > (unsigned int)(-3 * PAGE_SIZE))
1199 return -1;
1200 ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1201 return ret;
1202}
1203
1204static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1205 u32 kvm_nr_mmu_pages)
1206{
1207 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1208 return -EINVAL;
1209
1210 down_write(&current->mm->mmap_sem);
1211
1212 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1213 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1214
1215 up_write(&current->mm->mmap_sem);
1216 return 0;
1217}
1218
1219static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1220{
1221 return kvm->arch.n_alloc_mmu_pages;
1222}
1223
1224gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1225{
1226 int i;
1227 struct kvm_mem_alias *alias;
1228
1229 for (i = 0; i < kvm->arch.naliases; ++i) {
1230 alias = &kvm->arch.aliases[i];
1231 if (gfn >= alias->base_gfn
1232 && gfn < alias->base_gfn + alias->npages)
1233 return alias->target_gfn + gfn - alias->base_gfn;
1234 }
1235 return gfn;
1236}
1237
809/* 1238/*
810 * Set a new alias region. Aliases map a portion of physical memory into 1239 * Set a new alias region. Aliases map a portion of physical memory into
811 * another portion. This is useful for memory windows, for example the PC 1240 * another portion. This is useful for memory windows, for example the PC
@@ -832,21 +1261,21 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
832 < alias->target_phys_addr) 1261 < alias->target_phys_addr)
833 goto out; 1262 goto out;
834 1263
835 mutex_lock(&kvm->lock); 1264 down_write(&current->mm->mmap_sem);
836 1265
837 p = &kvm->aliases[alias->slot]; 1266 p = &kvm->arch.aliases[alias->slot];
838 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1267 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
839 p->npages = alias->memory_size >> PAGE_SHIFT; 1268 p->npages = alias->memory_size >> PAGE_SHIFT;
840 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 1269 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
841 1270
842 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 1271 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
843 if (kvm->aliases[n - 1].npages) 1272 if (kvm->arch.aliases[n - 1].npages)
844 break; 1273 break;
845 kvm->naliases = n; 1274 kvm->arch.naliases = n;
846 1275
847 kvm_mmu_zap_all(kvm); 1276 kvm_mmu_zap_all(kvm);
848 1277
849 mutex_unlock(&kvm->lock); 1278 up_write(&current->mm->mmap_sem);
850 1279
851 return 0; 1280 return 0;
852 1281
@@ -861,17 +1290,17 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
861 r = 0; 1290 r = 0;
862 switch (chip->chip_id) { 1291 switch (chip->chip_id) {
863 case KVM_IRQCHIP_PIC_MASTER: 1292 case KVM_IRQCHIP_PIC_MASTER:
864 memcpy (&chip->chip.pic, 1293 memcpy(&chip->chip.pic,
865 &pic_irqchip(kvm)->pics[0], 1294 &pic_irqchip(kvm)->pics[0],
866 sizeof(struct kvm_pic_state)); 1295 sizeof(struct kvm_pic_state));
867 break; 1296 break;
868 case KVM_IRQCHIP_PIC_SLAVE: 1297 case KVM_IRQCHIP_PIC_SLAVE:
869 memcpy (&chip->chip.pic, 1298 memcpy(&chip->chip.pic,
870 &pic_irqchip(kvm)->pics[1], 1299 &pic_irqchip(kvm)->pics[1],
871 sizeof(struct kvm_pic_state)); 1300 sizeof(struct kvm_pic_state));
872 break; 1301 break;
873 case KVM_IRQCHIP_IOAPIC: 1302 case KVM_IRQCHIP_IOAPIC:
874 memcpy (&chip->chip.ioapic, 1303 memcpy(&chip->chip.ioapic,
875 ioapic_irqchip(kvm), 1304 ioapic_irqchip(kvm),
876 sizeof(struct kvm_ioapic_state)); 1305 sizeof(struct kvm_ioapic_state));
877 break; 1306 break;
@@ -889,17 +1318,17 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
889 r = 0; 1318 r = 0;
890 switch (chip->chip_id) { 1319 switch (chip->chip_id) {
891 case KVM_IRQCHIP_PIC_MASTER: 1320 case KVM_IRQCHIP_PIC_MASTER:
892 memcpy (&pic_irqchip(kvm)->pics[0], 1321 memcpy(&pic_irqchip(kvm)->pics[0],
893 &chip->chip.pic, 1322 &chip->chip.pic,
894 sizeof(struct kvm_pic_state)); 1323 sizeof(struct kvm_pic_state));
895 break; 1324 break;
896 case KVM_IRQCHIP_PIC_SLAVE: 1325 case KVM_IRQCHIP_PIC_SLAVE:
897 memcpy (&pic_irqchip(kvm)->pics[1], 1326 memcpy(&pic_irqchip(kvm)->pics[1],
898 &chip->chip.pic, 1327 &chip->chip.pic,
899 sizeof(struct kvm_pic_state)); 1328 sizeof(struct kvm_pic_state));
900 break; 1329 break;
901 case KVM_IRQCHIP_IOAPIC: 1330 case KVM_IRQCHIP_IOAPIC:
902 memcpy (ioapic_irqchip(kvm), 1331 memcpy(ioapic_irqchip(kvm),
903 &chip->chip.ioapic, 1332 &chip->chip.ioapic,
904 sizeof(struct kvm_ioapic_state)); 1333 sizeof(struct kvm_ioapic_state));
905 break; 1334 break;
@@ -911,110 +1340,191 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
911 return r; 1340 return r;
912} 1341}
913 1342
914static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 1343/*
1344 * Get (and clear) the dirty memory log for a memory slot.
1345 */
1346int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1347 struct kvm_dirty_log *log)
915{ 1348{
916 int i; 1349 int r;
917 struct kvm_mem_alias *alias; 1350 int n;
918 1351 struct kvm_memory_slot *memslot;
919 for (i = 0; i < kvm->naliases; ++i) { 1352 int is_dirty = 0;
920 alias = &kvm->aliases[i];
921 if (gfn >= alias->base_gfn
922 && gfn < alias->base_gfn + alias->npages)
923 return alias->target_gfn + gfn - alias->base_gfn;
924 }
925 return gfn;
926}
927 1353
928static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1354 down_write(&current->mm->mmap_sem);
929{
930 int i;
931 1355
932 for (i = 0; i < kvm->nmemslots; ++i) { 1356 r = kvm_get_dirty_log(kvm, log, &is_dirty);
933 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1357 if (r)
1358 goto out;
934 1359
935 if (gfn >= memslot->base_gfn 1360 /* If nothing is dirty, don't bother messing with page tables. */
936 && gfn < memslot->base_gfn + memslot->npages) 1361 if (is_dirty) {
937 return memslot; 1362 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1363 kvm_flush_remote_tlbs(kvm);
1364 memslot = &kvm->memslots[log->slot];
1365 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1366 memset(memslot->dirty_bitmap, 0, n);
938 } 1367 }
939 return NULL; 1368 r = 0;
940} 1369out:
941 1370 up_write(&current->mm->mmap_sem);
942struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1371 return r;
943{
944 gfn = unalias_gfn(kvm, gfn);
945 return __gfn_to_memslot(kvm, gfn);
946}
947
948struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
949{
950 struct kvm_memory_slot *slot;
951
952 gfn = unalias_gfn(kvm, gfn);
953 slot = __gfn_to_memslot(kvm, gfn);
954 if (!slot)
955 return NULL;
956 return slot->phys_mem[gfn - slot->base_gfn];
957} 1372}
958EXPORT_SYMBOL_GPL(gfn_to_page);
959 1373
960/* WARNING: Does not work on aliased pages. */ 1374long kvm_arch_vm_ioctl(struct file *filp,
961void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1375 unsigned int ioctl, unsigned long arg)
962{ 1376{
963 struct kvm_memory_slot *memslot; 1377 struct kvm *kvm = filp->private_data;
1378 void __user *argp = (void __user *)arg;
1379 int r = -EINVAL;
964 1380
965 memslot = __gfn_to_memslot(kvm, gfn); 1381 switch (ioctl) {
966 if (memslot && memslot->dirty_bitmap) { 1382 case KVM_SET_TSS_ADDR:
967 unsigned long rel_gfn = gfn - memslot->base_gfn; 1383 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1384 if (r < 0)
1385 goto out;
1386 break;
1387 case KVM_SET_MEMORY_REGION: {
1388 struct kvm_memory_region kvm_mem;
1389 struct kvm_userspace_memory_region kvm_userspace_mem;
968 1390
969 /* avoid RMW */ 1391 r = -EFAULT;
970 if (!test_bit(rel_gfn, memslot->dirty_bitmap)) 1392 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
971 set_bit(rel_gfn, memslot->dirty_bitmap); 1393 goto out;
1394 kvm_userspace_mem.slot = kvm_mem.slot;
1395 kvm_userspace_mem.flags = kvm_mem.flags;
1396 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1397 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1398 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1399 if (r)
1400 goto out;
1401 break;
972 } 1402 }
973} 1403 case KVM_SET_NR_MMU_PAGES:
1404 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1405 if (r)
1406 goto out;
1407 break;
1408 case KVM_GET_NR_MMU_PAGES:
1409 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1410 break;
1411 case KVM_SET_MEMORY_ALIAS: {
1412 struct kvm_memory_alias alias;
974 1413
975int emulator_read_std(unsigned long addr, 1414 r = -EFAULT;
976 void *val, 1415 if (copy_from_user(&alias, argp, sizeof alias))
977 unsigned int bytes, 1416 goto out;
978 struct kvm_vcpu *vcpu) 1417 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
979{ 1418 if (r)
980 void *data = val; 1419 goto out;
1420 break;
1421 }
1422 case KVM_CREATE_IRQCHIP:
1423 r = -ENOMEM;
1424 kvm->arch.vpic = kvm_create_pic(kvm);
1425 if (kvm->arch.vpic) {
1426 r = kvm_ioapic_init(kvm);
1427 if (r) {
1428 kfree(kvm->arch.vpic);
1429 kvm->arch.vpic = NULL;
1430 goto out;
1431 }
1432 } else
1433 goto out;
1434 break;
1435 case KVM_IRQ_LINE: {
1436 struct kvm_irq_level irq_event;
981 1437
982 while (bytes) { 1438 r = -EFAULT;
983 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1439 if (copy_from_user(&irq_event, argp, sizeof irq_event))
984 unsigned offset = addr & (PAGE_SIZE-1); 1440 goto out;
985 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); 1441 if (irqchip_in_kernel(kvm)) {
986 unsigned long pfn; 1442 mutex_lock(&kvm->lock);
987 struct page *page; 1443 if (irq_event.irq < 16)
988 void *page_virt; 1444 kvm_pic_set_irq(pic_irqchip(kvm),
1445 irq_event.irq,
1446 irq_event.level);
1447 kvm_ioapic_set_irq(kvm->arch.vioapic,
1448 irq_event.irq,
1449 irq_event.level);
1450 mutex_unlock(&kvm->lock);
1451 r = 0;
1452 }
1453 break;
1454 }
1455 case KVM_GET_IRQCHIP: {
1456 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1457 struct kvm_irqchip chip;
989 1458
990 if (gpa == UNMAPPED_GVA) 1459 r = -EFAULT;
991 return X86EMUL_PROPAGATE_FAULT; 1460 if (copy_from_user(&chip, argp, sizeof chip))
992 pfn = gpa >> PAGE_SHIFT; 1461 goto out;
993 page = gfn_to_page(vcpu->kvm, pfn); 1462 r = -ENXIO;
994 if (!page) 1463 if (!irqchip_in_kernel(kvm))
995 return X86EMUL_UNHANDLEABLE; 1464 goto out;
996 page_virt = kmap_atomic(page, KM_USER0); 1465 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
1466 if (r)
1467 goto out;
1468 r = -EFAULT;
1469 if (copy_to_user(argp, &chip, sizeof chip))
1470 goto out;
1471 r = 0;
1472 break;
1473 }
1474 case KVM_SET_IRQCHIP: {
1475 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1476 struct kvm_irqchip chip;
997 1477
998 memcpy(data, page_virt + offset, tocopy); 1478 r = -EFAULT;
1479 if (copy_from_user(&chip, argp, sizeof chip))
1480 goto out;
1481 r = -ENXIO;
1482 if (!irqchip_in_kernel(kvm))
1483 goto out;
1484 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
1485 if (r)
1486 goto out;
1487 r = 0;
1488 break;
1489 }
1490 case KVM_GET_SUPPORTED_CPUID: {
1491 struct kvm_cpuid2 __user *cpuid_arg = argp;
1492 struct kvm_cpuid2 cpuid;
999 1493
1000 kunmap_atomic(page_virt, KM_USER0); 1494 r = -EFAULT;
1495 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1496 goto out;
1497 r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
1498 cpuid_arg->entries);
1499 if (r)
1500 goto out;
1001 1501
1002 bytes -= tocopy; 1502 r = -EFAULT;
1003 data += tocopy; 1503 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1004 addr += tocopy; 1504 goto out;
1505 r = 0;
1506 break;
1005 } 1507 }
1006 1508 default:
1007 return X86EMUL_CONTINUE; 1509 ;
1510 }
1511out:
1512 return r;
1008} 1513}
1009EXPORT_SYMBOL_GPL(emulator_read_std);
1010 1514
1011static int emulator_write_std(unsigned long addr, 1515static void kvm_init_msr_list(void)
1012 const void *val,
1013 unsigned int bytes,
1014 struct kvm_vcpu *vcpu)
1015{ 1516{
1016 pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes); 1517 u32 dummy[2];
1017 return X86EMUL_UNHANDLEABLE; 1518 unsigned i, j;
1519
1520 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1521 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1522 continue;
1523 if (j < i)
1524 msrs_to_save[j] = msrs_to_save[i];
1525 j++;
1526 }
1527 num_msrs_to_save = j;
1018} 1528}
1019 1529
1020/* 1530/*
@@ -1025,14 +1535,15 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1025{ 1535{
1026 struct kvm_io_device *dev; 1536 struct kvm_io_device *dev;
1027 1537
1028 if (vcpu->apic) { 1538 if (vcpu->arch.apic) {
1029 dev = &vcpu->apic->dev; 1539 dev = &vcpu->arch.apic->dev;
1030 if (dev->in_range(dev, addr)) 1540 if (dev->in_range(dev, addr))
1031 return dev; 1541 return dev;
1032 } 1542 }
1033 return NULL; 1543 return NULL;
1034} 1544}
1035 1545
1546
1036static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 1547static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1037 gpa_t addr) 1548 gpa_t addr)
1038{ 1549{
@@ -1044,11 +1555,40 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1044 return dev; 1555 return dev;
1045} 1556}
1046 1557
1047static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 1558int emulator_read_std(unsigned long addr,
1048 gpa_t addr) 1559 void *val,
1560 unsigned int bytes,
1561 struct kvm_vcpu *vcpu)
1049{ 1562{
1050 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); 1563 void *data = val;
1564 int r = X86EMUL_CONTINUE;
1565
1566 down_read(&current->mm->mmap_sem);
1567 while (bytes) {
1568 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1569 unsigned offset = addr & (PAGE_SIZE-1);
1570 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1571 int ret;
1572
1573 if (gpa == UNMAPPED_GVA) {
1574 r = X86EMUL_PROPAGATE_FAULT;
1575 goto out;
1576 }
1577 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1578 if (ret < 0) {
1579 r = X86EMUL_UNHANDLEABLE;
1580 goto out;
1581 }
1582
1583 bytes -= tocopy;
1584 data += tocopy;
1585 addr += tocopy;
1586 }
1587out:
1588 up_read(&current->mm->mmap_sem);
1589 return r;
1051} 1590}
1591EXPORT_SYMBOL_GPL(emulator_read_std);
1052 1592
1053static int emulator_read_emulated(unsigned long addr, 1593static int emulator_read_emulated(unsigned long addr,
1054 void *val, 1594 void *val,
@@ -1062,22 +1602,34 @@ static int emulator_read_emulated(unsigned long addr,
1062 memcpy(val, vcpu->mmio_data, bytes); 1602 memcpy(val, vcpu->mmio_data, bytes);
1063 vcpu->mmio_read_completed = 0; 1603 vcpu->mmio_read_completed = 0;
1064 return X86EMUL_CONTINUE; 1604 return X86EMUL_CONTINUE;
1065 } else if (emulator_read_std(addr, val, bytes, vcpu) 1605 }
1066 == X86EMUL_CONTINUE) 1606
1067 return X86EMUL_CONTINUE; 1607 down_read(&current->mm->mmap_sem);
1608 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1609 up_read(&current->mm->mmap_sem);
1610
1611 /* For APIC access vmexit */
1612 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1613 goto mmio;
1068 1614
1069 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1615 if (emulator_read_std(addr, val, bytes, vcpu)
1616 == X86EMUL_CONTINUE)
1617 return X86EMUL_CONTINUE;
1070 if (gpa == UNMAPPED_GVA) 1618 if (gpa == UNMAPPED_GVA)
1071 return X86EMUL_PROPAGATE_FAULT; 1619 return X86EMUL_PROPAGATE_FAULT;
1072 1620
1621mmio:
1073 /* 1622 /*
1074 * Is this MMIO handled locally? 1623 * Is this MMIO handled locally?
1075 */ 1624 */
1625 mutex_lock(&vcpu->kvm->lock);
1076 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1626 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1077 if (mmio_dev) { 1627 if (mmio_dev) {
1078 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 1628 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1629 mutex_unlock(&vcpu->kvm->lock);
1079 return X86EMUL_CONTINUE; 1630 return X86EMUL_CONTINUE;
1080 } 1631 }
1632 mutex_unlock(&vcpu->kvm->lock);
1081 1633
1082 vcpu->mmio_needed = 1; 1634 vcpu->mmio_needed = 1;
1083 vcpu->mmio_phys_addr = gpa; 1635 vcpu->mmio_phys_addr = gpa;
@@ -1090,19 +1642,16 @@ static int emulator_read_emulated(unsigned long addr,
1090static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 1642static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1091 const void *val, int bytes) 1643 const void *val, int bytes)
1092{ 1644{
1093 struct page *page; 1645 int ret;
1094 void *virt;
1095 1646
1096 if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) 1647 down_read(&current->mm->mmap_sem);
1648 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1649 if (ret < 0) {
1650 up_read(&current->mm->mmap_sem);
1097 return 0; 1651 return 0;
1098 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1652 }
1099 if (!page)
1100 return 0;
1101 mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1102 virt = kmap_atomic(page, KM_USER0);
1103 kvm_mmu_pte_write(vcpu, gpa, val, bytes); 1653 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1104 memcpy(virt + offset_in_page(gpa), val, bytes); 1654 up_read(&current->mm->mmap_sem);
1105 kunmap_atomic(virt, KM_USER0);
1106 return 1; 1655 return 1;
1107} 1656}
1108 1657
@@ -1112,24 +1661,36 @@ static int emulator_write_emulated_onepage(unsigned long addr,
1112 struct kvm_vcpu *vcpu) 1661 struct kvm_vcpu *vcpu)
1113{ 1662{
1114 struct kvm_io_device *mmio_dev; 1663 struct kvm_io_device *mmio_dev;
1115 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1664 gpa_t gpa;
1665
1666 down_read(&current->mm->mmap_sem);
1667 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1668 up_read(&current->mm->mmap_sem);
1116 1669
1117 if (gpa == UNMAPPED_GVA) { 1670 if (gpa == UNMAPPED_GVA) {
1118 kvm_x86_ops->inject_page_fault(vcpu, addr, 2); 1671 kvm_inject_page_fault(vcpu, addr, 2);
1119 return X86EMUL_PROPAGATE_FAULT; 1672 return X86EMUL_PROPAGATE_FAULT;
1120 } 1673 }
1121 1674
1675 /* For APIC access vmexit */
1676 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1677 goto mmio;
1678
1122 if (emulator_write_phys(vcpu, gpa, val, bytes)) 1679 if (emulator_write_phys(vcpu, gpa, val, bytes))
1123 return X86EMUL_CONTINUE; 1680 return X86EMUL_CONTINUE;
1124 1681
1682mmio:
1125 /* 1683 /*
1126 * Is this MMIO handled locally? 1684 * Is this MMIO handled locally?
1127 */ 1685 */
1686 mutex_lock(&vcpu->kvm->lock);
1128 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1687 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1129 if (mmio_dev) { 1688 if (mmio_dev) {
1130 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 1689 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1690 mutex_unlock(&vcpu->kvm->lock);
1131 return X86EMUL_CONTINUE; 1691 return X86EMUL_CONTINUE;
1132 } 1692 }
1693 mutex_unlock(&vcpu->kvm->lock);
1133 1694
1134 vcpu->mmio_needed = 1; 1695 vcpu->mmio_needed = 1;
1135 vcpu->mmio_phys_addr = gpa; 1696 vcpu->mmio_phys_addr = gpa;
@@ -1173,6 +1734,35 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
1173 reported = 1; 1734 reported = 1;
1174 printk(KERN_WARNING "kvm: emulating exchange as write\n"); 1735 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1175 } 1736 }
1737#ifndef CONFIG_X86_64
1738 /* guests cmpxchg8b have to be emulated atomically */
1739 if (bytes == 8) {
1740 gpa_t gpa;
1741 struct page *page;
1742 char *addr;
1743 u64 val;
1744
1745 down_read(&current->mm->mmap_sem);
1746 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1747
1748 if (gpa == UNMAPPED_GVA ||
1749 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1750 goto emul_write;
1751
1752 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
1753 goto emul_write;
1754
1755 val = *(u64 *)new;
1756 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1757 addr = kmap_atomic(page, KM_USER0);
1758 set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
1759 kunmap_atomic(addr, KM_USER0);
1760 kvm_release_page_dirty(page);
1761 emul_write:
1762 up_read(&current->mm->mmap_sem);
1763 }
1764#endif
1765
1176 return emulator_write_emulated(addr, new, bytes, vcpu); 1766 return emulator_write_emulated(addr, new, bytes, vcpu);
1177} 1767}
1178 1768
@@ -1188,11 +1778,11 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1188 1778
1189int emulate_clts(struct kvm_vcpu *vcpu) 1779int emulate_clts(struct kvm_vcpu *vcpu)
1190{ 1780{
1191 kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS); 1781 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
1192 return X86EMUL_CONTINUE; 1782 return X86EMUL_CONTINUE;
1193} 1783}
1194 1784
1195int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest) 1785int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1196{ 1786{
1197 struct kvm_vcpu *vcpu = ctxt->vcpu; 1787 struct kvm_vcpu *vcpu = ctxt->vcpu;
1198 1788
@@ -1223,7 +1813,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1223{ 1813{
1224 static int reported; 1814 static int reported;
1225 u8 opcodes[4]; 1815 u8 opcodes[4];
1226 unsigned long rip = vcpu->rip; 1816 unsigned long rip = vcpu->arch.rip;
1227 unsigned long rip_linear; 1817 unsigned long rip_linear;
1228 1818
1229 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 1819 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
@@ -1241,7 +1831,6 @@ EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1241 1831
1242struct x86_emulate_ops emulate_ops = { 1832struct x86_emulate_ops emulate_ops = {
1243 .read_std = emulator_read_std, 1833 .read_std = emulator_read_std,
1244 .write_std = emulator_write_std,
1245 .read_emulated = emulator_read_emulated, 1834 .read_emulated = emulator_read_emulated,
1246 .write_emulated = emulator_write_emulated, 1835 .write_emulated = emulator_write_emulated,
1247 .cmpxchg_emulated = emulator_cmpxchg_emulated, 1836 .cmpxchg_emulated = emulator_cmpxchg_emulated,
@@ -1250,44 +1839,74 @@ struct x86_emulate_ops emulate_ops = {
1250int emulate_instruction(struct kvm_vcpu *vcpu, 1839int emulate_instruction(struct kvm_vcpu *vcpu,
1251 struct kvm_run *run, 1840 struct kvm_run *run,
1252 unsigned long cr2, 1841 unsigned long cr2,
1253 u16 error_code) 1842 u16 error_code,
1843 int emulation_type)
1254{ 1844{
1255 struct x86_emulate_ctxt emulate_ctxt;
1256 int r; 1845 int r;
1257 int cs_db, cs_l; 1846 struct decode_cache *c;
1258 1847
1259 vcpu->mmio_fault_cr2 = cr2; 1848 vcpu->arch.mmio_fault_cr2 = cr2;
1260 kvm_x86_ops->cache_regs(vcpu); 1849 kvm_x86_ops->cache_regs(vcpu);
1261 1850
1262 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 1851 vcpu->mmio_is_write = 0;
1263 1852 vcpu->arch.pio.string = 0;
1264 emulate_ctxt.vcpu = vcpu; 1853
1265 emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 1854 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
1266 emulate_ctxt.cr2 = cr2; 1855 int cs_db, cs_l;
1267 emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) 1856 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1268 ? X86EMUL_MODE_REAL : cs_l 1857
1269 ? X86EMUL_MODE_PROT64 : cs_db 1858 vcpu->arch.emulate_ctxt.vcpu = vcpu;
1270 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 1859 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1271 1860 vcpu->arch.emulate_ctxt.mode =
1272 if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) { 1861 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
1273 emulate_ctxt.cs_base = 0; 1862 ? X86EMUL_MODE_REAL : cs_l
1274 emulate_ctxt.ds_base = 0; 1863 ? X86EMUL_MODE_PROT64 : cs_db
1275 emulate_ctxt.es_base = 0; 1864 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1276 emulate_ctxt.ss_base = 0; 1865
1277 } else { 1866 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1278 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS); 1867 vcpu->arch.emulate_ctxt.cs_base = 0;
1279 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS); 1868 vcpu->arch.emulate_ctxt.ds_base = 0;
1280 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES); 1869 vcpu->arch.emulate_ctxt.es_base = 0;
1281 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS); 1870 vcpu->arch.emulate_ctxt.ss_base = 0;
1871 } else {
1872 vcpu->arch.emulate_ctxt.cs_base =
1873 get_segment_base(vcpu, VCPU_SREG_CS);
1874 vcpu->arch.emulate_ctxt.ds_base =
1875 get_segment_base(vcpu, VCPU_SREG_DS);
1876 vcpu->arch.emulate_ctxt.es_base =
1877 get_segment_base(vcpu, VCPU_SREG_ES);
1878 vcpu->arch.emulate_ctxt.ss_base =
1879 get_segment_base(vcpu, VCPU_SREG_SS);
1880 }
1881
1882 vcpu->arch.emulate_ctxt.gs_base =
1883 get_segment_base(vcpu, VCPU_SREG_GS);
1884 vcpu->arch.emulate_ctxt.fs_base =
1885 get_segment_base(vcpu, VCPU_SREG_FS);
1886
1887 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1888
1889 /* Reject the instructions other than VMCALL/VMMCALL when
1890 * try to emulate invalid opcode */
1891 c = &vcpu->arch.emulate_ctxt.decode;
1892 if ((emulation_type & EMULTYPE_TRAP_UD) &&
1893 (!(c->twobyte && c->b == 0x01 &&
1894 (c->modrm_reg == 0 || c->modrm_reg == 3) &&
1895 c->modrm_mod == 3 && c->modrm_rm == 1)))
1896 return EMULATE_FAIL;
1897
1898 ++vcpu->stat.insn_emulation;
1899 if (r) {
1900 ++vcpu->stat.insn_emulation_fail;
1901 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1902 return EMULATE_DONE;
1903 return EMULATE_FAIL;
1904 }
1282 } 1905 }
1283 1906
1284 emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS); 1907 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1285 emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1286 1908
1287 vcpu->mmio_is_write = 0; 1909 if (vcpu->arch.pio.string)
1288 vcpu->pio.string = 0;
1289 r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1290 if (vcpu->pio.string)
1291 return EMULATE_DO_MMIO; 1910 return EMULATE_DO_MMIO;
1292 1911
1293 if ((r || vcpu->mmio_is_write) && run) { 1912 if ((r || vcpu->mmio_is_write) && run) {
@@ -1309,7 +1928,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
1309 } 1928 }
1310 1929
1311 kvm_x86_ops->decache_regs(vcpu); 1930 kvm_x86_ops->decache_regs(vcpu);
1312 kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags); 1931 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
1313 1932
1314 if (vcpu->mmio_is_write) { 1933 if (vcpu->mmio_is_write) {
1315 vcpu->mmio_needed = 0; 1934 vcpu->mmio_needed = 0;
@@ -1320,439 +1939,45 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
1320} 1939}
1321EXPORT_SYMBOL_GPL(emulate_instruction); 1940EXPORT_SYMBOL_GPL(emulate_instruction);
1322 1941
1323/* 1942static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
1324 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1325 */
1326static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1327{
1328 DECLARE_WAITQUEUE(wait, current);
1329
1330 add_wait_queue(&vcpu->wq, &wait);
1331
1332 /*
1333 * We will block until either an interrupt or a signal wakes us up
1334 */
1335 while (!kvm_cpu_has_interrupt(vcpu)
1336 && !signal_pending(current)
1337 && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
1338 && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
1339 set_current_state(TASK_INTERRUPTIBLE);
1340 vcpu_put(vcpu);
1341 schedule();
1342 vcpu_load(vcpu);
1343 }
1344
1345 __set_current_state(TASK_RUNNING);
1346 remove_wait_queue(&vcpu->wq, &wait);
1347}
1348
1349int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1350{
1351 ++vcpu->stat.halt_exits;
1352 if (irqchip_in_kernel(vcpu->kvm)) {
1353 vcpu->mp_state = VCPU_MP_STATE_HALTED;
1354 kvm_vcpu_block(vcpu);
1355 if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
1356 return -EINTR;
1357 return 1;
1358 } else {
1359 vcpu->run->exit_reason = KVM_EXIT_HLT;
1360 return 0;
1361 }
1362}
1363EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1364
1365int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1366{
1367 unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1368
1369 kvm_x86_ops->cache_regs(vcpu);
1370 ret = -KVM_EINVAL;
1371#ifdef CONFIG_X86_64
1372 if (is_long_mode(vcpu)) {
1373 nr = vcpu->regs[VCPU_REGS_RAX];
1374 a0 = vcpu->regs[VCPU_REGS_RDI];
1375 a1 = vcpu->regs[VCPU_REGS_RSI];
1376 a2 = vcpu->regs[VCPU_REGS_RDX];
1377 a3 = vcpu->regs[VCPU_REGS_RCX];
1378 a4 = vcpu->regs[VCPU_REGS_R8];
1379 a5 = vcpu->regs[VCPU_REGS_R9];
1380 } else
1381#endif
1382 {
1383 nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
1384 a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
1385 a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
1386 a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
1387 a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
1388 a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
1389 a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
1390 }
1391 switch (nr) {
1392 default:
1393 run->hypercall.nr = nr;
1394 run->hypercall.args[0] = a0;
1395 run->hypercall.args[1] = a1;
1396 run->hypercall.args[2] = a2;
1397 run->hypercall.args[3] = a3;
1398 run->hypercall.args[4] = a4;
1399 run->hypercall.args[5] = a5;
1400 run->hypercall.ret = ret;
1401 run->hypercall.longmode = is_long_mode(vcpu);
1402 kvm_x86_ops->decache_regs(vcpu);
1403 return 0;
1404 }
1405 vcpu->regs[VCPU_REGS_RAX] = ret;
1406 kvm_x86_ops->decache_regs(vcpu);
1407 return 1;
1408}
1409EXPORT_SYMBOL_GPL(kvm_hypercall);
1410
1411static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1412{
1413 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1414}
1415
1416void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1417{
1418 struct descriptor_table dt = { limit, base };
1419
1420 kvm_x86_ops->set_gdt(vcpu, &dt);
1421}
1422
1423void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1424{
1425 struct descriptor_table dt = { limit, base };
1426
1427 kvm_x86_ops->set_idt(vcpu, &dt);
1428}
1429
1430void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1431 unsigned long *rflags)
1432{
1433 lmsw(vcpu, msw);
1434 *rflags = kvm_x86_ops->get_rflags(vcpu);
1435}
1436
1437unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1438{
1439 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
1440 switch (cr) {
1441 case 0:
1442 return vcpu->cr0;
1443 case 2:
1444 return vcpu->cr2;
1445 case 3:
1446 return vcpu->cr3;
1447 case 4:
1448 return vcpu->cr4;
1449 default:
1450 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1451 return 0;
1452 }
1453}
1454
1455void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1456 unsigned long *rflags)
1457{
1458 switch (cr) {
1459 case 0:
1460 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1461 *rflags = kvm_x86_ops->get_rflags(vcpu);
1462 break;
1463 case 2:
1464 vcpu->cr2 = val;
1465 break;
1466 case 3:
1467 set_cr3(vcpu, val);
1468 break;
1469 case 4:
1470 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1471 break;
1472 default:
1473 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1474 }
1475}
1476
1477/*
1478 * Register the para guest with the host:
1479 */
1480static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1481{
1482 struct kvm_vcpu_para_state *para_state;
1483 hpa_t para_state_hpa, hypercall_hpa;
1484 struct page *para_state_page;
1485 unsigned char *hypercall;
1486 gpa_t hypercall_gpa;
1487
1488 printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
1489 printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
1490
1491 /*
1492 * Needs to be page aligned:
1493 */
1494 if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
1495 goto err_gp;
1496
1497 para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
1498 printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
1499 if (is_error_hpa(para_state_hpa))
1500 goto err_gp;
1501
1502 mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
1503 para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
1504 para_state = kmap(para_state_page);
1505
1506 printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version);
1507 printk(KERN_DEBUG ".... size: %d\n", para_state->size);
1508
1509 para_state->host_version = KVM_PARA_API_VERSION;
1510 /*
1511 * We cannot support guests that try to register themselves
1512 * with a newer API version than the host supports:
1513 */
1514 if (para_state->guest_version > KVM_PARA_API_VERSION) {
1515 para_state->ret = -KVM_EINVAL;
1516 goto err_kunmap_skip;
1517 }
1518
1519 hypercall_gpa = para_state->hypercall_gpa;
1520 hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
1521 printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
1522 if (is_error_hpa(hypercall_hpa)) {
1523 para_state->ret = -KVM_EINVAL;
1524 goto err_kunmap_skip;
1525 }
1526
1527 printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
1528 vcpu->para_state_page = para_state_page;
1529 vcpu->para_state_gpa = para_state_gpa;
1530 vcpu->hypercall_gpa = hypercall_gpa;
1531
1532 mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
1533 hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
1534 KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
1535 kvm_x86_ops->patch_hypercall(vcpu, hypercall);
1536 kunmap_atomic(hypercall, KM_USER1);
1537
1538 para_state->ret = 0;
1539err_kunmap_skip:
1540 kunmap(para_state_page);
1541 return 0;
1542err_gp:
1543 return 1;
1544}
1545
1546int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1547{
1548 u64 data;
1549
1550 switch (msr) {
1551 case 0xc0010010: /* SYSCFG */
1552 case 0xc0010015: /* HWCR */
1553 case MSR_IA32_PLATFORM_ID:
1554 case MSR_IA32_P5_MC_ADDR:
1555 case MSR_IA32_P5_MC_TYPE:
1556 case MSR_IA32_MC0_CTL:
1557 case MSR_IA32_MCG_STATUS:
1558 case MSR_IA32_MCG_CAP:
1559 case MSR_IA32_MC0_MISC:
1560 case MSR_IA32_MC0_MISC+4:
1561 case MSR_IA32_MC0_MISC+8:
1562 case MSR_IA32_MC0_MISC+12:
1563 case MSR_IA32_MC0_MISC+16:
1564 case MSR_IA32_UCODE_REV:
1565 case MSR_IA32_PERF_STATUS:
1566 case MSR_IA32_EBL_CR_POWERON:
1567 /* MTRR registers */
1568 case 0xfe:
1569 case 0x200 ... 0x2ff:
1570 data = 0;
1571 break;
1572 case 0xcd: /* fsb frequency */
1573 data = 3;
1574 break;
1575 case MSR_IA32_APICBASE:
1576 data = kvm_get_apic_base(vcpu);
1577 break;
1578 case MSR_IA32_MISC_ENABLE:
1579 data = vcpu->ia32_misc_enable_msr;
1580 break;
1581#ifdef CONFIG_X86_64
1582 case MSR_EFER:
1583 data = vcpu->shadow_efer;
1584 break;
1585#endif
1586 default:
1587 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1588 return 1;
1589 }
1590 *pdata = data;
1591 return 0;
1592}
1593EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1594
1595/*
1596 * Reads an msr value (of 'msr_index') into 'pdata'.
1597 * Returns 0 on success, non-0 otherwise.
1598 * Assumes vcpu_load() was already called.
1599 */
1600int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1601{
1602 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
1603}
1604
1605#ifdef CONFIG_X86_64
1606
1607static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1608{
1609 if (efer & EFER_RESERVED_BITS) {
1610 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1611 efer);
1612 inject_gp(vcpu);
1613 return;
1614 }
1615
1616 if (is_paging(vcpu)
1617 && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1618 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1619 inject_gp(vcpu);
1620 return;
1621 }
1622
1623 kvm_x86_ops->set_efer(vcpu, efer);
1624
1625 efer &= ~EFER_LMA;
1626 efer |= vcpu->shadow_efer & EFER_LMA;
1627
1628 vcpu->shadow_efer = efer;
1629}
1630
1631#endif
1632
1633int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1634{
1635 switch (msr) {
1636#ifdef CONFIG_X86_64
1637 case MSR_EFER:
1638 set_efer(vcpu, data);
1639 break;
1640#endif
1641 case MSR_IA32_MC0_STATUS:
1642 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1643 __FUNCTION__, data);
1644 break;
1645 case MSR_IA32_MCG_STATUS:
1646 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1647 __FUNCTION__, data);
1648 break;
1649 case MSR_IA32_UCODE_REV:
1650 case MSR_IA32_UCODE_WRITE:
1651 case 0x200 ... 0x2ff: /* MTRRs */
1652 break;
1653 case MSR_IA32_APICBASE:
1654 kvm_set_apic_base(vcpu, data);
1655 break;
1656 case MSR_IA32_MISC_ENABLE:
1657 vcpu->ia32_misc_enable_msr = data;
1658 break;
1659 /*
1660 * This is the 'probe whether the host is KVM' logic:
1661 */
1662 case MSR_KVM_API_MAGIC:
1663 return vcpu_register_para(vcpu, data);
1664
1665 default:
1666 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
1667 return 1;
1668 }
1669 return 0;
1670}
1671EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1672
1673/*
1674 * Writes msr value into into the appropriate "register".
1675 * Returns 0 on success, non-0 otherwise.
1676 * Assumes vcpu_load() was already called.
1677 */
1678int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1679{
1680 return kvm_x86_ops->set_msr(vcpu, msr_index, data);
1681}
1682
1683void kvm_resched(struct kvm_vcpu *vcpu)
1684{
1685 if (!need_resched())
1686 return;
1687 cond_resched();
1688}
1689EXPORT_SYMBOL_GPL(kvm_resched);
1690
1691void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1692{ 1943{
1693 int i; 1944 int i;
1694 u32 function;
1695 struct kvm_cpuid_entry *e, *best;
1696 1945
1697 kvm_x86_ops->cache_regs(vcpu); 1946 for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
1698 function = vcpu->regs[VCPU_REGS_RAX]; 1947 if (vcpu->arch.pio.guest_pages[i]) {
1699 vcpu->regs[VCPU_REGS_RAX] = 0; 1948 kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
1700 vcpu->regs[VCPU_REGS_RBX] = 0; 1949 vcpu->arch.pio.guest_pages[i] = NULL;
1701 vcpu->regs[VCPU_REGS_RCX] = 0;
1702 vcpu->regs[VCPU_REGS_RDX] = 0;
1703 best = NULL;
1704 for (i = 0; i < vcpu->cpuid_nent; ++i) {
1705 e = &vcpu->cpuid_entries[i];
1706 if (e->function == function) {
1707 best = e;
1708 break;
1709 } 1950 }
1710 /*
1711 * Both basic or both extended?
1712 */
1713 if (((e->function ^ function) & 0x80000000) == 0)
1714 if (!best || e->function > best->function)
1715 best = e;
1716 }
1717 if (best) {
1718 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1719 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1720 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1721 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1722 }
1723 kvm_x86_ops->decache_regs(vcpu);
1724 kvm_x86_ops->skip_emulated_instruction(vcpu);
1725} 1951}
1726EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1727 1952
1728static int pio_copy_data(struct kvm_vcpu *vcpu) 1953static int pio_copy_data(struct kvm_vcpu *vcpu)
1729{ 1954{
1730 void *p = vcpu->pio_data; 1955 void *p = vcpu->arch.pio_data;
1731 void *q; 1956 void *q;
1732 unsigned bytes; 1957 unsigned bytes;
1733 int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1; 1958 int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
1734 1959
1735 q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE, 1960 q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1736 PAGE_KERNEL); 1961 PAGE_KERNEL);
1737 if (!q) { 1962 if (!q) {
1738 free_pio_guest_pages(vcpu); 1963 free_pio_guest_pages(vcpu);
1739 return -ENOMEM; 1964 return -ENOMEM;
1740 } 1965 }
1741 q += vcpu->pio.guest_page_offset; 1966 q += vcpu->arch.pio.guest_page_offset;
1742 bytes = vcpu->pio.size * vcpu->pio.cur_count; 1967 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
1743 if (vcpu->pio.in) 1968 if (vcpu->arch.pio.in)
1744 memcpy(q, p, bytes); 1969 memcpy(q, p, bytes);
1745 else 1970 else
1746 memcpy(p, q, bytes); 1971 memcpy(p, q, bytes);
1747 q -= vcpu->pio.guest_page_offset; 1972 q -= vcpu->arch.pio.guest_page_offset;
1748 vunmap(q); 1973 vunmap(q);
1749 free_pio_guest_pages(vcpu); 1974 free_pio_guest_pages(vcpu);
1750 return 0; 1975 return 0;
1751} 1976}
1752 1977
1753static int complete_pio(struct kvm_vcpu *vcpu) 1978int complete_pio(struct kvm_vcpu *vcpu)
1754{ 1979{
1755 struct kvm_pio_request *io = &vcpu->pio; 1980 struct kvm_pio_request *io = &vcpu->arch.pio;
1756 long delta; 1981 long delta;
1757 int r; 1982 int r;
1758 1983
@@ -1760,7 +1985,7 @@ static int complete_pio(struct kvm_vcpu *vcpu)
1760 1985
1761 if (!io->string) { 1986 if (!io->string) {
1762 if (io->in) 1987 if (io->in)
1763 memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data, 1988 memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
1764 io->size); 1989 io->size);
1765 } else { 1990 } else {
1766 if (io->in) { 1991 if (io->in) {
@@ -1778,15 +2003,15 @@ static int complete_pio(struct kvm_vcpu *vcpu)
1778 * The size of the register should really depend on 2003 * The size of the register should really depend on
1779 * current address size. 2004 * current address size.
1780 */ 2005 */
1781 vcpu->regs[VCPU_REGS_RCX] -= delta; 2006 vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
1782 } 2007 }
1783 if (io->down) 2008 if (io->down)
1784 delta = -delta; 2009 delta = -delta;
1785 delta *= io->size; 2010 delta *= io->size;
1786 if (io->in) 2011 if (io->in)
1787 vcpu->regs[VCPU_REGS_RDI] += delta; 2012 vcpu->arch.regs[VCPU_REGS_RDI] += delta;
1788 else 2013 else
1789 vcpu->regs[VCPU_REGS_RSI] += delta; 2014 vcpu->arch.regs[VCPU_REGS_RSI] += delta;
1790 } 2015 }
1791 2016
1792 kvm_x86_ops->decache_regs(vcpu); 2017 kvm_x86_ops->decache_regs(vcpu);
@@ -1804,13 +2029,13 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
1804 /* TODO: String I/O for in kernel device */ 2029 /* TODO: String I/O for in kernel device */
1805 2030
1806 mutex_lock(&vcpu->kvm->lock); 2031 mutex_lock(&vcpu->kvm->lock);
1807 if (vcpu->pio.in) 2032 if (vcpu->arch.pio.in)
1808 kvm_iodevice_read(pio_dev, vcpu->pio.port, 2033 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
1809 vcpu->pio.size, 2034 vcpu->arch.pio.size,
1810 pd); 2035 pd);
1811 else 2036 else
1812 kvm_iodevice_write(pio_dev, vcpu->pio.port, 2037 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
1813 vcpu->pio.size, 2038 vcpu->arch.pio.size,
1814 pd); 2039 pd);
1815 mutex_unlock(&vcpu->kvm->lock); 2040 mutex_unlock(&vcpu->kvm->lock);
1816} 2041}
@@ -1818,8 +2043,8 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
1818static void pio_string_write(struct kvm_io_device *pio_dev, 2043static void pio_string_write(struct kvm_io_device *pio_dev,
1819 struct kvm_vcpu *vcpu) 2044 struct kvm_vcpu *vcpu)
1820{ 2045{
1821 struct kvm_pio_request *io = &vcpu->pio; 2046 struct kvm_pio_request *io = &vcpu->arch.pio;
1822 void *pd = vcpu->pio_data; 2047 void *pd = vcpu->arch.pio_data;
1823 int i; 2048 int i;
1824 2049
1825 mutex_lock(&vcpu->kvm->lock); 2050 mutex_lock(&vcpu->kvm->lock);
@@ -1832,32 +2057,38 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
1832 mutex_unlock(&vcpu->kvm->lock); 2057 mutex_unlock(&vcpu->kvm->lock);
1833} 2058}
1834 2059
1835int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2060static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2061 gpa_t addr)
2062{
2063 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
2064}
2065
2066int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1836 int size, unsigned port) 2067 int size, unsigned port)
1837{ 2068{
1838 struct kvm_io_device *pio_dev; 2069 struct kvm_io_device *pio_dev;
1839 2070
1840 vcpu->run->exit_reason = KVM_EXIT_IO; 2071 vcpu->run->exit_reason = KVM_EXIT_IO;
1841 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2072 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1842 vcpu->run->io.size = vcpu->pio.size = size; 2073 vcpu->run->io.size = vcpu->arch.pio.size = size;
1843 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2074 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1844 vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1; 2075 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
1845 vcpu->run->io.port = vcpu->pio.port = port; 2076 vcpu->run->io.port = vcpu->arch.pio.port = port;
1846 vcpu->pio.in = in; 2077 vcpu->arch.pio.in = in;
1847 vcpu->pio.string = 0; 2078 vcpu->arch.pio.string = 0;
1848 vcpu->pio.down = 0; 2079 vcpu->arch.pio.down = 0;
1849 vcpu->pio.guest_page_offset = 0; 2080 vcpu->arch.pio.guest_page_offset = 0;
1850 vcpu->pio.rep = 0; 2081 vcpu->arch.pio.rep = 0;
1851 2082
1852 kvm_x86_ops->cache_regs(vcpu); 2083 kvm_x86_ops->cache_regs(vcpu);
1853 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); 2084 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
1854 kvm_x86_ops->decache_regs(vcpu); 2085 kvm_x86_ops->decache_regs(vcpu);
1855 2086
1856 kvm_x86_ops->skip_emulated_instruction(vcpu); 2087 kvm_x86_ops->skip_emulated_instruction(vcpu);
1857 2088
1858 pio_dev = vcpu_find_pio_dev(vcpu, port); 2089 pio_dev = vcpu_find_pio_dev(vcpu, port);
1859 if (pio_dev) { 2090 if (pio_dev) {
1860 kernel_pio(pio_dev, vcpu, vcpu->pio_data); 2091 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
1861 complete_pio(vcpu); 2092 complete_pio(vcpu);
1862 return 1; 2093 return 1;
1863 } 2094 }
@@ -1877,15 +2108,15 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1877 2108
1878 vcpu->run->exit_reason = KVM_EXIT_IO; 2109 vcpu->run->exit_reason = KVM_EXIT_IO;
1879 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2110 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1880 vcpu->run->io.size = vcpu->pio.size = size; 2111 vcpu->run->io.size = vcpu->arch.pio.size = size;
1881 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2112 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1882 vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count; 2113 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
1883 vcpu->run->io.port = vcpu->pio.port = port; 2114 vcpu->run->io.port = vcpu->arch.pio.port = port;
1884 vcpu->pio.in = in; 2115 vcpu->arch.pio.in = in;
1885 vcpu->pio.string = 1; 2116 vcpu->arch.pio.string = 1;
1886 vcpu->pio.down = down; 2117 vcpu->arch.pio.down = down;
1887 vcpu->pio.guest_page_offset = offset_in_page(address); 2118 vcpu->arch.pio.guest_page_offset = offset_in_page(address);
1888 vcpu->pio.rep = rep; 2119 vcpu->arch.pio.rep = rep;
1889 2120
1890 if (!count) { 2121 if (!count) {
1891 kvm_x86_ops->skip_emulated_instruction(vcpu); 2122 kvm_x86_ops->skip_emulated_instruction(vcpu);
@@ -1911,37 +2142,35 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1911 * String I/O in reverse. Yuck. Kill the guest, fix later. 2142 * String I/O in reverse. Yuck. Kill the guest, fix later.
1912 */ 2143 */
1913 pr_unimpl(vcpu, "guest string pio down\n"); 2144 pr_unimpl(vcpu, "guest string pio down\n");
1914 inject_gp(vcpu); 2145 kvm_inject_gp(vcpu, 0);
1915 return 1; 2146 return 1;
1916 } 2147 }
1917 vcpu->run->io.count = now; 2148 vcpu->run->io.count = now;
1918 vcpu->pio.cur_count = now; 2149 vcpu->arch.pio.cur_count = now;
1919 2150
1920 if (vcpu->pio.cur_count == vcpu->pio.count) 2151 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
1921 kvm_x86_ops->skip_emulated_instruction(vcpu); 2152 kvm_x86_ops->skip_emulated_instruction(vcpu);
1922 2153
1923 for (i = 0; i < nr_pages; ++i) { 2154 for (i = 0; i < nr_pages; ++i) {
1924 mutex_lock(&vcpu->kvm->lock); 2155 down_read(&current->mm->mmap_sem);
1925 page = gva_to_page(vcpu, address + i * PAGE_SIZE); 2156 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1926 if (page) 2157 vcpu->arch.pio.guest_pages[i] = page;
1927 get_page(page); 2158 up_read(&current->mm->mmap_sem);
1928 vcpu->pio.guest_pages[i] = page;
1929 mutex_unlock(&vcpu->kvm->lock);
1930 if (!page) { 2159 if (!page) {
1931 inject_gp(vcpu); 2160 kvm_inject_gp(vcpu, 0);
1932 free_pio_guest_pages(vcpu); 2161 free_pio_guest_pages(vcpu);
1933 return 1; 2162 return 1;
1934 } 2163 }
1935 } 2164 }
1936 2165
1937 pio_dev = vcpu_find_pio_dev(vcpu, port); 2166 pio_dev = vcpu_find_pio_dev(vcpu, port);
1938 if (!vcpu->pio.in) { 2167 if (!vcpu->arch.pio.in) {
1939 /* string PIO write */ 2168 /* string PIO write */
1940 ret = pio_copy_data(vcpu); 2169 ret = pio_copy_data(vcpu);
1941 if (ret >= 0 && pio_dev) { 2170 if (ret >= 0 && pio_dev) {
1942 pio_string_write(pio_dev, vcpu); 2171 pio_string_write(pio_dev, vcpu);
1943 complete_pio(vcpu); 2172 complete_pio(vcpu);
1944 if (vcpu->pio.count == 0) 2173 if (vcpu->arch.pio.count == 0)
1945 ret = 1; 2174 ret = 1;
1946 } 2175 }
1947 } else if (pio_dev) 2176 } else if (pio_dev)
@@ -1953,6 +2182,263 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1953} 2182}
1954EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 2183EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
1955 2184
2185int kvm_arch_init(void *opaque)
2186{
2187 int r;
2188 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2189
2190 if (kvm_x86_ops) {
2191 printk(KERN_ERR "kvm: already loaded the other module\n");
2192 r = -EEXIST;
2193 goto out;
2194 }
2195
2196 if (!ops->cpu_has_kvm_support()) {
2197 printk(KERN_ERR "kvm: no hardware support\n");
2198 r = -EOPNOTSUPP;
2199 goto out;
2200 }
2201 if (ops->disabled_by_bios()) {
2202 printk(KERN_ERR "kvm: disabled by bios\n");
2203 r = -EOPNOTSUPP;
2204 goto out;
2205 }
2206
2207 r = kvm_mmu_module_init();
2208 if (r)
2209 goto out;
2210
2211 kvm_init_msr_list();
2212
2213 kvm_x86_ops = ops;
2214 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2215 return 0;
2216
2217out:
2218 return r;
2219}
2220
2221void kvm_arch_exit(void)
2222{
2223 kvm_x86_ops = NULL;
2224 kvm_mmu_module_exit();
2225}
2226
2227int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2228{
2229 ++vcpu->stat.halt_exits;
2230 if (irqchip_in_kernel(vcpu->kvm)) {
2231 vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
2232 kvm_vcpu_block(vcpu);
2233 if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
2234 return -EINTR;
2235 return 1;
2236 } else {
2237 vcpu->run->exit_reason = KVM_EXIT_HLT;
2238 return 0;
2239 }
2240}
2241EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2242
2243int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2244{
2245 unsigned long nr, a0, a1, a2, a3, ret;
2246
2247 kvm_x86_ops->cache_regs(vcpu);
2248
2249 nr = vcpu->arch.regs[VCPU_REGS_RAX];
2250 a0 = vcpu->arch.regs[VCPU_REGS_RBX];
2251 a1 = vcpu->arch.regs[VCPU_REGS_RCX];
2252 a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2253 a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2254
2255 if (!is_long_mode(vcpu)) {
2256 nr &= 0xFFFFFFFF;
2257 a0 &= 0xFFFFFFFF;
2258 a1 &= 0xFFFFFFFF;
2259 a2 &= 0xFFFFFFFF;
2260 a3 &= 0xFFFFFFFF;
2261 }
2262
2263 switch (nr) {
2264 case KVM_HC_VAPIC_POLL_IRQ:
2265 ret = 0;
2266 break;
2267 default:
2268 ret = -KVM_ENOSYS;
2269 break;
2270 }
2271 vcpu->arch.regs[VCPU_REGS_RAX] = ret;
2272 kvm_x86_ops->decache_regs(vcpu);
2273 return 0;
2274}
2275EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2276
2277int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2278{
2279 char instruction[3];
2280 int ret = 0;
2281
2282
2283 /*
2284 * Blow out the MMU to ensure that no other VCPU has an active mapping
2285 * to ensure that the updated hypercall appears atomically across all
2286 * VCPUs.
2287 */
2288 kvm_mmu_zap_all(vcpu->kvm);
2289
2290 kvm_x86_ops->cache_regs(vcpu);
2291 kvm_x86_ops->patch_hypercall(vcpu, instruction);
2292 if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
2293 != X86EMUL_CONTINUE)
2294 ret = -EFAULT;
2295
2296 return ret;
2297}
2298
2299static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2300{
2301 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2302}
2303
2304void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2305{
2306 struct descriptor_table dt = { limit, base };
2307
2308 kvm_x86_ops->set_gdt(vcpu, &dt);
2309}
2310
2311void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2312{
2313 struct descriptor_table dt = { limit, base };
2314
2315 kvm_x86_ops->set_idt(vcpu, &dt);
2316}
2317
2318void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2319 unsigned long *rflags)
2320{
2321 lmsw(vcpu, msw);
2322 *rflags = kvm_x86_ops->get_rflags(vcpu);
2323}
2324
2325unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2326{
2327 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2328 switch (cr) {
2329 case 0:
2330 return vcpu->arch.cr0;
2331 case 2:
2332 return vcpu->arch.cr2;
2333 case 3:
2334 return vcpu->arch.cr3;
2335 case 4:
2336 return vcpu->arch.cr4;
2337 case 8:
2338 return get_cr8(vcpu);
2339 default:
2340 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2341 return 0;
2342 }
2343}
2344
2345void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2346 unsigned long *rflags)
2347{
2348 switch (cr) {
2349 case 0:
2350 set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2351 *rflags = kvm_x86_ops->get_rflags(vcpu);
2352 break;
2353 case 2:
2354 vcpu->arch.cr2 = val;
2355 break;
2356 case 3:
2357 set_cr3(vcpu, val);
2358 break;
2359 case 4:
2360 set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2361 break;
2362 case 8:
2363 set_cr8(vcpu, val & 0xfUL);
2364 break;
2365 default:
2366 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2367 }
2368}
2369
2370static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2371{
2372 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2373 int j, nent = vcpu->arch.cpuid_nent;
2374
2375 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2376 /* when no next entry is found, the current entry[i] is reselected */
2377 for (j = i + 1; j == i; j = (j + 1) % nent) {
2378 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2379 if (ej->function == e->function) {
2380 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2381 return j;
2382 }
2383 }
2384 return 0; /* silence gcc, even though control never reaches here */
2385}
2386
2387/* find an entry with matching function, matching index (if needed), and that
2388 * should be read next (if it's stateful) */
2389static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2390 u32 function, u32 index)
2391{
2392 if (e->function != function)
2393 return 0;
2394 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2395 return 0;
2396 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2397 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2398 return 0;
2399 return 1;
2400}
2401
2402void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2403{
2404 int i;
2405 u32 function, index;
2406 struct kvm_cpuid_entry2 *e, *best;
2407
2408 kvm_x86_ops->cache_regs(vcpu);
2409 function = vcpu->arch.regs[VCPU_REGS_RAX];
2410 index = vcpu->arch.regs[VCPU_REGS_RCX];
2411 vcpu->arch.regs[VCPU_REGS_RAX] = 0;
2412 vcpu->arch.regs[VCPU_REGS_RBX] = 0;
2413 vcpu->arch.regs[VCPU_REGS_RCX] = 0;
2414 vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2415 best = NULL;
2416 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2417 e = &vcpu->arch.cpuid_entries[i];
2418 if (is_matching_cpuid_entry(e, function, index)) {
2419 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
2420 move_to_next_stateful_cpuid_entry(vcpu, i);
2421 best = e;
2422 break;
2423 }
2424 /*
2425 * Both basic or both extended?
2426 */
2427 if (((e->function ^ function) & 0x80000000) == 0)
2428 if (!best || e->function > best->function)
2429 best = e;
2430 }
2431 if (best) {
2432 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
2433 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
2434 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
2435 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
2436 }
2437 kvm_x86_ops->decache_regs(vcpu);
2438 kvm_x86_ops->skip_emulated_instruction(vcpu);
2439}
2440EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2441
1956/* 2442/*
1957 * Check if userspace requested an interrupt window, and that the 2443 * Check if userspace requested an interrupt window, and that the
1958 * interrupt window is open. 2444 * interrupt window is open.
@@ -1962,9 +2448,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
1962static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 2448static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1963 struct kvm_run *kvm_run) 2449 struct kvm_run *kvm_run)
1964{ 2450{
1965 return (!vcpu->irq_summary && 2451 return (!vcpu->arch.irq_summary &&
1966 kvm_run->request_interrupt_window && 2452 kvm_run->request_interrupt_window &&
1967 vcpu->interrupt_window_open && 2453 vcpu->arch.interrupt_window_open &&
1968 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); 2454 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
1969} 2455}
1970 2456
@@ -1978,22 +2464,51 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
1978 kvm_run->ready_for_interrupt_injection = 1; 2464 kvm_run->ready_for_interrupt_injection = 1;
1979 else 2465 else
1980 kvm_run->ready_for_interrupt_injection = 2466 kvm_run->ready_for_interrupt_injection =
1981 (vcpu->interrupt_window_open && 2467 (vcpu->arch.interrupt_window_open &&
1982 vcpu->irq_summary == 0); 2468 vcpu->arch.irq_summary == 0);
2469}
2470
2471static void vapic_enter(struct kvm_vcpu *vcpu)
2472{
2473 struct kvm_lapic *apic = vcpu->arch.apic;
2474 struct page *page;
2475
2476 if (!apic || !apic->vapic_addr)
2477 return;
2478
2479 down_read(&current->mm->mmap_sem);
2480 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2481 vcpu->arch.apic->vapic_page = page;
2482 up_read(&current->mm->mmap_sem);
2483}
2484
2485static void vapic_exit(struct kvm_vcpu *vcpu)
2486{
2487 struct kvm_lapic *apic = vcpu->arch.apic;
2488
2489 if (!apic || !apic->vapic_addr)
2490 return;
2491
2492 kvm_release_page_dirty(apic->vapic_page);
2493 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
1983} 2494}
1984 2495
1985static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2496static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1986{ 2497{
1987 int r; 2498 int r;
1988 2499
1989 if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { 2500 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
1990 printk("vcpu %d received sipi with vector # %x\n", 2501 pr_debug("vcpu %d received sipi with vector # %x\n",
1991 vcpu->vcpu_id, vcpu->sipi_vector); 2502 vcpu->vcpu_id, vcpu->arch.sipi_vector);
1992 kvm_lapic_reset(vcpu); 2503 kvm_lapic_reset(vcpu);
1993 kvm_x86_ops->vcpu_reset(vcpu); 2504 r = kvm_x86_ops->vcpu_reset(vcpu);
1994 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; 2505 if (r)
2506 return r;
2507 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
1995 } 2508 }
1996 2509
2510 vapic_enter(vcpu);
2511
1997preempted: 2512preempted:
1998 if (vcpu->guest_debug.enabled) 2513 if (vcpu->guest_debug.enabled)
1999 kvm_x86_ops->guest_debug_pre(vcpu); 2514 kvm_x86_ops->guest_debug_pre(vcpu);
@@ -2003,6 +2518,19 @@ again:
2003 if (unlikely(r)) 2518 if (unlikely(r))
2004 goto out; 2519 goto out;
2005 2520
2521 if (vcpu->requests) {
2522 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2523 __kvm_migrate_apic_timer(vcpu);
2524 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
2525 &vcpu->requests)) {
2526 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
2527 r = 0;
2528 goto out;
2529 }
2530 }
2531
2532 kvm_inject_pending_timer_irqs(vcpu);
2533
2006 preempt_disable(); 2534 preempt_disable();
2007 2535
2008 kvm_x86_ops->prepare_guest_switch(vcpu); 2536 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -2010,6 +2538,13 @@ again:
2010 2538
2011 local_irq_disable(); 2539 local_irq_disable();
2012 2540
2541 if (need_resched()) {
2542 local_irq_enable();
2543 preempt_enable();
2544 r = 1;
2545 goto out;
2546 }
2547
2013 if (signal_pending(current)) { 2548 if (signal_pending(current)) {
2014 local_irq_enable(); 2549 local_irq_enable();
2015 preempt_enable(); 2550 preempt_enable();
@@ -2019,16 +2554,20 @@ again:
2019 goto out; 2554 goto out;
2020 } 2555 }
2021 2556
2022 if (irqchip_in_kernel(vcpu->kvm)) 2557 if (vcpu->arch.exception.pending)
2558 __queue_exception(vcpu);
2559 else if (irqchip_in_kernel(vcpu->kvm))
2023 kvm_x86_ops->inject_pending_irq(vcpu); 2560 kvm_x86_ops->inject_pending_irq(vcpu);
2024 else if (!vcpu->mmio_read_completed) 2561 else
2025 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); 2562 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2026 2563
2564 kvm_lapic_sync_to_vapic(vcpu);
2565
2027 vcpu->guest_mode = 1; 2566 vcpu->guest_mode = 1;
2028 kvm_guest_enter(); 2567 kvm_guest_enter();
2029 2568
2030 if (vcpu->requests) 2569 if (vcpu->requests)
2031 if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) 2570 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2032 kvm_x86_ops->tlb_flush(vcpu); 2571 kvm_x86_ops->tlb_flush(vcpu);
2033 2572
2034 kvm_x86_ops->run(vcpu, kvm_run); 2573 kvm_x86_ops->run(vcpu, kvm_run);
@@ -2055,9 +2594,14 @@ again:
2055 */ 2594 */
2056 if (unlikely(prof_on == KVM_PROFILING)) { 2595 if (unlikely(prof_on == KVM_PROFILING)) {
2057 kvm_x86_ops->cache_regs(vcpu); 2596 kvm_x86_ops->cache_regs(vcpu);
2058 profile_hit(KVM_PROFILING, (void *)vcpu->rip); 2597 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
2059 } 2598 }
2060 2599
2600 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
2601 vcpu->arch.exception.pending = false;
2602
2603 kvm_lapic_sync_from_vapic(vcpu);
2604
2061 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 2605 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2062 2606
2063 if (r > 0) { 2607 if (r > 0) {
@@ -2067,10 +2611,8 @@ again:
2067 ++vcpu->stat.request_irq_exits; 2611 ++vcpu->stat.request_irq_exits;
2068 goto out; 2612 goto out;
2069 } 2613 }
2070 if (!need_resched()) { 2614 if (!need_resched())
2071 ++vcpu->stat.light_exits;
2072 goto again; 2615 goto again;
2073 }
2074 } 2616 }
2075 2617
2076out: 2618out:
@@ -2081,18 +2623,19 @@ out:
2081 2623
2082 post_kvm_run_save(vcpu, kvm_run); 2624 post_kvm_run_save(vcpu, kvm_run);
2083 2625
2626 vapic_exit(vcpu);
2627
2084 return r; 2628 return r;
2085} 2629}
2086 2630
2087 2631int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2088static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2089{ 2632{
2090 int r; 2633 int r;
2091 sigset_t sigsaved; 2634 sigset_t sigsaved;
2092 2635
2093 vcpu_load(vcpu); 2636 vcpu_load(vcpu);
2094 2637
2095 if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) { 2638 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2096 kvm_vcpu_block(vcpu); 2639 kvm_vcpu_block(vcpu);
2097 vcpu_put(vcpu); 2640 vcpu_put(vcpu);
2098 return -EAGAIN; 2641 return -EAGAIN;
@@ -2105,18 +2648,19 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2105 if (!irqchip_in_kernel(vcpu->kvm)) 2648 if (!irqchip_in_kernel(vcpu->kvm))
2106 set_cr8(vcpu, kvm_run->cr8); 2649 set_cr8(vcpu, kvm_run->cr8);
2107 2650
2108 if (vcpu->pio.cur_count) { 2651 if (vcpu->arch.pio.cur_count) {
2109 r = complete_pio(vcpu); 2652 r = complete_pio(vcpu);
2110 if (r) 2653 if (r)
2111 goto out; 2654 goto out;
2112 } 2655 }
2113 2656#if CONFIG_HAS_IOMEM
2114 if (vcpu->mmio_needed) { 2657 if (vcpu->mmio_needed) {
2115 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 2658 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2116 vcpu->mmio_read_completed = 1; 2659 vcpu->mmio_read_completed = 1;
2117 vcpu->mmio_needed = 0; 2660 vcpu->mmio_needed = 0;
2118 r = emulate_instruction(vcpu, kvm_run, 2661 r = emulate_instruction(vcpu, kvm_run,
2119 vcpu->mmio_fault_cr2, 0); 2662 vcpu->arch.mmio_fault_cr2, 0,
2663 EMULTYPE_NO_DECODE);
2120 if (r == EMULATE_DO_MMIO) { 2664 if (r == EMULATE_DO_MMIO) {
2121 /* 2665 /*
2122 * Read-modify-write. Back to userspace. 2666 * Read-modify-write. Back to userspace.
@@ -2125,10 +2669,10 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2125 goto out; 2669 goto out;
2126 } 2670 }
2127 } 2671 }
2128 2672#endif
2129 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { 2673 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2130 kvm_x86_ops->cache_regs(vcpu); 2674 kvm_x86_ops->cache_regs(vcpu);
2131 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; 2675 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2132 kvm_x86_ops->decache_regs(vcpu); 2676 kvm_x86_ops->decache_regs(vcpu);
2133 } 2677 }
2134 2678
@@ -2142,33 +2686,32 @@ out:
2142 return r; 2686 return r;
2143} 2687}
2144 2688
2145static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, 2689int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2146 struct kvm_regs *regs)
2147{ 2690{
2148 vcpu_load(vcpu); 2691 vcpu_load(vcpu);
2149 2692
2150 kvm_x86_ops->cache_regs(vcpu); 2693 kvm_x86_ops->cache_regs(vcpu);
2151 2694
2152 regs->rax = vcpu->regs[VCPU_REGS_RAX]; 2695 regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
2153 regs->rbx = vcpu->regs[VCPU_REGS_RBX]; 2696 regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
2154 regs->rcx = vcpu->regs[VCPU_REGS_RCX]; 2697 regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
2155 regs->rdx = vcpu->regs[VCPU_REGS_RDX]; 2698 regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
2156 regs->rsi = vcpu->regs[VCPU_REGS_RSI]; 2699 regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
2157 regs->rdi = vcpu->regs[VCPU_REGS_RDI]; 2700 regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
2158 regs->rsp = vcpu->regs[VCPU_REGS_RSP]; 2701 regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2159 regs->rbp = vcpu->regs[VCPU_REGS_RBP]; 2702 regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
2160#ifdef CONFIG_X86_64 2703#ifdef CONFIG_X86_64
2161 regs->r8 = vcpu->regs[VCPU_REGS_R8]; 2704 regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
2162 regs->r9 = vcpu->regs[VCPU_REGS_R9]; 2705 regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
2163 regs->r10 = vcpu->regs[VCPU_REGS_R10]; 2706 regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
2164 regs->r11 = vcpu->regs[VCPU_REGS_R11]; 2707 regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
2165 regs->r12 = vcpu->regs[VCPU_REGS_R12]; 2708 regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
2166 regs->r13 = vcpu->regs[VCPU_REGS_R13]; 2709 regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
2167 regs->r14 = vcpu->regs[VCPU_REGS_R14]; 2710 regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
2168 regs->r15 = vcpu->regs[VCPU_REGS_R15]; 2711 regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
2169#endif 2712#endif
2170 2713
2171 regs->rip = vcpu->rip; 2714 regs->rip = vcpu->arch.rip;
2172 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 2715 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2173 2716
2174 /* 2717 /*
@@ -2182,31 +2725,30 @@ static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
2182 return 0; 2725 return 0;
2183} 2726}
2184 2727
2185static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 2728int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2186 struct kvm_regs *regs)
2187{ 2729{
2188 vcpu_load(vcpu); 2730 vcpu_load(vcpu);
2189 2731
2190 vcpu->regs[VCPU_REGS_RAX] = regs->rax; 2732 vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
2191 vcpu->regs[VCPU_REGS_RBX] = regs->rbx; 2733 vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
2192 vcpu->regs[VCPU_REGS_RCX] = regs->rcx; 2734 vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
2193 vcpu->regs[VCPU_REGS_RDX] = regs->rdx; 2735 vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
2194 vcpu->regs[VCPU_REGS_RSI] = regs->rsi; 2736 vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
2195 vcpu->regs[VCPU_REGS_RDI] = regs->rdi; 2737 vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
2196 vcpu->regs[VCPU_REGS_RSP] = regs->rsp; 2738 vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
2197 vcpu->regs[VCPU_REGS_RBP] = regs->rbp; 2739 vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
2198#ifdef CONFIG_X86_64 2740#ifdef CONFIG_X86_64
2199 vcpu->regs[VCPU_REGS_R8] = regs->r8; 2741 vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
2200 vcpu->regs[VCPU_REGS_R9] = regs->r9; 2742 vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
2201 vcpu->regs[VCPU_REGS_R10] = regs->r10; 2743 vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
2202 vcpu->regs[VCPU_REGS_R11] = regs->r11; 2744 vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
2203 vcpu->regs[VCPU_REGS_R12] = regs->r12; 2745 vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
2204 vcpu->regs[VCPU_REGS_R13] = regs->r13; 2746 vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
2205 vcpu->regs[VCPU_REGS_R14] = regs->r14; 2747 vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
2206 vcpu->regs[VCPU_REGS_R15] = regs->r15; 2748 vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
2207#endif 2749#endif
2208 2750
2209 vcpu->rip = regs->rip; 2751 vcpu->arch.rip = regs->rip;
2210 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 2752 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2211 2753
2212 kvm_x86_ops->decache_regs(vcpu); 2754 kvm_x86_ops->decache_regs(vcpu);
@@ -2222,8 +2764,18 @@ static void get_segment(struct kvm_vcpu *vcpu,
2222 return kvm_x86_ops->get_segment(vcpu, var, seg); 2764 return kvm_x86_ops->get_segment(vcpu, var, seg);
2223} 2765}
2224 2766
2225static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 2767void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2226 struct kvm_sregs *sregs) 2768{
2769 struct kvm_segment cs;
2770
2771 get_segment(vcpu, &cs, VCPU_SREG_CS);
2772 *db = cs.db;
2773 *l = cs.l;
2774}
2775EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2776
2777int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2778 struct kvm_sregs *sregs)
2227{ 2779{
2228 struct descriptor_table dt; 2780 struct descriptor_table dt;
2229 int pending_vec; 2781 int pending_vec;
@@ -2248,12 +2800,12 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2248 sregs->gdt.base = dt.base; 2800 sregs->gdt.base = dt.base;
2249 2801
2250 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2802 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2251 sregs->cr0 = vcpu->cr0; 2803 sregs->cr0 = vcpu->arch.cr0;
2252 sregs->cr2 = vcpu->cr2; 2804 sregs->cr2 = vcpu->arch.cr2;
2253 sregs->cr3 = vcpu->cr3; 2805 sregs->cr3 = vcpu->arch.cr3;
2254 sregs->cr4 = vcpu->cr4; 2806 sregs->cr4 = vcpu->arch.cr4;
2255 sregs->cr8 = get_cr8(vcpu); 2807 sregs->cr8 = get_cr8(vcpu);
2256 sregs->efer = vcpu->shadow_efer; 2808 sregs->efer = vcpu->arch.shadow_efer;
2257 sregs->apic_base = kvm_get_apic_base(vcpu); 2809 sregs->apic_base = kvm_get_apic_base(vcpu);
2258 2810
2259 if (irqchip_in_kernel(vcpu->kvm)) { 2811 if (irqchip_in_kernel(vcpu->kvm)) {
@@ -2261,9 +2813,10 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2261 sizeof sregs->interrupt_bitmap); 2813 sizeof sregs->interrupt_bitmap);
2262 pending_vec = kvm_x86_ops->get_irq(vcpu); 2814 pending_vec = kvm_x86_ops->get_irq(vcpu);
2263 if (pending_vec >= 0) 2815 if (pending_vec >= 0)
2264 set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap); 2816 set_bit(pending_vec,
2817 (unsigned long *)sregs->interrupt_bitmap);
2265 } else 2818 } else
2266 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, 2819 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
2267 sizeof sregs->interrupt_bitmap); 2820 sizeof sregs->interrupt_bitmap);
2268 2821
2269 vcpu_put(vcpu); 2822 vcpu_put(vcpu);
@@ -2277,8 +2830,8 @@ static void set_segment(struct kvm_vcpu *vcpu,
2277 return kvm_x86_ops->set_segment(vcpu, var, seg); 2830 return kvm_x86_ops->set_segment(vcpu, var, seg);
2278} 2831}
2279 2832
2280static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 2833int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2281 struct kvm_sregs *sregs) 2834 struct kvm_sregs *sregs)
2282{ 2835{
2283 int mmu_reset_needed = 0; 2836 int mmu_reset_needed = 0;
2284 int i, pending_vec, max_bits; 2837 int i, pending_vec, max_bits;
@@ -2293,13 +2846,13 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2293 dt.base = sregs->gdt.base; 2846 dt.base = sregs->gdt.base;
2294 kvm_x86_ops->set_gdt(vcpu, &dt); 2847 kvm_x86_ops->set_gdt(vcpu, &dt);
2295 2848
2296 vcpu->cr2 = sregs->cr2; 2849 vcpu->arch.cr2 = sregs->cr2;
2297 mmu_reset_needed |= vcpu->cr3 != sregs->cr3; 2850 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
2298 vcpu->cr3 = sregs->cr3; 2851 vcpu->arch.cr3 = sregs->cr3;
2299 2852
2300 set_cr8(vcpu, sregs->cr8); 2853 set_cr8(vcpu, sregs->cr8);
2301 2854
2302 mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; 2855 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
2303#ifdef CONFIG_X86_64 2856#ifdef CONFIG_X86_64
2304 kvm_x86_ops->set_efer(vcpu, sregs->efer); 2857 kvm_x86_ops->set_efer(vcpu, sregs->efer);
2305#endif 2858#endif
@@ -2307,25 +2860,25 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2307 2860
2308 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2861 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2309 2862
2310 mmu_reset_needed |= vcpu->cr0 != sregs->cr0; 2863 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
2311 vcpu->cr0 = sregs->cr0; 2864 vcpu->arch.cr0 = sregs->cr0;
2312 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 2865 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2313 2866
2314 mmu_reset_needed |= vcpu->cr4 != sregs->cr4; 2867 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
2315 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 2868 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2316 if (!is_long_mode(vcpu) && is_pae(vcpu)) 2869 if (!is_long_mode(vcpu) && is_pae(vcpu))
2317 load_pdptrs(vcpu, vcpu->cr3); 2870 load_pdptrs(vcpu, vcpu->arch.cr3);
2318 2871
2319 if (mmu_reset_needed) 2872 if (mmu_reset_needed)
2320 kvm_mmu_reset_context(vcpu); 2873 kvm_mmu_reset_context(vcpu);
2321 2874
2322 if (!irqchip_in_kernel(vcpu->kvm)) { 2875 if (!irqchip_in_kernel(vcpu->kvm)) {
2323 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, 2876 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
2324 sizeof vcpu->irq_pending); 2877 sizeof vcpu->arch.irq_pending);
2325 vcpu->irq_summary = 0; 2878 vcpu->arch.irq_summary = 0;
2326 for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i) 2879 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
2327 if (vcpu->irq_pending[i]) 2880 if (vcpu->arch.irq_pending[i])
2328 __set_bit(i, &vcpu->irq_summary); 2881 __set_bit(i, &vcpu->arch.irq_summary);
2329 } else { 2882 } else {
2330 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 2883 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2331 pending_vec = find_first_bit( 2884 pending_vec = find_first_bit(
@@ -2334,7 +2887,8 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2334 /* Only pending external irq is handled here */ 2887 /* Only pending external irq is handled here */
2335 if (pending_vec < max_bits) { 2888 if (pending_vec < max_bits) {
2336 kvm_x86_ops->set_irq(vcpu, pending_vec); 2889 kvm_x86_ops->set_irq(vcpu, pending_vec);
2337 printk("Set back pending irq %d\n", pending_vec); 2890 pr_debug("Set back pending irq %d\n",
2891 pending_vec);
2338 } 2892 }
2339 } 2893 }
2340 2894
@@ -2353,174 +2907,8 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2353 return 0; 2907 return 0;
2354} 2908}
2355 2909
2356void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 2910int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2357{ 2911 struct kvm_debug_guest *dbg)
2358 struct kvm_segment cs;
2359
2360 get_segment(vcpu, &cs, VCPU_SREG_CS);
2361 *db = cs.db;
2362 *l = cs.l;
2363}
2364EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2365
2366/*
2367 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
2368 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
2369 *
2370 * This list is modified at module load time to reflect the
2371 * capabilities of the host cpu.
2372 */
2373static u32 msrs_to_save[] = {
2374 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
2375 MSR_K6_STAR,
2376#ifdef CONFIG_X86_64
2377 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
2378#endif
2379 MSR_IA32_TIME_STAMP_COUNTER,
2380};
2381
2382static unsigned num_msrs_to_save;
2383
2384static u32 emulated_msrs[] = {
2385 MSR_IA32_MISC_ENABLE,
2386};
2387
2388static __init void kvm_init_msr_list(void)
2389{
2390 u32 dummy[2];
2391 unsigned i, j;
2392
2393 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2394 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2395 continue;
2396 if (j < i)
2397 msrs_to_save[j] = msrs_to_save[i];
2398 j++;
2399 }
2400 num_msrs_to_save = j;
2401}
2402
2403/*
2404 * Adapt set_msr() to msr_io()'s calling convention
2405 */
2406static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2407{
2408 return kvm_set_msr(vcpu, index, *data);
2409}
2410
2411/*
2412 * Read or write a bunch of msrs. All parameters are kernel addresses.
2413 *
2414 * @return number of msrs set successfully.
2415 */
2416static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2417 struct kvm_msr_entry *entries,
2418 int (*do_msr)(struct kvm_vcpu *vcpu,
2419 unsigned index, u64 *data))
2420{
2421 int i;
2422
2423 vcpu_load(vcpu);
2424
2425 for (i = 0; i < msrs->nmsrs; ++i)
2426 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2427 break;
2428
2429 vcpu_put(vcpu);
2430
2431 return i;
2432}
2433
2434/*
2435 * Read or write a bunch of msrs. Parameters are user addresses.
2436 *
2437 * @return number of msrs set successfully.
2438 */
2439static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2440 int (*do_msr)(struct kvm_vcpu *vcpu,
2441 unsigned index, u64 *data),
2442 int writeback)
2443{
2444 struct kvm_msrs msrs;
2445 struct kvm_msr_entry *entries;
2446 int r, n;
2447 unsigned size;
2448
2449 r = -EFAULT;
2450 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2451 goto out;
2452
2453 r = -E2BIG;
2454 if (msrs.nmsrs >= MAX_IO_MSRS)
2455 goto out;
2456
2457 r = -ENOMEM;
2458 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2459 entries = vmalloc(size);
2460 if (!entries)
2461 goto out;
2462
2463 r = -EFAULT;
2464 if (copy_from_user(entries, user_msrs->entries, size))
2465 goto out_free;
2466
2467 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2468 if (r < 0)
2469 goto out_free;
2470
2471 r = -EFAULT;
2472 if (writeback && copy_to_user(user_msrs->entries, entries, size))
2473 goto out_free;
2474
2475 r = n;
2476
2477out_free:
2478 vfree(entries);
2479out:
2480 return r;
2481}
2482
2483/*
2484 * Translate a guest virtual address to a guest physical address.
2485 */
2486static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2487 struct kvm_translation *tr)
2488{
2489 unsigned long vaddr = tr->linear_address;
2490 gpa_t gpa;
2491
2492 vcpu_load(vcpu);
2493 mutex_lock(&vcpu->kvm->lock);
2494 gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2495 tr->physical_address = gpa;
2496 tr->valid = gpa != UNMAPPED_GVA;
2497 tr->writeable = 1;
2498 tr->usermode = 0;
2499 mutex_unlock(&vcpu->kvm->lock);
2500 vcpu_put(vcpu);
2501
2502 return 0;
2503}
2504
2505static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2506 struct kvm_interrupt *irq)
2507{
2508 if (irq->irq < 0 || irq->irq >= 256)
2509 return -EINVAL;
2510 if (irqchip_in_kernel(vcpu->kvm))
2511 return -ENXIO;
2512 vcpu_load(vcpu);
2513
2514 set_bit(irq->irq, vcpu->irq_pending);
2515 set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2516
2517 vcpu_put(vcpu);
2518
2519 return 0;
2520}
2521
2522static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2523 struct kvm_debug_guest *dbg)
2524{ 2912{
2525 int r; 2913 int r;
2526 2914
@@ -2533,179 +2921,6 @@ static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2533 return r; 2921 return r;
2534} 2922}
2535 2923
2536static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2537 unsigned long address,
2538 int *type)
2539{
2540 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2541 unsigned long pgoff;
2542 struct page *page;
2543
2544 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2545 if (pgoff == 0)
2546 page = virt_to_page(vcpu->run);
2547 else if (pgoff == KVM_PIO_PAGE_OFFSET)
2548 page = virt_to_page(vcpu->pio_data);
2549 else
2550 return NOPAGE_SIGBUS;
2551 get_page(page);
2552 if (type != NULL)
2553 *type = VM_FAULT_MINOR;
2554
2555 return page;
2556}
2557
2558static struct vm_operations_struct kvm_vcpu_vm_ops = {
2559 .nopage = kvm_vcpu_nopage,
2560};
2561
2562static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2563{
2564 vma->vm_ops = &kvm_vcpu_vm_ops;
2565 return 0;
2566}
2567
2568static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2569{
2570 struct kvm_vcpu *vcpu = filp->private_data;
2571
2572 fput(vcpu->kvm->filp);
2573 return 0;
2574}
2575
2576static struct file_operations kvm_vcpu_fops = {
2577 .release = kvm_vcpu_release,
2578 .unlocked_ioctl = kvm_vcpu_ioctl,
2579 .compat_ioctl = kvm_vcpu_ioctl,
2580 .mmap = kvm_vcpu_mmap,
2581};
2582
2583/*
2584 * Allocates an inode for the vcpu.
2585 */
2586static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2587{
2588 int fd, r;
2589 struct inode *inode;
2590 struct file *file;
2591
2592 r = anon_inode_getfd(&fd, &inode, &file,
2593 "kvm-vcpu", &kvm_vcpu_fops, vcpu);
2594 if (r)
2595 return r;
2596 atomic_inc(&vcpu->kvm->filp->f_count);
2597 return fd;
2598}
2599
2600/*
2601 * Creates some virtual cpus. Good luck creating more than one.
2602 */
2603static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2604{
2605 int r;
2606 struct kvm_vcpu *vcpu;
2607
2608 if (!valid_vcpu(n))
2609 return -EINVAL;
2610
2611 vcpu = kvm_x86_ops->vcpu_create(kvm, n);
2612 if (IS_ERR(vcpu))
2613 return PTR_ERR(vcpu);
2614
2615 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
2616
2617 /* We do fxsave: this must be aligned. */
2618 BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
2619
2620 vcpu_load(vcpu);
2621 r = kvm_mmu_setup(vcpu);
2622 vcpu_put(vcpu);
2623 if (r < 0)
2624 goto free_vcpu;
2625
2626 mutex_lock(&kvm->lock);
2627 if (kvm->vcpus[n]) {
2628 r = -EEXIST;
2629 mutex_unlock(&kvm->lock);
2630 goto mmu_unload;
2631 }
2632 kvm->vcpus[n] = vcpu;
2633 mutex_unlock(&kvm->lock);
2634
2635 /* Now it's all set up, let userspace reach it */
2636 r = create_vcpu_fd(vcpu);
2637 if (r < 0)
2638 goto unlink;
2639 return r;
2640
2641unlink:
2642 mutex_lock(&kvm->lock);
2643 kvm->vcpus[n] = NULL;
2644 mutex_unlock(&kvm->lock);
2645
2646mmu_unload:
2647 vcpu_load(vcpu);
2648 kvm_mmu_unload(vcpu);
2649 vcpu_put(vcpu);
2650
2651free_vcpu:
2652 kvm_x86_ops->vcpu_free(vcpu);
2653 return r;
2654}
2655
2656static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2657{
2658 u64 efer;
2659 int i;
2660 struct kvm_cpuid_entry *e, *entry;
2661
2662 rdmsrl(MSR_EFER, efer);
2663 entry = NULL;
2664 for (i = 0; i < vcpu->cpuid_nent; ++i) {
2665 e = &vcpu->cpuid_entries[i];
2666 if (e->function == 0x80000001) {
2667 entry = e;
2668 break;
2669 }
2670 }
2671 if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
2672 entry->edx &= ~(1 << 20);
2673 printk(KERN_INFO "kvm: guest NX capability removed\n");
2674 }
2675}
2676
2677static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2678 struct kvm_cpuid *cpuid,
2679 struct kvm_cpuid_entry __user *entries)
2680{
2681 int r;
2682
2683 r = -E2BIG;
2684 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2685 goto out;
2686 r = -EFAULT;
2687 if (copy_from_user(&vcpu->cpuid_entries, entries,
2688 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2689 goto out;
2690 vcpu->cpuid_nent = cpuid->nent;
2691 cpuid_fix_nx_cap(vcpu);
2692 return 0;
2693
2694out:
2695 return r;
2696}
2697
2698static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2699{
2700 if (sigset) {
2701 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2702 vcpu->sigset_active = 1;
2703 vcpu->sigset = *sigset;
2704 } else
2705 vcpu->sigset_active = 0;
2706 return 0;
2707}
2708
2709/* 2924/*
2710 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 2925 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
2711 * we have asm/x86/processor.h 2926 * we have asm/x86/processor.h
@@ -2727,9 +2942,31 @@ struct fxsave {
2727#endif 2942#endif
2728}; 2943};
2729 2944
2730static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 2945/*
2946 * Translate a guest virtual address to a guest physical address.
2947 */
2948int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2949 struct kvm_translation *tr)
2731{ 2950{
2732 struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; 2951 unsigned long vaddr = tr->linear_address;
2952 gpa_t gpa;
2953
2954 vcpu_load(vcpu);
2955 down_read(&current->mm->mmap_sem);
2956 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
2957 up_read(&current->mm->mmap_sem);
2958 tr->physical_address = gpa;
2959 tr->valid = gpa != UNMAPPED_GVA;
2960 tr->writeable = 1;
2961 tr->usermode = 0;
2962 vcpu_put(vcpu);
2963
2964 return 0;
2965}
2966
2967int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2968{
2969 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2733 2970
2734 vcpu_load(vcpu); 2971 vcpu_load(vcpu);
2735 2972
@@ -2747,9 +2984,9 @@ static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2747 return 0; 2984 return 0;
2748} 2985}
2749 2986
2750static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 2987int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2751{ 2988{
2752 struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; 2989 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2753 2990
2754 vcpu_load(vcpu); 2991 vcpu_load(vcpu);
2755 2992
@@ -2767,862 +3004,284 @@ static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2767 return 0; 3004 return 0;
2768} 3005}
2769 3006
2770static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 3007void fx_init(struct kvm_vcpu *vcpu)
2771 struct kvm_lapic_state *s)
2772{ 3008{
2773 vcpu_load(vcpu); 3009 unsigned after_mxcsr_mask;
2774 memcpy(s->regs, vcpu->apic->regs, sizeof *s);
2775 vcpu_put(vcpu);
2776
2777 return 0;
2778}
2779 3010
2780static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 3011 /* Initialize guest FPU by resetting ours and saving into guest's */
2781 struct kvm_lapic_state *s) 3012 preempt_disable();
2782{ 3013 fx_save(&vcpu->arch.host_fx_image);
2783 vcpu_load(vcpu); 3014 fpu_init();
2784 memcpy(vcpu->apic->regs, s->regs, sizeof *s); 3015 fx_save(&vcpu->arch.guest_fx_image);
2785 kvm_apic_post_state_restore(vcpu); 3016 fx_restore(&vcpu->arch.host_fx_image);
2786 vcpu_put(vcpu); 3017 preempt_enable();
2787 3018
2788 return 0; 3019 vcpu->arch.cr0 |= X86_CR0_ET;
3020 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
3021 vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
3022 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
3023 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
2789} 3024}
3025EXPORT_SYMBOL_GPL(fx_init);
2790 3026
2791static long kvm_vcpu_ioctl(struct file *filp, 3027void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
2792 unsigned int ioctl, unsigned long arg)
2793{ 3028{
2794 struct kvm_vcpu *vcpu = filp->private_data; 3029 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
2795 void __user *argp = (void __user *)arg; 3030 return;
2796 int r = -EINVAL;
2797
2798 switch (ioctl) {
2799 case KVM_RUN:
2800 r = -EINVAL;
2801 if (arg)
2802 goto out;
2803 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2804 break;
2805 case KVM_GET_REGS: {
2806 struct kvm_regs kvm_regs;
2807
2808 memset(&kvm_regs, 0, sizeof kvm_regs);
2809 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2810 if (r)
2811 goto out;
2812 r = -EFAULT;
2813 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2814 goto out;
2815 r = 0;
2816 break;
2817 }
2818 case KVM_SET_REGS: {
2819 struct kvm_regs kvm_regs;
2820
2821 r = -EFAULT;
2822 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2823 goto out;
2824 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2825 if (r)
2826 goto out;
2827 r = 0;
2828 break;
2829 }
2830 case KVM_GET_SREGS: {
2831 struct kvm_sregs kvm_sregs;
2832
2833 memset(&kvm_sregs, 0, sizeof kvm_sregs);
2834 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2835 if (r)
2836 goto out;
2837 r = -EFAULT;
2838 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2839 goto out;
2840 r = 0;
2841 break;
2842 }
2843 case KVM_SET_SREGS: {
2844 struct kvm_sregs kvm_sregs;
2845
2846 r = -EFAULT;
2847 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2848 goto out;
2849 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2850 if (r)
2851 goto out;
2852 r = 0;
2853 break;
2854 }
2855 case KVM_TRANSLATE: {
2856 struct kvm_translation tr;
2857
2858 r = -EFAULT;
2859 if (copy_from_user(&tr, argp, sizeof tr))
2860 goto out;
2861 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2862 if (r)
2863 goto out;
2864 r = -EFAULT;
2865 if (copy_to_user(argp, &tr, sizeof tr))
2866 goto out;
2867 r = 0;
2868 break;
2869 }
2870 case KVM_INTERRUPT: {
2871 struct kvm_interrupt irq;
2872
2873 r = -EFAULT;
2874 if (copy_from_user(&irq, argp, sizeof irq))
2875 goto out;
2876 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2877 if (r)
2878 goto out;
2879 r = 0;
2880 break;
2881 }
2882 case KVM_DEBUG_GUEST: {
2883 struct kvm_debug_guest dbg;
2884
2885 r = -EFAULT;
2886 if (copy_from_user(&dbg, argp, sizeof dbg))
2887 goto out;
2888 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2889 if (r)
2890 goto out;
2891 r = 0;
2892 break;
2893 }
2894 case KVM_GET_MSRS:
2895 r = msr_io(vcpu, argp, kvm_get_msr, 1);
2896 break;
2897 case KVM_SET_MSRS:
2898 r = msr_io(vcpu, argp, do_set_msr, 0);
2899 break;
2900 case KVM_SET_CPUID: {
2901 struct kvm_cpuid __user *cpuid_arg = argp;
2902 struct kvm_cpuid cpuid;
2903
2904 r = -EFAULT;
2905 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2906 goto out;
2907 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2908 if (r)
2909 goto out;
2910 break;
2911 }
2912 case KVM_SET_SIGNAL_MASK: {
2913 struct kvm_signal_mask __user *sigmask_arg = argp;
2914 struct kvm_signal_mask kvm_sigmask;
2915 sigset_t sigset, *p;
2916
2917 p = NULL;
2918 if (argp) {
2919 r = -EFAULT;
2920 if (copy_from_user(&kvm_sigmask, argp,
2921 sizeof kvm_sigmask))
2922 goto out;
2923 r = -EINVAL;
2924 if (kvm_sigmask.len != sizeof sigset)
2925 goto out;
2926 r = -EFAULT;
2927 if (copy_from_user(&sigset, sigmask_arg->sigset,
2928 sizeof sigset))
2929 goto out;
2930 p = &sigset;
2931 }
2932 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2933 break;
2934 }
2935 case KVM_GET_FPU: {
2936 struct kvm_fpu fpu;
2937
2938 memset(&fpu, 0, sizeof fpu);
2939 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2940 if (r)
2941 goto out;
2942 r = -EFAULT;
2943 if (copy_to_user(argp, &fpu, sizeof fpu))
2944 goto out;
2945 r = 0;
2946 break;
2947 }
2948 case KVM_SET_FPU: {
2949 struct kvm_fpu fpu;
2950
2951 r = -EFAULT;
2952 if (copy_from_user(&fpu, argp, sizeof fpu))
2953 goto out;
2954 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2955 if (r)
2956 goto out;
2957 r = 0;
2958 break;
2959 }
2960 case KVM_GET_LAPIC: {
2961 struct kvm_lapic_state lapic;
2962
2963 memset(&lapic, 0, sizeof lapic);
2964 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
2965 if (r)
2966 goto out;
2967 r = -EFAULT;
2968 if (copy_to_user(argp, &lapic, sizeof lapic))
2969 goto out;
2970 r = 0;
2971 break;
2972 }
2973 case KVM_SET_LAPIC: {
2974 struct kvm_lapic_state lapic;
2975 3031
2976 r = -EFAULT; 3032 vcpu->guest_fpu_loaded = 1;
2977 if (copy_from_user(&lapic, argp, sizeof lapic)) 3033 fx_save(&vcpu->arch.host_fx_image);
2978 goto out; 3034 fx_restore(&vcpu->arch.guest_fx_image);
2979 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
2980 if (r)
2981 goto out;
2982 r = 0;
2983 break;
2984 }
2985 default:
2986 ;
2987 }
2988out:
2989 return r;
2990} 3035}
3036EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
2991 3037
2992static long kvm_vm_ioctl(struct file *filp, 3038void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
2993 unsigned int ioctl, unsigned long arg)
2994{ 3039{
2995 struct kvm *kvm = filp->private_data; 3040 if (!vcpu->guest_fpu_loaded)
2996 void __user *argp = (void __user *)arg; 3041 return;
2997 int r = -EINVAL;
2998
2999 switch (ioctl) {
3000 case KVM_CREATE_VCPU:
3001 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
3002 if (r < 0)
3003 goto out;
3004 break;
3005 case KVM_SET_MEMORY_REGION: {
3006 struct kvm_memory_region kvm_mem;
3007
3008 r = -EFAULT;
3009 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
3010 goto out;
3011 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
3012 if (r)
3013 goto out;
3014 break;
3015 }
3016 case KVM_GET_DIRTY_LOG: {
3017 struct kvm_dirty_log log;
3018
3019 r = -EFAULT;
3020 if (copy_from_user(&log, argp, sizeof log))
3021 goto out;
3022 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3023 if (r)
3024 goto out;
3025 break;
3026 }
3027 case KVM_SET_MEMORY_ALIAS: {
3028 struct kvm_memory_alias alias;
3029
3030 r = -EFAULT;
3031 if (copy_from_user(&alias, argp, sizeof alias))
3032 goto out;
3033 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
3034 if (r)
3035 goto out;
3036 break;
3037 }
3038 case KVM_CREATE_IRQCHIP:
3039 r = -ENOMEM;
3040 kvm->vpic = kvm_create_pic(kvm);
3041 if (kvm->vpic) {
3042 r = kvm_ioapic_init(kvm);
3043 if (r) {
3044 kfree(kvm->vpic);
3045 kvm->vpic = NULL;
3046 goto out;
3047 }
3048 }
3049 else
3050 goto out;
3051 break;
3052 case KVM_IRQ_LINE: {
3053 struct kvm_irq_level irq_event;
3054
3055 r = -EFAULT;
3056 if (copy_from_user(&irq_event, argp, sizeof irq_event))
3057 goto out;
3058 if (irqchip_in_kernel(kvm)) {
3059 mutex_lock(&kvm->lock);
3060 if (irq_event.irq < 16)
3061 kvm_pic_set_irq(pic_irqchip(kvm),
3062 irq_event.irq,
3063 irq_event.level);
3064 kvm_ioapic_set_irq(kvm->vioapic,
3065 irq_event.irq,
3066 irq_event.level);
3067 mutex_unlock(&kvm->lock);
3068 r = 0;
3069 }
3070 break;
3071 }
3072 case KVM_GET_IRQCHIP: {
3073 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3074 struct kvm_irqchip chip;
3075
3076 r = -EFAULT;
3077 if (copy_from_user(&chip, argp, sizeof chip))
3078 goto out;
3079 r = -ENXIO;
3080 if (!irqchip_in_kernel(kvm))
3081 goto out;
3082 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
3083 if (r)
3084 goto out;
3085 r = -EFAULT;
3086 if (copy_to_user(argp, &chip, sizeof chip))
3087 goto out;
3088 r = 0;
3089 break;
3090 }
3091 case KVM_SET_IRQCHIP: {
3092 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3093 struct kvm_irqchip chip;
3094 3042
3095 r = -EFAULT; 3043 vcpu->guest_fpu_loaded = 0;
3096 if (copy_from_user(&chip, argp, sizeof chip)) 3044 fx_save(&vcpu->arch.guest_fx_image);
3097 goto out; 3045 fx_restore(&vcpu->arch.host_fx_image);
3098 r = -ENXIO; 3046 ++vcpu->stat.fpu_reload;
3099 if (!irqchip_in_kernel(kvm))
3100 goto out;
3101 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
3102 if (r)
3103 goto out;
3104 r = 0;
3105 break;
3106 }
3107 default:
3108 ;
3109 }
3110out:
3111 return r;
3112} 3047}
3048EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
3113 3049
3114static struct page *kvm_vm_nopage(struct vm_area_struct *vma, 3050void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
3115 unsigned long address,
3116 int *type)
3117{ 3051{
3118 struct kvm *kvm = vma->vm_file->private_data; 3052 kvm_x86_ops->vcpu_free(vcpu);
3119 unsigned long pgoff;
3120 struct page *page;
3121
3122 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3123 page = gfn_to_page(kvm, pgoff);
3124 if (!page)
3125 return NOPAGE_SIGBUS;
3126 get_page(page);
3127 if (type != NULL)
3128 *type = VM_FAULT_MINOR;
3129
3130 return page;
3131} 3053}
3132 3054
3133static struct vm_operations_struct kvm_vm_vm_ops = { 3055struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
3134 .nopage = kvm_vm_nopage, 3056 unsigned int id)
3135};
3136
3137static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
3138{ 3057{
3139 vma->vm_ops = &kvm_vm_vm_ops; 3058 return kvm_x86_ops->vcpu_create(kvm, id);
3140 return 0;
3141} 3059}
3142 3060
3143static struct file_operations kvm_vm_fops = { 3061int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
3144 .release = kvm_vm_release,
3145 .unlocked_ioctl = kvm_vm_ioctl,
3146 .compat_ioctl = kvm_vm_ioctl,
3147 .mmap = kvm_vm_mmap,
3148};
3149
3150static int kvm_dev_ioctl_create_vm(void)
3151{ 3062{
3152 int fd, r; 3063 int r;
3153 struct inode *inode;
3154 struct file *file;
3155 struct kvm *kvm;
3156 3064
3157 kvm = kvm_create_vm(); 3065 /* We do fxsave: this must be aligned. */
3158 if (IS_ERR(kvm)) 3066 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
3159 return PTR_ERR(kvm);
3160 r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
3161 if (r) {
3162 kvm_destroy_vm(kvm);
3163 return r;
3164 }
3165 3067
3166 kvm->filp = file; 3068 vcpu_load(vcpu);
3069 r = kvm_arch_vcpu_reset(vcpu);
3070 if (r == 0)
3071 r = kvm_mmu_setup(vcpu);
3072 vcpu_put(vcpu);
3073 if (r < 0)
3074 goto free_vcpu;
3167 3075
3168 return fd; 3076 return 0;
3077free_vcpu:
3078 kvm_x86_ops->vcpu_free(vcpu);
3079 return r;
3169} 3080}
3170 3081
3171static long kvm_dev_ioctl(struct file *filp, 3082void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
3172 unsigned int ioctl, unsigned long arg)
3173{ 3083{
3174 void __user *argp = (void __user *)arg; 3084 vcpu_load(vcpu);
3175 long r = -EINVAL; 3085 kvm_mmu_unload(vcpu);
3176 3086 vcpu_put(vcpu);
3177 switch (ioctl) {
3178 case KVM_GET_API_VERSION:
3179 r = -EINVAL;
3180 if (arg)
3181 goto out;
3182 r = KVM_API_VERSION;
3183 break;
3184 case KVM_CREATE_VM:
3185 r = -EINVAL;
3186 if (arg)
3187 goto out;
3188 r = kvm_dev_ioctl_create_vm();
3189 break;
3190 case KVM_GET_MSR_INDEX_LIST: {
3191 struct kvm_msr_list __user *user_msr_list = argp;
3192 struct kvm_msr_list msr_list;
3193 unsigned n;
3194
3195 r = -EFAULT;
3196 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
3197 goto out;
3198 n = msr_list.nmsrs;
3199 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
3200 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
3201 goto out;
3202 r = -E2BIG;
3203 if (n < num_msrs_to_save)
3204 goto out;
3205 r = -EFAULT;
3206 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
3207 num_msrs_to_save * sizeof(u32)))
3208 goto out;
3209 if (copy_to_user(user_msr_list->indices
3210 + num_msrs_to_save * sizeof(u32),
3211 &emulated_msrs,
3212 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
3213 goto out;
3214 r = 0;
3215 break;
3216 }
3217 case KVM_CHECK_EXTENSION: {
3218 int ext = (long)argp;
3219 3087
3220 switch (ext) { 3088 kvm_x86_ops->vcpu_free(vcpu);
3221 case KVM_CAP_IRQCHIP:
3222 case KVM_CAP_HLT:
3223 r = 1;
3224 break;
3225 default:
3226 r = 0;
3227 break;
3228 }
3229 break;
3230 }
3231 case KVM_GET_VCPU_MMAP_SIZE:
3232 r = -EINVAL;
3233 if (arg)
3234 goto out;
3235 r = 2 * PAGE_SIZE;
3236 break;
3237 default:
3238 ;
3239 }
3240out:
3241 return r;
3242} 3089}
3243 3090
3244static struct file_operations kvm_chardev_ops = { 3091int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
3245 .unlocked_ioctl = kvm_dev_ioctl,
3246 .compat_ioctl = kvm_dev_ioctl,
3247};
3248
3249static struct miscdevice kvm_dev = {
3250 KVM_MINOR,
3251 "kvm",
3252 &kvm_chardev_ops,
3253};
3254
3255/*
3256 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
3257 * cached on it.
3258 */
3259static void decache_vcpus_on_cpu(int cpu)
3260{ 3092{
3261 struct kvm *vm; 3093 return kvm_x86_ops->vcpu_reset(vcpu);
3262 struct kvm_vcpu *vcpu;
3263 int i;
3264
3265 spin_lock(&kvm_lock);
3266 list_for_each_entry(vm, &vm_list, vm_list)
3267 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3268 vcpu = vm->vcpus[i];
3269 if (!vcpu)
3270 continue;
3271 /*
3272 * If the vcpu is locked, then it is running on some
3273 * other cpu and therefore it is not cached on the
3274 * cpu in question.
3275 *
3276 * If it's not locked, check the last cpu it executed
3277 * on.
3278 */
3279 if (mutex_trylock(&vcpu->mutex)) {
3280 if (vcpu->cpu == cpu) {
3281 kvm_x86_ops->vcpu_decache(vcpu);
3282 vcpu->cpu = -1;
3283 }
3284 mutex_unlock(&vcpu->mutex);
3285 }
3286 }
3287 spin_unlock(&kvm_lock);
3288} 3094}
3289 3095
3290static void hardware_enable(void *junk) 3096void kvm_arch_hardware_enable(void *garbage)
3291{ 3097{
3292 int cpu = raw_smp_processor_id(); 3098 kvm_x86_ops->hardware_enable(garbage);
3293
3294 if (cpu_isset(cpu, cpus_hardware_enabled))
3295 return;
3296 cpu_set(cpu, cpus_hardware_enabled);
3297 kvm_x86_ops->hardware_enable(NULL);
3298} 3099}
3299 3100
3300static void hardware_disable(void *junk) 3101void kvm_arch_hardware_disable(void *garbage)
3301{ 3102{
3302 int cpu = raw_smp_processor_id(); 3103 kvm_x86_ops->hardware_disable(garbage);
3303
3304 if (!cpu_isset(cpu, cpus_hardware_enabled))
3305 return;
3306 cpu_clear(cpu, cpus_hardware_enabled);
3307 decache_vcpus_on_cpu(cpu);
3308 kvm_x86_ops->hardware_disable(NULL);
3309} 3104}
3310 3105
3311static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 3106int kvm_arch_hardware_setup(void)
3312 void *v)
3313{ 3107{
3314 int cpu = (long)v; 3108 return kvm_x86_ops->hardware_setup();
3315
3316 switch (val) {
3317 case CPU_DYING:
3318 case CPU_DYING_FROZEN:
3319 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3320 cpu);
3321 hardware_disable(NULL);
3322 break;
3323 case CPU_UP_CANCELED:
3324 case CPU_UP_CANCELED_FROZEN:
3325 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3326 cpu);
3327 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
3328 break;
3329 case CPU_ONLINE:
3330 case CPU_ONLINE_FROZEN:
3331 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
3332 cpu);
3333 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
3334 break;
3335 }
3336 return NOTIFY_OK;
3337} 3109}
3338 3110
3339static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 3111void kvm_arch_hardware_unsetup(void)
3340 void *v)
3341{ 3112{
3342 if (val == SYS_RESTART) { 3113 kvm_x86_ops->hardware_unsetup();
3343 /*
3344 * Some (well, at least mine) BIOSes hang on reboot if
3345 * in vmx root mode.
3346 */
3347 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
3348 on_each_cpu(hardware_disable, NULL, 0, 1);
3349 }
3350 return NOTIFY_OK;
3351} 3114}
3352 3115
3353static struct notifier_block kvm_reboot_notifier = { 3116void kvm_arch_check_processor_compat(void *rtn)
3354 .notifier_call = kvm_reboot,
3355 .priority = 0,
3356};
3357
3358void kvm_io_bus_init(struct kvm_io_bus *bus)
3359{ 3117{
3360 memset(bus, 0, sizeof(*bus)); 3118 kvm_x86_ops->check_processor_compatibility(rtn);
3361} 3119}
3362 3120
3363void kvm_io_bus_destroy(struct kvm_io_bus *bus) 3121int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
3364{ 3122{
3365 int i; 3123 struct page *page;
3124 struct kvm *kvm;
3125 int r;
3366 3126
3367 for (i = 0; i < bus->dev_count; i++) { 3127 BUG_ON(vcpu->kvm == NULL);
3368 struct kvm_io_device *pos = bus->devs[i]; 3128 kvm = vcpu->kvm;
3369 3129
3370 kvm_iodevice_destructor(pos); 3130 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
3371 } 3131 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
3372} 3132 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
3133 else
3134 vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
3373 3135
3374struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) 3136 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3375{ 3137 if (!page) {
3376 int i; 3138 r = -ENOMEM;
3139 goto fail;
3140 }
3141 vcpu->arch.pio_data = page_address(page);
3377 3142
3378 for (i = 0; i < bus->dev_count; i++) { 3143 r = kvm_mmu_create(vcpu);
3379 struct kvm_io_device *pos = bus->devs[i]; 3144 if (r < 0)
3145 goto fail_free_pio_data;
3380 3146
3381 if (pos->in_range(pos, addr)) 3147 if (irqchip_in_kernel(kvm)) {
3382 return pos; 3148 r = kvm_create_lapic(vcpu);
3149 if (r < 0)
3150 goto fail_mmu_destroy;
3383 } 3151 }
3384 3152
3385 return NULL; 3153 return 0;
3386}
3387
3388void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
3389{
3390 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
3391 3154
3392 bus->devs[bus->dev_count++] = dev; 3155fail_mmu_destroy:
3156 kvm_mmu_destroy(vcpu);
3157fail_free_pio_data:
3158 free_page((unsigned long)vcpu->arch.pio_data);
3159fail:
3160 return r;
3393} 3161}
3394 3162
3395static struct notifier_block kvm_cpu_notifier = { 3163void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
3396 .notifier_call = kvm_cpu_hotplug,
3397 .priority = 20, /* must be > scheduler priority */
3398};
3399
3400static u64 stat_get(void *_offset)
3401{ 3164{
3402 unsigned offset = (long)_offset; 3165 kvm_free_lapic(vcpu);
3403 u64 total = 0; 3166 kvm_mmu_destroy(vcpu);
3404 struct kvm *kvm; 3167 free_page((unsigned long)vcpu->arch.pio_data);
3405 struct kvm_vcpu *vcpu;
3406 int i;
3407
3408 spin_lock(&kvm_lock);
3409 list_for_each_entry(kvm, &vm_list, vm_list)
3410 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3411 vcpu = kvm->vcpus[i];
3412 if (vcpu)
3413 total += *(u32 *)((void *)vcpu + offset);
3414 }
3415 spin_unlock(&kvm_lock);
3416 return total;
3417} 3168}
3418 3169
3419DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n"); 3170struct kvm *kvm_arch_create_vm(void)
3420
3421static __init void kvm_init_debug(void)
3422{ 3171{
3423 struct kvm_stats_debugfs_item *p; 3172 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
3424
3425 debugfs_dir = debugfs_create_dir("kvm", NULL);
3426 for (p = debugfs_entries; p->name; ++p)
3427 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
3428 (void *)(long)p->offset,
3429 &stat_fops);
3430}
3431 3173
3432static void kvm_exit_debug(void) 3174 if (!kvm)
3433{ 3175 return ERR_PTR(-ENOMEM);
3434 struct kvm_stats_debugfs_item *p;
3435 3176
3436 for (p = debugfs_entries; p->name; ++p) 3177 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
3437 debugfs_remove(p->dentry);
3438 debugfs_remove(debugfs_dir);
3439}
3440 3178
3441static int kvm_suspend(struct sys_device *dev, pm_message_t state) 3179 return kvm;
3442{
3443 hardware_disable(NULL);
3444 return 0;
3445} 3180}
3446 3181
3447static int kvm_resume(struct sys_device *dev) 3182static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
3448{ 3183{
3449 hardware_enable(NULL); 3184 vcpu_load(vcpu);
3450 return 0; 3185 kvm_mmu_unload(vcpu);
3186 vcpu_put(vcpu);
3451} 3187}
3452 3188
3453static struct sysdev_class kvm_sysdev_class = { 3189static void kvm_free_vcpus(struct kvm *kvm)
3454 .name = "kvm",
3455 .suspend = kvm_suspend,
3456 .resume = kvm_resume,
3457};
3458
3459static struct sys_device kvm_sysdev = {
3460 .id = 0,
3461 .cls = &kvm_sysdev_class,
3462};
3463
3464hpa_t bad_page_address;
3465
3466static inline
3467struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
3468{ 3190{
3469 return container_of(pn, struct kvm_vcpu, preempt_notifier); 3191 unsigned int i;
3470}
3471 3192
3472static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 3193 /*
3473{ 3194 * Unpin any mmu pages first.
3474 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3195 */
3196 for (i = 0; i < KVM_MAX_VCPUS; ++i)
3197 if (kvm->vcpus[i])
3198 kvm_unload_vcpu_mmu(kvm->vcpus[i]);
3199 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3200 if (kvm->vcpus[i]) {
3201 kvm_arch_vcpu_free(kvm->vcpus[i]);
3202 kvm->vcpus[i] = NULL;
3203 }
3204 }
3475 3205
3476 kvm_x86_ops->vcpu_load(vcpu, cpu);
3477} 3206}
3478 3207
3479static void kvm_sched_out(struct preempt_notifier *pn, 3208void kvm_arch_destroy_vm(struct kvm *kvm)
3480 struct task_struct *next)
3481{ 3209{
3482 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3210 kfree(kvm->arch.vpic);
3483 3211 kfree(kvm->arch.vioapic);
3484 kvm_x86_ops->vcpu_put(vcpu); 3212 kvm_free_vcpus(kvm);
3213 kvm_free_physmem(kvm);
3214 kfree(kvm);
3485} 3215}
3486 3216
3487int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, 3217int kvm_arch_set_memory_region(struct kvm *kvm,
3488 struct module *module) 3218 struct kvm_userspace_memory_region *mem,
3219 struct kvm_memory_slot old,
3220 int user_alloc)
3489{ 3221{
3490 int r; 3222 int npages = mem->memory_size >> PAGE_SHIFT;
3491 int cpu; 3223 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
3492
3493 if (kvm_x86_ops) {
3494 printk(KERN_ERR "kvm: already loaded the other module\n");
3495 return -EEXIST;
3496 }
3497 3224
3498 if (!ops->cpu_has_kvm_support()) { 3225 /*To keep backward compatibility with older userspace,
3499 printk(KERN_ERR "kvm: no hardware support\n"); 3226 *x86 needs to hanlde !user_alloc case.
3500 return -EOPNOTSUPP; 3227 */
3501 } 3228 if (!user_alloc) {
3502 if (ops->disabled_by_bios()) { 3229 if (npages && !old.rmap) {
3503 printk(KERN_ERR "kvm: disabled by bios\n"); 3230 memslot->userspace_addr = do_mmap(NULL, 0,
3504 return -EOPNOTSUPP; 3231 npages * PAGE_SIZE,
3505 } 3232 PROT_READ | PROT_WRITE,
3506 3233 MAP_SHARED | MAP_ANONYMOUS,
3507 kvm_x86_ops = ops; 3234 0);
3508 3235
3509 r = kvm_x86_ops->hardware_setup(); 3236 if (IS_ERR((void *)memslot->userspace_addr))
3510 if (r < 0) 3237 return PTR_ERR((void *)memslot->userspace_addr);
3511 goto out; 3238 } else {
3512 3239 if (!old.user_alloc && old.rmap) {
3513 for_each_online_cpu(cpu) { 3240 int ret;
3514 smp_call_function_single(cpu, 3241
3515 kvm_x86_ops->check_processor_compatibility, 3242 ret = do_munmap(current->mm, old.userspace_addr,
3516 &r, 0, 1); 3243 old.npages * PAGE_SIZE);
3517 if (r < 0) 3244 if (ret < 0)
3518 goto out_free_0; 3245 printk(KERN_WARNING
3519 } 3246 "kvm_vm_ioctl_set_memory_region: "
3520 3247 "failed to munmap memory\n");
3521 on_each_cpu(hardware_enable, NULL, 0, 1); 3248 }
3522 r = register_cpu_notifier(&kvm_cpu_notifier); 3249 }
3523 if (r)
3524 goto out_free_1;
3525 register_reboot_notifier(&kvm_reboot_notifier);
3526
3527 r = sysdev_class_register(&kvm_sysdev_class);
3528 if (r)
3529 goto out_free_2;
3530
3531 r = sysdev_register(&kvm_sysdev);
3532 if (r)
3533 goto out_free_3;
3534
3535 /* A kmem cache lets us meet the alignment requirements of fx_save. */
3536 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
3537 __alignof__(struct kvm_vcpu), 0, 0);
3538 if (!kvm_vcpu_cache) {
3539 r = -ENOMEM;
3540 goto out_free_4;
3541 } 3250 }
3542 3251
3543 kvm_chardev_ops.owner = module; 3252 if (!kvm->arch.n_requested_mmu_pages) {
3544 3253 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
3545 r = misc_register(&kvm_dev); 3254 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
3546 if (r) {
3547 printk (KERN_ERR "kvm: misc device register failed\n");
3548 goto out_free;
3549 } 3255 }
3550 3256
3551 kvm_preempt_ops.sched_in = kvm_sched_in; 3257 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
3552 kvm_preempt_ops.sched_out = kvm_sched_out; 3258 kvm_flush_remote_tlbs(kvm);
3553
3554 return r;
3555 3259
3556out_free: 3260 return 0;
3557 kmem_cache_destroy(kvm_vcpu_cache);
3558out_free_4:
3559 sysdev_unregister(&kvm_sysdev);
3560out_free_3:
3561 sysdev_class_unregister(&kvm_sysdev_class);
3562out_free_2:
3563 unregister_reboot_notifier(&kvm_reboot_notifier);
3564 unregister_cpu_notifier(&kvm_cpu_notifier);
3565out_free_1:
3566 on_each_cpu(hardware_disable, NULL, 0, 1);
3567out_free_0:
3568 kvm_x86_ops->hardware_unsetup();
3569out:
3570 kvm_x86_ops = NULL;
3571 return r;
3572} 3261}
3573 3262
3574void kvm_exit_x86(void) 3263int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
3575{ 3264{
3576 misc_deregister(&kvm_dev); 3265 return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
3577 kmem_cache_destroy(kvm_vcpu_cache); 3266 || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
3578 sysdev_unregister(&kvm_sysdev);
3579 sysdev_class_unregister(&kvm_sysdev_class);
3580 unregister_reboot_notifier(&kvm_reboot_notifier);
3581 unregister_cpu_notifier(&kvm_cpu_notifier);
3582 on_each_cpu(hardware_disable, NULL, 0, 1);
3583 kvm_x86_ops->hardware_unsetup();
3584 kvm_x86_ops = NULL;
3585} 3267}
3586 3268
3587static __init int kvm_init(void) 3269static void vcpu_kick_intr(void *info)
3588{ 3270{
3589 static struct page *bad_page; 3271#ifdef DEBUG
3590 int r; 3272 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
3591 3273 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
3592 r = kvm_mmu_module_init(); 3274#endif
3593 if (r)
3594 goto out4;
3595
3596 kvm_init_debug();
3597
3598 kvm_init_msr_list();
3599
3600 if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
3601 r = -ENOMEM;
3602 goto out;
3603 }
3604
3605 bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3606 memset(__va(bad_page_address), 0, PAGE_SIZE);
3607
3608 return 0;
3609
3610out:
3611 kvm_exit_debug();
3612 kvm_mmu_module_exit();
3613out4:
3614 return r;
3615} 3275}
3616 3276
3617static __exit void kvm_exit(void) 3277void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3618{ 3278{
3619 kvm_exit_debug(); 3279 int ipi_pcpu = vcpu->cpu;
3620 __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3621 kvm_mmu_module_exit();
3622}
3623
3624module_init(kvm_init)
3625module_exit(kvm_exit)
3626 3280
3627EXPORT_SYMBOL_GPL(kvm_init_x86); 3281 if (waitqueue_active(&vcpu->wq)) {
3628EXPORT_SYMBOL_GPL(kvm_exit_x86); 3282 wake_up_interruptible(&vcpu->wq);
3283 ++vcpu->stat.halt_wakeup;
3284 }
3285 if (vcpu->guest_mode)
3286 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
3287}
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
new file mode 100644
index 000000000000..79586003397a
--- /dev/null
+++ b/arch/x86/kvm/x86_emulate.c
@@ -0,0 +1,1912 @@
1/******************************************************************************
2 * x86_emulate.c
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * Linux coding style, mod r/m decoder, segment base fixes, real-mode
9 * privileged instructions:
10 *
11 * Copyright (C) 2006 Qumranet
12 *
13 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com>
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 *
19 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
20 */
21
22#ifndef __KERNEL__
23#include <stdio.h>
24#include <stdint.h>
25#include <public/xen.h>
26#define DPRINTF(_f, _a ...) printf(_f , ## _a)
27#else
28#include <linux/kvm_host.h>
29#define DPRINTF(x...) do {} while (0)
30#endif
31#include <linux/module.h>
32#include <asm/kvm_x86_emulate.h>
33
34/*
35 * Opcode effective-address decode tables.
36 * Note that we only emulate instructions that have at least one memory
37 * operand (excluding implicit stack references). We assume that stack
38 * references and instruction fetches will never occur in special memory
39 * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
40 * not be handled.
41 */
42
43/* Operand sizes: 8-bit operands or specified/overridden size. */
44#define ByteOp (1<<0) /* 8-bit operands. */
45/* Destination operand type. */
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1)
50/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3)
60/* Generic ModRM decode. */
61#define ModRM (1<<6)
62/* Destination is only written; never read. */
63#define Mov (1<<7)
64#define BitOp (1<<8)
65#define MemAbs (1<<9) /* Memory operand is absolute displacement */
66#define String (1<<10) /* String instruction (rep capable) */
67#define Stack (1<<11) /* Stack instruction (push/pop) */
68
69static u16 opcode_table[256] = {
70 /* 0x00 - 0x07 */
71 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
72 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
73 0, 0, 0, 0,
74 /* 0x08 - 0x0F */
75 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
76 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
77 0, 0, 0, 0,
78 /* 0x10 - 0x17 */
79 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
80 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
81 0, 0, 0, 0,
82 /* 0x18 - 0x1F */
83 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
84 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
85 0, 0, 0, 0,
86 /* 0x20 - 0x27 */
87 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
88 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
89 SrcImmByte, SrcImm, 0, 0,
90 /* 0x28 - 0x2F */
91 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
92 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
93 0, 0, 0, 0,
94 /* 0x30 - 0x37 */
95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
97 0, 0, 0, 0,
98 /* 0x38 - 0x3F */
99 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
100 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
101 0, 0, 0, 0,
102 /* 0x40 - 0x47 */
103 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
104 /* 0x48 - 0x4F */
105 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
106 /* 0x50 - 0x57 */
107 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
108 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
109 /* 0x58 - 0x5F */
110 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
111 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
112 /* 0x60 - 0x67 */
113 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
114 0, 0, 0, 0,
115 /* 0x68 - 0x6F */
116 0, 0, ImplicitOps | Mov | Stack, 0,
117 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
118 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
119 /* 0x70 - 0x77 */
120 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
121 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
122 /* 0x78 - 0x7F */
123 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
124 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
125 /* 0x80 - 0x87 */
126 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
127 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
128 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
129 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
130 /* 0x88 - 0x8F */
131 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
132 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
133 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
134 /* 0x90 - 0x9F */
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
137 /* 0xA0 - 0xA7 */
138 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
139 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
140 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
141 ByteOp | ImplicitOps | String, ImplicitOps | String,
142 /* 0xA8 - 0xAF */
143 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
144 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
145 ByteOp | ImplicitOps | String, ImplicitOps | String,
146 /* 0xB0 - 0xBF */
147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
148 /* 0xC0 - 0xC7 */
149 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
150 0, ImplicitOps | Stack, 0, 0,
151 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
152 /* 0xC8 - 0xCF */
153 0, 0, 0, 0, 0, 0, 0, 0,
154 /* 0xD0 - 0xD7 */
155 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
156 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
157 0, 0, 0, 0,
158 /* 0xD8 - 0xDF */
159 0, 0, 0, 0, 0, 0, 0, 0,
160 /* 0xE0 - 0xE7 */
161 0, 0, 0, 0, 0, 0, 0, 0,
162 /* 0xE8 - 0xEF */
163 ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
164 0, 0, 0, 0,
165 /* 0xF0 - 0xF7 */
166 0, 0, 0, 0,
167 ImplicitOps, ImplicitOps,
168 ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
169 /* 0xF8 - 0xFF */
170 ImplicitOps, 0, ImplicitOps, ImplicitOps,
171 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
172};
173
174static u16 twobyte_table[256] = {
175 /* 0x00 - 0x0F */
176 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
177 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
178 /* 0x10 - 0x1F */
179 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
180 /* 0x20 - 0x2F */
181 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 /* 0x30 - 0x3F */
184 ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185 /* 0x40 - 0x47 */
186 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
187 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
188 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
189 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
190 /* 0x48 - 0x4F */
191 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
192 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
193 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
194 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
195 /* 0x50 - 0x5F */
196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197 /* 0x60 - 0x6F */
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 /* 0x70 - 0x7F */
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 /* 0x80 - 0x8F */
202 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
203 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
204 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
205 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
206 /* 0x90 - 0x9F */
207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
208 /* 0xA0 - 0xA7 */
209 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
210 /* 0xA8 - 0xAF */
211 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
212 /* 0xB0 - 0xB7 */
213 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
214 DstMem | SrcReg | ModRM | BitOp,
215 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
216 DstReg | SrcMem16 | ModRM | Mov,
217 /* 0xB8 - 0xBF */
218 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
219 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
220 DstReg | SrcMem16 | ModRM | Mov,
221 /* 0xC0 - 0xCF */
222 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 /* 0xD0 - 0xDF */
225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
226 /* 0xE0 - 0xEF */
227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
228 /* 0xF0 - 0xFF */
229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
230};
231
232/* EFLAGS bit definitions. */
233#define EFLG_OF (1<<11)
234#define EFLG_DF (1<<10)
235#define EFLG_SF (1<<7)
236#define EFLG_ZF (1<<6)
237#define EFLG_AF (1<<4)
238#define EFLG_PF (1<<2)
239#define EFLG_CF (1<<0)
240
241/*
242 * Instruction emulation:
243 * Most instructions are emulated directly via a fragment of inline assembly
244 * code. This allows us to save/restore EFLAGS and thus very easily pick up
245 * any modified flags.
246 */
247
248#if defined(CONFIG_X86_64)
249#define _LO32 "k" /* force 32-bit operand */
250#define _STK "%%rsp" /* stack pointer */
251#elif defined(__i386__)
252#define _LO32 "" /* force 32-bit operand */
253#define _STK "%%esp" /* stack pointer */
254#endif
255
256/*
257 * These EFLAGS bits are restored from saved value during emulation, and
258 * any changes are written back to the saved value after emulation.
259 */
260#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
261
262/* Before executing instruction: restore necessary bits in EFLAGS. */
263#define _PRE_EFLAGS(_sav, _msk, _tmp) \
264 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
265 "movl %"_sav",%"_LO32 _tmp"; " \
266 "push %"_tmp"; " \
267 "push %"_tmp"; " \
268 "movl %"_msk",%"_LO32 _tmp"; " \
269 "andl %"_LO32 _tmp",("_STK"); " \
270 "pushf; " \
271 "notl %"_LO32 _tmp"; " \
272 "andl %"_LO32 _tmp",("_STK"); " \
273 "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
274 "pop %"_tmp"; " \
275 "orl %"_LO32 _tmp",("_STK"); " \
276 "popf; " \
277 "pop %"_sav"; "
278
279/* After executing instruction: write-back necessary bits in EFLAGS. */
280#define _POST_EFLAGS(_sav, _msk, _tmp) \
281 /* _sav |= EFLAGS & _msk; */ \
282 "pushf; " \
283 "pop %"_tmp"; " \
284 "andl %"_msk",%"_LO32 _tmp"; " \
285 "orl %"_LO32 _tmp",%"_sav"; "
286
287/* Raw emulation: instruction has two explicit operands. */
288#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
289 do { \
290 unsigned long _tmp; \
291 \
292 switch ((_dst).bytes) { \
293 case 2: \
294 __asm__ __volatile__ ( \
295 _PRE_EFLAGS("0", "4", "2") \
296 _op"w %"_wx"3,%1; " \
297 _POST_EFLAGS("0", "4", "2") \
298 : "=m" (_eflags), "=m" ((_dst).val), \
299 "=&r" (_tmp) \
300 : _wy ((_src).val), "i" (EFLAGS_MASK)); \
301 break; \
302 case 4: \
303 __asm__ __volatile__ ( \
304 _PRE_EFLAGS("0", "4", "2") \
305 _op"l %"_lx"3,%1; " \
306 _POST_EFLAGS("0", "4", "2") \
307 : "=m" (_eflags), "=m" ((_dst).val), \
308 "=&r" (_tmp) \
309 : _ly ((_src).val), "i" (EFLAGS_MASK)); \
310 break; \
311 case 8: \
312 __emulate_2op_8byte(_op, _src, _dst, \
313 _eflags, _qx, _qy); \
314 break; \
315 } \
316 } while (0)
317
318#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
319 do { \
320 unsigned long _tmp; \
321 switch ((_dst).bytes) { \
322 case 1: \
323 __asm__ __volatile__ ( \
324 _PRE_EFLAGS("0", "4", "2") \
325 _op"b %"_bx"3,%1; " \
326 _POST_EFLAGS("0", "4", "2") \
327 : "=m" (_eflags), "=m" ((_dst).val), \
328 "=&r" (_tmp) \
329 : _by ((_src).val), "i" (EFLAGS_MASK)); \
330 break; \
331 default: \
332 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
333 _wx, _wy, _lx, _ly, _qx, _qy); \
334 break; \
335 } \
336 } while (0)
337
338/* Source operand is byte-sized and may be restricted to just %cl. */
339#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
340 __emulate_2op(_op, _src, _dst, _eflags, \
341 "b", "c", "b", "c", "b", "c", "b", "c")
342
343/* Source operand is byte, word, long or quad sized. */
344#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
345 __emulate_2op(_op, _src, _dst, _eflags, \
346 "b", "q", "w", "r", _LO32, "r", "", "r")
347
348/* Source operand is word, long or quad sized. */
349#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
350 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
351 "w", "r", _LO32, "r", "", "r")
352
353/* Instruction has only one explicit operand (no source operand). */
354#define emulate_1op(_op, _dst, _eflags) \
355 do { \
356 unsigned long _tmp; \
357 \
358 switch ((_dst).bytes) { \
359 case 1: \
360 __asm__ __volatile__ ( \
361 _PRE_EFLAGS("0", "3", "2") \
362 _op"b %1; " \
363 _POST_EFLAGS("0", "3", "2") \
364 : "=m" (_eflags), "=m" ((_dst).val), \
365 "=&r" (_tmp) \
366 : "i" (EFLAGS_MASK)); \
367 break; \
368 case 2: \
369 __asm__ __volatile__ ( \
370 _PRE_EFLAGS("0", "3", "2") \
371 _op"w %1; " \
372 _POST_EFLAGS("0", "3", "2") \
373 : "=m" (_eflags), "=m" ((_dst).val), \
374 "=&r" (_tmp) \
375 : "i" (EFLAGS_MASK)); \
376 break; \
377 case 4: \
378 __asm__ __volatile__ ( \
379 _PRE_EFLAGS("0", "3", "2") \
380 _op"l %1; " \
381 _POST_EFLAGS("0", "3", "2") \
382 : "=m" (_eflags), "=m" ((_dst).val), \
383 "=&r" (_tmp) \
384 : "i" (EFLAGS_MASK)); \
385 break; \
386 case 8: \
387 __emulate_1op_8byte(_op, _dst, _eflags); \
388 break; \
389 } \
390 } while (0)
391
392/* Emulate an instruction with quadword operands (x86/64 only). */
393#if defined(CONFIG_X86_64)
394#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
395 do { \
396 __asm__ __volatile__ ( \
397 _PRE_EFLAGS("0", "4", "2") \
398 _op"q %"_qx"3,%1; " \
399 _POST_EFLAGS("0", "4", "2") \
400 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
401 : _qy ((_src).val), "i" (EFLAGS_MASK)); \
402 } while (0)
403
404#define __emulate_1op_8byte(_op, _dst, _eflags) \
405 do { \
406 __asm__ __volatile__ ( \
407 _PRE_EFLAGS("0", "3", "2") \
408 _op"q %1; " \
409 _POST_EFLAGS("0", "3", "2") \
410 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
411 : "i" (EFLAGS_MASK)); \
412 } while (0)
413
414#elif defined(__i386__)
415#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
416#define __emulate_1op_8byte(_op, _dst, _eflags)
417#endif /* __i386__ */
418
419/* Fetch next part of the instruction being emulated. */
420#define insn_fetch(_type, _size, _eip) \
421({ unsigned long _x; \
422 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
423 if (rc != 0) \
424 goto done; \
425 (_eip) += (_size); \
426 (_type)_x; \
427})
428
429/* Access/update address held in a register, based on addressing mode. */
430#define address_mask(reg) \
431 ((c->ad_bytes == sizeof(unsigned long)) ? \
432 (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
433#define register_address(base, reg) \
434 ((base) + address_mask(reg))
435#define register_address_increment(reg, inc) \
436 do { \
437 /* signed type ensures sign extension to long */ \
438 int _inc = (inc); \
439 if (c->ad_bytes == sizeof(unsigned long)) \
440 (reg) += _inc; \
441 else \
442 (reg) = ((reg) & \
443 ~((1UL << (c->ad_bytes << 3)) - 1)) | \
444 (((reg) + _inc) & \
445 ((1UL << (c->ad_bytes << 3)) - 1)); \
446 } while (0)
447
448#define JMP_REL(rel) \
449 do { \
450 register_address_increment(c->eip, rel); \
451 } while (0)
452
453static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
454 struct x86_emulate_ops *ops,
455 unsigned long linear, u8 *dest)
456{
457 struct fetch_cache *fc = &ctxt->decode.fetch;
458 int rc;
459 int size;
460
461 if (linear < fc->start || linear >= fc->end) {
462 size = min(15UL, PAGE_SIZE - offset_in_page(linear));
463 rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
464 if (rc)
465 return rc;
466 fc->start = linear;
467 fc->end = linear + size;
468 }
469 *dest = fc->data[linear - fc->start];
470 return 0;
471}
472
473static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
474 struct x86_emulate_ops *ops,
475 unsigned long eip, void *dest, unsigned size)
476{
477 int rc = 0;
478
479 eip += ctxt->cs_base;
480 while (size--) {
481 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
482 if (rc)
483 return rc;
484 }
485 return 0;
486}
487
488/*
489 * Given the 'reg' portion of a ModRM byte, and a register block, return a
490 * pointer into the block that addresses the relevant register.
491 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
492 */
493static void *decode_register(u8 modrm_reg, unsigned long *regs,
494 int highbyte_regs)
495{
496 void *p;
497
498 p = &regs[modrm_reg];
499 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
500 p = (unsigned char *)&regs[modrm_reg & 3] + 1;
501 return p;
502}
503
504static int read_descriptor(struct x86_emulate_ctxt *ctxt,
505 struct x86_emulate_ops *ops,
506 void *ptr,
507 u16 *size, unsigned long *address, int op_bytes)
508{
509 int rc;
510
511 if (op_bytes == 2)
512 op_bytes = 3;
513 *address = 0;
514 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
515 ctxt->vcpu);
516 if (rc)
517 return rc;
518 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
519 ctxt->vcpu);
520 return rc;
521}
522
523static int test_cc(unsigned int condition, unsigned int flags)
524{
525 int rc = 0;
526
527 switch ((condition & 15) >> 1) {
528 case 0: /* o */
529 rc |= (flags & EFLG_OF);
530 break;
531 case 1: /* b/c/nae */
532 rc |= (flags & EFLG_CF);
533 break;
534 case 2: /* z/e */
535 rc |= (flags & EFLG_ZF);
536 break;
537 case 3: /* be/na */
538 rc |= (flags & (EFLG_CF|EFLG_ZF));
539 break;
540 case 4: /* s */
541 rc |= (flags & EFLG_SF);
542 break;
543 case 5: /* p/pe */
544 rc |= (flags & EFLG_PF);
545 break;
546 case 7: /* le/ng */
547 rc |= (flags & EFLG_ZF);
548 /* fall through */
549 case 6: /* l/nge */
550 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
551 break;
552 }
553
554 /* Odd condition identifiers (lsb == 1) have inverted sense. */
555 return (!!rc ^ (condition & 1));
556}
557
558static void decode_register_operand(struct operand *op,
559 struct decode_cache *c,
560 int inhibit_bytereg)
561{
562 unsigned reg = c->modrm_reg;
563 int highbyte_regs = c->rex_prefix == 0;
564
565 if (!(c->d & ModRM))
566 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
567 op->type = OP_REG;
568 if ((c->d & ByteOp) && !inhibit_bytereg) {
569 op->ptr = decode_register(reg, c->regs, highbyte_regs);
570 op->val = *(u8 *)op->ptr;
571 op->bytes = 1;
572 } else {
573 op->ptr = decode_register(reg, c->regs, 0);
574 op->bytes = c->op_bytes;
575 switch (op->bytes) {
576 case 2:
577 op->val = *(u16 *)op->ptr;
578 break;
579 case 4:
580 op->val = *(u32 *)op->ptr;
581 break;
582 case 8:
583 op->val = *(u64 *) op->ptr;
584 break;
585 }
586 }
587 op->orig_val = op->val;
588}
589
590static int decode_modrm(struct x86_emulate_ctxt *ctxt,
591 struct x86_emulate_ops *ops)
592{
593 struct decode_cache *c = &ctxt->decode;
594 u8 sib;
595 int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
596 int rc = 0;
597
598 if (c->rex_prefix) {
599 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
600 index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
601 c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
602 }
603
604 c->modrm = insn_fetch(u8, 1, c->eip);
605 c->modrm_mod |= (c->modrm & 0xc0) >> 6;
606 c->modrm_reg |= (c->modrm & 0x38) >> 3;
607 c->modrm_rm |= (c->modrm & 0x07);
608 c->modrm_ea = 0;
609 c->use_modrm_ea = 1;
610
611 if (c->modrm_mod == 3) {
612 c->modrm_val = *(unsigned long *)
613 decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
614 return rc;
615 }
616
617 if (c->ad_bytes == 2) {
618 unsigned bx = c->regs[VCPU_REGS_RBX];
619 unsigned bp = c->regs[VCPU_REGS_RBP];
620 unsigned si = c->regs[VCPU_REGS_RSI];
621 unsigned di = c->regs[VCPU_REGS_RDI];
622
623 /* 16-bit ModR/M decode. */
624 switch (c->modrm_mod) {
625 case 0:
626 if (c->modrm_rm == 6)
627 c->modrm_ea += insn_fetch(u16, 2, c->eip);
628 break;
629 case 1:
630 c->modrm_ea += insn_fetch(s8, 1, c->eip);
631 break;
632 case 2:
633 c->modrm_ea += insn_fetch(u16, 2, c->eip);
634 break;
635 }
636 switch (c->modrm_rm) {
637 case 0:
638 c->modrm_ea += bx + si;
639 break;
640 case 1:
641 c->modrm_ea += bx + di;
642 break;
643 case 2:
644 c->modrm_ea += bp + si;
645 break;
646 case 3:
647 c->modrm_ea += bp + di;
648 break;
649 case 4:
650 c->modrm_ea += si;
651 break;
652 case 5:
653 c->modrm_ea += di;
654 break;
655 case 6:
656 if (c->modrm_mod != 0)
657 c->modrm_ea += bp;
658 break;
659 case 7:
660 c->modrm_ea += bx;
661 break;
662 }
663 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
664 (c->modrm_rm == 6 && c->modrm_mod != 0))
665 if (!c->override_base)
666 c->override_base = &ctxt->ss_base;
667 c->modrm_ea = (u16)c->modrm_ea;
668 } else {
669 /* 32/64-bit ModR/M decode. */
670 switch (c->modrm_rm) {
671 case 4:
672 case 12:
673 sib = insn_fetch(u8, 1, c->eip);
674 index_reg |= (sib >> 3) & 7;
675 base_reg |= sib & 7;
676 scale = sib >> 6;
677
678 switch (base_reg) {
679 case 5:
680 if (c->modrm_mod != 0)
681 c->modrm_ea += c->regs[base_reg];
682 else
683 c->modrm_ea +=
684 insn_fetch(s32, 4, c->eip);
685 break;
686 default:
687 c->modrm_ea += c->regs[base_reg];
688 }
689 switch (index_reg) {
690 case 4:
691 break;
692 default:
693 c->modrm_ea += c->regs[index_reg] << scale;
694 }
695 break;
696 case 5:
697 if (c->modrm_mod != 0)
698 c->modrm_ea += c->regs[c->modrm_rm];
699 else if (ctxt->mode == X86EMUL_MODE_PROT64)
700 rip_relative = 1;
701 break;
702 default:
703 c->modrm_ea += c->regs[c->modrm_rm];
704 break;
705 }
706 switch (c->modrm_mod) {
707 case 0:
708 if (c->modrm_rm == 5)
709 c->modrm_ea += insn_fetch(s32, 4, c->eip);
710 break;
711 case 1:
712 c->modrm_ea += insn_fetch(s8, 1, c->eip);
713 break;
714 case 2:
715 c->modrm_ea += insn_fetch(s32, 4, c->eip);
716 break;
717 }
718 }
719 if (rip_relative) {
720 c->modrm_ea += c->eip;
721 switch (c->d & SrcMask) {
722 case SrcImmByte:
723 c->modrm_ea += 1;
724 break;
725 case SrcImm:
726 if (c->d & ByteOp)
727 c->modrm_ea += 1;
728 else
729 if (c->op_bytes == 8)
730 c->modrm_ea += 4;
731 else
732 c->modrm_ea += c->op_bytes;
733 }
734 }
735done:
736 return rc;
737}
738
739static int decode_abs(struct x86_emulate_ctxt *ctxt,
740 struct x86_emulate_ops *ops)
741{
742 struct decode_cache *c = &ctxt->decode;
743 int rc = 0;
744
745 switch (c->ad_bytes) {
746 case 2:
747 c->modrm_ea = insn_fetch(u16, 2, c->eip);
748 break;
749 case 4:
750 c->modrm_ea = insn_fetch(u32, 4, c->eip);
751 break;
752 case 8:
753 c->modrm_ea = insn_fetch(u64, 8, c->eip);
754 break;
755 }
756done:
757 return rc;
758}
759
760int
761x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
762{
763 struct decode_cache *c = &ctxt->decode;
764 int rc = 0;
765 int mode = ctxt->mode;
766 int def_op_bytes, def_ad_bytes;
767
768 /* Shadow copy of register state. Committed on successful emulation. */
769
770 memset(c, 0, sizeof(struct decode_cache));
771 c->eip = ctxt->vcpu->arch.rip;
772 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
773
774 switch (mode) {
775 case X86EMUL_MODE_REAL:
776 case X86EMUL_MODE_PROT16:
777 def_op_bytes = def_ad_bytes = 2;
778 break;
779 case X86EMUL_MODE_PROT32:
780 def_op_bytes = def_ad_bytes = 4;
781 break;
782#ifdef CONFIG_X86_64
783 case X86EMUL_MODE_PROT64:
784 def_op_bytes = 4;
785 def_ad_bytes = 8;
786 break;
787#endif
788 default:
789 return -1;
790 }
791
792 c->op_bytes = def_op_bytes;
793 c->ad_bytes = def_ad_bytes;
794
795 /* Legacy prefixes. */
796 for (;;) {
797 switch (c->b = insn_fetch(u8, 1, c->eip)) {
798 case 0x66: /* operand-size override */
799 /* switch between 2/4 bytes */
800 c->op_bytes = def_op_bytes ^ 6;
801 break;
802 case 0x67: /* address-size override */
803 if (mode == X86EMUL_MODE_PROT64)
804 /* switch between 4/8 bytes */
805 c->ad_bytes = def_ad_bytes ^ 12;
806 else
807 /* switch between 2/4 bytes */
808 c->ad_bytes = def_ad_bytes ^ 6;
809 break;
810 case 0x2e: /* CS override */
811 c->override_base = &ctxt->cs_base;
812 break;
813 case 0x3e: /* DS override */
814 c->override_base = &ctxt->ds_base;
815 break;
816 case 0x26: /* ES override */
817 c->override_base = &ctxt->es_base;
818 break;
819 case 0x64: /* FS override */
820 c->override_base = &ctxt->fs_base;
821 break;
822 case 0x65: /* GS override */
823 c->override_base = &ctxt->gs_base;
824 break;
825 case 0x36: /* SS override */
826 c->override_base = &ctxt->ss_base;
827 break;
828 case 0x40 ... 0x4f: /* REX */
829 if (mode != X86EMUL_MODE_PROT64)
830 goto done_prefixes;
831 c->rex_prefix = c->b;
832 continue;
833 case 0xf0: /* LOCK */
834 c->lock_prefix = 1;
835 break;
836 case 0xf2: /* REPNE/REPNZ */
837 c->rep_prefix = REPNE_PREFIX;
838 break;
839 case 0xf3: /* REP/REPE/REPZ */
840 c->rep_prefix = REPE_PREFIX;
841 break;
842 default:
843 goto done_prefixes;
844 }
845
846 /* Any legacy prefix after a REX prefix nullifies its effect. */
847
848 c->rex_prefix = 0;
849 }
850
851done_prefixes:
852
853 /* REX prefix. */
854 if (c->rex_prefix)
855 if (c->rex_prefix & 8)
856 c->op_bytes = 8; /* REX.W */
857
858 /* Opcode byte(s). */
859 c->d = opcode_table[c->b];
860 if (c->d == 0) {
861 /* Two-byte opcode? */
862 if (c->b == 0x0f) {
863 c->twobyte = 1;
864 c->b = insn_fetch(u8, 1, c->eip);
865 c->d = twobyte_table[c->b];
866 }
867
868 /* Unrecognised? */
869 if (c->d == 0) {
870 DPRINTF("Cannot emulate %02x\n", c->b);
871 return -1;
872 }
873 }
874
875 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
876 c->op_bytes = 8;
877
878 /* ModRM and SIB bytes. */
879 if (c->d & ModRM)
880 rc = decode_modrm(ctxt, ops);
881 else if (c->d & MemAbs)
882 rc = decode_abs(ctxt, ops);
883 if (rc)
884 goto done;
885
886 if (!c->override_base)
887 c->override_base = &ctxt->ds_base;
888 if (mode == X86EMUL_MODE_PROT64 &&
889 c->override_base != &ctxt->fs_base &&
890 c->override_base != &ctxt->gs_base)
891 c->override_base = NULL;
892
893 if (c->override_base)
894 c->modrm_ea += *c->override_base;
895
896 if (c->ad_bytes != 8)
897 c->modrm_ea = (u32)c->modrm_ea;
898 /*
899 * Decode and fetch the source operand: register, memory
900 * or immediate.
901 */
902 switch (c->d & SrcMask) {
903 case SrcNone:
904 break;
905 case SrcReg:
906 decode_register_operand(&c->src, c, 0);
907 break;
908 case SrcMem16:
909 c->src.bytes = 2;
910 goto srcmem_common;
911 case SrcMem32:
912 c->src.bytes = 4;
913 goto srcmem_common;
914 case SrcMem:
915 c->src.bytes = (c->d & ByteOp) ? 1 :
916 c->op_bytes;
917 /* Don't fetch the address for invlpg: it could be unmapped. */
918 if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
919 break;
920 srcmem_common:
921 /*
922 * For instructions with a ModR/M byte, switch to register
923 * access if Mod = 3.
924 */
925 if ((c->d & ModRM) && c->modrm_mod == 3) {
926 c->src.type = OP_REG;
927 break;
928 }
929 c->src.type = OP_MEM;
930 break;
931 case SrcImm:
932 c->src.type = OP_IMM;
933 c->src.ptr = (unsigned long *)c->eip;
934 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
935 if (c->src.bytes == 8)
936 c->src.bytes = 4;
937 /* NB. Immediates are sign-extended as necessary. */
938 switch (c->src.bytes) {
939 case 1:
940 c->src.val = insn_fetch(s8, 1, c->eip);
941 break;
942 case 2:
943 c->src.val = insn_fetch(s16, 2, c->eip);
944 break;
945 case 4:
946 c->src.val = insn_fetch(s32, 4, c->eip);
947 break;
948 }
949 break;
950 case SrcImmByte:
951 c->src.type = OP_IMM;
952 c->src.ptr = (unsigned long *)c->eip;
953 c->src.bytes = 1;
954 c->src.val = insn_fetch(s8, 1, c->eip);
955 break;
956 }
957
958 /* Decode and fetch the destination operand: register or memory. */
959 switch (c->d & DstMask) {
960 case ImplicitOps:
961 /* Special instructions do their own operand decoding. */
962 return 0;
963 case DstReg:
964 decode_register_operand(&c->dst, c,
965 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
966 break;
967 case DstMem:
968 if ((c->d & ModRM) && c->modrm_mod == 3) {
969 c->dst.type = OP_REG;
970 break;
971 }
972 c->dst.type = OP_MEM;
973 break;
974 }
975
976done:
977 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
978}
979
980static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
981{
982 struct decode_cache *c = &ctxt->decode;
983
984 c->dst.type = OP_MEM;
985 c->dst.bytes = c->op_bytes;
986 c->dst.val = c->src.val;
987 register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
988 c->dst.ptr = (void *) register_address(ctxt->ss_base,
989 c->regs[VCPU_REGS_RSP]);
990}
991
992static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
993 struct x86_emulate_ops *ops)
994{
995 struct decode_cache *c = &ctxt->decode;
996 int rc;
997
998 rc = ops->read_std(register_address(ctxt->ss_base,
999 c->regs[VCPU_REGS_RSP]),
1000 &c->dst.val, c->dst.bytes, ctxt->vcpu);
1001 if (rc != 0)
1002 return rc;
1003
1004 register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
1005
1006 return 0;
1007}
1008
1009static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
1010{
1011 struct decode_cache *c = &ctxt->decode;
1012 switch (c->modrm_reg) {
1013 case 0: /* rol */
1014 emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
1015 break;
1016 case 1: /* ror */
1017 emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
1018 break;
1019 case 2: /* rcl */
1020 emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
1021 break;
1022 case 3: /* rcr */
1023 emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
1024 break;
1025 case 4: /* sal/shl */
1026 case 6: /* sal/shl */
1027 emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
1028 break;
1029 case 5: /* shr */
1030 emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
1031 break;
1032 case 7: /* sar */
1033 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
1034 break;
1035 }
1036}
1037
1038static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1039 struct x86_emulate_ops *ops)
1040{
1041 struct decode_cache *c = &ctxt->decode;
1042 int rc = 0;
1043
1044 switch (c->modrm_reg) {
1045 case 0 ... 1: /* test */
1046 /*
1047 * Special case in Grp3: test has an immediate
1048 * source operand.
1049 */
1050 c->src.type = OP_IMM;
1051 c->src.ptr = (unsigned long *)c->eip;
1052 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1053 if (c->src.bytes == 8)
1054 c->src.bytes = 4;
1055 switch (c->src.bytes) {
1056 case 1:
1057 c->src.val = insn_fetch(s8, 1, c->eip);
1058 break;
1059 case 2:
1060 c->src.val = insn_fetch(s16, 2, c->eip);
1061 break;
1062 case 4:
1063 c->src.val = insn_fetch(s32, 4, c->eip);
1064 break;
1065 }
1066 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1067 break;
1068 case 2: /* not */
1069 c->dst.val = ~c->dst.val;
1070 break;
1071 case 3: /* neg */
1072 emulate_1op("neg", c->dst, ctxt->eflags);
1073 break;
1074 default:
1075 DPRINTF("Cannot emulate %02x\n", c->b);
1076 rc = X86EMUL_UNHANDLEABLE;
1077 break;
1078 }
1079done:
1080 return rc;
1081}
1082
1083static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1084 struct x86_emulate_ops *ops)
1085{
1086 struct decode_cache *c = &ctxt->decode;
1087 int rc;
1088
1089 switch (c->modrm_reg) {
1090 case 0: /* inc */
1091 emulate_1op("inc", c->dst, ctxt->eflags);
1092 break;
1093 case 1: /* dec */
1094 emulate_1op("dec", c->dst, ctxt->eflags);
1095 break;
1096 case 4: /* jmp abs */
1097 if (c->b == 0xff)
1098 c->eip = c->dst.val;
1099 else {
1100 DPRINTF("Cannot emulate %02x\n", c->b);
1101 return X86EMUL_UNHANDLEABLE;
1102 }
1103 break;
1104 case 6: /* push */
1105
1106 /* 64-bit mode: PUSH always pushes a 64-bit operand. */
1107
1108 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1109 c->dst.bytes = 8;
1110 rc = ops->read_std((unsigned long)c->dst.ptr,
1111 &c->dst.val, 8, ctxt->vcpu);
1112 if (rc != 0)
1113 return rc;
1114 }
1115 register_address_increment(c->regs[VCPU_REGS_RSP],
1116 -c->dst.bytes);
1117 rc = ops->write_emulated(register_address(ctxt->ss_base,
1118 c->regs[VCPU_REGS_RSP]), &c->dst.val,
1119 c->dst.bytes, ctxt->vcpu);
1120 if (rc != 0)
1121 return rc;
1122 c->dst.type = OP_NONE;
1123 break;
1124 default:
1125 DPRINTF("Cannot emulate %02x\n", c->b);
1126 return X86EMUL_UNHANDLEABLE;
1127 }
1128 return 0;
1129}
1130
1131static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1132 struct x86_emulate_ops *ops,
1133 unsigned long memop)
1134{
1135 struct decode_cache *c = &ctxt->decode;
1136 u64 old, new;
1137 int rc;
1138
1139 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1140 if (rc != 0)
1141 return rc;
1142
1143 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
1144 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
1145
1146 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1147 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1148 ctxt->eflags &= ~EFLG_ZF;
1149
1150 } else {
1151 new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
1152 (u32) c->regs[VCPU_REGS_RBX];
1153
1154 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1155 if (rc != 0)
1156 return rc;
1157 ctxt->eflags |= EFLG_ZF;
1158 }
1159 return 0;
1160}
1161
1162static inline int writeback(struct x86_emulate_ctxt *ctxt,
1163 struct x86_emulate_ops *ops)
1164{
1165 int rc;
1166 struct decode_cache *c = &ctxt->decode;
1167
1168 switch (c->dst.type) {
1169 case OP_REG:
1170 /* The 4-byte case *is* correct:
1171 * in 64-bit mode we zero-extend.
1172 */
1173 switch (c->dst.bytes) {
1174 case 1:
1175 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1176 break;
1177 case 2:
1178 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1179 break;
1180 case 4:
1181 *c->dst.ptr = (u32)c->dst.val;
1182 break; /* 64b: zero-ext */
1183 case 8:
1184 *c->dst.ptr = c->dst.val;
1185 break;
1186 }
1187 break;
1188 case OP_MEM:
1189 if (c->lock_prefix)
1190 rc = ops->cmpxchg_emulated(
1191 (unsigned long)c->dst.ptr,
1192 &c->dst.orig_val,
1193 &c->dst.val,
1194 c->dst.bytes,
1195 ctxt->vcpu);
1196 else
1197 rc = ops->write_emulated(
1198 (unsigned long)c->dst.ptr,
1199 &c->dst.val,
1200 c->dst.bytes,
1201 ctxt->vcpu);
1202 if (rc != 0)
1203 return rc;
1204 break;
1205 case OP_NONE:
1206 /* no writeback */
1207 break;
1208 default:
1209 break;
1210 }
1211 return 0;
1212}
1213
1214int
1215x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1216{
1217 unsigned long memop = 0;
1218 u64 msr_data;
1219 unsigned long saved_eip = 0;
1220 struct decode_cache *c = &ctxt->decode;
1221 int rc = 0;
1222
1223 /* Shadow copy of register state. Committed on successful emulation.
1224 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
1225 * modify them.
1226 */
1227
1228 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1229 saved_eip = c->eip;
1230
1231 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1232 memop = c->modrm_ea;
1233
1234 if (c->rep_prefix && (c->d & String)) {
1235 /* All REP prefixes have the same first termination condition */
1236 if (c->regs[VCPU_REGS_RCX] == 0) {
1237 ctxt->vcpu->arch.rip = c->eip;
1238 goto done;
1239 }
1240 /* The second termination condition only applies for REPE
1241 * and REPNE. Test if the repeat string operation prefix is
1242 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
1243 * corresponding termination condition according to:
1244 * - if REPE/REPZ and ZF = 0 then done
1245 * - if REPNE/REPNZ and ZF = 1 then done
1246 */
1247 if ((c->b == 0xa6) || (c->b == 0xa7) ||
1248 (c->b == 0xae) || (c->b == 0xaf)) {
1249 if ((c->rep_prefix == REPE_PREFIX) &&
1250 ((ctxt->eflags & EFLG_ZF) == 0)) {
1251 ctxt->vcpu->arch.rip = c->eip;
1252 goto done;
1253 }
1254 if ((c->rep_prefix == REPNE_PREFIX) &&
1255 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
1256 ctxt->vcpu->arch.rip = c->eip;
1257 goto done;
1258 }
1259 }
1260 c->regs[VCPU_REGS_RCX]--;
1261 c->eip = ctxt->vcpu->arch.rip;
1262 }
1263
1264 if (c->src.type == OP_MEM) {
1265 c->src.ptr = (unsigned long *)memop;
1266 c->src.val = 0;
1267 rc = ops->read_emulated((unsigned long)c->src.ptr,
1268 &c->src.val,
1269 c->src.bytes,
1270 ctxt->vcpu);
1271 if (rc != 0)
1272 goto done;
1273 c->src.orig_val = c->src.val;
1274 }
1275
1276 if ((c->d & DstMask) == ImplicitOps)
1277 goto special_insn;
1278
1279
1280 if (c->dst.type == OP_MEM) {
1281 c->dst.ptr = (unsigned long *)memop;
1282 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1283 c->dst.val = 0;
1284 if (c->d & BitOp) {
1285 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1286
1287 c->dst.ptr = (void *)c->dst.ptr +
1288 (c->src.val & mask) / 8;
1289 }
1290 if (!(c->d & Mov) &&
1291 /* optimisation - avoid slow emulated read */
1292 ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1293 &c->dst.val,
1294 c->dst.bytes, ctxt->vcpu)) != 0))
1295 goto done;
1296 }
1297 c->dst.orig_val = c->dst.val;
1298
1299special_insn:
1300
1301 if (c->twobyte)
1302 goto twobyte_insn;
1303
1304 switch (c->b) {
1305 case 0x00 ... 0x05:
1306 add: /* add */
1307 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
1308 break;
1309 case 0x08 ... 0x0d:
1310 or: /* or */
1311 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
1312 break;
1313 case 0x10 ... 0x15:
1314 adc: /* adc */
1315 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
1316 break;
1317 case 0x18 ... 0x1d:
1318 sbb: /* sbb */
1319 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1320 break;
1321 case 0x20 ... 0x23:
1322 and: /* and */
1323 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
1324 break;
1325 case 0x24: /* and al imm8 */
1326 c->dst.type = OP_REG;
1327 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1328 c->dst.val = *(u8 *)c->dst.ptr;
1329 c->dst.bytes = 1;
1330 c->dst.orig_val = c->dst.val;
1331 goto and;
1332 case 0x25: /* and ax imm16, or eax imm32 */
1333 c->dst.type = OP_REG;
1334 c->dst.bytes = c->op_bytes;
1335 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1336 if (c->op_bytes == 2)
1337 c->dst.val = *(u16 *)c->dst.ptr;
1338 else
1339 c->dst.val = *(u32 *)c->dst.ptr;
1340 c->dst.orig_val = c->dst.val;
1341 goto and;
1342 case 0x28 ... 0x2d:
1343 sub: /* sub */
1344 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
1345 break;
1346 case 0x30 ... 0x35:
1347 xor: /* xor */
1348 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
1349 break;
1350 case 0x38 ... 0x3d:
1351 cmp: /* cmp */
1352 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1353 break;
1354 case 0x40 ... 0x47: /* inc r16/r32 */
1355 emulate_1op("inc", c->dst, ctxt->eflags);
1356 break;
1357 case 0x48 ... 0x4f: /* dec r16/r32 */
1358 emulate_1op("dec", c->dst, ctxt->eflags);
1359 break;
1360 case 0x50 ... 0x57: /* push reg */
1361 c->dst.type = OP_MEM;
1362 c->dst.bytes = c->op_bytes;
1363 c->dst.val = c->src.val;
1364 register_address_increment(c->regs[VCPU_REGS_RSP],
1365 -c->op_bytes);
1366 c->dst.ptr = (void *) register_address(
1367 ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
1368 break;
1369 case 0x58 ... 0x5f: /* pop reg */
1370 pop_instruction:
1371 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1372 c->regs[VCPU_REGS_RSP]), c->dst.ptr,
1373 c->op_bytes, ctxt->vcpu)) != 0)
1374 goto done;
1375
1376 register_address_increment(c->regs[VCPU_REGS_RSP],
1377 c->op_bytes);
1378 c->dst.type = OP_NONE; /* Disable writeback. */
1379 break;
1380 case 0x63: /* movsxd */
1381 if (ctxt->mode != X86EMUL_MODE_PROT64)
1382 goto cannot_emulate;
1383 c->dst.val = (s32) c->src.val;
1384 break;
1385 case 0x6a: /* push imm8 */
1386 c->src.val = 0L;
1387 c->src.val = insn_fetch(s8, 1, c->eip);
1388 emulate_push(ctxt);
1389 break;
1390 case 0x6c: /* insb */
1391 case 0x6d: /* insw/insd */
1392 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1393 1,
1394 (c->d & ByteOp) ? 1 : c->op_bytes,
1395 c->rep_prefix ?
1396 address_mask(c->regs[VCPU_REGS_RCX]) : 1,
1397 (ctxt->eflags & EFLG_DF),
1398 register_address(ctxt->es_base,
1399 c->regs[VCPU_REGS_RDI]),
1400 c->rep_prefix,
1401 c->regs[VCPU_REGS_RDX]) == 0) {
1402 c->eip = saved_eip;
1403 return -1;
1404 }
1405 return 0;
1406 case 0x6e: /* outsb */
1407 case 0x6f: /* outsw/outsd */
1408 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1409 0,
1410 (c->d & ByteOp) ? 1 : c->op_bytes,
1411 c->rep_prefix ?
1412 address_mask(c->regs[VCPU_REGS_RCX]) : 1,
1413 (ctxt->eflags & EFLG_DF),
1414 register_address(c->override_base ?
1415 *c->override_base :
1416 ctxt->ds_base,
1417 c->regs[VCPU_REGS_RSI]),
1418 c->rep_prefix,
1419 c->regs[VCPU_REGS_RDX]) == 0) {
1420 c->eip = saved_eip;
1421 return -1;
1422 }
1423 return 0;
1424 case 0x70 ... 0x7f: /* jcc (short) */ {
1425 int rel = insn_fetch(s8, 1, c->eip);
1426
1427 if (test_cc(c->b, ctxt->eflags))
1428 JMP_REL(rel);
1429 break;
1430 }
1431 case 0x80 ... 0x83: /* Grp1 */
1432 switch (c->modrm_reg) {
1433 case 0:
1434 goto add;
1435 case 1:
1436 goto or;
1437 case 2:
1438 goto adc;
1439 case 3:
1440 goto sbb;
1441 case 4:
1442 goto and;
1443 case 5:
1444 goto sub;
1445 case 6:
1446 goto xor;
1447 case 7:
1448 goto cmp;
1449 }
1450 break;
1451 case 0x84 ... 0x85:
1452 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1453 break;
1454 case 0x86 ... 0x87: /* xchg */
1455 /* Write back the register source. */
1456 switch (c->dst.bytes) {
1457 case 1:
1458 *(u8 *) c->src.ptr = (u8) c->dst.val;
1459 break;
1460 case 2:
1461 *(u16 *) c->src.ptr = (u16) c->dst.val;
1462 break;
1463 case 4:
1464 *c->src.ptr = (u32) c->dst.val;
1465 break; /* 64b reg: zero-extend */
1466 case 8:
1467 *c->src.ptr = c->dst.val;
1468 break;
1469 }
1470 /*
1471 * Write back the memory destination with implicit LOCK
1472 * prefix.
1473 */
1474 c->dst.val = c->src.val;
1475 c->lock_prefix = 1;
1476 break;
1477 case 0x88 ... 0x8b: /* mov */
1478 goto mov;
1479 case 0x8d: /* lea r16/r32, m */
1480 c->dst.val = c->modrm_val;
1481 break;
1482 case 0x8f: /* pop (sole member of Grp1a) */
1483 rc = emulate_grp1a(ctxt, ops);
1484 if (rc != 0)
1485 goto done;
1486 break;
1487 case 0x9c: /* pushf */
1488 c->src.val = (unsigned long) ctxt->eflags;
1489 emulate_push(ctxt);
1490 break;
1491 case 0x9d: /* popf */
1492 c->dst.ptr = (unsigned long *) &ctxt->eflags;
1493 goto pop_instruction;
1494 case 0xa0 ... 0xa1: /* mov */
1495 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1496 c->dst.val = c->src.val;
1497 break;
1498 case 0xa2 ... 0xa3: /* mov */
1499 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
1500 break;
1501 case 0xa4 ... 0xa5: /* movs */
1502 c->dst.type = OP_MEM;
1503 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1504 c->dst.ptr = (unsigned long *)register_address(
1505 ctxt->es_base,
1506 c->regs[VCPU_REGS_RDI]);
1507 if ((rc = ops->read_emulated(register_address(
1508 c->override_base ? *c->override_base :
1509 ctxt->ds_base,
1510 c->regs[VCPU_REGS_RSI]),
1511 &c->dst.val,
1512 c->dst.bytes, ctxt->vcpu)) != 0)
1513 goto done;
1514 register_address_increment(c->regs[VCPU_REGS_RSI],
1515 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1516 : c->dst.bytes);
1517 register_address_increment(c->regs[VCPU_REGS_RDI],
1518 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1519 : c->dst.bytes);
1520 break;
1521 case 0xa6 ... 0xa7: /* cmps */
1522 c->src.type = OP_NONE; /* Disable writeback. */
1523 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1524 c->src.ptr = (unsigned long *)register_address(
1525 c->override_base ? *c->override_base :
1526 ctxt->ds_base,
1527 c->regs[VCPU_REGS_RSI]);
1528 if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
1529 &c->src.val,
1530 c->src.bytes,
1531 ctxt->vcpu)) != 0)
1532 goto done;
1533
1534 c->dst.type = OP_NONE; /* Disable writeback. */
1535 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1536 c->dst.ptr = (unsigned long *)register_address(
1537 ctxt->es_base,
1538 c->regs[VCPU_REGS_RDI]);
1539 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1540 &c->dst.val,
1541 c->dst.bytes,
1542 ctxt->vcpu)) != 0)
1543 goto done;
1544
1545 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
1546
1547 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1548
1549 register_address_increment(c->regs[VCPU_REGS_RSI],
1550 (ctxt->eflags & EFLG_DF) ? -c->src.bytes
1551 : c->src.bytes);
1552 register_address_increment(c->regs[VCPU_REGS_RDI],
1553 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1554 : c->dst.bytes);
1555
1556 break;
1557 case 0xaa ... 0xab: /* stos */
1558 c->dst.type = OP_MEM;
1559 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1560 c->dst.ptr = (unsigned long *)register_address(
1561 ctxt->es_base,
1562 c->regs[VCPU_REGS_RDI]);
1563 c->dst.val = c->regs[VCPU_REGS_RAX];
1564 register_address_increment(c->regs[VCPU_REGS_RDI],
1565 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1566 : c->dst.bytes);
1567 break;
1568 case 0xac ... 0xad: /* lods */
1569 c->dst.type = OP_REG;
1570 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1571 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1572 if ((rc = ops->read_emulated(register_address(
1573 c->override_base ? *c->override_base :
1574 ctxt->ds_base,
1575 c->regs[VCPU_REGS_RSI]),
1576 &c->dst.val,
1577 c->dst.bytes,
1578 ctxt->vcpu)) != 0)
1579 goto done;
1580 register_address_increment(c->regs[VCPU_REGS_RSI],
1581 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1582 : c->dst.bytes);
1583 break;
1584 case 0xae ... 0xaf: /* scas */
1585 DPRINTF("Urk! I don't handle SCAS.\n");
1586 goto cannot_emulate;
1587 case 0xc0 ... 0xc1:
1588 emulate_grp2(ctxt);
1589 break;
1590 case 0xc3: /* ret */
1591 c->dst.ptr = &c->eip;
1592 goto pop_instruction;
1593 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
1594 mov:
1595 c->dst.val = c->src.val;
1596 break;
1597 case 0xd0 ... 0xd1: /* Grp2 */
1598 c->src.val = 1;
1599 emulate_grp2(ctxt);
1600 break;
1601 case 0xd2 ... 0xd3: /* Grp2 */
1602 c->src.val = c->regs[VCPU_REGS_RCX];
1603 emulate_grp2(ctxt);
1604 break;
1605 case 0xe8: /* call (near) */ {
1606 long int rel;
1607 switch (c->op_bytes) {
1608 case 2:
1609 rel = insn_fetch(s16, 2, c->eip);
1610 break;
1611 case 4:
1612 rel = insn_fetch(s32, 4, c->eip);
1613 break;
1614 default:
1615 DPRINTF("Call: Invalid op_bytes\n");
1616 goto cannot_emulate;
1617 }
1618 c->src.val = (unsigned long) c->eip;
1619 JMP_REL(rel);
1620 c->op_bytes = c->ad_bytes;
1621 emulate_push(ctxt);
1622 break;
1623 }
1624 case 0xe9: /* jmp rel */
1625 case 0xeb: /* jmp rel short */
1626 JMP_REL(c->src.val);
1627 c->dst.type = OP_NONE; /* Disable writeback. */
1628 break;
1629 case 0xf4: /* hlt */
1630 ctxt->vcpu->arch.halt_request = 1;
1631 goto done;
1632 case 0xf5: /* cmc */
1633 /* complement carry flag from eflags reg */
1634 ctxt->eflags ^= EFLG_CF;
1635 c->dst.type = OP_NONE; /* Disable writeback. */
1636 break;
1637 case 0xf6 ... 0xf7: /* Grp3 */
1638 rc = emulate_grp3(ctxt, ops);
1639 if (rc != 0)
1640 goto done;
1641 break;
1642 case 0xf8: /* clc */
1643 ctxt->eflags &= ~EFLG_CF;
1644 c->dst.type = OP_NONE; /* Disable writeback. */
1645 break;
1646 case 0xfa: /* cli */
1647 ctxt->eflags &= ~X86_EFLAGS_IF;
1648 c->dst.type = OP_NONE; /* Disable writeback. */
1649 break;
1650 case 0xfb: /* sti */
1651 ctxt->eflags |= X86_EFLAGS_IF;
1652 c->dst.type = OP_NONE; /* Disable writeback. */
1653 break;
1654 case 0xfe ... 0xff: /* Grp4/Grp5 */
1655 rc = emulate_grp45(ctxt, ops);
1656 if (rc != 0)
1657 goto done;
1658 break;
1659 }
1660
1661writeback:
1662 rc = writeback(ctxt, ops);
1663 if (rc != 0)
1664 goto done;
1665
1666 /* Commit shadow register state. */
1667 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
1668 ctxt->vcpu->arch.rip = c->eip;
1669
1670done:
1671 if (rc == X86EMUL_UNHANDLEABLE) {
1672 c->eip = saved_eip;
1673 return -1;
1674 }
1675 return 0;
1676
1677twobyte_insn:
1678 switch (c->b) {
1679 case 0x01: /* lgdt, lidt, lmsw */
1680 switch (c->modrm_reg) {
1681 u16 size;
1682 unsigned long address;
1683
1684 case 0: /* vmcall */
1685 if (c->modrm_mod != 3 || c->modrm_rm != 1)
1686 goto cannot_emulate;
1687
1688 rc = kvm_fix_hypercall(ctxt->vcpu);
1689 if (rc)
1690 goto done;
1691
1692 kvm_emulate_hypercall(ctxt->vcpu);
1693 break;
1694 case 2: /* lgdt */
1695 rc = read_descriptor(ctxt, ops, c->src.ptr,
1696 &size, &address, c->op_bytes);
1697 if (rc)
1698 goto done;
1699 realmode_lgdt(ctxt->vcpu, size, address);
1700 break;
1701 case 3: /* lidt/vmmcall */
1702 if (c->modrm_mod == 3 && c->modrm_rm == 1) {
1703 rc = kvm_fix_hypercall(ctxt->vcpu);
1704 if (rc)
1705 goto done;
1706 kvm_emulate_hypercall(ctxt->vcpu);
1707 } else {
1708 rc = read_descriptor(ctxt, ops, c->src.ptr,
1709 &size, &address,
1710 c->op_bytes);
1711 if (rc)
1712 goto done;
1713 realmode_lidt(ctxt->vcpu, size, address);
1714 }
1715 break;
1716 case 4: /* smsw */
1717 if (c->modrm_mod != 3)
1718 goto cannot_emulate;
1719 *(u16 *)&c->regs[c->modrm_rm]
1720 = realmode_get_cr(ctxt->vcpu, 0);
1721 break;
1722 case 6: /* lmsw */
1723 if (c->modrm_mod != 3)
1724 goto cannot_emulate;
1725 realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
1726 &ctxt->eflags);
1727 break;
1728 case 7: /* invlpg*/
1729 emulate_invlpg(ctxt->vcpu, memop);
1730 break;
1731 default:
1732 goto cannot_emulate;
1733 }
1734 /* Disable writeback. */
1735 c->dst.type = OP_NONE;
1736 break;
1737 case 0x06:
1738 emulate_clts(ctxt->vcpu);
1739 c->dst.type = OP_NONE;
1740 break;
1741 case 0x08: /* invd */
1742 case 0x09: /* wbinvd */
1743 case 0x0d: /* GrpP (prefetch) */
1744 case 0x18: /* Grp16 (prefetch/nop) */
1745 c->dst.type = OP_NONE;
1746 break;
1747 case 0x20: /* mov cr, reg */
1748 if (c->modrm_mod != 3)
1749 goto cannot_emulate;
1750 c->regs[c->modrm_rm] =
1751 realmode_get_cr(ctxt->vcpu, c->modrm_reg);
1752 c->dst.type = OP_NONE; /* no writeback */
1753 break;
1754 case 0x21: /* mov from dr to reg */
1755 if (c->modrm_mod != 3)
1756 goto cannot_emulate;
1757 rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
1758 if (rc)
1759 goto cannot_emulate;
1760 c->dst.type = OP_NONE; /* no writeback */
1761 break;
1762 case 0x22: /* mov reg, cr */
1763 if (c->modrm_mod != 3)
1764 goto cannot_emulate;
1765 realmode_set_cr(ctxt->vcpu,
1766 c->modrm_reg, c->modrm_val, &ctxt->eflags);
1767 c->dst.type = OP_NONE;
1768 break;
1769 case 0x23: /* mov from reg to dr */
1770 if (c->modrm_mod != 3)
1771 goto cannot_emulate;
1772 rc = emulator_set_dr(ctxt, c->modrm_reg,
1773 c->regs[c->modrm_rm]);
1774 if (rc)
1775 goto cannot_emulate;
1776 c->dst.type = OP_NONE; /* no writeback */
1777 break;
1778 case 0x30:
1779 /* wrmsr */
1780 msr_data = (u32)c->regs[VCPU_REGS_RAX]
1781 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
1782 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
1783 if (rc) {
1784 kvm_inject_gp(ctxt->vcpu, 0);
1785 c->eip = ctxt->vcpu->arch.rip;
1786 }
1787 rc = X86EMUL_CONTINUE;
1788 c->dst.type = OP_NONE;
1789 break;
1790 case 0x32:
1791 /* rdmsr */
1792 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
1793 if (rc) {
1794 kvm_inject_gp(ctxt->vcpu, 0);
1795 c->eip = ctxt->vcpu->arch.rip;
1796 } else {
1797 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
1798 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
1799 }
1800 rc = X86EMUL_CONTINUE;
1801 c->dst.type = OP_NONE;
1802 break;
1803 case 0x40 ... 0x4f: /* cmov */
1804 c->dst.val = c->dst.orig_val = c->src.val;
1805 if (!test_cc(c->b, ctxt->eflags))
1806 c->dst.type = OP_NONE; /* no writeback */
1807 break;
1808 case 0x80 ... 0x8f: /* jnz rel, etc*/ {
1809 long int rel;
1810
1811 switch (c->op_bytes) {
1812 case 2:
1813 rel = insn_fetch(s16, 2, c->eip);
1814 break;
1815 case 4:
1816 rel = insn_fetch(s32, 4, c->eip);
1817 break;
1818 case 8:
1819 rel = insn_fetch(s64, 8, c->eip);
1820 break;
1821 default:
1822 DPRINTF("jnz: Invalid op_bytes\n");
1823 goto cannot_emulate;
1824 }
1825 if (test_cc(c->b, ctxt->eflags))
1826 JMP_REL(rel);
1827 c->dst.type = OP_NONE;
1828 break;
1829 }
1830 case 0xa3:
1831 bt: /* bt */
1832 c->dst.type = OP_NONE;
1833 /* only subword offset */
1834 c->src.val &= (c->dst.bytes << 3) - 1;
1835 emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
1836 break;
1837 case 0xab:
1838 bts: /* bts */
1839 /* only subword offset */
1840 c->src.val &= (c->dst.bytes << 3) - 1;
1841 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
1842 break;
1843 case 0xb0 ... 0xb1: /* cmpxchg */
1844 /*
1845 * Save real source value, then compare EAX against
1846 * destination.
1847 */
1848 c->src.orig_val = c->src.val;
1849 c->src.val = c->regs[VCPU_REGS_RAX];
1850 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1851 if (ctxt->eflags & EFLG_ZF) {
1852 /* Success: write back to memory. */
1853 c->dst.val = c->src.orig_val;
1854 } else {
1855 /* Failure: write the value we saw to EAX. */
1856 c->dst.type = OP_REG;
1857 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1858 }
1859 break;
1860 case 0xb3:
1861 btr: /* btr */
1862 /* only subword offset */
1863 c->src.val &= (c->dst.bytes << 3) - 1;
1864 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
1865 break;
1866 case 0xb6 ... 0xb7: /* movzx */
1867 c->dst.bytes = c->op_bytes;
1868 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
1869 : (u16) c->src.val;
1870 break;
1871 case 0xba: /* Grp8 */
1872 switch (c->modrm_reg & 3) {
1873 case 0:
1874 goto bt;
1875 case 1:
1876 goto bts;
1877 case 2:
1878 goto btr;
1879 case 3:
1880 goto btc;
1881 }
1882 break;
1883 case 0xbb:
1884 btc: /* btc */
1885 /* only subword offset */
1886 c->src.val &= (c->dst.bytes << 3) - 1;
1887 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
1888 break;
1889 case 0xbe ... 0xbf: /* movsx */
1890 c->dst.bytes = c->op_bytes;
1891 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
1892 (s16) c->src.val;
1893 break;
1894 case 0xc3: /* movnti */
1895 c->dst.bytes = c->op_bytes;
1896 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
1897 (u64) c->src.val;
1898 break;
1899 case 0xc7: /* Grp9 (cmpxchg8b) */
1900 rc = emulate_grp9(ctxt, ops, memop);
1901 if (rc != 0)
1902 goto done;
1903 c->dst.type = OP_NONE;
1904 break;
1905 }
1906 goto writeback;
1907
1908cannot_emulate:
1909 DPRINTF("Cannot emulate %02x\n", c->b);
1910 c->eip = saved_eip;
1911 return -1;
1912}
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a63373759f08..5afdde4895dc 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -67,6 +67,7 @@
67#include <asm/mce.h> 67#include <asm/mce.h>
68#include <asm/io.h> 68#include <asm/io.h>
69#include <asm/i387.h> 69#include <asm/i387.h>
70#include <asm/reboot.h> /* for struct machine_ops */
70 71
71/*G:010 Welcome to the Guest! 72/*G:010 Welcome to the Guest!
72 * 73 *
@@ -813,7 +814,7 @@ static void lguest_safe_halt(void)
813 * rather than virtual addresses, so we use __pa() here. */ 814 * rather than virtual addresses, so we use __pa() here. */
814static void lguest_power_off(void) 815static void lguest_power_off(void)
815{ 816{
816 hcall(LHCALL_CRASH, __pa("Power down"), 0, 0); 817 hcall(LHCALL_SHUTDOWN, __pa("Power down"), LGUEST_SHUTDOWN_POWEROFF, 0);
817} 818}
818 819
819/* 820/*
@@ -823,7 +824,7 @@ static void lguest_power_off(void)
823 */ 824 */
824static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) 825static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
825{ 826{
826 hcall(LHCALL_CRASH, __pa(p), 0, 0); 827 hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0);
827 /* The hcall won't return, but to keep gcc happy, we're "done". */ 828 /* The hcall won't return, but to keep gcc happy, we're "done". */
828 return NOTIFY_DONE; 829 return NOTIFY_DONE;
829} 830}
@@ -927,6 +928,11 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
927 return insn_len; 928 return insn_len;
928} 929}
929 930
931static void lguest_restart(char *reason)
932{
933 hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0);
934}
935
930/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops 936/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops
931 * structures in the kernel provide points for (almost) every routine we have 937 * structures in the kernel provide points for (almost) every routine we have
932 * to override to avoid privileged instructions. */ 938 * to override to avoid privileged instructions. */
@@ -1060,6 +1066,7 @@ __init void lguest_init(void)
1060 * the Guest routine to power off. */ 1066 * the Guest routine to power off. */
1061 pm_power_off = lguest_power_off; 1067 pm_power_off = lguest_power_off;
1062 1068
1069 machine_ops.restart = lguest_restart;
1063 /* Now we're set up, call start_kernel() in init/main.c and we proceed 1070 /* Now we're set up, call start_kernel() in init/main.c and we proceed
1064 * to boot as normal. It never returns. */ 1071 * to boot as normal. It never returns. */
1065 start_kernel(); 1072 start_kernel();
diff --git a/block/bsg.c b/block/bsg.c
index 69b0a9d33306..8917c5174dc2 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -279,6 +279,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr)
279 goto out; 279 goto out;
280 } 280 }
281 rq->next_rq = next_rq; 281 rq->next_rq = next_rq;
282 next_rq->cmd_type = rq->cmd_type;
282 283
283 dxferp = (void*)(unsigned long)hdr->din_xferp; 284 dxferp = (void*)(unsigned long)hdr->din_xferp;
284 ret = blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len); 285 ret = blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len);
diff --git a/drivers/Kconfig b/drivers/Kconfig
index f4076d9e9902..08d4ae201597 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -90,8 +90,6 @@ source "drivers/dca/Kconfig"
90 90
91source "drivers/auxdisplay/Kconfig" 91source "drivers/auxdisplay/Kconfig"
92 92
93source "drivers/kvm/Kconfig"
94
95source "drivers/uio/Kconfig" 93source "drivers/uio/Kconfig"
96 94
97source "drivers/virtio/Kconfig" 95source "drivers/virtio/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index d92d4d82d001..0ee9a8a4095e 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -47,7 +47,6 @@ obj-$(CONFIG_SPI) += spi/
47obj-$(CONFIG_PCCARD) += pcmcia/ 47obj-$(CONFIG_PCCARD) += pcmcia/
48obj-$(CONFIG_DIO) += dio/ 48obj-$(CONFIG_DIO) += dio/
49obj-$(CONFIG_SBUS) += sbus/ 49obj-$(CONFIG_SBUS) += sbus/
50obj-$(CONFIG_KVM) += kvm/
51obj-$(CONFIG_ZORRO) += zorro/ 50obj-$(CONFIG_ZORRO) += zorro/
52obj-$(CONFIG_MAC) += macintosh/ 51obj-$(CONFIG_MAC) += macintosh/
53obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ 52obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/
@@ -73,7 +72,7 @@ obj-$(CONFIG_ISDN) += isdn/
73obj-$(CONFIG_EDAC) += edac/ 72obj-$(CONFIG_EDAC) += edac/
74obj-$(CONFIG_MCA) += mca/ 73obj-$(CONFIG_MCA) += mca/
75obj-$(CONFIG_EISA) += eisa/ 74obj-$(CONFIG_EISA) += eisa/
76obj-$(CONFIG_LGUEST_GUEST) += lguest/ 75obj-y += lguest/
77obj-$(CONFIG_CPU_FREQ) += cpufreq/ 76obj-$(CONFIG_CPU_FREQ) += cpufreq/
78obj-$(CONFIG_CPU_IDLE) += cpuidle/ 77obj-$(CONFIG_CPU_IDLE) += cpuidle/
79obj-$(CONFIG_MMC) += mmc/ 78obj-$(CONFIG_MMC) += mmc/
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index f484495b2ad1..055989e94799 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -163,15 +163,6 @@ static struct kset *bus_kset;
163 163
164#ifdef CONFIG_HOTPLUG 164#ifdef CONFIG_HOTPLUG
165/* Manually detach a device from its associated driver. */ 165/* Manually detach a device from its associated driver. */
166static int driver_helper(struct device *dev, void *data)
167{
168 const char *name = data;
169
170 if (strcmp(name, dev->bus_id) == 0)
171 return 1;
172 return 0;
173}
174
175static ssize_t driver_unbind(struct device_driver *drv, 166static ssize_t driver_unbind(struct device_driver *drv,
176 const char *buf, size_t count) 167 const char *buf, size_t count)
177{ 168{
@@ -179,7 +170,7 @@ static ssize_t driver_unbind(struct device_driver *drv,
179 struct device *dev; 170 struct device *dev;
180 int err = -ENODEV; 171 int err = -ENODEV;
181 172
182 dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); 173 dev = bus_find_device_by_name(bus, NULL, buf);
183 if (dev && dev->driver == drv) { 174 if (dev && dev->driver == drv) {
184 if (dev->parent) /* Needed for USB */ 175 if (dev->parent) /* Needed for USB */
185 down(&dev->parent->sem); 176 down(&dev->parent->sem);
@@ -206,7 +197,7 @@ static ssize_t driver_bind(struct device_driver *drv,
206 struct device *dev; 197 struct device *dev;
207 int err = -ENODEV; 198 int err = -ENODEV;
208 199
209 dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); 200 dev = bus_find_device_by_name(bus, NULL, buf);
210 if (dev && dev->driver == NULL) { 201 if (dev && dev->driver == NULL) {
211 if (dev->parent) /* Needed for USB */ 202 if (dev->parent) /* Needed for USB */
212 down(&dev->parent->sem); 203 down(&dev->parent->sem);
@@ -250,7 +241,7 @@ static ssize_t store_drivers_probe(struct bus_type *bus,
250{ 241{
251 struct device *dev; 242 struct device *dev;
252 243
253 dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); 244 dev = bus_find_device_by_name(bus, NULL, buf);
254 if (!dev) 245 if (!dev)
255 return -ENODEV; 246 return -ENODEV;
256 if (bus_rescan_devices_helper(dev, NULL) != 0) 247 if (bus_rescan_devices_helper(dev, NULL) != 0)
@@ -338,6 +329,32 @@ struct device *bus_find_device(struct bus_type *bus,
338} 329}
339EXPORT_SYMBOL_GPL(bus_find_device); 330EXPORT_SYMBOL_GPL(bus_find_device);
340 331
332static int match_name(struct device *dev, void *data)
333{
334 const char *name = data;
335
336 if (strcmp(name, dev->bus_id) == 0)
337 return 1;
338 return 0;
339}
340
341/**
342 * bus_find_device_by_name - device iterator for locating a particular device of a specific name
343 * @bus: bus type
344 * @start: Device to begin with
345 * @name: name of the device to match
346 *
347 * This is similar to the bus_find_device() function above, but it handles
348 * searching by a name automatically, no need to write another strcmp matching
349 * function.
350 */
351struct device *bus_find_device_by_name(struct bus_type *bus,
352 struct device *start, const char *name)
353{
354 return bus_find_device(bus, start, (void *)name, match_name);
355}
356EXPORT_SYMBOL_GPL(bus_find_device_by_name);
357
341static struct device_driver *next_driver(struct klist_iter *i) 358static struct device_driver *next_driver(struct klist_iter *i)
342{ 359{
343 struct klist_node *n = klist_next(i); 360 struct klist_node *n = klist_next(i);
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 59cf35894cfc..9d915376c313 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -149,7 +149,7 @@ int class_register(struct class *cls)
149 if (error) 149 if (error)
150 return error; 150 return error;
151 151
152#ifdef CONFIG_SYSFS_DEPRECATED 152#if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK)
153 /* let the block class directory show up in the root of sysfs */ 153 /* let the block class directory show up in the root of sysfs */
154 if (cls != &block_class) 154 if (cls != &block_class)
155 cls->subsys.kobj.kset = class_kset; 155 cls->subsys.kobj.kset = class_kset;
@@ -863,7 +863,7 @@ EXPORT_SYMBOL_GPL(class_for_each_device);
863 * The callback should return 0 if the device doesn't match and non-zero 863 * The callback should return 0 if the device doesn't match and non-zero
864 * if it does. If the callback returns non-zero, this function will 864 * if it does. If the callback returns non-zero, this function will
865 * return to the caller and not iterate over any more devices. 865 * return to the caller and not iterate over any more devices.
866 866 *
867 * Note, you will need to drop the reference with put_device() after use. 867 * Note, you will need to drop the reference with put_device() after use.
868 * 868 *
869 * We hold class->sem in this function, so it can not be 869 * We hold class->sem in this function, so it can not be
diff --git a/drivers/base/core.c b/drivers/base/core.c
index edf3bbeb8d6a..b1727876182c 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -27,9 +27,17 @@
27int (*platform_notify)(struct device *dev) = NULL; 27int (*platform_notify)(struct device *dev) = NULL;
28int (*platform_notify_remove)(struct device *dev) = NULL; 28int (*platform_notify_remove)(struct device *dev) = NULL;
29 29
30/* 30#ifdef CONFIG_BLOCK
31 * sysfs bindings for devices. 31static inline int device_is_not_partition(struct device *dev)
32 */ 32{
33 return !(dev->type == &part_type);
34}
35#else
36static inline int device_is_not_partition(struct device *dev)
37{
38 return 1;
39}
40#endif
33 41
34/** 42/**
35 * dev_driver_string - Return a device's driver name, if at all possible 43 * dev_driver_string - Return a device's driver name, if at all possible
@@ -652,14 +660,14 @@ static int device_add_class_symlinks(struct device *dev)
652#ifdef CONFIG_SYSFS_DEPRECATED 660#ifdef CONFIG_SYSFS_DEPRECATED
653 /* stacked class devices need a symlink in the class directory */ 661 /* stacked class devices need a symlink in the class directory */
654 if (dev->kobj.parent != &dev->class->subsys.kobj && 662 if (dev->kobj.parent != &dev->class->subsys.kobj &&
655 dev->type != &part_type) { 663 device_is_not_partition(dev)) {
656 error = sysfs_create_link(&dev->class->subsys.kobj, &dev->kobj, 664 error = sysfs_create_link(&dev->class->subsys.kobj, &dev->kobj,
657 dev->bus_id); 665 dev->bus_id);
658 if (error) 666 if (error)
659 goto out_subsys; 667 goto out_subsys;
660 } 668 }
661 669
662 if (dev->parent && dev->type != &part_type) { 670 if (dev->parent && device_is_not_partition(dev)) {
663 struct device *parent = dev->parent; 671 struct device *parent = dev->parent;
664 char *class_name; 672 char *class_name;
665 673
@@ -688,11 +696,11 @@ static int device_add_class_symlinks(struct device *dev)
688 return 0; 696 return 0;
689 697
690out_device: 698out_device:
691 if (dev->parent && dev->type != &part_type) 699 if (dev->parent && device_is_not_partition(dev))
692 sysfs_remove_link(&dev->kobj, "device"); 700 sysfs_remove_link(&dev->kobj, "device");
693out_busid: 701out_busid:
694 if (dev->kobj.parent != &dev->class->subsys.kobj && 702 if (dev->kobj.parent != &dev->class->subsys.kobj &&
695 dev->type != &part_type) 703 device_is_not_partition(dev))
696 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); 704 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
697#else 705#else
698 /* link in the class directory pointing to the device */ 706 /* link in the class directory pointing to the device */
@@ -701,7 +709,7 @@ out_busid:
701 if (error) 709 if (error)
702 goto out_subsys; 710 goto out_subsys;
703 711
704 if (dev->parent && dev->type != &part_type) { 712 if (dev->parent && device_is_not_partition(dev)) {
705 error = sysfs_create_link(&dev->kobj, &dev->parent->kobj, 713 error = sysfs_create_link(&dev->kobj, &dev->parent->kobj,
706 "device"); 714 "device");
707 if (error) 715 if (error)
@@ -725,7 +733,7 @@ static void device_remove_class_symlinks(struct device *dev)
725 return; 733 return;
726 734
727#ifdef CONFIG_SYSFS_DEPRECATED 735#ifdef CONFIG_SYSFS_DEPRECATED
728 if (dev->parent && dev->type != &part_type) { 736 if (dev->parent && device_is_not_partition(dev)) {
729 char *class_name; 737 char *class_name;
730 738
731 class_name = make_class_name(dev->class->name, &dev->kobj); 739 class_name = make_class_name(dev->class->name, &dev->kobj);
@@ -737,10 +745,10 @@ static void device_remove_class_symlinks(struct device *dev)
737 } 745 }
738 746
739 if (dev->kobj.parent != &dev->class->subsys.kobj && 747 if (dev->kobj.parent != &dev->class->subsys.kobj &&
740 dev->type != &part_type) 748 device_is_not_partition(dev))
741 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); 749 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
742#else 750#else
743 if (dev->parent && dev->type != &part_type) 751 if (dev->parent && device_is_not_partition(dev))
744 sysfs_remove_link(&dev->kobj, "device"); 752 sysfs_remove_link(&dev->kobj, "device");
745 753
746 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); 754 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index f2d2c7e2c76b..195ce7c12319 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1571,7 +1571,6 @@ static struct scsi_host_template srp_template = {
1571 .this_id = -1, 1571 .this_id = -1,
1572 .cmd_per_lun = SRP_SQ_SIZE, 1572 .cmd_per_lun = SRP_SQ_SIZE,
1573 .use_clustering = ENABLE_CLUSTERING, 1573 .use_clustering = ENABLE_CLUSTERING,
1574 .use_sg_chaining = ENABLE_SG_CHAINING,
1575 .shost_attrs = srp_host_attrs 1574 .shost_attrs = srp_host_attrs
1576}; 1575};
1577 1576
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h
deleted file mode 100644
index 11fc014e2b30..000000000000
--- a/drivers/kvm/irq.h
+++ /dev/null
@@ -1,165 +0,0 @@
1/*
2 * irq.h: in kernel interrupt controller related definitions
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#ifndef __IRQ_H
23#define __IRQ_H
24
25#include "kvm.h"
26
27typedef void irq_request_func(void *opaque, int level);
28
29struct kvm_kpic_state {
30 u8 last_irr; /* edge detection */
31 u8 irr; /* interrupt request register */
32 u8 imr; /* interrupt mask register */
33 u8 isr; /* interrupt service register */
34 u8 priority_add; /* highest irq priority */
35 u8 irq_base;
36 u8 read_reg_select;
37 u8 poll;
38 u8 special_mask;
39 u8 init_state;
40 u8 auto_eoi;
41 u8 rotate_on_auto_eoi;
42 u8 special_fully_nested_mode;
43 u8 init4; /* true if 4 byte init */
44 u8 elcr; /* PIIX edge/trigger selection */
45 u8 elcr_mask;
46 struct kvm_pic *pics_state;
47};
48
49struct kvm_pic {
50 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
51 irq_request_func *irq_request;
52 void *irq_request_opaque;
53 int output; /* intr from master PIC */
54 struct kvm_io_device dev;
55};
56
57struct kvm_pic *kvm_create_pic(struct kvm *kvm);
58void kvm_pic_set_irq(void *opaque, int irq, int level);
59int kvm_pic_read_irq(struct kvm_pic *s);
60int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
61int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
62void kvm_pic_update_irq(struct kvm_pic *s);
63
64#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
65#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
66#define IOAPIC_EDGE_TRIG 0
67#define IOAPIC_LEVEL_TRIG 1
68
69#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
70#define IOAPIC_MEM_LENGTH 0x100
71
72/* Direct registers. */
73#define IOAPIC_REG_SELECT 0x00
74#define IOAPIC_REG_WINDOW 0x10
75#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
76
77/* Indirect registers. */
78#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
79#define IOAPIC_REG_VERSION 0x01
80#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
81
82struct kvm_ioapic {
83 u64 base_address;
84 u32 ioregsel;
85 u32 id;
86 u32 irr;
87 u32 pad;
88 union ioapic_redir_entry {
89 u64 bits;
90 struct {
91 u8 vector;
92 u8 delivery_mode:3;
93 u8 dest_mode:1;
94 u8 delivery_status:1;
95 u8 polarity:1;
96 u8 remote_irr:1;
97 u8 trig_mode:1;
98 u8 mask:1;
99 u8 reserve:7;
100 u8 reserved[4];
101 u8 dest_id;
102 } fields;
103 } redirtbl[IOAPIC_NUM_PINS];
104 struct kvm_io_device dev;
105 struct kvm *kvm;
106};
107
108struct kvm_lapic {
109 unsigned long base_address;
110 struct kvm_io_device dev;
111 struct {
112 atomic_t pending;
113 s64 period; /* unit: ns */
114 u32 divide_count;
115 ktime_t last_update;
116 struct hrtimer dev;
117 } timer;
118 struct kvm_vcpu *vcpu;
119 struct page *regs_page;
120 void *regs;
121};
122
123#ifdef DEBUG
124#define ASSERT(x) \
125do { \
126 if (!(x)) { \
127 printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
128 __FILE__, __LINE__, #x); \
129 BUG(); \
130 } \
131} while (0)
132#else
133#define ASSERT(x) do { } while (0)
134#endif
135
136void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
137int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
138int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
139int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
140int kvm_create_lapic(struct kvm_vcpu *vcpu);
141void kvm_lapic_reset(struct kvm_vcpu *vcpu);
142void kvm_free_apic(struct kvm_lapic *apic);
143u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
144void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
145void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
146struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
147 unsigned long bitmap);
148u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
149void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
150int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
151void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
152int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
153int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig);
154void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
155int kvm_ioapic_init(struct kvm *kvm);
156void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
157int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
158int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
159void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
160void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
161void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
162void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
163void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
164
165#endif
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
deleted file mode 100644
index feb5ac986c5d..000000000000
--- a/drivers/kvm/mmu.c
+++ /dev/null
@@ -1,1498 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include "vmx.h"
21#include "kvm.h"
22
23#include <linux/types.h>
24#include <linux/string.h>
25#include <linux/mm.h>
26#include <linux/highmem.h>
27#include <linux/module.h>
28
29#include <asm/page.h>
30#include <asm/cmpxchg.h>
31
32#undef MMU_DEBUG
33
34#undef AUDIT
35
36#ifdef AUDIT
37static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
38#else
39static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
40#endif
41
42#ifdef MMU_DEBUG
43
44#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
45#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
46
47#else
48
49#define pgprintk(x...) do { } while (0)
50#define rmap_printk(x...) do { } while (0)
51
52#endif
53
54#if defined(MMU_DEBUG) || defined(AUDIT)
55static int dbg = 1;
56#endif
57
58#ifndef MMU_DEBUG
59#define ASSERT(x) do { } while (0)
60#else
61#define ASSERT(x) \
62 if (!(x)) { \
63 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
64 __FILE__, __LINE__, #x); \
65 }
66#endif
67
68#define PT64_PT_BITS 9
69#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
70#define PT32_PT_BITS 10
71#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
72
73#define PT_WRITABLE_SHIFT 1
74
75#define PT_PRESENT_MASK (1ULL << 0)
76#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
77#define PT_USER_MASK (1ULL << 2)
78#define PT_PWT_MASK (1ULL << 3)
79#define PT_PCD_MASK (1ULL << 4)
80#define PT_ACCESSED_MASK (1ULL << 5)
81#define PT_DIRTY_MASK (1ULL << 6)
82#define PT_PAGE_SIZE_MASK (1ULL << 7)
83#define PT_PAT_MASK (1ULL << 7)
84#define PT_GLOBAL_MASK (1ULL << 8)
85#define PT64_NX_MASK (1ULL << 63)
86
87#define PT_PAT_SHIFT 7
88#define PT_DIR_PAT_SHIFT 12
89#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
90
91#define PT32_DIR_PSE36_SIZE 4
92#define PT32_DIR_PSE36_SHIFT 13
93#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
94
95
96#define PT_FIRST_AVAIL_BITS_SHIFT 9
97#define PT64_SECOND_AVAIL_BITS_SHIFT 52
98
99#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
100
101#define VALID_PAGE(x) ((x) != INVALID_PAGE)
102
103#define PT64_LEVEL_BITS 9
104
105#define PT64_LEVEL_SHIFT(level) \
106 ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
107
108#define PT64_LEVEL_MASK(level) \
109 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
110
111#define PT64_INDEX(address, level)\
112 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
113
114
115#define PT32_LEVEL_BITS 10
116
117#define PT32_LEVEL_SHIFT(level) \
118 ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
119
120#define PT32_LEVEL_MASK(level) \
121 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
122
123#define PT32_INDEX(address, level)\
124 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
125
126
127#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
128#define PT64_DIR_BASE_ADDR_MASK \
129 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
130
131#define PT32_BASE_ADDR_MASK PAGE_MASK
132#define PT32_DIR_BASE_ADDR_MASK \
133 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
134
135
136#define PFERR_PRESENT_MASK (1U << 0)
137#define PFERR_WRITE_MASK (1U << 1)
138#define PFERR_USER_MASK (1U << 2)
139#define PFERR_FETCH_MASK (1U << 4)
140
141#define PT64_ROOT_LEVEL 4
142#define PT32_ROOT_LEVEL 2
143#define PT32E_ROOT_LEVEL 3
144
145#define PT_DIRECTORY_LEVEL 2
146#define PT_PAGE_TABLE_LEVEL 1
147
148#define RMAP_EXT 4
149
150struct kvm_rmap_desc {
151 u64 *shadow_ptes[RMAP_EXT];
152 struct kvm_rmap_desc *more;
153};
154
155static struct kmem_cache *pte_chain_cache;
156static struct kmem_cache *rmap_desc_cache;
157static struct kmem_cache *mmu_page_header_cache;
158
159static int is_write_protection(struct kvm_vcpu *vcpu)
160{
161 return vcpu->cr0 & X86_CR0_WP;
162}
163
164static int is_cpuid_PSE36(void)
165{
166 return 1;
167}
168
169static int is_nx(struct kvm_vcpu *vcpu)
170{
171 return vcpu->shadow_efer & EFER_NX;
172}
173
174static int is_present_pte(unsigned long pte)
175{
176 return pte & PT_PRESENT_MASK;
177}
178
179static int is_writeble_pte(unsigned long pte)
180{
181 return pte & PT_WRITABLE_MASK;
182}
183
184static int is_io_pte(unsigned long pte)
185{
186 return pte & PT_SHADOW_IO_MARK;
187}
188
189static int is_rmap_pte(u64 pte)
190{
191 return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
192 == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
193}
194
195static void set_shadow_pte(u64 *sptep, u64 spte)
196{
197#ifdef CONFIG_X86_64
198 set_64bit((unsigned long *)sptep, spte);
199#else
200 set_64bit((unsigned long long *)sptep, spte);
201#endif
202}
203
204static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
205 struct kmem_cache *base_cache, int min)
206{
207 void *obj;
208
209 if (cache->nobjs >= min)
210 return 0;
211 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
212 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
213 if (!obj)
214 return -ENOMEM;
215 cache->objects[cache->nobjs++] = obj;
216 }
217 return 0;
218}
219
220static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
221{
222 while (mc->nobjs)
223 kfree(mc->objects[--mc->nobjs]);
224}
225
226static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
227 int min)
228{
229 struct page *page;
230
231 if (cache->nobjs >= min)
232 return 0;
233 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
234 page = alloc_page(GFP_KERNEL);
235 if (!page)
236 return -ENOMEM;
237 set_page_private(page, 0);
238 cache->objects[cache->nobjs++] = page_address(page);
239 }
240 return 0;
241}
242
243static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
244{
245 while (mc->nobjs)
246 free_page((unsigned long)mc->objects[--mc->nobjs]);
247}
248
249static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
250{
251 int r;
252
253 kvm_mmu_free_some_pages(vcpu);
254 r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
255 pte_chain_cache, 4);
256 if (r)
257 goto out;
258 r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
259 rmap_desc_cache, 1);
260 if (r)
261 goto out;
262 r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4);
263 if (r)
264 goto out;
265 r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
266 mmu_page_header_cache, 4);
267out:
268 return r;
269}
270
271static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
272{
273 mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
274 mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
275 mmu_free_memory_cache_page(&vcpu->mmu_page_cache);
276 mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
277}
278
279static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
280 size_t size)
281{
282 void *p;
283
284 BUG_ON(!mc->nobjs);
285 p = mc->objects[--mc->nobjs];
286 memset(p, 0, size);
287 return p;
288}
289
290static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
291{
292 return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
293 sizeof(struct kvm_pte_chain));
294}
295
296static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
297{
298 kfree(pc);
299}
300
301static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
302{
303 return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
304 sizeof(struct kvm_rmap_desc));
305}
306
307static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
308{
309 kfree(rd);
310}
311
312/*
313 * Reverse mapping data structures:
314 *
315 * If page->private bit zero is zero, then page->private points to the
316 * shadow page table entry that points to page_address(page).
317 *
318 * If page->private bit zero is one, (then page->private & ~1) points
319 * to a struct kvm_rmap_desc containing more mappings.
320 */
321static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
322{
323 struct page *page;
324 struct kvm_rmap_desc *desc;
325 int i;
326
327 if (!is_rmap_pte(*spte))
328 return;
329 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
330 if (!page_private(page)) {
331 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
332 set_page_private(page,(unsigned long)spte);
333 } else if (!(page_private(page) & 1)) {
334 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
335 desc = mmu_alloc_rmap_desc(vcpu);
336 desc->shadow_ptes[0] = (u64 *)page_private(page);
337 desc->shadow_ptes[1] = spte;
338 set_page_private(page,(unsigned long)desc | 1);
339 } else {
340 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
341 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
342 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
343 desc = desc->more;
344 if (desc->shadow_ptes[RMAP_EXT-1]) {
345 desc->more = mmu_alloc_rmap_desc(vcpu);
346 desc = desc->more;
347 }
348 for (i = 0; desc->shadow_ptes[i]; ++i)
349 ;
350 desc->shadow_ptes[i] = spte;
351 }
352}
353
354static void rmap_desc_remove_entry(struct page *page,
355 struct kvm_rmap_desc *desc,
356 int i,
357 struct kvm_rmap_desc *prev_desc)
358{
359 int j;
360
361 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
362 ;
363 desc->shadow_ptes[i] = desc->shadow_ptes[j];
364 desc->shadow_ptes[j] = NULL;
365 if (j != 0)
366 return;
367 if (!prev_desc && !desc->more)
368 set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
369 else
370 if (prev_desc)
371 prev_desc->more = desc->more;
372 else
373 set_page_private(page,(unsigned long)desc->more | 1);
374 mmu_free_rmap_desc(desc);
375}
376
377static void rmap_remove(u64 *spte)
378{
379 struct page *page;
380 struct kvm_rmap_desc *desc;
381 struct kvm_rmap_desc *prev_desc;
382 int i;
383
384 if (!is_rmap_pte(*spte))
385 return;
386 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
387 if (!page_private(page)) {
388 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
389 BUG();
390 } else if (!(page_private(page) & 1)) {
391 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
392 if ((u64 *)page_private(page) != spte) {
393 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
394 spte, *spte);
395 BUG();
396 }
397 set_page_private(page,0);
398 } else {
399 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
400 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
401 prev_desc = NULL;
402 while (desc) {
403 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
404 if (desc->shadow_ptes[i] == spte) {
405 rmap_desc_remove_entry(page,
406 desc, i,
407 prev_desc);
408 return;
409 }
410 prev_desc = desc;
411 desc = desc->more;
412 }
413 BUG();
414 }
415}
416
417static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
418{
419 struct kvm *kvm = vcpu->kvm;
420 struct page *page;
421 struct kvm_rmap_desc *desc;
422 u64 *spte;
423
424 page = gfn_to_page(kvm, gfn);
425 BUG_ON(!page);
426
427 while (page_private(page)) {
428 if (!(page_private(page) & 1))
429 spte = (u64 *)page_private(page);
430 else {
431 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
432 spte = desc->shadow_ptes[0];
433 }
434 BUG_ON(!spte);
435 BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
436 != page_to_pfn(page));
437 BUG_ON(!(*spte & PT_PRESENT_MASK));
438 BUG_ON(!(*spte & PT_WRITABLE_MASK));
439 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
440 rmap_remove(spte);
441 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
442 kvm_flush_remote_tlbs(vcpu->kvm);
443 }
444}
445
446#ifdef MMU_DEBUG
447static int is_empty_shadow_page(u64 *spt)
448{
449 u64 *pos;
450 u64 *end;
451
452 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
453 if (*pos != 0) {
454 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
455 pos, *pos);
456 return 0;
457 }
458 return 1;
459}
460#endif
461
462static void kvm_mmu_free_page(struct kvm *kvm,
463 struct kvm_mmu_page *page_head)
464{
465 ASSERT(is_empty_shadow_page(page_head->spt));
466 list_del(&page_head->link);
467 __free_page(virt_to_page(page_head->spt));
468 kfree(page_head);
469 ++kvm->n_free_mmu_pages;
470}
471
472static unsigned kvm_page_table_hashfn(gfn_t gfn)
473{
474 return gfn;
475}
476
477static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
478 u64 *parent_pte)
479{
480 struct kvm_mmu_page *page;
481
482 if (!vcpu->kvm->n_free_mmu_pages)
483 return NULL;
484
485 page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
486 sizeof *page);
487 page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
488 set_page_private(virt_to_page(page->spt), (unsigned long)page);
489 list_add(&page->link, &vcpu->kvm->active_mmu_pages);
490 ASSERT(is_empty_shadow_page(page->spt));
491 page->slot_bitmap = 0;
492 page->multimapped = 0;
493 page->parent_pte = parent_pte;
494 --vcpu->kvm->n_free_mmu_pages;
495 return page;
496}
497
498static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
499 struct kvm_mmu_page *page, u64 *parent_pte)
500{
501 struct kvm_pte_chain *pte_chain;
502 struct hlist_node *node;
503 int i;
504
505 if (!parent_pte)
506 return;
507 if (!page->multimapped) {
508 u64 *old = page->parent_pte;
509
510 if (!old) {
511 page->parent_pte = parent_pte;
512 return;
513 }
514 page->multimapped = 1;
515 pte_chain = mmu_alloc_pte_chain(vcpu);
516 INIT_HLIST_HEAD(&page->parent_ptes);
517 hlist_add_head(&pte_chain->link, &page->parent_ptes);
518 pte_chain->parent_ptes[0] = old;
519 }
520 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
521 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
522 continue;
523 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
524 if (!pte_chain->parent_ptes[i]) {
525 pte_chain->parent_ptes[i] = parent_pte;
526 return;
527 }
528 }
529 pte_chain = mmu_alloc_pte_chain(vcpu);
530 BUG_ON(!pte_chain);
531 hlist_add_head(&pte_chain->link, &page->parent_ptes);
532 pte_chain->parent_ptes[0] = parent_pte;
533}
534
535static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
536 u64 *parent_pte)
537{
538 struct kvm_pte_chain *pte_chain;
539 struct hlist_node *node;
540 int i;
541
542 if (!page->multimapped) {
543 BUG_ON(page->parent_pte != parent_pte);
544 page->parent_pte = NULL;
545 return;
546 }
547 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
548 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
549 if (!pte_chain->parent_ptes[i])
550 break;
551 if (pte_chain->parent_ptes[i] != parent_pte)
552 continue;
553 while (i + 1 < NR_PTE_CHAIN_ENTRIES
554 && pte_chain->parent_ptes[i + 1]) {
555 pte_chain->parent_ptes[i]
556 = pte_chain->parent_ptes[i + 1];
557 ++i;
558 }
559 pte_chain->parent_ptes[i] = NULL;
560 if (i == 0) {
561 hlist_del(&pte_chain->link);
562 mmu_free_pte_chain(pte_chain);
563 if (hlist_empty(&page->parent_ptes)) {
564 page->multimapped = 0;
565 page->parent_pte = NULL;
566 }
567 }
568 return;
569 }
570 BUG();
571}
572
573static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
574 gfn_t gfn)
575{
576 unsigned index;
577 struct hlist_head *bucket;
578 struct kvm_mmu_page *page;
579 struct hlist_node *node;
580
581 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
582 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
583 bucket = &vcpu->kvm->mmu_page_hash[index];
584 hlist_for_each_entry(page, node, bucket, hash_link)
585 if (page->gfn == gfn && !page->role.metaphysical) {
586 pgprintk("%s: found role %x\n",
587 __FUNCTION__, page->role.word);
588 return page;
589 }
590 return NULL;
591}
592
593static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
594 gfn_t gfn,
595 gva_t gaddr,
596 unsigned level,
597 int metaphysical,
598 unsigned hugepage_access,
599 u64 *parent_pte)
600{
601 union kvm_mmu_page_role role;
602 unsigned index;
603 unsigned quadrant;
604 struct hlist_head *bucket;
605 struct kvm_mmu_page *page;
606 struct hlist_node *node;
607
608 role.word = 0;
609 role.glevels = vcpu->mmu.root_level;
610 role.level = level;
611 role.metaphysical = metaphysical;
612 role.hugepage_access = hugepage_access;
613 if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
614 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
615 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
616 role.quadrant = quadrant;
617 }
618 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
619 gfn, role.word);
620 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
621 bucket = &vcpu->kvm->mmu_page_hash[index];
622 hlist_for_each_entry(page, node, bucket, hash_link)
623 if (page->gfn == gfn && page->role.word == role.word) {
624 mmu_page_add_parent_pte(vcpu, page, parent_pte);
625 pgprintk("%s: found\n", __FUNCTION__);
626 return page;
627 }
628 page = kvm_mmu_alloc_page(vcpu, parent_pte);
629 if (!page)
630 return page;
631 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
632 page->gfn = gfn;
633 page->role = role;
634 hlist_add_head(&page->hash_link, bucket);
635 if (!metaphysical)
636 rmap_write_protect(vcpu, gfn);
637 return page;
638}
639
640static void kvm_mmu_page_unlink_children(struct kvm *kvm,
641 struct kvm_mmu_page *page)
642{
643 unsigned i;
644 u64 *pt;
645 u64 ent;
646
647 pt = page->spt;
648
649 if (page->role.level == PT_PAGE_TABLE_LEVEL) {
650 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
651 if (pt[i] & PT_PRESENT_MASK)
652 rmap_remove(&pt[i]);
653 pt[i] = 0;
654 }
655 kvm_flush_remote_tlbs(kvm);
656 return;
657 }
658
659 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
660 ent = pt[i];
661
662 pt[i] = 0;
663 if (!(ent & PT_PRESENT_MASK))
664 continue;
665 ent &= PT64_BASE_ADDR_MASK;
666 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
667 }
668 kvm_flush_remote_tlbs(kvm);
669}
670
671static void kvm_mmu_put_page(struct kvm_mmu_page *page,
672 u64 *parent_pte)
673{
674 mmu_page_remove_parent_pte(page, parent_pte);
675}
676
677static void kvm_mmu_zap_page(struct kvm *kvm,
678 struct kvm_mmu_page *page)
679{
680 u64 *parent_pte;
681
682 while (page->multimapped || page->parent_pte) {
683 if (!page->multimapped)
684 parent_pte = page->parent_pte;
685 else {
686 struct kvm_pte_chain *chain;
687
688 chain = container_of(page->parent_ptes.first,
689 struct kvm_pte_chain, link);
690 parent_pte = chain->parent_ptes[0];
691 }
692 BUG_ON(!parent_pte);
693 kvm_mmu_put_page(page, parent_pte);
694 set_shadow_pte(parent_pte, 0);
695 }
696 kvm_mmu_page_unlink_children(kvm, page);
697 if (!page->root_count) {
698 hlist_del(&page->hash_link);
699 kvm_mmu_free_page(kvm, page);
700 } else
701 list_move(&page->link, &kvm->active_mmu_pages);
702}
703
704static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
705{
706 unsigned index;
707 struct hlist_head *bucket;
708 struct kvm_mmu_page *page;
709 struct hlist_node *node, *n;
710 int r;
711
712 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
713 r = 0;
714 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
715 bucket = &vcpu->kvm->mmu_page_hash[index];
716 hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
717 if (page->gfn == gfn && !page->role.metaphysical) {
718 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
719 page->role.word);
720 kvm_mmu_zap_page(vcpu->kvm, page);
721 r = 1;
722 }
723 return r;
724}
725
726static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
727{
728 struct kvm_mmu_page *page;
729
730 while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
731 pgprintk("%s: zap %lx %x\n",
732 __FUNCTION__, gfn, page->role.word);
733 kvm_mmu_zap_page(vcpu->kvm, page);
734 }
735}
736
737static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
738{
739 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
740 struct kvm_mmu_page *page_head = page_header(__pa(pte));
741
742 __set_bit(slot, &page_head->slot_bitmap);
743}
744
745hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
746{
747 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
748
749 return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
750}
751
752hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
753{
754 struct page *page;
755
756 ASSERT((gpa & HPA_ERR_MASK) == 0);
757 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
758 if (!page)
759 return gpa | HPA_ERR_MASK;
760 return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
761 | (gpa & (PAGE_SIZE-1));
762}
763
764hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
765{
766 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
767
768 if (gpa == UNMAPPED_GVA)
769 return UNMAPPED_GVA;
770 return gpa_to_hpa(vcpu, gpa);
771}
772
773struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
774{
775 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
776
777 if (gpa == UNMAPPED_GVA)
778 return NULL;
779 return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
780}
781
782static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
783{
784}
785
786static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
787{
788 int level = PT32E_ROOT_LEVEL;
789 hpa_t table_addr = vcpu->mmu.root_hpa;
790
791 for (; ; level--) {
792 u32 index = PT64_INDEX(v, level);
793 u64 *table;
794 u64 pte;
795
796 ASSERT(VALID_PAGE(table_addr));
797 table = __va(table_addr);
798
799 if (level == 1) {
800 pte = table[index];
801 if (is_present_pte(pte) && is_writeble_pte(pte))
802 return 0;
803 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
804 page_header_update_slot(vcpu->kvm, table, v);
805 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
806 PT_USER_MASK;
807 rmap_add(vcpu, &table[index]);
808 return 0;
809 }
810
811 if (table[index] == 0) {
812 struct kvm_mmu_page *new_table;
813 gfn_t pseudo_gfn;
814
815 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
816 >> PAGE_SHIFT;
817 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
818 v, level - 1,
819 1, 0, &table[index]);
820 if (!new_table) {
821 pgprintk("nonpaging_map: ENOMEM\n");
822 return -ENOMEM;
823 }
824
825 table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
826 | PT_WRITABLE_MASK | PT_USER_MASK;
827 }
828 table_addr = table[index] & PT64_BASE_ADDR_MASK;
829 }
830}
831
832static void mmu_free_roots(struct kvm_vcpu *vcpu)
833{
834 int i;
835 struct kvm_mmu_page *page;
836
837 if (!VALID_PAGE(vcpu->mmu.root_hpa))
838 return;
839#ifdef CONFIG_X86_64
840 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
841 hpa_t root = vcpu->mmu.root_hpa;
842
843 page = page_header(root);
844 --page->root_count;
845 vcpu->mmu.root_hpa = INVALID_PAGE;
846 return;
847 }
848#endif
849 for (i = 0; i < 4; ++i) {
850 hpa_t root = vcpu->mmu.pae_root[i];
851
852 if (root) {
853 root &= PT64_BASE_ADDR_MASK;
854 page = page_header(root);
855 --page->root_count;
856 }
857 vcpu->mmu.pae_root[i] = INVALID_PAGE;
858 }
859 vcpu->mmu.root_hpa = INVALID_PAGE;
860}
861
862static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
863{
864 int i;
865 gfn_t root_gfn;
866 struct kvm_mmu_page *page;
867
868 root_gfn = vcpu->cr3 >> PAGE_SHIFT;
869
870#ifdef CONFIG_X86_64
871 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
872 hpa_t root = vcpu->mmu.root_hpa;
873
874 ASSERT(!VALID_PAGE(root));
875 page = kvm_mmu_get_page(vcpu, root_gfn, 0,
876 PT64_ROOT_LEVEL, 0, 0, NULL);
877 root = __pa(page->spt);
878 ++page->root_count;
879 vcpu->mmu.root_hpa = root;
880 return;
881 }
882#endif
883 for (i = 0; i < 4; ++i) {
884 hpa_t root = vcpu->mmu.pae_root[i];
885
886 ASSERT(!VALID_PAGE(root));
887 if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
888 if (!is_present_pte(vcpu->pdptrs[i])) {
889 vcpu->mmu.pae_root[i] = 0;
890 continue;
891 }
892 root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
893 } else if (vcpu->mmu.root_level == 0)
894 root_gfn = 0;
895 page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
896 PT32_ROOT_LEVEL, !is_paging(vcpu),
897 0, NULL);
898 root = __pa(page->spt);
899 ++page->root_count;
900 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
901 }
902 vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
903}
904
905static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
906{
907 return vaddr;
908}
909
910static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
911 u32 error_code)
912{
913 gpa_t addr = gva;
914 hpa_t paddr;
915 int r;
916
917 r = mmu_topup_memory_caches(vcpu);
918 if (r)
919 return r;
920
921 ASSERT(vcpu);
922 ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
923
924
925 paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
926
927 if (is_error_hpa(paddr))
928 return 1;
929
930 return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
931}
932
933static void nonpaging_free(struct kvm_vcpu *vcpu)
934{
935 mmu_free_roots(vcpu);
936}
937
938static int nonpaging_init_context(struct kvm_vcpu *vcpu)
939{
940 struct kvm_mmu *context = &vcpu->mmu;
941
942 context->new_cr3 = nonpaging_new_cr3;
943 context->page_fault = nonpaging_page_fault;
944 context->gva_to_gpa = nonpaging_gva_to_gpa;
945 context->free = nonpaging_free;
946 context->root_level = 0;
947 context->shadow_root_level = PT32E_ROOT_LEVEL;
948 context->root_hpa = INVALID_PAGE;
949 return 0;
950}
951
952static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
953{
954 ++vcpu->stat.tlb_flush;
955 kvm_x86_ops->tlb_flush(vcpu);
956}
957
958static void paging_new_cr3(struct kvm_vcpu *vcpu)
959{
960 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
961 mmu_free_roots(vcpu);
962}
963
964static void inject_page_fault(struct kvm_vcpu *vcpu,
965 u64 addr,
966 u32 err_code)
967{
968 kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
969}
970
971static void paging_free(struct kvm_vcpu *vcpu)
972{
973 nonpaging_free(vcpu);
974}
975
976#define PTTYPE 64
977#include "paging_tmpl.h"
978#undef PTTYPE
979
980#define PTTYPE 32
981#include "paging_tmpl.h"
982#undef PTTYPE
983
984static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
985{
986 struct kvm_mmu *context = &vcpu->mmu;
987
988 ASSERT(is_pae(vcpu));
989 context->new_cr3 = paging_new_cr3;
990 context->page_fault = paging64_page_fault;
991 context->gva_to_gpa = paging64_gva_to_gpa;
992 context->free = paging_free;
993 context->root_level = level;
994 context->shadow_root_level = level;
995 context->root_hpa = INVALID_PAGE;
996 return 0;
997}
998
999static int paging64_init_context(struct kvm_vcpu *vcpu)
1000{
1001 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1002}
1003
1004static int paging32_init_context(struct kvm_vcpu *vcpu)
1005{
1006 struct kvm_mmu *context = &vcpu->mmu;
1007
1008 context->new_cr3 = paging_new_cr3;
1009 context->page_fault = paging32_page_fault;
1010 context->gva_to_gpa = paging32_gva_to_gpa;
1011 context->free = paging_free;
1012 context->root_level = PT32_ROOT_LEVEL;
1013 context->shadow_root_level = PT32E_ROOT_LEVEL;
1014 context->root_hpa = INVALID_PAGE;
1015 return 0;
1016}
1017
1018static int paging32E_init_context(struct kvm_vcpu *vcpu)
1019{
1020 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1021}
1022
1023static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1024{
1025 ASSERT(vcpu);
1026 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1027
1028 if (!is_paging(vcpu))
1029 return nonpaging_init_context(vcpu);
1030 else if (is_long_mode(vcpu))
1031 return paging64_init_context(vcpu);
1032 else if (is_pae(vcpu))
1033 return paging32E_init_context(vcpu);
1034 else
1035 return paging32_init_context(vcpu);
1036}
1037
1038static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1039{
1040 ASSERT(vcpu);
1041 if (VALID_PAGE(vcpu->mmu.root_hpa)) {
1042 vcpu->mmu.free(vcpu);
1043 vcpu->mmu.root_hpa = INVALID_PAGE;
1044 }
1045}
1046
1047int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1048{
1049 destroy_kvm_mmu(vcpu);
1050 return init_kvm_mmu(vcpu);
1051}
1052EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1053
1054int kvm_mmu_load(struct kvm_vcpu *vcpu)
1055{
1056 int r;
1057
1058 mutex_lock(&vcpu->kvm->lock);
1059 r = mmu_topup_memory_caches(vcpu);
1060 if (r)
1061 goto out;
1062 mmu_alloc_roots(vcpu);
1063 kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
1064 kvm_mmu_flush_tlb(vcpu);
1065out:
1066 mutex_unlock(&vcpu->kvm->lock);
1067 return r;
1068}
1069EXPORT_SYMBOL_GPL(kvm_mmu_load);
1070
1071void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1072{
1073 mmu_free_roots(vcpu);
1074}
1075
1076static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1077 struct kvm_mmu_page *page,
1078 u64 *spte)
1079{
1080 u64 pte;
1081 struct kvm_mmu_page *child;
1082
1083 pte = *spte;
1084 if (is_present_pte(pte)) {
1085 if (page->role.level == PT_PAGE_TABLE_LEVEL)
1086 rmap_remove(spte);
1087 else {
1088 child = page_header(pte & PT64_BASE_ADDR_MASK);
1089 mmu_page_remove_parent_pte(child, spte);
1090 }
1091 }
1092 set_shadow_pte(spte, 0);
1093 kvm_flush_remote_tlbs(vcpu->kvm);
1094}
1095
1096static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1097 struct kvm_mmu_page *page,
1098 u64 *spte,
1099 const void *new, int bytes)
1100{
1101 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1102 return;
1103
1104 if (page->role.glevels == PT32_ROOT_LEVEL)
1105 paging32_update_pte(vcpu, page, spte, new, bytes);
1106 else
1107 paging64_update_pte(vcpu, page, spte, new, bytes);
1108}
1109
1110void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1111 const u8 *new, int bytes)
1112{
1113 gfn_t gfn = gpa >> PAGE_SHIFT;
1114 struct kvm_mmu_page *page;
1115 struct hlist_node *node, *n;
1116 struct hlist_head *bucket;
1117 unsigned index;
1118 u64 *spte;
1119 unsigned offset = offset_in_page(gpa);
1120 unsigned pte_size;
1121 unsigned page_offset;
1122 unsigned misaligned;
1123 unsigned quadrant;
1124 int level;
1125 int flooded = 0;
1126 int npte;
1127
1128 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1129 if (gfn == vcpu->last_pt_write_gfn) {
1130 ++vcpu->last_pt_write_count;
1131 if (vcpu->last_pt_write_count >= 3)
1132 flooded = 1;
1133 } else {
1134 vcpu->last_pt_write_gfn = gfn;
1135 vcpu->last_pt_write_count = 1;
1136 }
1137 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1138 bucket = &vcpu->kvm->mmu_page_hash[index];
1139 hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
1140 if (page->gfn != gfn || page->role.metaphysical)
1141 continue;
1142 pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1143 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1144 misaligned |= bytes < 4;
1145 if (misaligned || flooded) {
1146 /*
1147 * Misaligned accesses are too much trouble to fix
1148 * up; also, they usually indicate a page is not used
1149 * as a page table.
1150 *
1151 * If we're seeing too many writes to a page,
1152 * it may no longer be a page table, or we may be
1153 * forking, in which case it is better to unmap the
1154 * page.
1155 */
1156 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1157 gpa, bytes, page->role.word);
1158 kvm_mmu_zap_page(vcpu->kvm, page);
1159 continue;
1160 }
1161 page_offset = offset;
1162 level = page->role.level;
1163 npte = 1;
1164 if (page->role.glevels == PT32_ROOT_LEVEL) {
1165 page_offset <<= 1; /* 32->64 */
1166 /*
1167 * A 32-bit pde maps 4MB while the shadow pdes map
1168 * only 2MB. So we need to double the offset again
1169 * and zap two pdes instead of one.
1170 */
1171 if (level == PT32_ROOT_LEVEL) {
1172 page_offset &= ~7; /* kill rounding error */
1173 page_offset <<= 1;
1174 npte = 2;
1175 }
1176 quadrant = page_offset >> PAGE_SHIFT;
1177 page_offset &= ~PAGE_MASK;
1178 if (quadrant != page->role.quadrant)
1179 continue;
1180 }
1181 spte = &page->spt[page_offset / sizeof(*spte)];
1182 while (npte--) {
1183 mmu_pte_write_zap_pte(vcpu, page, spte);
1184 mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
1185 ++spte;
1186 }
1187 }
1188}
1189
1190int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1191{
1192 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
1193
1194 return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
1195}
1196
1197void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1198{
1199 while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
1200 struct kvm_mmu_page *page;
1201
1202 page = container_of(vcpu->kvm->active_mmu_pages.prev,
1203 struct kvm_mmu_page, link);
1204 kvm_mmu_zap_page(vcpu->kvm, page);
1205 }
1206}
1207
1208static void free_mmu_pages(struct kvm_vcpu *vcpu)
1209{
1210 struct kvm_mmu_page *page;
1211
1212 while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1213 page = container_of(vcpu->kvm->active_mmu_pages.next,
1214 struct kvm_mmu_page, link);
1215 kvm_mmu_zap_page(vcpu->kvm, page);
1216 }
1217 free_page((unsigned long)vcpu->mmu.pae_root);
1218}
1219
1220static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1221{
1222 struct page *page;
1223 int i;
1224
1225 ASSERT(vcpu);
1226
1227 vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
1228
1229 /*
1230 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1231 * Therefore we need to allocate shadow page tables in the first
1232 * 4GB of memory, which happens to fit the DMA32 zone.
1233 */
1234 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1235 if (!page)
1236 goto error_1;
1237 vcpu->mmu.pae_root = page_address(page);
1238 for (i = 0; i < 4; ++i)
1239 vcpu->mmu.pae_root[i] = INVALID_PAGE;
1240
1241 return 0;
1242
1243error_1:
1244 free_mmu_pages(vcpu);
1245 return -ENOMEM;
1246}
1247
1248int kvm_mmu_create(struct kvm_vcpu *vcpu)
1249{
1250 ASSERT(vcpu);
1251 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1252
1253 return alloc_mmu_pages(vcpu);
1254}
1255
1256int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1257{
1258 ASSERT(vcpu);
1259 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1260
1261 return init_kvm_mmu(vcpu);
1262}
1263
1264void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1265{
1266 ASSERT(vcpu);
1267
1268 destroy_kvm_mmu(vcpu);
1269 free_mmu_pages(vcpu);
1270 mmu_free_memory_caches(vcpu);
1271}
1272
1273void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1274{
1275 struct kvm_mmu_page *page;
1276
1277 list_for_each_entry(page, &kvm->active_mmu_pages, link) {
1278 int i;
1279 u64 *pt;
1280
1281 if (!test_bit(slot, &page->slot_bitmap))
1282 continue;
1283
1284 pt = page->spt;
1285 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1286 /* avoid RMW */
1287 if (pt[i] & PT_WRITABLE_MASK) {
1288 rmap_remove(&pt[i]);
1289 pt[i] &= ~PT_WRITABLE_MASK;
1290 }
1291 }
1292}
1293
1294void kvm_mmu_zap_all(struct kvm *kvm)
1295{
1296 struct kvm_mmu_page *page, *node;
1297
1298 list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link)
1299 kvm_mmu_zap_page(kvm, page);
1300
1301 kvm_flush_remote_tlbs(kvm);
1302}
1303
1304void kvm_mmu_module_exit(void)
1305{
1306 if (pte_chain_cache)
1307 kmem_cache_destroy(pte_chain_cache);
1308 if (rmap_desc_cache)
1309 kmem_cache_destroy(rmap_desc_cache);
1310 if (mmu_page_header_cache)
1311 kmem_cache_destroy(mmu_page_header_cache);
1312}
1313
1314int kvm_mmu_module_init(void)
1315{
1316 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1317 sizeof(struct kvm_pte_chain),
1318 0, 0, NULL);
1319 if (!pte_chain_cache)
1320 goto nomem;
1321 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1322 sizeof(struct kvm_rmap_desc),
1323 0, 0, NULL);
1324 if (!rmap_desc_cache)
1325 goto nomem;
1326
1327 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1328 sizeof(struct kvm_mmu_page),
1329 0, 0, NULL);
1330 if (!mmu_page_header_cache)
1331 goto nomem;
1332
1333 return 0;
1334
1335nomem:
1336 kvm_mmu_module_exit();
1337 return -ENOMEM;
1338}
1339
1340#ifdef AUDIT
1341
1342static const char *audit_msg;
1343
1344static gva_t canonicalize(gva_t gva)
1345{
1346#ifdef CONFIG_X86_64
1347 gva = (long long)(gva << 16) >> 16;
1348#endif
1349 return gva;
1350}
1351
1352static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1353 gva_t va, int level)
1354{
1355 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1356 int i;
1357 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1358
1359 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1360 u64 ent = pt[i];
1361
1362 if (!(ent & PT_PRESENT_MASK))
1363 continue;
1364
1365 va = canonicalize(va);
1366 if (level > 1)
1367 audit_mappings_page(vcpu, ent, va, level - 1);
1368 else {
1369 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
1370 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
1371
1372 if ((ent & PT_PRESENT_MASK)
1373 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1374 printk(KERN_ERR "audit error: (%s) levels %d"
1375 " gva %lx gpa %llx hpa %llx ent %llx\n",
1376 audit_msg, vcpu->mmu.root_level,
1377 va, gpa, hpa, ent);
1378 }
1379 }
1380}
1381
1382static void audit_mappings(struct kvm_vcpu *vcpu)
1383{
1384 unsigned i;
1385
1386 if (vcpu->mmu.root_level == 4)
1387 audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
1388 else
1389 for (i = 0; i < 4; ++i)
1390 if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
1391 audit_mappings_page(vcpu,
1392 vcpu->mmu.pae_root[i],
1393 i << 30,
1394 2);
1395}
1396
1397static int count_rmaps(struct kvm_vcpu *vcpu)
1398{
1399 int nmaps = 0;
1400 int i, j, k;
1401
1402 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1403 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1404 struct kvm_rmap_desc *d;
1405
1406 for (j = 0; j < m->npages; ++j) {
1407 struct page *page = m->phys_mem[j];
1408
1409 if (!page->private)
1410 continue;
1411 if (!(page->private & 1)) {
1412 ++nmaps;
1413 continue;
1414 }
1415 d = (struct kvm_rmap_desc *)(page->private & ~1ul);
1416 while (d) {
1417 for (k = 0; k < RMAP_EXT; ++k)
1418 if (d->shadow_ptes[k])
1419 ++nmaps;
1420 else
1421 break;
1422 d = d->more;
1423 }
1424 }
1425 }
1426 return nmaps;
1427}
1428
1429static int count_writable_mappings(struct kvm_vcpu *vcpu)
1430{
1431 int nmaps = 0;
1432 struct kvm_mmu_page *page;
1433 int i;
1434
1435 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1436 u64 *pt = page->spt;
1437
1438 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1439 continue;
1440
1441 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1442 u64 ent = pt[i];
1443
1444 if (!(ent & PT_PRESENT_MASK))
1445 continue;
1446 if (!(ent & PT_WRITABLE_MASK))
1447 continue;
1448 ++nmaps;
1449 }
1450 }
1451 return nmaps;
1452}
1453
1454static void audit_rmap(struct kvm_vcpu *vcpu)
1455{
1456 int n_rmap = count_rmaps(vcpu);
1457 int n_actual = count_writable_mappings(vcpu);
1458
1459 if (n_rmap != n_actual)
1460 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1461 __FUNCTION__, audit_msg, n_rmap, n_actual);
1462}
1463
1464static void audit_write_protection(struct kvm_vcpu *vcpu)
1465{
1466 struct kvm_mmu_page *page;
1467
1468 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1469 hfn_t hfn;
1470 struct page *pg;
1471
1472 if (page->role.metaphysical)
1473 continue;
1474
1475 hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
1476 >> PAGE_SHIFT;
1477 pg = pfn_to_page(hfn);
1478 if (pg->private)
1479 printk(KERN_ERR "%s: (%s) shadow page has writable"
1480 " mappings: gfn %lx role %x\n",
1481 __FUNCTION__, audit_msg, page->gfn,
1482 page->role.word);
1483 }
1484}
1485
1486static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1487{
1488 int olddbg = dbg;
1489
1490 dbg = 0;
1491 audit_msg = msg;
1492 audit_rmap(vcpu);
1493 audit_write_protection(vcpu);
1494 audit_mappings(vcpu);
1495 dbg = olddbg;
1496}
1497
1498#endif
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
deleted file mode 100644
index 6b094b44f8fb..000000000000
--- a/drivers/kvm/paging_tmpl.h
+++ /dev/null
@@ -1,511 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20/*
21 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
22 * so the code in this file is compiled twice, once per pte size.
23 */
24
25#if PTTYPE == 64
26 #define pt_element_t u64
27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #ifdef CONFIG_X86_64
35 #define PT_MAX_FULL_LEVELS 4
36 #else
37 #define PT_MAX_FULL_LEVELS 2
38 #endif
39#elif PTTYPE == 32
40 #define pt_element_t u32
41 #define guest_walker guest_walker32
42 #define FNAME(name) paging##32_##name
43 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
44 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
45 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
46 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
47 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
48 #define PT_MAX_FULL_LEVELS 2
49#else
50 #error Invalid PTTYPE value
51#endif
52
53/*
54 * The guest_walker structure emulates the behavior of the hardware page
55 * table walker.
56 */
57struct guest_walker {
58 int level;
59 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
60 pt_element_t *table;
61 pt_element_t pte;
62 pt_element_t *ptep;
63 struct page *page;
64 int index;
65 pt_element_t inherited_ar;
66 gfn_t gfn;
67 u32 error_code;
68};
69
70/*
71 * Fetch a guest pte for a guest virtual address
72 */
73static int FNAME(walk_addr)(struct guest_walker *walker,
74 struct kvm_vcpu *vcpu, gva_t addr,
75 int write_fault, int user_fault, int fetch_fault)
76{
77 hpa_t hpa;
78 struct kvm_memory_slot *slot;
79 pt_element_t *ptep;
80 pt_element_t root;
81 gfn_t table_gfn;
82
83 pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
84 walker->level = vcpu->mmu.root_level;
85 walker->table = NULL;
86 walker->page = NULL;
87 walker->ptep = NULL;
88 root = vcpu->cr3;
89#if PTTYPE == 64
90 if (!is_long_mode(vcpu)) {
91 walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
92 root = *walker->ptep;
93 walker->pte = root;
94 if (!(root & PT_PRESENT_MASK))
95 goto not_present;
96 --walker->level;
97 }
98#endif
99 table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
100 walker->table_gfn[walker->level - 1] = table_gfn;
101 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
102 walker->level - 1, table_gfn);
103 slot = gfn_to_memslot(vcpu->kvm, table_gfn);
104 hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
105 walker->page = pfn_to_page(hpa >> PAGE_SHIFT);
106 walker->table = kmap_atomic(walker->page, KM_USER0);
107
108 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
109 (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
110
111 walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
112
113 for (;;) {
114 int index = PT_INDEX(addr, walker->level);
115 hpa_t paddr;
116
117 ptep = &walker->table[index];
118 walker->index = index;
119 ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
120 ((unsigned long)ptep & PAGE_MASK));
121
122 if (!is_present_pte(*ptep))
123 goto not_present;
124
125 if (write_fault && !is_writeble_pte(*ptep))
126 if (user_fault || is_write_protection(vcpu))
127 goto access_error;
128
129 if (user_fault && !(*ptep & PT_USER_MASK))
130 goto access_error;
131
132#if PTTYPE == 64
133 if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK))
134 goto access_error;
135#endif
136
137 if (!(*ptep & PT_ACCESSED_MASK)) {
138 mark_page_dirty(vcpu->kvm, table_gfn);
139 *ptep |= PT_ACCESSED_MASK;
140 }
141
142 if (walker->level == PT_PAGE_TABLE_LEVEL) {
143 walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
144 >> PAGE_SHIFT;
145 break;
146 }
147
148 if (walker->level == PT_DIRECTORY_LEVEL
149 && (*ptep & PT_PAGE_SIZE_MASK)
150 && (PTTYPE == 64 || is_pse(vcpu))) {
151 walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK)
152 >> PAGE_SHIFT;
153 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
154 break;
155 }
156
157 walker->inherited_ar &= walker->table[index];
158 table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
159 kunmap_atomic(walker->table, KM_USER0);
160 paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT);
161 walker->page = pfn_to_page(paddr >> PAGE_SHIFT);
162 walker->table = kmap_atomic(walker->page, KM_USER0);
163 --walker->level;
164 walker->table_gfn[walker->level - 1 ] = table_gfn;
165 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
166 walker->level - 1, table_gfn);
167 }
168 walker->pte = *ptep;
169 if (walker->page)
170 walker->ptep = NULL;
171 if (walker->table)
172 kunmap_atomic(walker->table, KM_USER0);
173 pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
174 return 1;
175
176not_present:
177 walker->error_code = 0;
178 goto err;
179
180access_error:
181 walker->error_code = PFERR_PRESENT_MASK;
182
183err:
184 if (write_fault)
185 walker->error_code |= PFERR_WRITE_MASK;
186 if (user_fault)
187 walker->error_code |= PFERR_USER_MASK;
188 if (fetch_fault)
189 walker->error_code |= PFERR_FETCH_MASK;
190 if (walker->table)
191 kunmap_atomic(walker->table, KM_USER0);
192 return 0;
193}
194
195static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
196 struct guest_walker *walker)
197{
198 mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
199}
200
201static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
202 u64 *shadow_pte,
203 gpa_t gaddr,
204 pt_element_t gpte,
205 u64 access_bits,
206 int user_fault,
207 int write_fault,
208 int *ptwrite,
209 struct guest_walker *walker,
210 gfn_t gfn)
211{
212 hpa_t paddr;
213 int dirty = gpte & PT_DIRTY_MASK;
214 u64 spte = *shadow_pte;
215 int was_rmapped = is_rmap_pte(spte);
216
217 pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
218 " user_fault %d gfn %lx\n",
219 __FUNCTION__, spte, (u64)gpte, access_bits,
220 write_fault, user_fault, gfn);
221
222 if (write_fault && !dirty) {
223 pt_element_t *guest_ent, *tmp = NULL;
224
225 if (walker->ptep)
226 guest_ent = walker->ptep;
227 else {
228 tmp = kmap_atomic(walker->page, KM_USER0);
229 guest_ent = &tmp[walker->index];
230 }
231
232 *guest_ent |= PT_DIRTY_MASK;
233 if (!walker->ptep)
234 kunmap_atomic(tmp, KM_USER0);
235 dirty = 1;
236 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
237 }
238
239 spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
240 spte |= gpte & PT64_NX_MASK;
241 if (!dirty)
242 access_bits &= ~PT_WRITABLE_MASK;
243
244 paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
245
246 spte |= PT_PRESENT_MASK;
247 if (access_bits & PT_USER_MASK)
248 spte |= PT_USER_MASK;
249
250 if (is_error_hpa(paddr)) {
251 spte |= gaddr;
252 spte |= PT_SHADOW_IO_MARK;
253 spte &= ~PT_PRESENT_MASK;
254 set_shadow_pte(shadow_pte, spte);
255 return;
256 }
257
258 spte |= paddr;
259
260 if ((access_bits & PT_WRITABLE_MASK)
261 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
262 struct kvm_mmu_page *shadow;
263
264 spte |= PT_WRITABLE_MASK;
265 if (user_fault) {
266 mmu_unshadow(vcpu, gfn);
267 goto unshadowed;
268 }
269
270 shadow = kvm_mmu_lookup_page(vcpu, gfn);
271 if (shadow) {
272 pgprintk("%s: found shadow page for %lx, marking ro\n",
273 __FUNCTION__, gfn);
274 access_bits &= ~PT_WRITABLE_MASK;
275 if (is_writeble_pte(spte)) {
276 spte &= ~PT_WRITABLE_MASK;
277 kvm_x86_ops->tlb_flush(vcpu);
278 }
279 if (write_fault)
280 *ptwrite = 1;
281 }
282 }
283
284unshadowed:
285
286 if (access_bits & PT_WRITABLE_MASK)
287 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
288
289 set_shadow_pte(shadow_pte, spte);
290 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
291 if (!was_rmapped)
292 rmap_add(vcpu, shadow_pte);
293}
294
295static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
296 u64 *shadow_pte, u64 access_bits,
297 int user_fault, int write_fault, int *ptwrite,
298 struct guest_walker *walker, gfn_t gfn)
299{
300 access_bits &= gpte;
301 FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK,
302 gpte, access_bits, user_fault, write_fault,
303 ptwrite, walker, gfn);
304}
305
306static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
307 u64 *spte, const void *pte, int bytes)
308{
309 pt_element_t gpte;
310
311 if (bytes < sizeof(pt_element_t))
312 return;
313 gpte = *(const pt_element_t *)pte;
314 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
315 return;
316 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
317 FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
318 0, NULL, NULL,
319 (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
320}
321
322static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
323 u64 *shadow_pte, u64 access_bits,
324 int user_fault, int write_fault, int *ptwrite,
325 struct guest_walker *walker, gfn_t gfn)
326{
327 gpa_t gaddr;
328
329 access_bits &= gpde;
330 gaddr = (gpa_t)gfn << PAGE_SHIFT;
331 if (PTTYPE == 32 && is_cpuid_PSE36())
332 gaddr |= (gpde & PT32_DIR_PSE36_MASK) <<
333 (32 - PT32_DIR_PSE36_SHIFT);
334 FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
335 gpde, access_bits, user_fault, write_fault,
336 ptwrite, walker, gfn);
337}
338
339/*
340 * Fetch a shadow pte for a specific level in the paging hierarchy.
341 */
342static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
343 struct guest_walker *walker,
344 int user_fault, int write_fault, int *ptwrite)
345{
346 hpa_t shadow_addr;
347 int level;
348 u64 *shadow_ent;
349 u64 *prev_shadow_ent = NULL;
350
351 if (!is_present_pte(walker->pte))
352 return NULL;
353
354 shadow_addr = vcpu->mmu.root_hpa;
355 level = vcpu->mmu.shadow_root_level;
356 if (level == PT32E_ROOT_LEVEL) {
357 shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3];
358 shadow_addr &= PT64_BASE_ADDR_MASK;
359 --level;
360 }
361
362 for (; ; level--) {
363 u32 index = SHADOW_PT_INDEX(addr, level);
364 struct kvm_mmu_page *shadow_page;
365 u64 shadow_pte;
366 int metaphysical;
367 gfn_t table_gfn;
368 unsigned hugepage_access = 0;
369
370 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
371 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
372 if (level == PT_PAGE_TABLE_LEVEL)
373 break;
374 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
375 prev_shadow_ent = shadow_ent;
376 continue;
377 }
378
379 if (level == PT_PAGE_TABLE_LEVEL)
380 break;
381
382 if (level - 1 == PT_PAGE_TABLE_LEVEL
383 && walker->level == PT_DIRECTORY_LEVEL) {
384 metaphysical = 1;
385 hugepage_access = walker->pte;
386 hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
387 if (walker->pte & PT64_NX_MASK)
388 hugepage_access |= (1 << 2);
389 hugepage_access >>= PT_WRITABLE_SHIFT;
390 table_gfn = (walker->pte & PT_BASE_ADDR_MASK)
391 >> PAGE_SHIFT;
392 } else {
393 metaphysical = 0;
394 table_gfn = walker->table_gfn[level - 2];
395 }
396 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
397 metaphysical, hugepage_access,
398 shadow_ent);
399 shadow_addr = __pa(shadow_page->spt);
400 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
401 | PT_WRITABLE_MASK | PT_USER_MASK;
402 *shadow_ent = shadow_pte;
403 prev_shadow_ent = shadow_ent;
404 }
405
406 if (walker->level == PT_DIRECTORY_LEVEL) {
407 FNAME(set_pde)(vcpu, walker->pte, shadow_ent,
408 walker->inherited_ar, user_fault, write_fault,
409 ptwrite, walker, walker->gfn);
410 } else {
411 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
412 FNAME(set_pte)(vcpu, walker->pte, shadow_ent,
413 walker->inherited_ar, user_fault, write_fault,
414 ptwrite, walker, walker->gfn);
415 }
416 return shadow_ent;
417}
418
419/*
420 * Page fault handler. There are several causes for a page fault:
421 * - there is no shadow pte for the guest pte
422 * - write access through a shadow pte marked read only so that we can set
423 * the dirty bit
424 * - write access to a shadow pte marked read only so we can update the page
425 * dirty bitmap, when userspace requests it
426 * - mmio access; in this case we will never install a present shadow pte
427 * - normal guest page fault due to the guest pte marked not present, not
428 * writable, or not executable
429 *
430 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
431 * a negative value on error.
432 */
433static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
434 u32 error_code)
435{
436 int write_fault = error_code & PFERR_WRITE_MASK;
437 int user_fault = error_code & PFERR_USER_MASK;
438 int fetch_fault = error_code & PFERR_FETCH_MASK;
439 struct guest_walker walker;
440 u64 *shadow_pte;
441 int write_pt = 0;
442 int r;
443
444 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
445 kvm_mmu_audit(vcpu, "pre page fault");
446
447 r = mmu_topup_memory_caches(vcpu);
448 if (r)
449 return r;
450
451 /*
452 * Look up the shadow pte for the faulting address.
453 */
454 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
455 fetch_fault);
456
457 /*
458 * The page is not mapped by the guest. Let the guest handle it.
459 */
460 if (!r) {
461 pgprintk("%s: guest page fault\n", __FUNCTION__);
462 inject_page_fault(vcpu, addr, walker.error_code);
463 vcpu->last_pt_write_count = 0; /* reset fork detector */
464 return 0;
465 }
466
467 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
468 &write_pt);
469 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
470 shadow_pte, *shadow_pte, write_pt);
471
472 if (!write_pt)
473 vcpu->last_pt_write_count = 0; /* reset fork detector */
474
475 /*
476 * mmio: emulate if accessible, otherwise its a guest fault.
477 */
478 if (is_io_pte(*shadow_pte))
479 return 1;
480
481 ++vcpu->stat.pf_fixed;
482 kvm_mmu_audit(vcpu, "post page fault (fixed)");
483
484 return write_pt;
485}
486
487static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
488{
489 struct guest_walker walker;
490 gpa_t gpa = UNMAPPED_GVA;
491 int r;
492
493 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
494
495 if (r) {
496 gpa = (gpa_t)walker.gfn << PAGE_SHIFT;
497 gpa |= vaddr & ~PAGE_MASK;
498 }
499
500 return gpa;
501}
502
503#undef pt_element_t
504#undef guest_walker
505#undef FNAME
506#undef PT_BASE_ADDR_MASK
507#undef PT_INDEX
508#undef SHADOW_PT_INDEX
509#undef PT_LEVEL_MASK
510#undef PT_DIR_BASE_ADDR_MASK
511#undef PT_MAX_FULL_LEVELS
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
deleted file mode 100644
index bd46de6bf891..000000000000
--- a/drivers/kvm/x86_emulate.c
+++ /dev/null
@@ -1,1662 +0,0 @@
1/******************************************************************************
2 * x86_emulate.c
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * Linux coding style, mod r/m decoder, segment base fixes, real-mode
9 * privileged instructions:
10 *
11 * Copyright (C) 2006 Qumranet
12 *
13 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com>
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 *
19 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
20 */
21
22#ifndef __KERNEL__
23#include <stdio.h>
24#include <stdint.h>
25#include <public/xen.h>
26#define DPRINTF(_f, _a ...) printf( _f , ## _a )
27#else
28#include "kvm.h"
29#define DPRINTF(x...) do {} while (0)
30#endif
31#include "x86_emulate.h"
32#include <linux/module.h>
33
34/*
35 * Opcode effective-address decode tables.
36 * Note that we only emulate instructions that have at least one memory
37 * operand (excluding implicit stack references). We assume that stack
38 * references and instruction fetches will never occur in special memory
39 * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
40 * not be handled.
41 */
42
43/* Operand sizes: 8-bit operands or specified/overridden size. */
44#define ByteOp (1<<0) /* 8-bit operands. */
45/* Destination operand type. */
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1)
50/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3)
60/* Generic ModRM decode. */
61#define ModRM (1<<6)
62/* Destination is only written; never read. */
63#define Mov (1<<7)
64#define BitOp (1<<8)
65
66static u8 opcode_table[256] = {
67 /* 0x00 - 0x07 */
68 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
69 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
70 0, 0, 0, 0,
71 /* 0x08 - 0x0F */
72 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
73 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
74 0, 0, 0, 0,
75 /* 0x10 - 0x17 */
76 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
77 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
78 0, 0, 0, 0,
79 /* 0x18 - 0x1F */
80 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
81 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
82 0, 0, 0, 0,
83 /* 0x20 - 0x27 */
84 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
85 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
86 SrcImmByte, SrcImm, 0, 0,
87 /* 0x28 - 0x2F */
88 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
89 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
90 0, 0, 0, 0,
91 /* 0x30 - 0x37 */
92 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
93 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
94 0, 0, 0, 0,
95 /* 0x38 - 0x3F */
96 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
97 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
98 0, 0, 0, 0,
99 /* 0x40 - 0x4F */
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 /* 0x50 - 0x57 */
102 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
103 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
104 /* 0x58 - 0x5F */
105 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
106 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
107 /* 0x60 - 0x67 */
108 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
109 0, 0, 0, 0,
110 /* 0x68 - 0x6F */
111 0, 0, ImplicitOps|Mov, 0,
112 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
113 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
114 /* 0x70 - 0x77 */
115 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
116 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
117 /* 0x78 - 0x7F */
118 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
119 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
120 /* 0x80 - 0x87 */
121 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
122 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
123 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
124 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
125 /* 0x88 - 0x8F */
126 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
127 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
128 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov,
129 /* 0x90 - 0x9F */
130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
131 /* 0xA0 - 0xA7 */
132 ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
133 ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
134 ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
135 ByteOp | ImplicitOps, ImplicitOps,
136 /* 0xA8 - 0xAF */
137 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
138 ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
139 ByteOp | ImplicitOps, ImplicitOps,
140 /* 0xB0 - 0xBF */
141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
142 /* 0xC0 - 0xC7 */
143 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
144 0, ImplicitOps, 0, 0,
145 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
146 /* 0xC8 - 0xCF */
147 0, 0, 0, 0, 0, 0, 0, 0,
148 /* 0xD0 - 0xD7 */
149 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
150 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
151 0, 0, 0, 0,
152 /* 0xD8 - 0xDF */
153 0, 0, 0, 0, 0, 0, 0, 0,
154 /* 0xE0 - 0xE7 */
155 0, 0, 0, 0, 0, 0, 0, 0,
156 /* 0xE8 - 0xEF */
157 ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
158 /* 0xF0 - 0xF7 */
159 0, 0, 0, 0,
160 ImplicitOps, 0,
161 ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
162 /* 0xF8 - 0xFF */
163 0, 0, 0, 0,
164 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
165};
166
167static u16 twobyte_table[256] = {
168 /* 0x00 - 0x0F */
169 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
170 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
171 /* 0x10 - 0x1F */
172 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
173 /* 0x20 - 0x2F */
174 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 /* 0x30 - 0x3F */
177 ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
178 /* 0x40 - 0x47 */
179 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
180 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
181 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
182 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
183 /* 0x48 - 0x4F */
184 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
185 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
186 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
187 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
188 /* 0x50 - 0x5F */
189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 /* 0x60 - 0x6F */
191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
192 /* 0x70 - 0x7F */
193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 /* 0x80 - 0x8F */
195 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
196 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
197 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
198 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
199 /* 0x90 - 0x9F */
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 /* 0xA0 - 0xA7 */
202 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
203 /* 0xA8 - 0xAF */
204 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
205 /* 0xB0 - 0xB7 */
206 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
207 DstMem | SrcReg | ModRM | BitOp,
208 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
209 DstReg | SrcMem16 | ModRM | Mov,
210 /* 0xB8 - 0xBF */
211 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
212 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
213 DstReg | SrcMem16 | ModRM | Mov,
214 /* 0xC0 - 0xCF */
215 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 /* 0xD0 - 0xDF */
218 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
219 /* 0xE0 - 0xEF */
220 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
221 /* 0xF0 - 0xFF */
222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
223};
224
225/* Type, address-of, and value of an instruction's operand. */
226struct operand {
227 enum { OP_REG, OP_MEM, OP_IMM } type;
228 unsigned int bytes;
229 unsigned long val, orig_val, *ptr;
230};
231
232/* EFLAGS bit definitions. */
233#define EFLG_OF (1<<11)
234#define EFLG_DF (1<<10)
235#define EFLG_SF (1<<7)
236#define EFLG_ZF (1<<6)
237#define EFLG_AF (1<<4)
238#define EFLG_PF (1<<2)
239#define EFLG_CF (1<<0)
240
241/*
242 * Instruction emulation:
243 * Most instructions are emulated directly via a fragment of inline assembly
244 * code. This allows us to save/restore EFLAGS and thus very easily pick up
245 * any modified flags.
246 */
247
248#if defined(CONFIG_X86_64)
249#define _LO32 "k" /* force 32-bit operand */
250#define _STK "%%rsp" /* stack pointer */
251#elif defined(__i386__)
252#define _LO32 "" /* force 32-bit operand */
253#define _STK "%%esp" /* stack pointer */
254#endif
255
256/*
257 * These EFLAGS bits are restored from saved value during emulation, and
258 * any changes are written back to the saved value after emulation.
259 */
260#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
261
262/* Before executing instruction: restore necessary bits in EFLAGS. */
263#define _PRE_EFLAGS(_sav, _msk, _tmp) \
264 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \
265 "push %"_sav"; " \
266 "movl %"_msk",%"_LO32 _tmp"; " \
267 "andl %"_LO32 _tmp",("_STK"); " \
268 "pushf; " \
269 "notl %"_LO32 _tmp"; " \
270 "andl %"_LO32 _tmp",("_STK"); " \
271 "pop %"_tmp"; " \
272 "orl %"_LO32 _tmp",("_STK"); " \
273 "popf; " \
274 /* _sav &= ~msk; */ \
275 "movl %"_msk",%"_LO32 _tmp"; " \
276 "notl %"_LO32 _tmp"; " \
277 "andl %"_LO32 _tmp",%"_sav"; "
278
279/* After executing instruction: write-back necessary bits in EFLAGS. */
280#define _POST_EFLAGS(_sav, _msk, _tmp) \
281 /* _sav |= EFLAGS & _msk; */ \
282 "pushf; " \
283 "pop %"_tmp"; " \
284 "andl %"_msk",%"_LO32 _tmp"; " \
285 "orl %"_LO32 _tmp",%"_sav"; "
286
287/* Raw emulation: instruction has two explicit operands. */
288#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
289 do { \
290 unsigned long _tmp; \
291 \
292 switch ((_dst).bytes) { \
293 case 2: \
294 __asm__ __volatile__ ( \
295 _PRE_EFLAGS("0","4","2") \
296 _op"w %"_wx"3,%1; " \
297 _POST_EFLAGS("0","4","2") \
298 : "=m" (_eflags), "=m" ((_dst).val), \
299 "=&r" (_tmp) \
300 : _wy ((_src).val), "i" (EFLAGS_MASK) ); \
301 break; \
302 case 4: \
303 __asm__ __volatile__ ( \
304 _PRE_EFLAGS("0","4","2") \
305 _op"l %"_lx"3,%1; " \
306 _POST_EFLAGS("0","4","2") \
307 : "=m" (_eflags), "=m" ((_dst).val), \
308 "=&r" (_tmp) \
309 : _ly ((_src).val), "i" (EFLAGS_MASK) ); \
310 break; \
311 case 8: \
312 __emulate_2op_8byte(_op, _src, _dst, \
313 _eflags, _qx, _qy); \
314 break; \
315 } \
316 } while (0)
317
318#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
319 do { \
320 unsigned long _tmp; \
321 switch ( (_dst).bytes ) \
322 { \
323 case 1: \
324 __asm__ __volatile__ ( \
325 _PRE_EFLAGS("0","4","2") \
326 _op"b %"_bx"3,%1; " \
327 _POST_EFLAGS("0","4","2") \
328 : "=m" (_eflags), "=m" ((_dst).val), \
329 "=&r" (_tmp) \
330 : _by ((_src).val), "i" (EFLAGS_MASK) ); \
331 break; \
332 default: \
333 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
334 _wx, _wy, _lx, _ly, _qx, _qy); \
335 break; \
336 } \
337 } while (0)
338
339/* Source operand is byte-sized and may be restricted to just %cl. */
340#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
341 __emulate_2op(_op, _src, _dst, _eflags, \
342 "b", "c", "b", "c", "b", "c", "b", "c")
343
344/* Source operand is byte, word, long or quad sized. */
345#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
346 __emulate_2op(_op, _src, _dst, _eflags, \
347 "b", "q", "w", "r", _LO32, "r", "", "r")
348
349/* Source operand is word, long or quad sized. */
350#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
351 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
352 "w", "r", _LO32, "r", "", "r")
353
354/* Instruction has only one explicit operand (no source operand). */
355#define emulate_1op(_op, _dst, _eflags) \
356 do { \
357 unsigned long _tmp; \
358 \
359 switch ( (_dst).bytes ) \
360 { \
361 case 1: \
362 __asm__ __volatile__ ( \
363 _PRE_EFLAGS("0","3","2") \
364 _op"b %1; " \
365 _POST_EFLAGS("0","3","2") \
366 : "=m" (_eflags), "=m" ((_dst).val), \
367 "=&r" (_tmp) \
368 : "i" (EFLAGS_MASK) ); \
369 break; \
370 case 2: \
371 __asm__ __volatile__ ( \
372 _PRE_EFLAGS("0","3","2") \
373 _op"w %1; " \
374 _POST_EFLAGS("0","3","2") \
375 : "=m" (_eflags), "=m" ((_dst).val), \
376 "=&r" (_tmp) \
377 : "i" (EFLAGS_MASK) ); \
378 break; \
379 case 4: \
380 __asm__ __volatile__ ( \
381 _PRE_EFLAGS("0","3","2") \
382 _op"l %1; " \
383 _POST_EFLAGS("0","3","2") \
384 : "=m" (_eflags), "=m" ((_dst).val), \
385 "=&r" (_tmp) \
386 : "i" (EFLAGS_MASK) ); \
387 break; \
388 case 8: \
389 __emulate_1op_8byte(_op, _dst, _eflags); \
390 break; \
391 } \
392 } while (0)
393
394/* Emulate an instruction with quadword operands (x86/64 only). */
395#if defined(CONFIG_X86_64)
396#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
397 do { \
398 __asm__ __volatile__ ( \
399 _PRE_EFLAGS("0","4","2") \
400 _op"q %"_qx"3,%1; " \
401 _POST_EFLAGS("0","4","2") \
402 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
403 : _qy ((_src).val), "i" (EFLAGS_MASK) ); \
404 } while (0)
405
406#define __emulate_1op_8byte(_op, _dst, _eflags) \
407 do { \
408 __asm__ __volatile__ ( \
409 _PRE_EFLAGS("0","3","2") \
410 _op"q %1; " \
411 _POST_EFLAGS("0","3","2") \
412 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
413 : "i" (EFLAGS_MASK) ); \
414 } while (0)
415
416#elif defined(__i386__)
417#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
418#define __emulate_1op_8byte(_op, _dst, _eflags)
419#endif /* __i386__ */
420
421/* Fetch next part of the instruction being emulated. */
422#define insn_fetch(_type, _size, _eip) \
423({ unsigned long _x; \
424 rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \
425 (_size), ctxt->vcpu); \
426 if ( rc != 0 ) \
427 goto done; \
428 (_eip) += (_size); \
429 (_type)_x; \
430})
431
432/* Access/update address held in a register, based on addressing mode. */
433#define address_mask(reg) \
434 ((ad_bytes == sizeof(unsigned long)) ? \
435 (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1)))
436#define register_address(base, reg) \
437 ((base) + address_mask(reg))
438#define register_address_increment(reg, inc) \
439 do { \
440 /* signed type ensures sign extension to long */ \
441 int _inc = (inc); \
442 if ( ad_bytes == sizeof(unsigned long) ) \
443 (reg) += _inc; \
444 else \
445 (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \
446 (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
447 } while (0)
448
449#define JMP_REL(rel) \
450 do { \
451 register_address_increment(_eip, rel); \
452 } while (0)
453
454/*
455 * Given the 'reg' portion of a ModRM byte, and a register block, return a
456 * pointer into the block that addresses the relevant register.
457 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
458 */
459static void *decode_register(u8 modrm_reg, unsigned long *regs,
460 int highbyte_regs)
461{
462 void *p;
463
464 p = &regs[modrm_reg];
465 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
466 p = (unsigned char *)&regs[modrm_reg & 3] + 1;
467 return p;
468}
469
470static int read_descriptor(struct x86_emulate_ctxt *ctxt,
471 struct x86_emulate_ops *ops,
472 void *ptr,
473 u16 *size, unsigned long *address, int op_bytes)
474{
475 int rc;
476
477 if (op_bytes == 2)
478 op_bytes = 3;
479 *address = 0;
480 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
481 ctxt->vcpu);
482 if (rc)
483 return rc;
484 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
485 ctxt->vcpu);
486 return rc;
487}
488
489static int test_cc(unsigned int condition, unsigned int flags)
490{
491 int rc = 0;
492
493 switch ((condition & 15) >> 1) {
494 case 0: /* o */
495 rc |= (flags & EFLG_OF);
496 break;
497 case 1: /* b/c/nae */
498 rc |= (flags & EFLG_CF);
499 break;
500 case 2: /* z/e */
501 rc |= (flags & EFLG_ZF);
502 break;
503 case 3: /* be/na */
504 rc |= (flags & (EFLG_CF|EFLG_ZF));
505 break;
506 case 4: /* s */
507 rc |= (flags & EFLG_SF);
508 break;
509 case 5: /* p/pe */
510 rc |= (flags & EFLG_PF);
511 break;
512 case 7: /* le/ng */
513 rc |= (flags & EFLG_ZF);
514 /* fall through */
515 case 6: /* l/nge */
516 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
517 break;
518 }
519
520 /* Odd condition identifiers (lsb == 1) have inverted sense. */
521 return (!!rc ^ (condition & 1));
522}
523
524int
525x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
526{
527 unsigned d;
528 u8 b, sib, twobyte = 0, rex_prefix = 0;
529 u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
530 unsigned long *override_base = NULL;
531 unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
532 int rc = 0;
533 struct operand src, dst;
534 unsigned long cr2 = ctxt->cr2;
535 int mode = ctxt->mode;
536 unsigned long modrm_ea;
537 int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
538 int no_wb = 0;
539 u64 msr_data;
540
541 /* Shadow copy of register state. Committed on successful emulation. */
542 unsigned long _regs[NR_VCPU_REGS];
543 unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags;
544 unsigned long modrm_val = 0;
545
546 memcpy(_regs, ctxt->vcpu->regs, sizeof _regs);
547
548 switch (mode) {
549 case X86EMUL_MODE_REAL:
550 case X86EMUL_MODE_PROT16:
551 op_bytes = ad_bytes = 2;
552 break;
553 case X86EMUL_MODE_PROT32:
554 op_bytes = ad_bytes = 4;
555 break;
556#ifdef CONFIG_X86_64
557 case X86EMUL_MODE_PROT64:
558 op_bytes = 4;
559 ad_bytes = 8;
560 break;
561#endif
562 default:
563 return -1;
564 }
565
566 /* Legacy prefixes. */
567 for (i = 0; i < 8; i++) {
568 switch (b = insn_fetch(u8, 1, _eip)) {
569 case 0x66: /* operand-size override */
570 op_bytes ^= 6; /* switch between 2/4 bytes */
571 break;
572 case 0x67: /* address-size override */
573 if (mode == X86EMUL_MODE_PROT64)
574 ad_bytes ^= 12; /* switch between 4/8 bytes */
575 else
576 ad_bytes ^= 6; /* switch between 2/4 bytes */
577 break;
578 case 0x2e: /* CS override */
579 override_base = &ctxt->cs_base;
580 break;
581 case 0x3e: /* DS override */
582 override_base = &ctxt->ds_base;
583 break;
584 case 0x26: /* ES override */
585 override_base = &ctxt->es_base;
586 break;
587 case 0x64: /* FS override */
588 override_base = &ctxt->fs_base;
589 break;
590 case 0x65: /* GS override */
591 override_base = &ctxt->gs_base;
592 break;
593 case 0x36: /* SS override */
594 override_base = &ctxt->ss_base;
595 break;
596 case 0xf0: /* LOCK */
597 lock_prefix = 1;
598 break;
599 case 0xf2: /* REPNE/REPNZ */
600 case 0xf3: /* REP/REPE/REPZ */
601 rep_prefix = 1;
602 break;
603 default:
604 goto done_prefixes;
605 }
606 }
607
608done_prefixes:
609
610 /* REX prefix. */
611 if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) {
612 rex_prefix = b;
613 if (b & 8)
614 op_bytes = 8; /* REX.W */
615 modrm_reg = (b & 4) << 1; /* REX.R */
616 index_reg = (b & 2) << 2; /* REX.X */
617 modrm_rm = base_reg = (b & 1) << 3; /* REG.B */
618 b = insn_fetch(u8, 1, _eip);
619 }
620
621 /* Opcode byte(s). */
622 d = opcode_table[b];
623 if (d == 0) {
624 /* Two-byte opcode? */
625 if (b == 0x0f) {
626 twobyte = 1;
627 b = insn_fetch(u8, 1, _eip);
628 d = twobyte_table[b];
629 }
630
631 /* Unrecognised? */
632 if (d == 0)
633 goto cannot_emulate;
634 }
635
636 /* ModRM and SIB bytes. */
637 if (d & ModRM) {
638 modrm = insn_fetch(u8, 1, _eip);
639 modrm_mod |= (modrm & 0xc0) >> 6;
640 modrm_reg |= (modrm & 0x38) >> 3;
641 modrm_rm |= (modrm & 0x07);
642 modrm_ea = 0;
643 use_modrm_ea = 1;
644
645 if (modrm_mod == 3) {
646 modrm_val = *(unsigned long *)
647 decode_register(modrm_rm, _regs, d & ByteOp);
648 goto modrm_done;
649 }
650
651 if (ad_bytes == 2) {
652 unsigned bx = _regs[VCPU_REGS_RBX];
653 unsigned bp = _regs[VCPU_REGS_RBP];
654 unsigned si = _regs[VCPU_REGS_RSI];
655 unsigned di = _regs[VCPU_REGS_RDI];
656
657 /* 16-bit ModR/M decode. */
658 switch (modrm_mod) {
659 case 0:
660 if (modrm_rm == 6)
661 modrm_ea += insn_fetch(u16, 2, _eip);
662 break;
663 case 1:
664 modrm_ea += insn_fetch(s8, 1, _eip);
665 break;
666 case 2:
667 modrm_ea += insn_fetch(u16, 2, _eip);
668 break;
669 }
670 switch (modrm_rm) {
671 case 0:
672 modrm_ea += bx + si;
673 break;
674 case 1:
675 modrm_ea += bx + di;
676 break;
677 case 2:
678 modrm_ea += bp + si;
679 break;
680 case 3:
681 modrm_ea += bp + di;
682 break;
683 case 4:
684 modrm_ea += si;
685 break;
686 case 5:
687 modrm_ea += di;
688 break;
689 case 6:
690 if (modrm_mod != 0)
691 modrm_ea += bp;
692 break;
693 case 7:
694 modrm_ea += bx;
695 break;
696 }
697 if (modrm_rm == 2 || modrm_rm == 3 ||
698 (modrm_rm == 6 && modrm_mod != 0))
699 if (!override_base)
700 override_base = &ctxt->ss_base;
701 modrm_ea = (u16)modrm_ea;
702 } else {
703 /* 32/64-bit ModR/M decode. */
704 switch (modrm_rm) {
705 case 4:
706 case 12:
707 sib = insn_fetch(u8, 1, _eip);
708 index_reg |= (sib >> 3) & 7;
709 base_reg |= sib & 7;
710 scale = sib >> 6;
711
712 switch (base_reg) {
713 case 5:
714 if (modrm_mod != 0)
715 modrm_ea += _regs[base_reg];
716 else
717 modrm_ea += insn_fetch(s32, 4, _eip);
718 break;
719 default:
720 modrm_ea += _regs[base_reg];
721 }
722 switch (index_reg) {
723 case 4:
724 break;
725 default:
726 modrm_ea += _regs[index_reg] << scale;
727
728 }
729 break;
730 case 5:
731 if (modrm_mod != 0)
732 modrm_ea += _regs[modrm_rm];
733 else if (mode == X86EMUL_MODE_PROT64)
734 rip_relative = 1;
735 break;
736 default:
737 modrm_ea += _regs[modrm_rm];
738 break;
739 }
740 switch (modrm_mod) {
741 case 0:
742 if (modrm_rm == 5)
743 modrm_ea += insn_fetch(s32, 4, _eip);
744 break;
745 case 1:
746 modrm_ea += insn_fetch(s8, 1, _eip);
747 break;
748 case 2:
749 modrm_ea += insn_fetch(s32, 4, _eip);
750 break;
751 }
752 }
753 if (!override_base)
754 override_base = &ctxt->ds_base;
755 if (mode == X86EMUL_MODE_PROT64 &&
756 override_base != &ctxt->fs_base &&
757 override_base != &ctxt->gs_base)
758 override_base = NULL;
759
760 if (override_base)
761 modrm_ea += *override_base;
762
763 if (rip_relative) {
764 modrm_ea += _eip;
765 switch (d & SrcMask) {
766 case SrcImmByte:
767 modrm_ea += 1;
768 break;
769 case SrcImm:
770 if (d & ByteOp)
771 modrm_ea += 1;
772 else
773 if (op_bytes == 8)
774 modrm_ea += 4;
775 else
776 modrm_ea += op_bytes;
777 }
778 }
779 if (ad_bytes != 8)
780 modrm_ea = (u32)modrm_ea;
781 cr2 = modrm_ea;
782 modrm_done:
783 ;
784 }
785
786 /*
787 * Decode and fetch the source operand: register, memory
788 * or immediate.
789 */
790 switch (d & SrcMask) {
791 case SrcNone:
792 break;
793 case SrcReg:
794 src.type = OP_REG;
795 if (d & ByteOp) {
796 src.ptr = decode_register(modrm_reg, _regs,
797 (rex_prefix == 0));
798 src.val = src.orig_val = *(u8 *) src.ptr;
799 src.bytes = 1;
800 } else {
801 src.ptr = decode_register(modrm_reg, _regs, 0);
802 switch ((src.bytes = op_bytes)) {
803 case 2:
804 src.val = src.orig_val = *(u16 *) src.ptr;
805 break;
806 case 4:
807 src.val = src.orig_val = *(u32 *) src.ptr;
808 break;
809 case 8:
810 src.val = src.orig_val = *(u64 *) src.ptr;
811 break;
812 }
813 }
814 break;
815 case SrcMem16:
816 src.bytes = 2;
817 goto srcmem_common;
818 case SrcMem32:
819 src.bytes = 4;
820 goto srcmem_common;
821 case SrcMem:
822 src.bytes = (d & ByteOp) ? 1 : op_bytes;
823 /* Don't fetch the address for invlpg: it could be unmapped. */
824 if (twobyte && b == 0x01 && modrm_reg == 7)
825 break;
826 srcmem_common:
827 /*
828 * For instructions with a ModR/M byte, switch to register
829 * access if Mod = 3.
830 */
831 if ((d & ModRM) && modrm_mod == 3) {
832 src.type = OP_REG;
833 break;
834 }
835 src.type = OP_MEM;
836 src.ptr = (unsigned long *)cr2;
837 src.val = 0;
838 if ((rc = ops->read_emulated((unsigned long)src.ptr,
839 &src.val, src.bytes, ctxt->vcpu)) != 0)
840 goto done;
841 src.orig_val = src.val;
842 break;
843 case SrcImm:
844 src.type = OP_IMM;
845 src.ptr = (unsigned long *)_eip;
846 src.bytes = (d & ByteOp) ? 1 : op_bytes;
847 if (src.bytes == 8)
848 src.bytes = 4;
849 /* NB. Immediates are sign-extended as necessary. */
850 switch (src.bytes) {
851 case 1:
852 src.val = insn_fetch(s8, 1, _eip);
853 break;
854 case 2:
855 src.val = insn_fetch(s16, 2, _eip);
856 break;
857 case 4:
858 src.val = insn_fetch(s32, 4, _eip);
859 break;
860 }
861 break;
862 case SrcImmByte:
863 src.type = OP_IMM;
864 src.ptr = (unsigned long *)_eip;
865 src.bytes = 1;
866 src.val = insn_fetch(s8, 1, _eip);
867 break;
868 }
869
870 /* Decode and fetch the destination operand: register or memory. */
871 switch (d & DstMask) {
872 case ImplicitOps:
873 /* Special instructions do their own operand decoding. */
874 goto special_insn;
875 case DstReg:
876 dst.type = OP_REG;
877 if ((d & ByteOp)
878 && !(twobyte && (b == 0xb6 || b == 0xb7))) {
879 dst.ptr = decode_register(modrm_reg, _regs,
880 (rex_prefix == 0));
881 dst.val = *(u8 *) dst.ptr;
882 dst.bytes = 1;
883 } else {
884 dst.ptr = decode_register(modrm_reg, _regs, 0);
885 switch ((dst.bytes = op_bytes)) {
886 case 2:
887 dst.val = *(u16 *)dst.ptr;
888 break;
889 case 4:
890 dst.val = *(u32 *)dst.ptr;
891 break;
892 case 8:
893 dst.val = *(u64 *)dst.ptr;
894 break;
895 }
896 }
897 break;
898 case DstMem:
899 dst.type = OP_MEM;
900 dst.ptr = (unsigned long *)cr2;
901 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
902 dst.val = 0;
903 /*
904 * For instructions with a ModR/M byte, switch to register
905 * access if Mod = 3.
906 */
907 if ((d & ModRM) && modrm_mod == 3) {
908 dst.type = OP_REG;
909 break;
910 }
911 if (d & BitOp) {
912 unsigned long mask = ~(dst.bytes * 8 - 1);
913
914 dst.ptr = (void *)dst.ptr + (src.val & mask) / 8;
915 }
916 if (!(d & Mov) && /* optimisation - avoid slow emulated read */
917 ((rc = ops->read_emulated((unsigned long)dst.ptr,
918 &dst.val, dst.bytes, ctxt->vcpu)) != 0))
919 goto done;
920 break;
921 }
922 dst.orig_val = dst.val;
923
924 if (twobyte)
925 goto twobyte_insn;
926
927 switch (b) {
928 case 0x00 ... 0x05:
929 add: /* add */
930 emulate_2op_SrcV("add", src, dst, _eflags);
931 break;
932 case 0x08 ... 0x0d:
933 or: /* or */
934 emulate_2op_SrcV("or", src, dst, _eflags);
935 break;
936 case 0x10 ... 0x15:
937 adc: /* adc */
938 emulate_2op_SrcV("adc", src, dst, _eflags);
939 break;
940 case 0x18 ... 0x1d:
941 sbb: /* sbb */
942 emulate_2op_SrcV("sbb", src, dst, _eflags);
943 break;
944 case 0x20 ... 0x23:
945 and: /* and */
946 emulate_2op_SrcV("and", src, dst, _eflags);
947 break;
948 case 0x24: /* and al imm8 */
949 dst.type = OP_REG;
950 dst.ptr = &_regs[VCPU_REGS_RAX];
951 dst.val = *(u8 *)dst.ptr;
952 dst.bytes = 1;
953 dst.orig_val = dst.val;
954 goto and;
955 case 0x25: /* and ax imm16, or eax imm32 */
956 dst.type = OP_REG;
957 dst.bytes = op_bytes;
958 dst.ptr = &_regs[VCPU_REGS_RAX];
959 if (op_bytes == 2)
960 dst.val = *(u16 *)dst.ptr;
961 else
962 dst.val = *(u32 *)dst.ptr;
963 dst.orig_val = dst.val;
964 goto and;
965 case 0x28 ... 0x2d:
966 sub: /* sub */
967 emulate_2op_SrcV("sub", src, dst, _eflags);
968 break;
969 case 0x30 ... 0x35:
970 xor: /* xor */
971 emulate_2op_SrcV("xor", src, dst, _eflags);
972 break;
973 case 0x38 ... 0x3d:
974 cmp: /* cmp */
975 emulate_2op_SrcV("cmp", src, dst, _eflags);
976 break;
977 case 0x63: /* movsxd */
978 if (mode != X86EMUL_MODE_PROT64)
979 goto cannot_emulate;
980 dst.val = (s32) src.val;
981 break;
982 case 0x80 ... 0x83: /* Grp1 */
983 switch (modrm_reg) {
984 case 0:
985 goto add;
986 case 1:
987 goto or;
988 case 2:
989 goto adc;
990 case 3:
991 goto sbb;
992 case 4:
993 goto and;
994 case 5:
995 goto sub;
996 case 6:
997 goto xor;
998 case 7:
999 goto cmp;
1000 }
1001 break;
1002 case 0x84 ... 0x85:
1003 test: /* test */
1004 emulate_2op_SrcV("test", src, dst, _eflags);
1005 break;
1006 case 0x86 ... 0x87: /* xchg */
1007 /* Write back the register source. */
1008 switch (dst.bytes) {
1009 case 1:
1010 *(u8 *) src.ptr = (u8) dst.val;
1011 break;
1012 case 2:
1013 *(u16 *) src.ptr = (u16) dst.val;
1014 break;
1015 case 4:
1016 *src.ptr = (u32) dst.val;
1017 break; /* 64b reg: zero-extend */
1018 case 8:
1019 *src.ptr = dst.val;
1020 break;
1021 }
1022 /*
1023 * Write back the memory destination with implicit LOCK
1024 * prefix.
1025 */
1026 dst.val = src.val;
1027 lock_prefix = 1;
1028 break;
1029 case 0x88 ... 0x8b: /* mov */
1030 goto mov;
1031 case 0x8d: /* lea r16/r32, m */
1032 dst.val = modrm_val;
1033 break;
1034 case 0x8f: /* pop (sole member of Grp1a) */
1035 /* 64-bit mode: POP always pops a 64-bit operand. */
1036 if (mode == X86EMUL_MODE_PROT64)
1037 dst.bytes = 8;
1038 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1039 _regs[VCPU_REGS_RSP]),
1040 &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1041 goto done;
1042 register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
1043 break;
1044 case 0xa0 ... 0xa1: /* mov */
1045 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1046 dst.val = src.val;
1047 _eip += ad_bytes; /* skip src displacement */
1048 break;
1049 case 0xa2 ... 0xa3: /* mov */
1050 dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
1051 _eip += ad_bytes; /* skip dst displacement */
1052 break;
1053 case 0xc0 ... 0xc1:
1054 grp2: /* Grp2 */
1055 switch (modrm_reg) {
1056 case 0: /* rol */
1057 emulate_2op_SrcB("rol", src, dst, _eflags);
1058 break;
1059 case 1: /* ror */
1060 emulate_2op_SrcB("ror", src, dst, _eflags);
1061 break;
1062 case 2: /* rcl */
1063 emulate_2op_SrcB("rcl", src, dst, _eflags);
1064 break;
1065 case 3: /* rcr */
1066 emulate_2op_SrcB("rcr", src, dst, _eflags);
1067 break;
1068 case 4: /* sal/shl */
1069 case 6: /* sal/shl */
1070 emulate_2op_SrcB("sal", src, dst, _eflags);
1071 break;
1072 case 5: /* shr */
1073 emulate_2op_SrcB("shr", src, dst, _eflags);
1074 break;
1075 case 7: /* sar */
1076 emulate_2op_SrcB("sar", src, dst, _eflags);
1077 break;
1078 }
1079 break;
1080 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
1081 mov:
1082 dst.val = src.val;
1083 break;
1084 case 0xd0 ... 0xd1: /* Grp2 */
1085 src.val = 1;
1086 goto grp2;
1087 case 0xd2 ... 0xd3: /* Grp2 */
1088 src.val = _regs[VCPU_REGS_RCX];
1089 goto grp2;
1090 case 0xf6 ... 0xf7: /* Grp3 */
1091 switch (modrm_reg) {
1092 case 0 ... 1: /* test */
1093 /*
1094 * Special case in Grp3: test has an immediate
1095 * source operand.
1096 */
1097 src.type = OP_IMM;
1098 src.ptr = (unsigned long *)_eip;
1099 src.bytes = (d & ByteOp) ? 1 : op_bytes;
1100 if (src.bytes == 8)
1101 src.bytes = 4;
1102 switch (src.bytes) {
1103 case 1:
1104 src.val = insn_fetch(s8, 1, _eip);
1105 break;
1106 case 2:
1107 src.val = insn_fetch(s16, 2, _eip);
1108 break;
1109 case 4:
1110 src.val = insn_fetch(s32, 4, _eip);
1111 break;
1112 }
1113 goto test;
1114 case 2: /* not */
1115 dst.val = ~dst.val;
1116 break;
1117 case 3: /* neg */
1118 emulate_1op("neg", dst, _eflags);
1119 break;
1120 default:
1121 goto cannot_emulate;
1122 }
1123 break;
1124 case 0xfe ... 0xff: /* Grp4/Grp5 */
1125 switch (modrm_reg) {
1126 case 0: /* inc */
1127 emulate_1op("inc", dst, _eflags);
1128 break;
1129 case 1: /* dec */
1130 emulate_1op("dec", dst, _eflags);
1131 break;
1132 case 4: /* jmp abs */
1133 if (b == 0xff)
1134 _eip = dst.val;
1135 else
1136 goto cannot_emulate;
1137 break;
1138 case 6: /* push */
1139 /* 64-bit mode: PUSH always pushes a 64-bit operand. */
1140 if (mode == X86EMUL_MODE_PROT64) {
1141 dst.bytes = 8;
1142 if ((rc = ops->read_std((unsigned long)dst.ptr,
1143 &dst.val, 8,
1144 ctxt->vcpu)) != 0)
1145 goto done;
1146 }
1147 register_address_increment(_regs[VCPU_REGS_RSP],
1148 -dst.bytes);
1149 if ((rc = ops->write_emulated(
1150 register_address(ctxt->ss_base,
1151 _regs[VCPU_REGS_RSP]),
1152 &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1153 goto done;
1154 no_wb = 1;
1155 break;
1156 default:
1157 goto cannot_emulate;
1158 }
1159 break;
1160 }
1161
1162writeback:
1163 if (!no_wb) {
1164 switch (dst.type) {
1165 case OP_REG:
1166 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
1167 switch (dst.bytes) {
1168 case 1:
1169 *(u8 *)dst.ptr = (u8)dst.val;
1170 break;
1171 case 2:
1172 *(u16 *)dst.ptr = (u16)dst.val;
1173 break;
1174 case 4:
1175 *dst.ptr = (u32)dst.val;
1176 break; /* 64b: zero-ext */
1177 case 8:
1178 *dst.ptr = dst.val;
1179 break;
1180 }
1181 break;
1182 case OP_MEM:
1183 if (lock_prefix)
1184 rc = ops->cmpxchg_emulated((unsigned long)dst.
1185 ptr, &dst.orig_val,
1186 &dst.val, dst.bytes,
1187 ctxt->vcpu);
1188 else
1189 rc = ops->write_emulated((unsigned long)dst.ptr,
1190 &dst.val, dst.bytes,
1191 ctxt->vcpu);
1192 if (rc != 0)
1193 goto done;
1194 default:
1195 break;
1196 }
1197 }
1198
1199 /* Commit shadow register state. */
1200 memcpy(ctxt->vcpu->regs, _regs, sizeof _regs);
1201 ctxt->eflags = _eflags;
1202 ctxt->vcpu->rip = _eip;
1203
1204done:
1205 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1206
1207special_insn:
1208 if (twobyte)
1209 goto twobyte_special_insn;
1210 switch(b) {
1211 case 0x50 ... 0x57: /* push reg */
1212 if (op_bytes == 2)
1213 src.val = (u16) _regs[b & 0x7];
1214 else
1215 src.val = (u32) _regs[b & 0x7];
1216 dst.type = OP_MEM;
1217 dst.bytes = op_bytes;
1218 dst.val = src.val;
1219 register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
1220 dst.ptr = (void *) register_address(
1221 ctxt->ss_base, _regs[VCPU_REGS_RSP]);
1222 break;
1223 case 0x58 ... 0x5f: /* pop reg */
1224 dst.ptr = (unsigned long *)&_regs[b & 0x7];
1225 pop_instruction:
1226 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1227 _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
1228 != 0)
1229 goto done;
1230
1231 register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
1232 no_wb = 1; /* Disable writeback. */
1233 break;
1234 case 0x6a: /* push imm8 */
1235 src.val = 0L;
1236 src.val = insn_fetch(s8, 1, _eip);
1237 push:
1238 dst.type = OP_MEM;
1239 dst.bytes = op_bytes;
1240 dst.val = src.val;
1241 register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
1242 dst.ptr = (void *) register_address(ctxt->ss_base,
1243 _regs[VCPU_REGS_RSP]);
1244 break;
1245 case 0x6c: /* insb */
1246 case 0x6d: /* insw/insd */
1247 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1248 1, /* in */
1249 (d & ByteOp) ? 1 : op_bytes, /* size */
1250 rep_prefix ?
1251 address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
1252 (_eflags & EFLG_DF), /* down */
1253 register_address(ctxt->es_base,
1254 _regs[VCPU_REGS_RDI]), /* address */
1255 rep_prefix,
1256 _regs[VCPU_REGS_RDX] /* port */
1257 ) == 0)
1258 return -1;
1259 return 0;
1260 case 0x6e: /* outsb */
1261 case 0x6f: /* outsw/outsd */
1262 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1263 0, /* in */
1264 (d & ByteOp) ? 1 : op_bytes, /* size */
1265 rep_prefix ?
1266 address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
1267 (_eflags & EFLG_DF), /* down */
1268 register_address(override_base ?
1269 *override_base : ctxt->ds_base,
1270 _regs[VCPU_REGS_RSI]), /* address */
1271 rep_prefix,
1272 _regs[VCPU_REGS_RDX] /* port */
1273 ) == 0)
1274 return -1;
1275 return 0;
1276 case 0x70 ... 0x7f: /* jcc (short) */ {
1277 int rel = insn_fetch(s8, 1, _eip);
1278
1279 if (test_cc(b, _eflags))
1280 JMP_REL(rel);
1281 break;
1282 }
1283 case 0x9c: /* pushf */
1284 src.val = (unsigned long) _eflags;
1285 goto push;
1286 case 0x9d: /* popf */
1287 dst.ptr = (unsigned long *) &_eflags;
1288 goto pop_instruction;
1289 case 0xc3: /* ret */
1290 dst.ptr = &_eip;
1291 goto pop_instruction;
1292 case 0xf4: /* hlt */
1293 ctxt->vcpu->halt_request = 1;
1294 goto done;
1295 }
1296 if (rep_prefix) {
1297 if (_regs[VCPU_REGS_RCX] == 0) {
1298 ctxt->vcpu->rip = _eip;
1299 goto done;
1300 }
1301 _regs[VCPU_REGS_RCX]--;
1302 _eip = ctxt->vcpu->rip;
1303 }
1304 switch (b) {
1305 case 0xa4 ... 0xa5: /* movs */
1306 dst.type = OP_MEM;
1307 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1308 dst.ptr = (unsigned long *)register_address(ctxt->es_base,
1309 _regs[VCPU_REGS_RDI]);
1310 if ((rc = ops->read_emulated(register_address(
1311 override_base ? *override_base : ctxt->ds_base,
1312 _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1313 goto done;
1314 register_address_increment(_regs[VCPU_REGS_RSI],
1315 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1316 register_address_increment(_regs[VCPU_REGS_RDI],
1317 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1318 break;
1319 case 0xa6 ... 0xa7: /* cmps */
1320 DPRINTF("Urk! I don't handle CMPS.\n");
1321 goto cannot_emulate;
1322 case 0xaa ... 0xab: /* stos */
1323 dst.type = OP_MEM;
1324 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1325 dst.ptr = (unsigned long *)cr2;
1326 dst.val = _regs[VCPU_REGS_RAX];
1327 register_address_increment(_regs[VCPU_REGS_RDI],
1328 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1329 break;
1330 case 0xac ... 0xad: /* lods */
1331 dst.type = OP_REG;
1332 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1333 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1334 if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
1335 ctxt->vcpu)) != 0)
1336 goto done;
1337 register_address_increment(_regs[VCPU_REGS_RSI],
1338 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1339 break;
1340 case 0xae ... 0xaf: /* scas */
1341 DPRINTF("Urk! I don't handle SCAS.\n");
1342 goto cannot_emulate;
1343 case 0xe8: /* call (near) */ {
1344 long int rel;
1345 switch (op_bytes) {
1346 case 2:
1347 rel = insn_fetch(s16, 2, _eip);
1348 break;
1349 case 4:
1350 rel = insn_fetch(s32, 4, _eip);
1351 break;
1352 case 8:
1353 rel = insn_fetch(s64, 8, _eip);
1354 break;
1355 default:
1356 DPRINTF("Call: Invalid op_bytes\n");
1357 goto cannot_emulate;
1358 }
1359 src.val = (unsigned long) _eip;
1360 JMP_REL(rel);
1361 op_bytes = ad_bytes;
1362 goto push;
1363 }
1364 case 0xe9: /* jmp rel */
1365 case 0xeb: /* jmp rel short */
1366 JMP_REL(src.val);
1367 no_wb = 1; /* Disable writeback. */
1368 break;
1369
1370
1371 }
1372 goto writeback;
1373
1374twobyte_insn:
1375 switch (b) {
1376 case 0x01: /* lgdt, lidt, lmsw */
1377 /* Disable writeback. */
1378 no_wb = 1;
1379 switch (modrm_reg) {
1380 u16 size;
1381 unsigned long address;
1382
1383 case 2: /* lgdt */
1384 rc = read_descriptor(ctxt, ops, src.ptr,
1385 &size, &address, op_bytes);
1386 if (rc)
1387 goto done;
1388 realmode_lgdt(ctxt->vcpu, size, address);
1389 break;
1390 case 3: /* lidt */
1391 rc = read_descriptor(ctxt, ops, src.ptr,
1392 &size, &address, op_bytes);
1393 if (rc)
1394 goto done;
1395 realmode_lidt(ctxt->vcpu, size, address);
1396 break;
1397 case 4: /* smsw */
1398 if (modrm_mod != 3)
1399 goto cannot_emulate;
1400 *(u16 *)&_regs[modrm_rm]
1401 = realmode_get_cr(ctxt->vcpu, 0);
1402 break;
1403 case 6: /* lmsw */
1404 if (modrm_mod != 3)
1405 goto cannot_emulate;
1406 realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags);
1407 break;
1408 case 7: /* invlpg*/
1409 emulate_invlpg(ctxt->vcpu, cr2);
1410 break;
1411 default:
1412 goto cannot_emulate;
1413 }
1414 break;
1415 case 0x21: /* mov from dr to reg */
1416 no_wb = 1;
1417 if (modrm_mod != 3)
1418 goto cannot_emulate;
1419 rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]);
1420 break;
1421 case 0x23: /* mov from reg to dr */
1422 no_wb = 1;
1423 if (modrm_mod != 3)
1424 goto cannot_emulate;
1425 rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]);
1426 break;
1427 case 0x40 ... 0x4f: /* cmov */
1428 dst.val = dst.orig_val = src.val;
1429 no_wb = 1;
1430 /*
1431 * First, assume we're decoding an even cmov opcode
1432 * (lsb == 0).
1433 */
1434 switch ((b & 15) >> 1) {
1435 case 0: /* cmovo */
1436 no_wb = (_eflags & EFLG_OF) ? 0 : 1;
1437 break;
1438 case 1: /* cmovb/cmovc/cmovnae */
1439 no_wb = (_eflags & EFLG_CF) ? 0 : 1;
1440 break;
1441 case 2: /* cmovz/cmove */
1442 no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
1443 break;
1444 case 3: /* cmovbe/cmovna */
1445 no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
1446 break;
1447 case 4: /* cmovs */
1448 no_wb = (_eflags & EFLG_SF) ? 0 : 1;
1449 break;
1450 case 5: /* cmovp/cmovpe */
1451 no_wb = (_eflags & EFLG_PF) ? 0 : 1;
1452 break;
1453 case 7: /* cmovle/cmovng */
1454 no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
1455 /* fall through */
1456 case 6: /* cmovl/cmovnge */
1457 no_wb &= (!(_eflags & EFLG_SF) !=
1458 !(_eflags & EFLG_OF)) ? 0 : 1;
1459 break;
1460 }
1461 /* Odd cmov opcodes (lsb == 1) have inverted sense. */
1462 no_wb ^= b & 1;
1463 break;
1464 case 0xa3:
1465 bt: /* bt */
1466 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1467 emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
1468 break;
1469 case 0xab:
1470 bts: /* bts */
1471 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1472 emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
1473 break;
1474 case 0xb0 ... 0xb1: /* cmpxchg */
1475 /*
1476 * Save real source value, then compare EAX against
1477 * destination.
1478 */
1479 src.orig_val = src.val;
1480 src.val = _regs[VCPU_REGS_RAX];
1481 emulate_2op_SrcV("cmp", src, dst, _eflags);
1482 if (_eflags & EFLG_ZF) {
1483 /* Success: write back to memory. */
1484 dst.val = src.orig_val;
1485 } else {
1486 /* Failure: write the value we saw to EAX. */
1487 dst.type = OP_REG;
1488 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1489 }
1490 break;
1491 case 0xb3:
1492 btr: /* btr */
1493 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1494 emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
1495 break;
1496 case 0xb6 ... 0xb7: /* movzx */
1497 dst.bytes = op_bytes;
1498 dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
1499 break;
1500 case 0xba: /* Grp8 */
1501 switch (modrm_reg & 3) {
1502 case 0:
1503 goto bt;
1504 case 1:
1505 goto bts;
1506 case 2:
1507 goto btr;
1508 case 3:
1509 goto btc;
1510 }
1511 break;
1512 case 0xbb:
1513 btc: /* btc */
1514 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1515 emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
1516 break;
1517 case 0xbe ... 0xbf: /* movsx */
1518 dst.bytes = op_bytes;
1519 dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
1520 break;
1521 case 0xc3: /* movnti */
1522 dst.bytes = op_bytes;
1523 dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val;
1524 break;
1525 }
1526 goto writeback;
1527
1528twobyte_special_insn:
1529 /* Disable writeback. */
1530 no_wb = 1;
1531 switch (b) {
1532 case 0x06:
1533 emulate_clts(ctxt->vcpu);
1534 break;
1535 case 0x08: /* invd */
1536 break;
1537 case 0x09: /* wbinvd */
1538 break;
1539 case 0x0d: /* GrpP (prefetch) */
1540 case 0x18: /* Grp16 (prefetch/nop) */
1541 break;
1542 case 0x20: /* mov cr, reg */
1543 if (modrm_mod != 3)
1544 goto cannot_emulate;
1545 _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg);
1546 break;
1547 case 0x22: /* mov reg, cr */
1548 if (modrm_mod != 3)
1549 goto cannot_emulate;
1550 realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags);
1551 break;
1552 case 0x30:
1553 /* wrmsr */
1554 msr_data = (u32)_regs[VCPU_REGS_RAX]
1555 | ((u64)_regs[VCPU_REGS_RDX] << 32);
1556 rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
1557 if (rc) {
1558 kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
1559 _eip = ctxt->vcpu->rip;
1560 }
1561 rc = X86EMUL_CONTINUE;
1562 break;
1563 case 0x32:
1564 /* rdmsr */
1565 rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
1566 if (rc) {
1567 kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
1568 _eip = ctxt->vcpu->rip;
1569 } else {
1570 _regs[VCPU_REGS_RAX] = (u32)msr_data;
1571 _regs[VCPU_REGS_RDX] = msr_data >> 32;
1572 }
1573 rc = X86EMUL_CONTINUE;
1574 break;
1575 case 0x80 ... 0x8f: /* jnz rel, etc*/ {
1576 long int rel;
1577
1578 switch (op_bytes) {
1579 case 2:
1580 rel = insn_fetch(s16, 2, _eip);
1581 break;
1582 case 4:
1583 rel = insn_fetch(s32, 4, _eip);
1584 break;
1585 case 8:
1586 rel = insn_fetch(s64, 8, _eip);
1587 break;
1588 default:
1589 DPRINTF("jnz: Invalid op_bytes\n");
1590 goto cannot_emulate;
1591 }
1592 if (test_cc(b, _eflags))
1593 JMP_REL(rel);
1594 break;
1595 }
1596 case 0xc7: /* Grp9 (cmpxchg8b) */
1597 {
1598 u64 old, new;
1599 if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
1600 != 0)
1601 goto done;
1602 if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
1603 ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
1604 _regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1605 _regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1606 _eflags &= ~EFLG_ZF;
1607 } else {
1608 new = ((u64)_regs[VCPU_REGS_RCX] << 32)
1609 | (u32) _regs[VCPU_REGS_RBX];
1610 if ((rc = ops->cmpxchg_emulated(cr2, &old,
1611 &new, 8, ctxt->vcpu)) != 0)
1612 goto done;
1613 _eflags |= EFLG_ZF;
1614 }
1615 break;
1616 }
1617 }
1618 goto writeback;
1619
1620cannot_emulate:
1621 DPRINTF("Cannot emulate %02x\n", b);
1622 return -1;
1623}
1624
1625#ifdef __XEN__
1626
1627#include <asm/mm.h>
1628#include <asm/uaccess.h>
1629
1630int
1631x86_emulate_read_std(unsigned long addr,
1632 unsigned long *val,
1633 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1634{
1635 unsigned int rc;
1636
1637 *val = 0;
1638
1639 if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) {
1640 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
1641 return X86EMUL_PROPAGATE_FAULT;
1642 }
1643
1644 return X86EMUL_CONTINUE;
1645}
1646
1647int
1648x86_emulate_write_std(unsigned long addr,
1649 unsigned long val,
1650 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1651{
1652 unsigned int rc;
1653
1654 if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) {
1655 propagate_page_fault(addr + bytes - rc, PGERR_write_access);
1656 return X86EMUL_PROPAGATE_FAULT;
1657 }
1658
1659 return X86EMUL_CONTINUE;
1660}
1661
1662#endif
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index cb4c67025d52..7743d73768df 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -151,43 +151,43 @@ int lguest_address_ok(const struct lguest *lg,
151/* This routine copies memory from the Guest. Here we can see how useful the 151/* This routine copies memory from the Guest. Here we can see how useful the
152 * kill_lguest() routine we met in the Launcher can be: we return a random 152 * kill_lguest() routine we met in the Launcher can be: we return a random
153 * value (all zeroes) instead of needing to return an error. */ 153 * value (all zeroes) instead of needing to return an error. */
154void __lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) 154void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
155{ 155{
156 if (!lguest_address_ok(lg, addr, bytes) 156 if (!lguest_address_ok(cpu->lg, addr, bytes)
157 || copy_from_user(b, lg->mem_base + addr, bytes) != 0) { 157 || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
158 /* copy_from_user should do this, but as we rely on it... */ 158 /* copy_from_user should do this, but as we rely on it... */
159 memset(b, 0, bytes); 159 memset(b, 0, bytes);
160 kill_guest(lg, "bad read address %#lx len %u", addr, bytes); 160 kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
161 } 161 }
162} 162}
163 163
164/* This is the write (copy into guest) version. */ 164/* This is the write (copy into guest) version. */
165void __lgwrite(struct lguest *lg, unsigned long addr, const void *b, 165void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
166 unsigned bytes) 166 unsigned bytes)
167{ 167{
168 if (!lguest_address_ok(lg, addr, bytes) 168 if (!lguest_address_ok(cpu->lg, addr, bytes)
169 || copy_to_user(lg->mem_base + addr, b, bytes) != 0) 169 || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
170 kill_guest(lg, "bad write address %#lx len %u", addr, bytes); 170 kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
171} 171}
172/*:*/ 172/*:*/
173 173
174/*H:030 Let's jump straight to the the main loop which runs the Guest. 174/*H:030 Let's jump straight to the the main loop which runs the Guest.
175 * Remember, this is called by the Launcher reading /dev/lguest, and we keep 175 * Remember, this is called by the Launcher reading /dev/lguest, and we keep
176 * going around and around until something interesting happens. */ 176 * going around and around until something interesting happens. */
177int run_guest(struct lguest *lg, unsigned long __user *user) 177int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
178{ 178{
179 /* We stop running once the Guest is dead. */ 179 /* We stop running once the Guest is dead. */
180 while (!lg->dead) { 180 while (!cpu->lg->dead) {
181 /* First we run any hypercalls the Guest wants done. */ 181 /* First we run any hypercalls the Guest wants done. */
182 if (lg->hcall) 182 if (cpu->hcall)
183 do_hypercalls(lg); 183 do_hypercalls(cpu);
184 184
185 /* It's possible the Guest did a NOTIFY hypercall to the 185 /* It's possible the Guest did a NOTIFY hypercall to the
186 * Launcher, in which case we return from the read() now. */ 186 * Launcher, in which case we return from the read() now. */
187 if (lg->pending_notify) { 187 if (cpu->pending_notify) {
188 if (put_user(lg->pending_notify, user)) 188 if (put_user(cpu->pending_notify, user))
189 return -EFAULT; 189 return -EFAULT;
190 return sizeof(lg->pending_notify); 190 return sizeof(cpu->pending_notify);
191 } 191 }
192 192
193 /* Check for signals */ 193 /* Check for signals */
@@ -195,13 +195,13 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
195 return -ERESTARTSYS; 195 return -ERESTARTSYS;
196 196
197 /* If Waker set break_out, return to Launcher. */ 197 /* If Waker set break_out, return to Launcher. */
198 if (lg->break_out) 198 if (cpu->break_out)
199 return -EAGAIN; 199 return -EAGAIN;
200 200
201 /* Check if there are any interrupts which can be delivered 201 /* Check if there are any interrupts which can be delivered
202 * now: if so, this sets up the hander to be executed when we 202 * now: if so, this sets up the hander to be executed when we
203 * next run the Guest. */ 203 * next run the Guest. */
204 maybe_do_interrupt(lg); 204 maybe_do_interrupt(cpu);
205 205
206 /* All long-lived kernel loops need to check with this horrible 206 /* All long-lived kernel loops need to check with this horrible
207 * thing called the freezer. If the Host is trying to suspend, 207 * thing called the freezer. If the Host is trying to suspend,
@@ -210,12 +210,12 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
210 210
211 /* Just make absolutely sure the Guest is still alive. One of 211 /* Just make absolutely sure the Guest is still alive. One of
212 * those hypercalls could have been fatal, for example. */ 212 * those hypercalls could have been fatal, for example. */
213 if (lg->dead) 213 if (cpu->lg->dead)
214 break; 214 break;
215 215
216 /* If the Guest asked to be stopped, we sleep. The Guest's 216 /* If the Guest asked to be stopped, we sleep. The Guest's
217 * clock timer or LHCALL_BREAK from the Waker will wake us. */ 217 * clock timer or LHCALL_BREAK from the Waker will wake us. */
218 if (lg->halted) { 218 if (cpu->halted) {
219 set_current_state(TASK_INTERRUPTIBLE); 219 set_current_state(TASK_INTERRUPTIBLE);
220 schedule(); 220 schedule();
221 continue; 221 continue;
@@ -226,15 +226,17 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
226 local_irq_disable(); 226 local_irq_disable();
227 227
228 /* Actually run the Guest until something happens. */ 228 /* Actually run the Guest until something happens. */
229 lguest_arch_run_guest(lg); 229 lguest_arch_run_guest(cpu);
230 230
231 /* Now we're ready to be interrupted or moved to other CPUs */ 231 /* Now we're ready to be interrupted or moved to other CPUs */
232 local_irq_enable(); 232 local_irq_enable();
233 233
234 /* Now we deal with whatever happened to the Guest. */ 234 /* Now we deal with whatever happened to the Guest. */
235 lguest_arch_handle_trap(lg); 235 lguest_arch_handle_trap(cpu);
236 } 236 }
237 237
238 if (cpu->lg->dead == ERR_PTR(-ERESTART))
239 return -ERESTART;
238 /* The Guest is dead => "No such file or directory" */ 240 /* The Guest is dead => "No such file or directory" */
239 return -ENOENT; 241 return -ENOENT;
240} 242}
@@ -253,7 +255,7 @@ static int __init init(void)
253 255
254 /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */ 256 /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */
255 if (paravirt_enabled()) { 257 if (paravirt_enabled()) {
256 printk("lguest is afraid of %s\n", pv_info.name); 258 printk("lguest is afraid of being a guest\n");
257 return -EPERM; 259 return -EPERM;
258 } 260 }
259 261
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index b478affe8f91..0f2cb4fd7c69 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -23,13 +23,14 @@
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/syscalls.h> 24#include <linux/syscalls.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/ktime.h>
26#include <asm/page.h> 27#include <asm/page.h>
27#include <asm/pgtable.h> 28#include <asm/pgtable.h>
28#include "lg.h" 29#include "lg.h"
29 30
30/*H:120 This is the core hypercall routine: where the Guest gets what it wants. 31/*H:120 This is the core hypercall routine: where the Guest gets what it wants.
31 * Or gets killed. Or, in the case of LHCALL_CRASH, both. */ 32 * Or gets killed. Or, in the case of LHCALL_CRASH, both. */
32static void do_hcall(struct lguest *lg, struct hcall_args *args) 33static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
33{ 34{
34 switch (args->arg0) { 35 switch (args->arg0) {
35 case LHCALL_FLUSH_ASYNC: 36 case LHCALL_FLUSH_ASYNC:
@@ -39,60 +40,62 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args)
39 case LHCALL_LGUEST_INIT: 40 case LHCALL_LGUEST_INIT:
40 /* You can't get here unless you're already initialized. Don't 41 /* You can't get here unless you're already initialized. Don't
41 * do that. */ 42 * do that. */
42 kill_guest(lg, "already have lguest_data"); 43 kill_guest(cpu, "already have lguest_data");
43 break; 44 break;
44 case LHCALL_CRASH: { 45 case LHCALL_SHUTDOWN: {
45 /* Crash is such a trivial hypercall that we do it in four 46 /* Shutdown is such a trivial hypercall that we do it in four
46 * lines right here. */ 47 * lines right here. */
47 char msg[128]; 48 char msg[128];
48 /* If the lgread fails, it will call kill_guest() itself; the 49 /* If the lgread fails, it will call kill_guest() itself; the
49 * kill_guest() with the message will be ignored. */ 50 * kill_guest() with the message will be ignored. */
50 __lgread(lg, msg, args->arg1, sizeof(msg)); 51 __lgread(cpu, msg, args->arg1, sizeof(msg));
51 msg[sizeof(msg)-1] = '\0'; 52 msg[sizeof(msg)-1] = '\0';
52 kill_guest(lg, "CRASH: %s", msg); 53 kill_guest(cpu, "CRASH: %s", msg);
54 if (args->arg2 == LGUEST_SHUTDOWN_RESTART)
55 cpu->lg->dead = ERR_PTR(-ERESTART);
53 break; 56 break;
54 } 57 }
55 case LHCALL_FLUSH_TLB: 58 case LHCALL_FLUSH_TLB:
56 /* FLUSH_TLB comes in two flavors, depending on the 59 /* FLUSH_TLB comes in two flavors, depending on the
57 * argument: */ 60 * argument: */
58 if (args->arg1) 61 if (args->arg1)
59 guest_pagetable_clear_all(lg); 62 guest_pagetable_clear_all(cpu);
60 else 63 else
61 guest_pagetable_flush_user(lg); 64 guest_pagetable_flush_user(cpu);
62 break; 65 break;
63 66
64 /* All these calls simply pass the arguments through to the right 67 /* All these calls simply pass the arguments through to the right
65 * routines. */ 68 * routines. */
66 case LHCALL_NEW_PGTABLE: 69 case LHCALL_NEW_PGTABLE:
67 guest_new_pagetable(lg, args->arg1); 70 guest_new_pagetable(cpu, args->arg1);
68 break; 71 break;
69 case LHCALL_SET_STACK: 72 case LHCALL_SET_STACK:
70 guest_set_stack(lg, args->arg1, args->arg2, args->arg3); 73 guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
71 break; 74 break;
72 case LHCALL_SET_PTE: 75 case LHCALL_SET_PTE:
73 guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3)); 76 guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
74 break; 77 break;
75 case LHCALL_SET_PMD: 78 case LHCALL_SET_PMD:
76 guest_set_pmd(lg, args->arg1, args->arg2); 79 guest_set_pmd(cpu->lg, args->arg1, args->arg2);
77 break; 80 break;
78 case LHCALL_SET_CLOCKEVENT: 81 case LHCALL_SET_CLOCKEVENT:
79 guest_set_clockevent(lg, args->arg1); 82 guest_set_clockevent(cpu, args->arg1);
80 break; 83 break;
81 case LHCALL_TS: 84 case LHCALL_TS:
82 /* This sets the TS flag, as we saw used in run_guest(). */ 85 /* This sets the TS flag, as we saw used in run_guest(). */
83 lg->ts = args->arg1; 86 cpu->ts = args->arg1;
84 break; 87 break;
85 case LHCALL_HALT: 88 case LHCALL_HALT:
86 /* Similarly, this sets the halted flag for run_guest(). */ 89 /* Similarly, this sets the halted flag for run_guest(). */
87 lg->halted = 1; 90 cpu->halted = 1;
88 break; 91 break;
89 case LHCALL_NOTIFY: 92 case LHCALL_NOTIFY:
90 lg->pending_notify = args->arg1; 93 cpu->pending_notify = args->arg1;
91 break; 94 break;
92 default: 95 default:
93 /* It should be an architecture-specific hypercall. */ 96 /* It should be an architecture-specific hypercall. */
94 if (lguest_arch_do_hcall(lg, args)) 97 if (lguest_arch_do_hcall(cpu, args))
95 kill_guest(lg, "Bad hypercall %li\n", args->arg0); 98 kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
96 } 99 }
97} 100}
98/*:*/ 101/*:*/
@@ -104,13 +107,13 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args)
104 * Guest put them in the ring, but we also promise the Guest that they will 107 * Guest put them in the ring, but we also promise the Guest that they will
105 * happen before any normal hypercall (which is why we check this before 108 * happen before any normal hypercall (which is why we check this before
106 * checking for a normal hcall). */ 109 * checking for a normal hcall). */
107static void do_async_hcalls(struct lguest *lg) 110static void do_async_hcalls(struct lg_cpu *cpu)
108{ 111{
109 unsigned int i; 112 unsigned int i;
110 u8 st[LHCALL_RING_SIZE]; 113 u8 st[LHCALL_RING_SIZE];
111 114
112 /* For simplicity, we copy the entire call status array in at once. */ 115 /* For simplicity, we copy the entire call status array in at once. */
113 if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) 116 if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st)))
114 return; 117 return;
115 118
116 /* We process "struct lguest_data"s hcalls[] ring once. */ 119 /* We process "struct lguest_data"s hcalls[] ring once. */
@@ -119,7 +122,7 @@ static void do_async_hcalls(struct lguest *lg)
119 /* We remember where we were up to from last time. This makes 122 /* We remember where we were up to from last time. This makes
120 * sure that the hypercalls are done in the order the Guest 123 * sure that the hypercalls are done in the order the Guest
121 * places them in the ring. */ 124 * places them in the ring. */
122 unsigned int n = lg->next_hcall; 125 unsigned int n = cpu->next_hcall;
123 126
124 /* 0xFF means there's no call here (yet). */ 127 /* 0xFF means there's no call here (yet). */
125 if (st[n] == 0xFF) 128 if (st[n] == 0xFF)
@@ -127,65 +130,65 @@ static void do_async_hcalls(struct lguest *lg)
127 130
128 /* OK, we have hypercall. Increment the "next_hcall" cursor, 131 /* OK, we have hypercall. Increment the "next_hcall" cursor,
129 * and wrap back to 0 if we reach the end. */ 132 * and wrap back to 0 if we reach the end. */
130 if (++lg->next_hcall == LHCALL_RING_SIZE) 133 if (++cpu->next_hcall == LHCALL_RING_SIZE)
131 lg->next_hcall = 0; 134 cpu->next_hcall = 0;
132 135
133 /* Copy the hypercall arguments into a local copy of 136 /* Copy the hypercall arguments into a local copy of
134 * the hcall_args struct. */ 137 * the hcall_args struct. */
135 if (copy_from_user(&args, &lg->lguest_data->hcalls[n], 138 if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
136 sizeof(struct hcall_args))) { 139 sizeof(struct hcall_args))) {
137 kill_guest(lg, "Fetching async hypercalls"); 140 kill_guest(cpu, "Fetching async hypercalls");
138 break; 141 break;
139 } 142 }
140 143
141 /* Do the hypercall, same as a normal one. */ 144 /* Do the hypercall, same as a normal one. */
142 do_hcall(lg, &args); 145 do_hcall(cpu, &args);
143 146
144 /* Mark the hypercall done. */ 147 /* Mark the hypercall done. */
145 if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { 148 if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) {
146 kill_guest(lg, "Writing result for async hypercall"); 149 kill_guest(cpu, "Writing result for async hypercall");
147 break; 150 break;
148 } 151 }
149 152
150 /* Stop doing hypercalls if they want to notify the Launcher: 153 /* Stop doing hypercalls if they want to notify the Launcher:
151 * it needs to service this first. */ 154 * it needs to service this first. */
152 if (lg->pending_notify) 155 if (cpu->pending_notify)
153 break; 156 break;
154 } 157 }
155} 158}
156 159
157/* Last of all, we look at what happens first of all. The very first time the 160/* Last of all, we look at what happens first of all. The very first time the
158 * Guest makes a hypercall, we end up here to set things up: */ 161 * Guest makes a hypercall, we end up here to set things up: */
159static void initialize(struct lguest *lg) 162static void initialize(struct lg_cpu *cpu)
160{ 163{
161 /* You can't do anything until you're initialized. The Guest knows the 164 /* You can't do anything until you're initialized. The Guest knows the
162 * rules, so we're unforgiving here. */ 165 * rules, so we're unforgiving here. */
163 if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) { 166 if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
164 kill_guest(lg, "hypercall %li before INIT", lg->hcall->arg0); 167 kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
165 return; 168 return;
166 } 169 }
167 170
168 if (lguest_arch_init_hypercalls(lg)) 171 if (lguest_arch_init_hypercalls(cpu))
169 kill_guest(lg, "bad guest page %p", lg->lguest_data); 172 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
170 173
171 /* The Guest tells us where we're not to deliver interrupts by putting 174 /* The Guest tells us where we're not to deliver interrupts by putting
172 * the range of addresses into "struct lguest_data". */ 175 * the range of addresses into "struct lguest_data". */
173 if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) 176 if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start)
174 || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)) 177 || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end))
175 kill_guest(lg, "bad guest page %p", lg->lguest_data); 178 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
176 179
177 /* We write the current time into the Guest's data page once so it can 180 /* We write the current time into the Guest's data page once so it can
178 * set its clock. */ 181 * set its clock. */
179 write_timestamp(lg); 182 write_timestamp(cpu);
180 183
181 /* page_tables.c will also do some setup. */ 184 /* page_tables.c will also do some setup. */
182 page_table_guest_data_init(lg); 185 page_table_guest_data_init(cpu);
183 186
184 /* This is the one case where the above accesses might have been the 187 /* This is the one case where the above accesses might have been the
185 * first write to a Guest page. This may have caused a copy-on-write 188 * first write to a Guest page. This may have caused a copy-on-write
186 * fault, but the old page might be (read-only) in the Guest 189 * fault, but the old page might be (read-only) in the Guest
187 * pagetable. */ 190 * pagetable. */
188 guest_pagetable_clear_all(lg); 191 guest_pagetable_clear_all(cpu);
189} 192}
190 193
191/*H:100 194/*H:100
@@ -194,27 +197,27 @@ static void initialize(struct lguest *lg)
194 * Remember from the Guest, hypercalls come in two flavors: normal and 197 * Remember from the Guest, hypercalls come in two flavors: normal and
195 * asynchronous. This file handles both of types. 198 * asynchronous. This file handles both of types.
196 */ 199 */
197void do_hypercalls(struct lguest *lg) 200void do_hypercalls(struct lg_cpu *cpu)
198{ 201{
199 /* Not initialized yet? This hypercall must do it. */ 202 /* Not initialized yet? This hypercall must do it. */
200 if (unlikely(!lg->lguest_data)) { 203 if (unlikely(!cpu->lg->lguest_data)) {
201 /* Set up the "struct lguest_data" */ 204 /* Set up the "struct lguest_data" */
202 initialize(lg); 205 initialize(cpu);
203 /* Hcall is done. */ 206 /* Hcall is done. */
204 lg->hcall = NULL; 207 cpu->hcall = NULL;
205 return; 208 return;
206 } 209 }
207 210
208 /* The Guest has initialized. 211 /* The Guest has initialized.
209 * 212 *
210 * Look in the hypercall ring for the async hypercalls: */ 213 * Look in the hypercall ring for the async hypercalls: */
211 do_async_hcalls(lg); 214 do_async_hcalls(cpu);
212 215
213 /* If we stopped reading the hypercall ring because the Guest did a 216 /* If we stopped reading the hypercall ring because the Guest did a
214 * NOTIFY to the Launcher, we want to return now. Otherwise we do 217 * NOTIFY to the Launcher, we want to return now. Otherwise we do
215 * the hypercall. */ 218 * the hypercall. */
216 if (!lg->pending_notify) { 219 if (!cpu->pending_notify) {
217 do_hcall(lg, lg->hcall); 220 do_hcall(cpu, cpu->hcall);
218 /* Tricky point: we reset the hcall pointer to mark the 221 /* Tricky point: we reset the hcall pointer to mark the
219 * hypercall as "done". We use the hcall pointer rather than 222 * hypercall as "done". We use the hcall pointer rather than
220 * the trap number to indicate a hypercall is pending. 223 * the trap number to indicate a hypercall is pending.
@@ -225,16 +228,17 @@ void do_hypercalls(struct lguest *lg)
225 * Launcher, the run_guest() loop will exit without running the 228 * Launcher, the run_guest() loop will exit without running the
226 * Guest. When it comes back it would try to re-run the 229 * Guest. When it comes back it would try to re-run the
227 * hypercall. */ 230 * hypercall. */
228 lg->hcall = NULL; 231 cpu->hcall = NULL;
229 } 232 }
230} 233}
231 234
232/* This routine supplies the Guest with time: it's used for wallclock time at 235/* This routine supplies the Guest with time: it's used for wallclock time at
233 * initial boot and as a rough time source if the TSC isn't available. */ 236 * initial boot and as a rough time source if the TSC isn't available. */
234void write_timestamp(struct lguest *lg) 237void write_timestamp(struct lg_cpu *cpu)
235{ 238{
236 struct timespec now; 239 struct timespec now;
237 ktime_get_real_ts(&now); 240 ktime_get_real_ts(&now);
238 if (copy_to_user(&lg->lguest_data->time, &now, sizeof(struct timespec))) 241 if (copy_to_user(&cpu->lg->lguest_data->time,
239 kill_guest(lg, "Writing timestamp"); 242 &now, sizeof(struct timespec)))
243 kill_guest(cpu, "Writing timestamp");
240} 244}
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 2b66f79c208b..32e97c1858e5 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -41,11 +41,11 @@ static int idt_present(u32 lo, u32 hi)
41 41
42/* We need a helper to "push" a value onto the Guest's stack, since that's a 42/* We need a helper to "push" a value onto the Guest's stack, since that's a
43 * big part of what delivering an interrupt does. */ 43 * big part of what delivering an interrupt does. */
44static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) 44static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
45{ 45{
46 /* Stack grows upwards: move stack then write value. */ 46 /* Stack grows upwards: move stack then write value. */
47 *gstack -= 4; 47 *gstack -= 4;
48 lgwrite(lg, *gstack, u32, val); 48 lgwrite(cpu, *gstack, u32, val);
49} 49}
50 50
51/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or 51/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or
@@ -60,7 +60,7 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
60 * We set up the stack just like the CPU does for a real interrupt, so it's 60 * We set up the stack just like the CPU does for a real interrupt, so it's
61 * identical for the Guest (and the standard "iret" instruction will undo 61 * identical for the Guest (and the standard "iret" instruction will undo
62 * it). */ 62 * it). */
63static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) 63static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, int has_err)
64{ 64{
65 unsigned long gstack, origstack; 65 unsigned long gstack, origstack;
66 u32 eflags, ss, irq_enable; 66 u32 eflags, ss, irq_enable;
@@ -69,59 +69,59 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
69 /* There are two cases for interrupts: one where the Guest is already 69 /* There are two cases for interrupts: one where the Guest is already
70 * in the kernel, and a more complex one where the Guest is in 70 * in the kernel, and a more complex one where the Guest is in
71 * userspace. We check the privilege level to find out. */ 71 * userspace. We check the privilege level to find out. */
72 if ((lg->regs->ss&0x3) != GUEST_PL) { 72 if ((cpu->regs->ss&0x3) != GUEST_PL) {
73 /* The Guest told us their kernel stack with the SET_STACK 73 /* The Guest told us their kernel stack with the SET_STACK
74 * hypercall: both the virtual address and the segment */ 74 * hypercall: both the virtual address and the segment */
75 virtstack = lg->esp1; 75 virtstack = cpu->esp1;
76 ss = lg->ss1; 76 ss = cpu->ss1;
77 77
78 origstack = gstack = guest_pa(lg, virtstack); 78 origstack = gstack = guest_pa(cpu, virtstack);
79 /* We push the old stack segment and pointer onto the new 79 /* We push the old stack segment and pointer onto the new
80 * stack: when the Guest does an "iret" back from the interrupt 80 * stack: when the Guest does an "iret" back from the interrupt
81 * handler the CPU will notice they're dropping privilege 81 * handler the CPU will notice they're dropping privilege
82 * levels and expect these here. */ 82 * levels and expect these here. */
83 push_guest_stack(lg, &gstack, lg->regs->ss); 83 push_guest_stack(cpu, &gstack, cpu->regs->ss);
84 push_guest_stack(lg, &gstack, lg->regs->esp); 84 push_guest_stack(cpu, &gstack, cpu->regs->esp);
85 } else { 85 } else {
86 /* We're staying on the same Guest (kernel) stack. */ 86 /* We're staying on the same Guest (kernel) stack. */
87 virtstack = lg->regs->esp; 87 virtstack = cpu->regs->esp;
88 ss = lg->regs->ss; 88 ss = cpu->regs->ss;
89 89
90 origstack = gstack = guest_pa(lg, virtstack); 90 origstack = gstack = guest_pa(cpu, virtstack);
91 } 91 }
92 92
93 /* Remember that we never let the Guest actually disable interrupts, so 93 /* Remember that we never let the Guest actually disable interrupts, so
94 * the "Interrupt Flag" bit is always set. We copy that bit from the 94 * the "Interrupt Flag" bit is always set. We copy that bit from the
95 * Guest's "irq_enabled" field into the eflags word: we saw the Guest 95 * Guest's "irq_enabled" field into the eflags word: we saw the Guest
96 * copy it back in "lguest_iret". */ 96 * copy it back in "lguest_iret". */
97 eflags = lg->regs->eflags; 97 eflags = cpu->regs->eflags;
98 if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0 98 if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0
99 && !(irq_enable & X86_EFLAGS_IF)) 99 && !(irq_enable & X86_EFLAGS_IF))
100 eflags &= ~X86_EFLAGS_IF; 100 eflags &= ~X86_EFLAGS_IF;
101 101
102 /* An interrupt is expected to push three things on the stack: the old 102 /* An interrupt is expected to push three things on the stack: the old
103 * "eflags" word, the old code segment, and the old instruction 103 * "eflags" word, the old code segment, and the old instruction
104 * pointer. */ 104 * pointer. */
105 push_guest_stack(lg, &gstack, eflags); 105 push_guest_stack(cpu, &gstack, eflags);
106 push_guest_stack(lg, &gstack, lg->regs->cs); 106 push_guest_stack(cpu, &gstack, cpu->regs->cs);
107 push_guest_stack(lg, &gstack, lg->regs->eip); 107 push_guest_stack(cpu, &gstack, cpu->regs->eip);
108 108
109 /* For the six traps which supply an error code, we push that, too. */ 109 /* For the six traps which supply an error code, we push that, too. */
110 if (has_err) 110 if (has_err)
111 push_guest_stack(lg, &gstack, lg->regs->errcode); 111 push_guest_stack(cpu, &gstack, cpu->regs->errcode);
112 112
113 /* Now we've pushed all the old state, we change the stack, the code 113 /* Now we've pushed all the old state, we change the stack, the code
114 * segment and the address to execute. */ 114 * segment and the address to execute. */
115 lg->regs->ss = ss; 115 cpu->regs->ss = ss;
116 lg->regs->esp = virtstack + (gstack - origstack); 116 cpu->regs->esp = virtstack + (gstack - origstack);
117 lg->regs->cs = (__KERNEL_CS|GUEST_PL); 117 cpu->regs->cs = (__KERNEL_CS|GUEST_PL);
118 lg->regs->eip = idt_address(lo, hi); 118 cpu->regs->eip = idt_address(lo, hi);
119 119
120 /* There are two kinds of interrupt handlers: 0xE is an "interrupt 120 /* There are two kinds of interrupt handlers: 0xE is an "interrupt
121 * gate" which expects interrupts to be disabled on entry. */ 121 * gate" which expects interrupts to be disabled on entry. */
122 if (idt_type(lo, hi) == 0xE) 122 if (idt_type(lo, hi) == 0xE)
123 if (put_user(0, &lg->lguest_data->irq_enabled)) 123 if (put_user(0, &cpu->lg->lguest_data->irq_enabled))
124 kill_guest(lg, "Disabling interrupts"); 124 kill_guest(cpu, "Disabling interrupts");
125} 125}
126 126
127/*H:205 127/*H:205
@@ -129,23 +129,23 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
129 * 129 *
130 * maybe_do_interrupt() gets called before every entry to the Guest, to see if 130 * maybe_do_interrupt() gets called before every entry to the Guest, to see if
131 * we should divert the Guest to running an interrupt handler. */ 131 * we should divert the Guest to running an interrupt handler. */
132void maybe_do_interrupt(struct lguest *lg) 132void maybe_do_interrupt(struct lg_cpu *cpu)
133{ 133{
134 unsigned int irq; 134 unsigned int irq;
135 DECLARE_BITMAP(blk, LGUEST_IRQS); 135 DECLARE_BITMAP(blk, LGUEST_IRQS);
136 struct desc_struct *idt; 136 struct desc_struct *idt;
137 137
138 /* If the Guest hasn't even initialized yet, we can do nothing. */ 138 /* If the Guest hasn't even initialized yet, we can do nothing. */
139 if (!lg->lguest_data) 139 if (!cpu->lg->lguest_data)
140 return; 140 return;
141 141
142 /* Take our "irqs_pending" array and remove any interrupts the Guest 142 /* Take our "irqs_pending" array and remove any interrupts the Guest
143 * wants blocked: the result ends up in "blk". */ 143 * wants blocked: the result ends up in "blk". */
144 if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts, 144 if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
145 sizeof(blk))) 145 sizeof(blk)))
146 return; 146 return;
147 147
148 bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS); 148 bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
149 149
150 /* Find the first interrupt. */ 150 /* Find the first interrupt. */
151 irq = find_first_bit(blk, LGUEST_IRQS); 151 irq = find_first_bit(blk, LGUEST_IRQS);
@@ -155,19 +155,20 @@ void maybe_do_interrupt(struct lguest *lg)
155 155
156 /* They may be in the middle of an iret, where they asked us never to 156 /* They may be in the middle of an iret, where they asked us never to
157 * deliver interrupts. */ 157 * deliver interrupts. */
158 if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end) 158 if (cpu->regs->eip >= cpu->lg->noirq_start &&
159 (cpu->regs->eip < cpu->lg->noirq_end))
159 return; 160 return;
160 161
161 /* If they're halted, interrupts restart them. */ 162 /* If they're halted, interrupts restart them. */
162 if (lg->halted) { 163 if (cpu->halted) {
163 /* Re-enable interrupts. */ 164 /* Re-enable interrupts. */
164 if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled)) 165 if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled))
165 kill_guest(lg, "Re-enabling interrupts"); 166 kill_guest(cpu, "Re-enabling interrupts");
166 lg->halted = 0; 167 cpu->halted = 0;
167 } else { 168 } else {
168 /* Otherwise we check if they have interrupts disabled. */ 169 /* Otherwise we check if they have interrupts disabled. */
169 u32 irq_enabled; 170 u32 irq_enabled;
170 if (get_user(irq_enabled, &lg->lguest_data->irq_enabled)) 171 if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
171 irq_enabled = 0; 172 irq_enabled = 0;
172 if (!irq_enabled) 173 if (!irq_enabled)
173 return; 174 return;
@@ -176,15 +177,15 @@ void maybe_do_interrupt(struct lguest *lg)
176 /* Look at the IDT entry the Guest gave us for this interrupt. The 177 /* Look at the IDT entry the Guest gave us for this interrupt. The
177 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip 178 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
178 * over them. */ 179 * over them. */
179 idt = &lg->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; 180 idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
180 /* If they don't have a handler (yet?), we just ignore it */ 181 /* If they don't have a handler (yet?), we just ignore it */
181 if (idt_present(idt->a, idt->b)) { 182 if (idt_present(idt->a, idt->b)) {
182 /* OK, mark it no longer pending and deliver it. */ 183 /* OK, mark it no longer pending and deliver it. */
183 clear_bit(irq, lg->irqs_pending); 184 clear_bit(irq, cpu->irqs_pending);
184 /* set_guest_interrupt() takes the interrupt descriptor and a 185 /* set_guest_interrupt() takes the interrupt descriptor and a
185 * flag to say whether this interrupt pushes an error code onto 186 * flag to say whether this interrupt pushes an error code onto
186 * the stack as well: virtual interrupts never do. */ 187 * the stack as well: virtual interrupts never do. */
187 set_guest_interrupt(lg, idt->a, idt->b, 0); 188 set_guest_interrupt(cpu, idt->a, idt->b, 0);
188 } 189 }
189 190
190 /* Every time we deliver an interrupt, we update the timestamp in the 191 /* Every time we deliver an interrupt, we update the timestamp in the
@@ -192,7 +193,7 @@ void maybe_do_interrupt(struct lguest *lg)
192 * did this more often, but it can actually be quite slow: doing it 193 * did this more often, but it can actually be quite slow: doing it
193 * here is a compromise which means at least it gets updated every 194 * here is a compromise which means at least it gets updated every
194 * timer interrupt. */ 195 * timer interrupt. */
195 write_timestamp(lg); 196 write_timestamp(cpu);
196} 197}
197/*:*/ 198/*:*/
198 199
@@ -245,19 +246,19 @@ static int has_err(unsigned int trap)
245} 246}
246 247
247/* deliver_trap() returns true if it could deliver the trap. */ 248/* deliver_trap() returns true if it could deliver the trap. */
248int deliver_trap(struct lguest *lg, unsigned int num) 249int deliver_trap(struct lg_cpu *cpu, unsigned int num)
249{ 250{
250 /* Trap numbers are always 8 bit, but we set an impossible trap number 251 /* Trap numbers are always 8 bit, but we set an impossible trap number
251 * for traps inside the Switcher, so check that here. */ 252 * for traps inside the Switcher, so check that here. */
252 if (num >= ARRAY_SIZE(lg->arch.idt)) 253 if (num >= ARRAY_SIZE(cpu->arch.idt))
253 return 0; 254 return 0;
254 255
255 /* Early on the Guest hasn't set the IDT entries (or maybe it put a 256 /* Early on the Guest hasn't set the IDT entries (or maybe it put a
256 * bogus one in): if we fail here, the Guest will be killed. */ 257 * bogus one in): if we fail here, the Guest will be killed. */
257 if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b)) 258 if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
258 return 0; 259 return 0;
259 set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, 260 set_guest_interrupt(cpu, cpu->arch.idt[num].a,
260 has_err(num)); 261 cpu->arch.idt[num].b, has_err(num));
261 return 1; 262 return 1;
262} 263}
263 264
@@ -309,18 +310,18 @@ static int direct_trap(unsigned int num)
309 * the Guest. 310 * the Guest.
310 * 311 *
311 * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ 312 * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */
312void pin_stack_pages(struct lguest *lg) 313void pin_stack_pages(struct lg_cpu *cpu)
313{ 314{
314 unsigned int i; 315 unsigned int i;
315 316
316 /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or 317 /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or
317 * two pages of stack space. */ 318 * two pages of stack space. */
318 for (i = 0; i < lg->stack_pages; i++) 319 for (i = 0; i < cpu->lg->stack_pages; i++)
319 /* The stack grows *upwards*, so the address we're given is the 320 /* The stack grows *upwards*, so the address we're given is the
320 * start of the page after the kernel stack. Subtract one to 321 * start of the page after the kernel stack. Subtract one to
321 * get back onto the first stack page, and keep subtracting to 322 * get back onto the first stack page, and keep subtracting to
322 * get to the rest of the stack pages. */ 323 * get to the rest of the stack pages. */
323 pin_page(lg, lg->esp1 - 1 - i * PAGE_SIZE); 324 pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE);
324} 325}
325 326
326/* Direct traps also mean that we need to know whenever the Guest wants to use 327/* Direct traps also mean that we need to know whenever the Guest wants to use
@@ -331,21 +332,21 @@ void pin_stack_pages(struct lguest *lg)
331 * 332 *
332 * In Linux each process has its own kernel stack, so this happens a lot: we 333 * In Linux each process has its own kernel stack, so this happens a lot: we
333 * change stacks on each context switch. */ 334 * change stacks on each context switch. */
334void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) 335void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
335{ 336{
336 /* You are not allowed have a stack segment with privilege level 0: bad 337 /* You are not allowed have a stack segment with privilege level 0: bad
337 * Guest! */ 338 * Guest! */
338 if ((seg & 0x3) != GUEST_PL) 339 if ((seg & 0x3) != GUEST_PL)
339 kill_guest(lg, "bad stack segment %i", seg); 340 kill_guest(cpu, "bad stack segment %i", seg);
340 /* We only expect one or two stack pages. */ 341 /* We only expect one or two stack pages. */
341 if (pages > 2) 342 if (pages > 2)
342 kill_guest(lg, "bad stack pages %u", pages); 343 kill_guest(cpu, "bad stack pages %u", pages);
343 /* Save where the stack is, and how many pages */ 344 /* Save where the stack is, and how many pages */
344 lg->ss1 = seg; 345 cpu->ss1 = seg;
345 lg->esp1 = esp; 346 cpu->esp1 = esp;
346 lg->stack_pages = pages; 347 cpu->lg->stack_pages = pages;
347 /* Make sure the new stack pages are mapped */ 348 /* Make sure the new stack pages are mapped */
348 pin_stack_pages(lg); 349 pin_stack_pages(cpu);
349} 350}
350 351
351/* All this reference to mapping stacks leads us neatly into the other complex 352/* All this reference to mapping stacks leads us neatly into the other complex
@@ -353,7 +354,7 @@ void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
353 354
354/*H:235 This is the routine which actually checks the Guest's IDT entry and 355/*H:235 This is the routine which actually checks the Guest's IDT entry and
355 * transfers it into the entry in "struct lguest": */ 356 * transfers it into the entry in "struct lguest": */
356static void set_trap(struct lguest *lg, struct desc_struct *trap, 357static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
357 unsigned int num, u32 lo, u32 hi) 358 unsigned int num, u32 lo, u32 hi)
358{ 359{
359 u8 type = idt_type(lo, hi); 360 u8 type = idt_type(lo, hi);
@@ -366,7 +367,7 @@ static void set_trap(struct lguest *lg, struct desc_struct *trap,
366 367
367 /* We only support interrupt and trap gates. */ 368 /* We only support interrupt and trap gates. */
368 if (type != 0xE && type != 0xF) 369 if (type != 0xE && type != 0xF)
369 kill_guest(lg, "bad IDT type %i", type); 370 kill_guest(cpu, "bad IDT type %i", type);
370 371
371 /* We only copy the handler address, present bit, privilege level and 372 /* We only copy the handler address, present bit, privilege level and
372 * type. The privilege level controls where the trap can be triggered 373 * type. The privilege level controls where the trap can be triggered
@@ -383,7 +384,7 @@ static void set_trap(struct lguest *lg, struct desc_struct *trap,
383 * 384 *
384 * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the 385 * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
385 * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ 386 * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */
386void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi) 387void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
387{ 388{
388 /* Guest never handles: NMI, doublefault, spurious interrupt or 389 /* Guest never handles: NMI, doublefault, spurious interrupt or
389 * hypercall. We ignore when it tries to set them. */ 390 * hypercall. We ignore when it tries to set them. */
@@ -392,13 +393,13 @@ void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
392 393
393 /* Mark the IDT as changed: next time the Guest runs we'll know we have 394 /* Mark the IDT as changed: next time the Guest runs we'll know we have
394 * to copy this again. */ 395 * to copy this again. */
395 lg->changed |= CHANGED_IDT; 396 cpu->changed |= CHANGED_IDT;
396 397
397 /* Check that the Guest doesn't try to step outside the bounds. */ 398 /* Check that the Guest doesn't try to step outside the bounds. */
398 if (num >= ARRAY_SIZE(lg->arch.idt)) 399 if (num >= ARRAY_SIZE(cpu->arch.idt))
399 kill_guest(lg, "Setting idt entry %u", num); 400 kill_guest(cpu, "Setting idt entry %u", num);
400 else 401 else
401 set_trap(lg, &lg->arch.idt[num], num, lo, hi); 402 set_trap(cpu, &cpu->arch.idt[num], num, lo, hi);
402} 403}
403 404
404/* The default entry for each interrupt points into the Switcher routines which 405/* The default entry for each interrupt points into the Switcher routines which
@@ -434,14 +435,14 @@ void setup_default_idt_entries(struct lguest_ro_state *state,
434/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead 435/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead
435 * we copy them into the IDT which we've set up for Guests on this CPU, just 436 * we copy them into the IDT which we've set up for Guests on this CPU, just
436 * before we run the Guest. This routine does that copy. */ 437 * before we run the Guest. This routine does that copy. */
437void copy_traps(const struct lguest *lg, struct desc_struct *idt, 438void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
438 const unsigned long *def) 439 const unsigned long *def)
439{ 440{
440 unsigned int i; 441 unsigned int i;
441 442
442 /* We can simply copy the direct traps, otherwise we use the default 443 /* We can simply copy the direct traps, otherwise we use the default
443 * ones in the Switcher: they will return to the Host. */ 444 * ones in the Switcher: they will return to the Host. */
444 for (i = 0; i < ARRAY_SIZE(lg->arch.idt); i++) { 445 for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
445 /* If no Guest can ever override this trap, leave it alone. */ 446 /* If no Guest can ever override this trap, leave it alone. */
446 if (!direct_trap(i)) 447 if (!direct_trap(i))
447 continue; 448 continue;
@@ -450,8 +451,8 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
450 * Interrupt gates (type 14) disable interrupts as they are 451 * Interrupt gates (type 14) disable interrupts as they are
451 * entered, which we never let the Guest do. Not present 452 * entered, which we never let the Guest do. Not present
452 * entries (type 0x0) also can't go direct, of course. */ 453 * entries (type 0x0) also can't go direct, of course. */
453 if (idt_type(lg->arch.idt[i].a, lg->arch.idt[i].b) == 0xF) 454 if (idt_type(cpu->arch.idt[i].a, cpu->arch.idt[i].b) == 0xF)
454 idt[i] = lg->arch.idt[i]; 455 idt[i] = cpu->arch.idt[i];
455 else 456 else
456 /* Reset it to the default. */ 457 /* Reset it to the default. */
457 default_idt_entry(&idt[i], i, def[i]); 458 default_idt_entry(&idt[i], i, def[i]);
@@ -470,13 +471,13 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
470 * infrastructure to set a callback at that time. 471 * infrastructure to set a callback at that time.
471 * 472 *
472 * 0 means "turn off the clock". */ 473 * 0 means "turn off the clock". */
473void guest_set_clockevent(struct lguest *lg, unsigned long delta) 474void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
474{ 475{
475 ktime_t expires; 476 ktime_t expires;
476 477
477 if (unlikely(delta == 0)) { 478 if (unlikely(delta == 0)) {
478 /* Clock event device is shutting down. */ 479 /* Clock event device is shutting down. */
479 hrtimer_cancel(&lg->hrt); 480 hrtimer_cancel(&cpu->hrt);
480 return; 481 return;
481 } 482 }
482 483
@@ -484,25 +485,25 @@ void guest_set_clockevent(struct lguest *lg, unsigned long delta)
484 * all the time between now and the timer interrupt it asked for. This 485 * all the time between now and the timer interrupt it asked for. This
485 * is almost always the right thing to do. */ 486 * is almost always the right thing to do. */
486 expires = ktime_add_ns(ktime_get_real(), delta); 487 expires = ktime_add_ns(ktime_get_real(), delta);
487 hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS); 488 hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS);
488} 489}
489 490
490/* This is the function called when the Guest's timer expires. */ 491/* This is the function called when the Guest's timer expires. */
491static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) 492static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
492{ 493{
493 struct lguest *lg = container_of(timer, struct lguest, hrt); 494 struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
494 495
495 /* Remember the first interrupt is the timer interrupt. */ 496 /* Remember the first interrupt is the timer interrupt. */
496 set_bit(0, lg->irqs_pending); 497 set_bit(0, cpu->irqs_pending);
497 /* If the Guest is actually stopped, we need to wake it up. */ 498 /* If the Guest is actually stopped, we need to wake it up. */
498 if (lg->halted) 499 if (cpu->halted)
499 wake_up_process(lg->tsk); 500 wake_up_process(cpu->tsk);
500 return HRTIMER_NORESTART; 501 return HRTIMER_NORESTART;
501} 502}
502 503
503/* This sets up the timer for this Guest. */ 504/* This sets up the timer for this Guest. */
504void init_clockdev(struct lguest *lg) 505void init_clockdev(struct lg_cpu *cpu)
505{ 506{
506 hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS); 507 hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
507 lg->hrt.function = clockdev_fn; 508 cpu->hrt.function = clockdev_fn;
508} 509}
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 86924891b5eb..2337e1a06f02 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -8,6 +8,7 @@
8#include <linux/lguest.h> 8#include <linux/lguest.h>
9#include <linux/lguest_launcher.h> 9#include <linux/lguest_launcher.h>
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hrtimer.h>
11#include <linux/err.h> 12#include <linux/err.h>
12#include <asm/semaphore.h> 13#include <asm/semaphore.h>
13 14
@@ -38,58 +39,72 @@ struct lguest_pages
38#define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */ 39#define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */
39#define CHANGED_ALL 3 40#define CHANGED_ALL 3
40 41
41/* The private info the thread maintains about the guest. */ 42struct lguest;
42struct lguest 43
43{ 44struct lg_cpu {
44 /* At end of a page shared mapped over lguest_pages in guest. */ 45 unsigned int id;
45 unsigned long regs_page; 46 struct lguest *lg;
46 struct lguest_regs *regs;
47 struct lguest_data __user *lguest_data;
48 struct task_struct *tsk; 47 struct task_struct *tsk;
49 struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ 48 struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */
50 u32 pfn_limit; 49
51 /* This provides the offset to the base of guest-physical
52 * memory in the Launcher. */
53 void __user *mem_base;
54 unsigned long kernel_address;
55 u32 cr2; 50 u32 cr2;
56 int halted;
57 int ts; 51 int ts;
58 u32 next_hcall;
59 u32 esp1; 52 u32 esp1;
60 u8 ss1; 53 u8 ss1;
61 54
55 /* Bitmap of what has changed: see CHANGED_* above. */
56 int changed;
57
58 unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
59
60 /* At end of a page shared mapped over lguest_pages in guest. */
61 unsigned long regs_page;
62 struct lguest_regs *regs;
63
64 struct lguest_pages *last_pages;
65
66 int cpu_pgd; /* which pgd this cpu is currently using */
67
62 /* If a hypercall was asked for, this points to the arguments. */ 68 /* If a hypercall was asked for, this points to the arguments. */
63 struct hcall_args *hcall; 69 struct hcall_args *hcall;
70 u32 next_hcall;
71
72 /* Virtual clock device */
73 struct hrtimer hrt;
64 74
65 /* Do we need to stop what we're doing and return to userspace? */ 75 /* Do we need to stop what we're doing and return to userspace? */
66 int break_out; 76 int break_out;
67 wait_queue_head_t break_wq; 77 wait_queue_head_t break_wq;
78 int halted;
68 79
69 /* Bitmap of what has changed: see CHANGED_* above. */ 80 /* Pending virtual interrupts */
70 int changed; 81 DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
71 struct lguest_pages *last_pages; 82
83 struct lg_cpu_arch arch;
84};
85
86/* The private info the thread maintains about the guest. */
87struct lguest
88{
89 struct lguest_data __user *lguest_data;
90 struct lg_cpu cpus[NR_CPUS];
91 unsigned int nr_cpus;
92
93 u32 pfn_limit;
94 /* This provides the offset to the base of guest-physical
95 * memory in the Launcher. */
96 void __user *mem_base;
97 unsigned long kernel_address;
72 98
73 /* We keep a small number of these. */
74 u32 pgdidx;
75 struct pgdir pgdirs[4]; 99 struct pgdir pgdirs[4];
76 100
77 unsigned long noirq_start, noirq_end; 101 unsigned long noirq_start, noirq_end;
78 unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
79 102
80 unsigned int stack_pages; 103 unsigned int stack_pages;
81 u32 tsc_khz; 104 u32 tsc_khz;
82 105
83 /* Dead? */ 106 /* Dead? */
84 const char *dead; 107 const char *dead;
85
86 struct lguest_arch arch;
87
88 /* Virtual clock device */
89 struct hrtimer hrt;
90
91 /* Pending virtual interrupts */
92 DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
93}; 108};
94 109
95extern struct mutex lguest_lock; 110extern struct mutex lguest_lock;
@@ -97,26 +112,26 @@ extern struct mutex lguest_lock;
97/* core.c: */ 112/* core.c: */
98int lguest_address_ok(const struct lguest *lg, 113int lguest_address_ok(const struct lguest *lg,
99 unsigned long addr, unsigned long len); 114 unsigned long addr, unsigned long len);
100void __lgread(struct lguest *, void *, unsigned long, unsigned); 115void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
101void __lgwrite(struct lguest *, unsigned long, const void *, unsigned); 116void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
102 117
103/*H:035 Using memory-copy operations like that is usually inconvient, so we 118/*H:035 Using memory-copy operations like that is usually inconvient, so we
104 * have the following helper macros which read and write a specific type (often 119 * have the following helper macros which read and write a specific type (often
105 * an unsigned long). 120 * an unsigned long).
106 * 121 *
107 * This reads into a variable of the given type then returns that. */ 122 * This reads into a variable of the given type then returns that. */
108#define lgread(lg, addr, type) \ 123#define lgread(cpu, addr, type) \
109 ({ type _v; __lgread((lg), &_v, (addr), sizeof(_v)); _v; }) 124 ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
110 125
111/* This checks that the variable is of the given type, then writes it out. */ 126/* This checks that the variable is of the given type, then writes it out. */
112#define lgwrite(lg, addr, type, val) \ 127#define lgwrite(cpu, addr, type, val) \
113 do { \ 128 do { \
114 typecheck(type, val); \ 129 typecheck(type, val); \
115 __lgwrite((lg), (addr), &(val), sizeof(val)); \ 130 __lgwrite((cpu), (addr), &(val), sizeof(val)); \
116 } while(0) 131 } while(0)
117/* (end of memory access helper routines) :*/ 132/* (end of memory access helper routines) :*/
118 133
119int run_guest(struct lguest *lg, unsigned long __user *user); 134int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
120 135
121/* Helper macros to obtain the first 12 or the last 20 bits, this is only the 136/* Helper macros to obtain the first 12 or the last 20 bits, this is only the
122 * first step in the migration to the kernel types. pte_pfn is already defined 137 * first step in the migration to the kernel types. pte_pfn is already defined
@@ -126,52 +141,53 @@ int run_guest(struct lguest *lg, unsigned long __user *user);
126#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) 141#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT)
127 142
128/* interrupts_and_traps.c: */ 143/* interrupts_and_traps.c: */
129void maybe_do_interrupt(struct lguest *lg); 144void maybe_do_interrupt(struct lg_cpu *cpu);
130int deliver_trap(struct lguest *lg, unsigned int num); 145int deliver_trap(struct lg_cpu *cpu, unsigned int num);
131void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi); 146void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
132void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages); 147 u32 low, u32 hi);
133void pin_stack_pages(struct lguest *lg); 148void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
149void pin_stack_pages(struct lg_cpu *cpu);
134void setup_default_idt_entries(struct lguest_ro_state *state, 150void setup_default_idt_entries(struct lguest_ro_state *state,
135 const unsigned long *def); 151 const unsigned long *def);
136void copy_traps(const struct lguest *lg, struct desc_struct *idt, 152void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
137 const unsigned long *def); 153 const unsigned long *def);
138void guest_set_clockevent(struct lguest *lg, unsigned long delta); 154void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
139void init_clockdev(struct lguest *lg); 155void init_clockdev(struct lg_cpu *cpu);
140bool check_syscall_vector(struct lguest *lg); 156bool check_syscall_vector(struct lguest *lg);
141int init_interrupts(void); 157int init_interrupts(void);
142void free_interrupts(void); 158void free_interrupts(void);
143 159
144/* segments.c: */ 160/* segments.c: */
145void setup_default_gdt_entries(struct lguest_ro_state *state); 161void setup_default_gdt_entries(struct lguest_ro_state *state);
146void setup_guest_gdt(struct lguest *lg); 162void setup_guest_gdt(struct lg_cpu *cpu);
147void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num); 163void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num);
148void guest_load_tls(struct lguest *lg, unsigned long tls_array); 164void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
149void copy_gdt(const struct lguest *lg, struct desc_struct *gdt); 165void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
150void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt); 166void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
151 167
152/* page_tables.c: */ 168/* page_tables.c: */
153int init_guest_pagetable(struct lguest *lg, unsigned long pgtable); 169int init_guest_pagetable(struct lguest *lg, unsigned long pgtable);
154void free_guest_pagetable(struct lguest *lg); 170void free_guest_pagetable(struct lguest *lg);
155void guest_new_pagetable(struct lguest *lg, unsigned long pgtable); 171void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
156void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); 172void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
157void guest_pagetable_clear_all(struct lguest *lg); 173void guest_pagetable_clear_all(struct lg_cpu *cpu);
158void guest_pagetable_flush_user(struct lguest *lg); 174void guest_pagetable_flush_user(struct lg_cpu *cpu);
159void guest_set_pte(struct lguest *lg, unsigned long gpgdir, 175void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
160 unsigned long vaddr, pte_t val); 176 unsigned long vaddr, pte_t val);
161void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); 177void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
162int demand_page(struct lguest *info, unsigned long cr2, int errcode); 178int demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode);
163void pin_page(struct lguest *lg, unsigned long vaddr); 179void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
164unsigned long guest_pa(struct lguest *lg, unsigned long vaddr); 180unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
165void page_table_guest_data_init(struct lguest *lg); 181void page_table_guest_data_init(struct lg_cpu *cpu);
166 182
167/* <arch>/core.c: */ 183/* <arch>/core.c: */
168void lguest_arch_host_init(void); 184void lguest_arch_host_init(void);
169void lguest_arch_host_fini(void); 185void lguest_arch_host_fini(void);
170void lguest_arch_run_guest(struct lguest *lg); 186void lguest_arch_run_guest(struct lg_cpu *cpu);
171void lguest_arch_handle_trap(struct lguest *lg); 187void lguest_arch_handle_trap(struct lg_cpu *cpu);
172int lguest_arch_init_hypercalls(struct lguest *lg); 188int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
173int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args); 189int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
174void lguest_arch_setup_regs(struct lguest *lg, unsigned long start); 190void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
175 191
176/* <arch>/switcher.S: */ 192/* <arch>/switcher.S: */
177extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; 193extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
@@ -181,8 +197,8 @@ int lguest_device_init(void);
181void lguest_device_remove(void); 197void lguest_device_remove(void);
182 198
183/* hypercalls.c: */ 199/* hypercalls.c: */
184void do_hypercalls(struct lguest *lg); 200void do_hypercalls(struct lg_cpu *cpu);
185void write_timestamp(struct lguest *lg); 201void write_timestamp(struct lg_cpu *cpu);
186 202
187/*L:035 203/*L:035
188 * Let's step aside for the moment, to study one important routine that's used 204 * Let's step aside for the moment, to study one important routine that's used
@@ -208,12 +224,12 @@ void write_timestamp(struct lguest *lg);
208 * Like any macro which uses an "if", it is safely wrapped in a run-once "do { 224 * Like any macro which uses an "if", it is safely wrapped in a run-once "do {
209 * } while(0)". 225 * } while(0)".
210 */ 226 */
211#define kill_guest(lg, fmt...) \ 227#define kill_guest(cpu, fmt...) \
212do { \ 228do { \
213 if (!(lg)->dead) { \ 229 if (!(cpu)->lg->dead) { \
214 (lg)->dead = kasprintf(GFP_ATOMIC, fmt); \ 230 (cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt); \
215 if (!(lg)->dead) \ 231 if (!(cpu)->lg->dead) \
216 (lg)->dead = ERR_PTR(-ENOMEM); \ 232 (cpu)->lg->dead = ERR_PTR(-ENOMEM); \
217 } \ 233 } \
218} while(0) 234} while(0)
219/* (End of aside) :*/ 235/* (End of aside) :*/
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 3b92a61ba8d2..85d42d3d01a9 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -6,6 +6,7 @@
6#include <linux/uaccess.h> 6#include <linux/uaccess.h>
7#include <linux/miscdevice.h> 7#include <linux/miscdevice.h>
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/sched.h>
9#include "lg.h" 10#include "lg.h"
10 11
11/*L:055 When something happens, the Waker process needs a way to stop the 12/*L:055 When something happens, the Waker process needs a way to stop the
@@ -13,7 +14,7 @@
13 * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher 14 * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher
14 * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release 15 * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release
15 * the Waker. */ 16 * the Waker. */
16static int break_guest_out(struct lguest *lg, const unsigned long __user *input) 17static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
17{ 18{
18 unsigned long on; 19 unsigned long on;
19 20
@@ -22,21 +23,21 @@ static int break_guest_out(struct lguest *lg, const unsigned long __user *input)
22 return -EFAULT; 23 return -EFAULT;
23 24
24 if (on) { 25 if (on) {
25 lg->break_out = 1; 26 cpu->break_out = 1;
26 /* Pop it out of the Guest (may be running on different CPU) */ 27 /* Pop it out of the Guest (may be running on different CPU) */
27 wake_up_process(lg->tsk); 28 wake_up_process(cpu->tsk);
28 /* Wait for them to reset it */ 29 /* Wait for them to reset it */
29 return wait_event_interruptible(lg->break_wq, !lg->break_out); 30 return wait_event_interruptible(cpu->break_wq, !cpu->break_out);
30 } else { 31 } else {
31 lg->break_out = 0; 32 cpu->break_out = 0;
32 wake_up(&lg->break_wq); 33 wake_up(&cpu->break_wq);
33 return 0; 34 return 0;
34 } 35 }
35} 36}
36 37
37/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt 38/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
38 * number to /dev/lguest. */ 39 * number to /dev/lguest. */
39static int user_send_irq(struct lguest *lg, const unsigned long __user *input) 40static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
40{ 41{
41 unsigned long irq; 42 unsigned long irq;
42 43
@@ -46,7 +47,7 @@ static int user_send_irq(struct lguest *lg, const unsigned long __user *input)
46 return -EINVAL; 47 return -EINVAL;
47 /* Next time the Guest runs, the core code will see if it can deliver 48 /* Next time the Guest runs, the core code will see if it can deliver
48 * this interrupt. */ 49 * this interrupt. */
49 set_bit(irq, lg->irqs_pending); 50 set_bit(irq, cpu->irqs_pending);
50 return 0; 51 return 0;
51} 52}
52 53
@@ -55,13 +56,21 @@ static int user_send_irq(struct lguest *lg, const unsigned long __user *input)
55static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) 56static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
56{ 57{
57 struct lguest *lg = file->private_data; 58 struct lguest *lg = file->private_data;
59 struct lg_cpu *cpu;
60 unsigned int cpu_id = *o;
58 61
59 /* You must write LHREQ_INITIALIZE first! */ 62 /* You must write LHREQ_INITIALIZE first! */
60 if (!lg) 63 if (!lg)
61 return -EINVAL; 64 return -EINVAL;
62 65
66 /* Watch out for arbitrary vcpu indexes! */
67 if (cpu_id >= lg->nr_cpus)
68 return -EINVAL;
69
70 cpu = &lg->cpus[cpu_id];
71
63 /* If you're not the task which owns the Guest, go away. */ 72 /* If you're not the task which owns the Guest, go away. */
64 if (current != lg->tsk) 73 if (current != cpu->tsk)
65 return -EPERM; 74 return -EPERM;
66 75
67 /* If the guest is already dead, we indicate why */ 76 /* If the guest is already dead, we indicate why */
@@ -81,11 +90,53 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
81 90
82 /* If we returned from read() last time because the Guest notified, 91 /* If we returned from read() last time because the Guest notified,
83 * clear the flag. */ 92 * clear the flag. */
84 if (lg->pending_notify) 93 if (cpu->pending_notify)
85 lg->pending_notify = 0; 94 cpu->pending_notify = 0;
86 95
87 /* Run the Guest until something interesting happens. */ 96 /* Run the Guest until something interesting happens. */
88 return run_guest(lg, (unsigned long __user *)user); 97 return run_guest(cpu, (unsigned long __user *)user);
98}
99
100static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
101{
102 if (id >= NR_CPUS)
103 return -EINVAL;
104
105 cpu->id = id;
106 cpu->lg = container_of((cpu - id), struct lguest, cpus[0]);
107 cpu->lg->nr_cpus++;
108 init_clockdev(cpu);
109
110 /* We need a complete page for the Guest registers: they are accessible
111 * to the Guest and we can only grant it access to whole pages. */
112 cpu->regs_page = get_zeroed_page(GFP_KERNEL);
113 if (!cpu->regs_page)
114 return -ENOMEM;
115
116 /* We actually put the registers at the bottom of the page. */
117 cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
118
119 /* Now we initialize the Guest's registers, handing it the start
120 * address. */
121 lguest_arch_setup_regs(cpu, start_ip);
122
123 /* Initialize the queue for the waker to wait on */
124 init_waitqueue_head(&cpu->break_wq);
125
126 /* We keep a pointer to the Launcher task (ie. current task) for when
127 * other Guests want to wake this one (inter-Guest I/O). */
128 cpu->tsk = current;
129
130 /* We need to keep a pointer to the Launcher's memory map, because if
131 * the Launcher dies we need to clean it up. If we don't keep a
132 * reference, it is destroyed before close() is called. */
133 cpu->mm = get_task_mm(cpu->tsk);
134
135 /* We remember which CPU's pages this Guest used last, for optimization
136 * when the same Guest runs on the same CPU twice. */
137 cpu->last_pages = NULL;
138
139 return 0;
89} 140}
90 141
91/*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit) 142/*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit)
@@ -134,15 +185,10 @@ static int initialize(struct file *file, const unsigned long __user *input)
134 lg->mem_base = (void __user *)(long)args[0]; 185 lg->mem_base = (void __user *)(long)args[0];
135 lg->pfn_limit = args[1]; 186 lg->pfn_limit = args[1];
136 187
137 /* We need a complete page for the Guest registers: they are accessible 188 /* This is the first cpu */
138 * to the Guest and we can only grant it access to whole pages. */ 189 err = lg_cpu_start(&lg->cpus[0], 0, args[3]);
139 lg->regs_page = get_zeroed_page(GFP_KERNEL); 190 if (err)
140 if (!lg->regs_page) {
141 err = -ENOMEM;
142 goto release_guest; 191 goto release_guest;
143 }
144 /* We actually put the registers at the bottom of the page. */
145 lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs);
146 192
147 /* Initialize the Guest's shadow page tables, using the toplevel 193 /* Initialize the Guest's shadow page tables, using the toplevel
148 * address the Launcher gave us. This allocates memory, so can 194 * address the Launcher gave us. This allocates memory, so can
@@ -151,28 +197,6 @@ static int initialize(struct file *file, const unsigned long __user *input)
151 if (err) 197 if (err)
152 goto free_regs; 198 goto free_regs;
153 199
154 /* Now we initialize the Guest's registers, handing it the start
155 * address. */
156 lguest_arch_setup_regs(lg, args[3]);
157
158 /* The timer for lguest's clock needs initialization. */
159 init_clockdev(lg);
160
161 /* We keep a pointer to the Launcher task (ie. current task) for when
162 * other Guests want to wake this one (inter-Guest I/O). */
163 lg->tsk = current;
164 /* We need to keep a pointer to the Launcher's memory map, because if
165 * the Launcher dies we need to clean it up. If we don't keep a
166 * reference, it is destroyed before close() is called. */
167 lg->mm = get_task_mm(lg->tsk);
168
169 /* Initialize the queue for the waker to wait on */
170 init_waitqueue_head(&lg->break_wq);
171
172 /* We remember which CPU's pages this Guest used last, for optimization
173 * when the same Guest runs on the same CPU twice. */
174 lg->last_pages = NULL;
175
176 /* We keep our "struct lguest" in the file's private_data. */ 200 /* We keep our "struct lguest" in the file's private_data. */
177 file->private_data = lg; 201 file->private_data = lg;
178 202
@@ -182,7 +206,8 @@ static int initialize(struct file *file, const unsigned long __user *input)
182 return sizeof(args); 206 return sizeof(args);
183 207
184free_regs: 208free_regs:
185 free_page(lg->regs_page); 209 /* FIXME: This should be in free_vcpu */
210 free_page(lg->cpus[0].regs_page);
186release_guest: 211release_guest:
187 kfree(lg); 212 kfree(lg);
188unlock: 213unlock:
@@ -202,30 +227,37 @@ static ssize_t write(struct file *file, const char __user *in,
202 struct lguest *lg = file->private_data; 227 struct lguest *lg = file->private_data;
203 const unsigned long __user *input = (const unsigned long __user *)in; 228 const unsigned long __user *input = (const unsigned long __user *)in;
204 unsigned long req; 229 unsigned long req;
230 struct lg_cpu *uninitialized_var(cpu);
231 unsigned int cpu_id = *off;
205 232
206 if (get_user(req, input) != 0) 233 if (get_user(req, input) != 0)
207 return -EFAULT; 234 return -EFAULT;
208 input++; 235 input++;
209 236
210 /* If you haven't initialized, you must do that first. */ 237 /* If you haven't initialized, you must do that first. */
211 if (req != LHREQ_INITIALIZE && !lg) 238 if (req != LHREQ_INITIALIZE) {
212 return -EINVAL; 239 if (!lg || (cpu_id >= lg->nr_cpus))
240 return -EINVAL;
241 cpu = &lg->cpus[cpu_id];
242 if (!cpu)
243 return -EINVAL;
244 }
213 245
214 /* Once the Guest is dead, all you can do is read() why it died. */ 246 /* Once the Guest is dead, all you can do is read() why it died. */
215 if (lg && lg->dead) 247 if (lg && lg->dead)
216 return -ENOENT; 248 return -ENOENT;
217 249
218 /* If you're not the task which owns the Guest, you can only break */ 250 /* If you're not the task which owns the Guest, you can only break */
219 if (lg && current != lg->tsk && req != LHREQ_BREAK) 251 if (lg && current != cpu->tsk && req != LHREQ_BREAK)
220 return -EPERM; 252 return -EPERM;
221 253
222 switch (req) { 254 switch (req) {
223 case LHREQ_INITIALIZE: 255 case LHREQ_INITIALIZE:
224 return initialize(file, input); 256 return initialize(file, input);
225 case LHREQ_IRQ: 257 case LHREQ_IRQ:
226 return user_send_irq(lg, input); 258 return user_send_irq(cpu, input);
227 case LHREQ_BREAK: 259 case LHREQ_BREAK:
228 return break_guest_out(lg, input); 260 return break_guest_out(cpu, input);
229 default: 261 default:
230 return -EINVAL; 262 return -EINVAL;
231 } 263 }
@@ -241,6 +273,7 @@ static ssize_t write(struct file *file, const char __user *in,
241static int close(struct inode *inode, struct file *file) 273static int close(struct inode *inode, struct file *file)
242{ 274{
243 struct lguest *lg = file->private_data; 275 struct lguest *lg = file->private_data;
276 unsigned int i;
244 277
245 /* If we never successfully initialized, there's nothing to clean up */ 278 /* If we never successfully initialized, there's nothing to clean up */
246 if (!lg) 279 if (!lg)
@@ -249,19 +282,23 @@ static int close(struct inode *inode, struct file *file)
249 /* We need the big lock, to protect from inter-guest I/O and other 282 /* We need the big lock, to protect from inter-guest I/O and other
250 * Launchers initializing guests. */ 283 * Launchers initializing guests. */
251 mutex_lock(&lguest_lock); 284 mutex_lock(&lguest_lock);
252 /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ 285
253 hrtimer_cancel(&lg->hrt);
254 /* Free up the shadow page tables for the Guest. */ 286 /* Free up the shadow page tables for the Guest. */
255 free_guest_pagetable(lg); 287 free_guest_pagetable(lg);
256 /* Now all the memory cleanups are done, it's safe to release the 288
257 * Launcher's memory management structure. */ 289 for (i = 0; i < lg->nr_cpus; i++) {
258 mmput(lg->mm); 290 /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
291 hrtimer_cancel(&lg->cpus[i].hrt);
292 /* We can free up the register page we allocated. */
293 free_page(lg->cpus[i].regs_page);
294 /* Now all the memory cleanups are done, it's safe to release
295 * the Launcher's memory management structure. */
296 mmput(lg->cpus[i].mm);
297 }
259 /* If lg->dead doesn't contain an error code it will be NULL or a 298 /* If lg->dead doesn't contain an error code it will be NULL or a
260 * kmalloc()ed string, either of which is ok to hand to kfree(). */ 299 * kmalloc()ed string, either of which is ok to hand to kfree(). */
261 if (!IS_ERR(lg->dead)) 300 if (!IS_ERR(lg->dead))
262 kfree(lg->dead); 301 kfree(lg->dead);
263 /* We can free up the register page we allocated. */
264 free_page(lg->regs_page);
265 /* We clear the entire structure, which also marks it as free for the 302 /* We clear the entire structure, which also marks it as free for the
266 * next user. */ 303 * next user. */
267 memset(lg, 0, sizeof(*lg)); 304 memset(lg, 0, sizeof(*lg));
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index fffabb327157..74b4cf2a6c41 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -68,23 +68,23 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
68 * page directory entry (PGD) for that address. Since we keep track of several 68 * page directory entry (PGD) for that address. Since we keep track of several
69 * page tables, the "i" argument tells us which one we're interested in (it's 69 * page tables, the "i" argument tells us which one we're interested in (it's
70 * usually the current one). */ 70 * usually the current one). */
71static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) 71static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
72{ 72{
73 unsigned int index = pgd_index(vaddr); 73 unsigned int index = pgd_index(vaddr);
74 74
75 /* We kill any Guest trying to touch the Switcher addresses. */ 75 /* We kill any Guest trying to touch the Switcher addresses. */
76 if (index >= SWITCHER_PGD_INDEX) { 76 if (index >= SWITCHER_PGD_INDEX) {
77 kill_guest(lg, "attempt to access switcher pages"); 77 kill_guest(cpu, "attempt to access switcher pages");
78 index = 0; 78 index = 0;
79 } 79 }
80 /* Return a pointer index'th pgd entry for the i'th page table. */ 80 /* Return a pointer index'th pgd entry for the i'th page table. */
81 return &lg->pgdirs[i].pgdir[index]; 81 return &cpu->lg->pgdirs[i].pgdir[index];
82} 82}
83 83
84/* This routine then takes the page directory entry returned above, which 84/* This routine then takes the page directory entry returned above, which
85 * contains the address of the page table entry (PTE) page. It then returns a 85 * contains the address of the page table entry (PTE) page. It then returns a
86 * pointer to the PTE entry for the given address. */ 86 * pointer to the PTE entry for the given address. */
87static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) 87static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr)
88{ 88{
89 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 89 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
90 /* You should never call this if the PGD entry wasn't valid */ 90 /* You should never call this if the PGD entry wasn't valid */
@@ -94,14 +94,13 @@ static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
94 94
95/* These two functions just like the above two, except they access the Guest 95/* These two functions just like the above two, except they access the Guest
96 * page tables. Hence they return a Guest address. */ 96 * page tables. Hence they return a Guest address. */
97static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) 97static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
98{ 98{
99 unsigned int index = vaddr >> (PGDIR_SHIFT); 99 unsigned int index = vaddr >> (PGDIR_SHIFT);
100 return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t); 100 return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
101} 101}
102 102
103static unsigned long gpte_addr(struct lguest *lg, 103static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr)
104 pgd_t gpgd, unsigned long vaddr)
105{ 104{
106 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 105 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
107 BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); 106 BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
@@ -138,7 +137,7 @@ static unsigned long get_pfn(unsigned long virtpfn, int write)
138 * entry can be a little tricky. The flags are (almost) the same, but the 137 * entry can be a little tricky. The flags are (almost) the same, but the
139 * Guest PTE contains a virtual page number: the CPU needs the real page 138 * Guest PTE contains a virtual page number: the CPU needs the real page
140 * number. */ 139 * number. */
141static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write) 140static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
142{ 141{
143 unsigned long pfn, base, flags; 142 unsigned long pfn, base, flags;
144 143
@@ -149,7 +148,7 @@ static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
149 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); 148 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
150 149
151 /* The Guest's pages are offset inside the Launcher. */ 150 /* The Guest's pages are offset inside the Launcher. */
152 base = (unsigned long)lg->mem_base / PAGE_SIZE; 151 base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
153 152
154 /* We need a temporary "unsigned long" variable to hold the answer from 153 /* We need a temporary "unsigned long" variable to hold the answer from
155 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 154 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
@@ -157,7 +156,7 @@ static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
157 * page, given the virtual number. */ 156 * page, given the virtual number. */
158 pfn = get_pfn(base + pte_pfn(gpte), write); 157 pfn = get_pfn(base + pte_pfn(gpte), write);
159 if (pfn == -1UL) { 158 if (pfn == -1UL) {
160 kill_guest(lg, "failed to get page %lu", pte_pfn(gpte)); 159 kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
161 /* When we destroy the Guest, we'll go through the shadow page 160 /* When we destroy the Guest, we'll go through the shadow page
162 * tables and release_pte() them. Make sure we don't think 161 * tables and release_pte() them. Make sure we don't think
163 * this one is valid! */ 162 * this one is valid! */
@@ -177,17 +176,18 @@ static void release_pte(pte_t pte)
177} 176}
178/*:*/ 177/*:*/
179 178
180static void check_gpte(struct lguest *lg, pte_t gpte) 179static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
181{ 180{
182 if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE)) 181 if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE))
183 || pte_pfn(gpte) >= lg->pfn_limit) 182 || pte_pfn(gpte) >= cpu->lg->pfn_limit)
184 kill_guest(lg, "bad page table entry"); 183 kill_guest(cpu, "bad page table entry");
185} 184}
186 185
187static void check_gpgd(struct lguest *lg, pgd_t gpgd) 186static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
188{ 187{
189 if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit) 188 if ((pgd_flags(gpgd) & ~_PAGE_TABLE) ||
190 kill_guest(lg, "bad page directory entry"); 189 (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
190 kill_guest(cpu, "bad page directory entry");
191} 191}
192 192
193/*H:330 193/*H:330
@@ -200,7 +200,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd)
200 * 200 *
201 * If we fixed up the fault (ie. we mapped the address), this routine returns 201 * If we fixed up the fault (ie. we mapped the address), this routine returns
202 * true. Otherwise, it was a real fault and we need to tell the Guest. */ 202 * true. Otherwise, it was a real fault and we need to tell the Guest. */
203int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) 203int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
204{ 204{
205 pgd_t gpgd; 205 pgd_t gpgd;
206 pgd_t *spgd; 206 pgd_t *spgd;
@@ -209,24 +209,24 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
209 pte_t *spte; 209 pte_t *spte;
210 210
211 /* First step: get the top-level Guest page table entry. */ 211 /* First step: get the top-level Guest page table entry. */
212 gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); 212 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
213 /* Toplevel not present? We can't map it in. */ 213 /* Toplevel not present? We can't map it in. */
214 if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) 214 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
215 return 0; 215 return 0;
216 216
217 /* Now look at the matching shadow entry. */ 217 /* Now look at the matching shadow entry. */
218 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 218 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
219 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { 219 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
220 /* No shadow entry: allocate a new shadow PTE page. */ 220 /* No shadow entry: allocate a new shadow PTE page. */
221 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 221 unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
222 /* This is not really the Guest's fault, but killing it is 222 /* This is not really the Guest's fault, but killing it is
223 * simple for this corner case. */ 223 * simple for this corner case. */
224 if (!ptepage) { 224 if (!ptepage) {
225 kill_guest(lg, "out of memory allocating pte page"); 225 kill_guest(cpu, "out of memory allocating pte page");
226 return 0; 226 return 0;
227 } 227 }
228 /* We check that the Guest pgd is OK. */ 228 /* We check that the Guest pgd is OK. */
229 check_gpgd(lg, gpgd); 229 check_gpgd(cpu, gpgd);
230 /* And we copy the flags to the shadow PGD entry. The page 230 /* And we copy the flags to the shadow PGD entry. The page
231 * number in the shadow PGD is the page we just allocated. */ 231 * number in the shadow PGD is the page we just allocated. */
232 *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); 232 *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd));
@@ -234,8 +234,8 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
234 234
235 /* OK, now we look at the lower level in the Guest page table: keep its 235 /* OK, now we look at the lower level in the Guest page table: keep its
236 * address, because we might update it later. */ 236 * address, because we might update it later. */
237 gpte_ptr = gpte_addr(lg, gpgd, vaddr); 237 gpte_ptr = gpte_addr(gpgd, vaddr);
238 gpte = lgread(lg, gpte_ptr, pte_t); 238 gpte = lgread(cpu, gpte_ptr, pte_t);
239 239
240 /* If this page isn't in the Guest page tables, we can't page it in. */ 240 /* If this page isn't in the Guest page tables, we can't page it in. */
241 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 241 if (!(pte_flags(gpte) & _PAGE_PRESENT))
@@ -252,7 +252,7 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
252 252
253 /* Check that the Guest PTE flags are OK, and the page number is below 253 /* Check that the Guest PTE flags are OK, and the page number is below
254 * the pfn_limit (ie. not mapping the Launcher binary). */ 254 * the pfn_limit (ie. not mapping the Launcher binary). */
255 check_gpte(lg, gpte); 255 check_gpte(cpu, gpte);
256 256
257 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 257 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
258 gpte = pte_mkyoung(gpte); 258 gpte = pte_mkyoung(gpte);
@@ -260,7 +260,7 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
260 gpte = pte_mkdirty(gpte); 260 gpte = pte_mkdirty(gpte);
261 261
262 /* Get the pointer to the shadow PTE entry we're going to set. */ 262 /* Get the pointer to the shadow PTE entry we're going to set. */
263 spte = spte_addr(lg, *spgd, vaddr); 263 spte = spte_addr(*spgd, vaddr);
264 /* If there was a valid shadow PTE entry here before, we release it. 264 /* If there was a valid shadow PTE entry here before, we release it.
265 * This can happen with a write to a previously read-only entry. */ 265 * This can happen with a write to a previously read-only entry. */
266 release_pte(*spte); 266 release_pte(*spte);
@@ -268,17 +268,17 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
268 /* If this is a write, we insist that the Guest page is writable (the 268 /* If this is a write, we insist that the Guest page is writable (the
269 * final arg to gpte_to_spte()). */ 269 * final arg to gpte_to_spte()). */
270 if (pte_dirty(gpte)) 270 if (pte_dirty(gpte))
271 *spte = gpte_to_spte(lg, gpte, 1); 271 *spte = gpte_to_spte(cpu, gpte, 1);
272 else 272 else
273 /* If this is a read, don't set the "writable" bit in the page 273 /* If this is a read, don't set the "writable" bit in the page
274 * table entry, even if the Guest says it's writable. That way 274 * table entry, even if the Guest says it's writable. That way
275 * we will come back here when a write does actually occur, so 275 * we will come back here when a write does actually occur, so
276 * we can update the Guest's _PAGE_DIRTY flag. */ 276 * we can update the Guest's _PAGE_DIRTY flag. */
277 *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); 277 *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0);
278 278
279 /* Finally, we write the Guest PTE entry back: we've set the 279 /* Finally, we write the Guest PTE entry back: we've set the
280 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 280 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
281 lgwrite(lg, gpte_ptr, pte_t, gpte); 281 lgwrite(cpu, gpte_ptr, pte_t, gpte);
282 282
283 /* The fault is fixed, the page table is populated, the mapping 283 /* The fault is fixed, the page table is populated, the mapping
284 * manipulated, the result returned and the code complete. A small 284 * manipulated, the result returned and the code complete. A small
@@ -297,19 +297,19 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
297 * 297 *
298 * This is a quick version which answers the question: is this virtual address 298 * This is a quick version which answers the question: is this virtual address
299 * mapped by the shadow page tables, and is it writable? */ 299 * mapped by the shadow page tables, and is it writable? */
300static int page_writable(struct lguest *lg, unsigned long vaddr) 300static int page_writable(struct lg_cpu *cpu, unsigned long vaddr)
301{ 301{
302 pgd_t *spgd; 302 pgd_t *spgd;
303 unsigned long flags; 303 unsigned long flags;
304 304
305 /* Look at the current top level entry: is it present? */ 305 /* Look at the current top level entry: is it present? */
306 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 306 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
307 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) 307 if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
308 return 0; 308 return 0;
309 309
310 /* Check the flags on the pte entry itself: it must be present and 310 /* Check the flags on the pte entry itself: it must be present and
311 * writable. */ 311 * writable. */
312 flags = pte_flags(*(spte_addr(lg, *spgd, vaddr))); 312 flags = pte_flags(*(spte_addr(*spgd, vaddr)));
313 313
314 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 314 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
315} 315}
@@ -317,10 +317,10 @@ static int page_writable(struct lguest *lg, unsigned long vaddr)
317/* So, when pin_stack_pages() asks us to pin a page, we check if it's already 317/* So, when pin_stack_pages() asks us to pin a page, we check if it's already
318 * in the page tables, and if not, we call demand_page() with error code 2 318 * in the page tables, and if not, we call demand_page() with error code 2
319 * (meaning "write"). */ 319 * (meaning "write"). */
320void pin_page(struct lguest *lg, unsigned long vaddr) 320void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
321{ 321{
322 if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2)) 322 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
323 kill_guest(lg, "bad stack page %#lx", vaddr); 323 kill_guest(cpu, "bad stack page %#lx", vaddr);
324} 324}
325 325
326/*H:450 If we chase down the release_pgd() code, it looks like this: */ 326/*H:450 If we chase down the release_pgd() code, it looks like this: */
@@ -358,28 +358,28 @@ static void flush_user_mappings(struct lguest *lg, int idx)
358 * 358 *
359 * The Guest has a hypercall to throw away the page tables: it's used when a 359 * The Guest has a hypercall to throw away the page tables: it's used when a
360 * large number of mappings have been changed. */ 360 * large number of mappings have been changed. */
361void guest_pagetable_flush_user(struct lguest *lg) 361void guest_pagetable_flush_user(struct lg_cpu *cpu)
362{ 362{
363 /* Drop the userspace part of the current page table. */ 363 /* Drop the userspace part of the current page table. */
364 flush_user_mappings(lg, lg->pgdidx); 364 flush_user_mappings(cpu->lg, cpu->cpu_pgd);
365} 365}
366/*:*/ 366/*:*/
367 367
368/* We walk down the guest page tables to get a guest-physical address */ 368/* We walk down the guest page tables to get a guest-physical address */
369unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) 369unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
370{ 370{
371 pgd_t gpgd; 371 pgd_t gpgd;
372 pte_t gpte; 372 pte_t gpte;
373 373
374 /* First step: get the top-level Guest page table entry. */ 374 /* First step: get the top-level Guest page table entry. */
375 gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); 375 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
376 /* Toplevel not present? We can't map it in. */ 376 /* Toplevel not present? We can't map it in. */
377 if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) 377 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
378 kill_guest(lg, "Bad address %#lx", vaddr); 378 kill_guest(cpu, "Bad address %#lx", vaddr);
379 379
380 gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t); 380 gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t);
381 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 381 if (!(pte_flags(gpte) & _PAGE_PRESENT))
382 kill_guest(lg, "Bad address %#lx", vaddr); 382 kill_guest(cpu, "Bad address %#lx", vaddr);
383 383
384 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); 384 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
385} 385}
@@ -399,7 +399,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
399/*H:435 And this is us, creating the new page directory. If we really do 399/*H:435 And this is us, creating the new page directory. If we really do
400 * allocate a new one (and so the kernel parts are not there), we set 400 * allocate a new one (and so the kernel parts are not there), we set
401 * blank_pgdir. */ 401 * blank_pgdir. */
402static unsigned int new_pgdir(struct lguest *lg, 402static unsigned int new_pgdir(struct lg_cpu *cpu,
403 unsigned long gpgdir, 403 unsigned long gpgdir,
404 int *blank_pgdir) 404 int *blank_pgdir)
405{ 405{
@@ -407,22 +407,23 @@ static unsigned int new_pgdir(struct lguest *lg,
407 407
408 /* We pick one entry at random to throw out. Choosing the Least 408 /* We pick one entry at random to throw out. Choosing the Least
409 * Recently Used might be better, but this is easy. */ 409 * Recently Used might be better, but this is easy. */
410 next = random32() % ARRAY_SIZE(lg->pgdirs); 410 next = random32() % ARRAY_SIZE(cpu->lg->pgdirs);
411 /* If it's never been allocated at all before, try now. */ 411 /* If it's never been allocated at all before, try now. */
412 if (!lg->pgdirs[next].pgdir) { 412 if (!cpu->lg->pgdirs[next].pgdir) {
413 lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); 413 cpu->lg->pgdirs[next].pgdir =
414 (pgd_t *)get_zeroed_page(GFP_KERNEL);
414 /* If the allocation fails, just keep using the one we have */ 415 /* If the allocation fails, just keep using the one we have */
415 if (!lg->pgdirs[next].pgdir) 416 if (!cpu->lg->pgdirs[next].pgdir)
416 next = lg->pgdidx; 417 next = cpu->cpu_pgd;
417 else 418 else
418 /* This is a blank page, so there are no kernel 419 /* This is a blank page, so there are no kernel
419 * mappings: caller must map the stack! */ 420 * mappings: caller must map the stack! */
420 *blank_pgdir = 1; 421 *blank_pgdir = 1;
421 } 422 }
422 /* Record which Guest toplevel this shadows. */ 423 /* Record which Guest toplevel this shadows. */
423 lg->pgdirs[next].gpgdir = gpgdir; 424 cpu->lg->pgdirs[next].gpgdir = gpgdir;
424 /* Release all the non-kernel mappings. */ 425 /* Release all the non-kernel mappings. */
425 flush_user_mappings(lg, next); 426 flush_user_mappings(cpu->lg, next);
426 427
427 return next; 428 return next;
428} 429}
@@ -432,21 +433,21 @@ static unsigned int new_pgdir(struct lguest *lg,
432 * Now we've seen all the page table setting and manipulation, let's see what 433 * Now we've seen all the page table setting and manipulation, let's see what
433 * what happens when the Guest changes page tables (ie. changes the top-level 434 * what happens when the Guest changes page tables (ie. changes the top-level
434 * pgdir). This occurs on almost every context switch. */ 435 * pgdir). This occurs on almost every context switch. */
435void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) 436void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
436{ 437{
437 int newpgdir, repin = 0; 438 int newpgdir, repin = 0;
438 439
439 /* Look to see if we have this one already. */ 440 /* Look to see if we have this one already. */
440 newpgdir = find_pgdir(lg, pgtable); 441 newpgdir = find_pgdir(cpu->lg, pgtable);
441 /* If not, we allocate or mug an existing one: if it's a fresh one, 442 /* If not, we allocate or mug an existing one: if it's a fresh one,
442 * repin gets set to 1. */ 443 * repin gets set to 1. */
443 if (newpgdir == ARRAY_SIZE(lg->pgdirs)) 444 if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
444 newpgdir = new_pgdir(lg, pgtable, &repin); 445 newpgdir = new_pgdir(cpu, pgtable, &repin);
445 /* Change the current pgd index to the new one. */ 446 /* Change the current pgd index to the new one. */
446 lg->pgdidx = newpgdir; 447 cpu->cpu_pgd = newpgdir;
447 /* If it was completely blank, we map in the Guest kernel stack */ 448 /* If it was completely blank, we map in the Guest kernel stack */
448 if (repin) 449 if (repin)
449 pin_stack_pages(lg); 450 pin_stack_pages(cpu);
450} 451}
451 452
452/*H:470 Finally, a routine which throws away everything: all PGD entries in all 453/*H:470 Finally, a routine which throws away everything: all PGD entries in all
@@ -468,11 +469,11 @@ static void release_all_pagetables(struct lguest *lg)
468 * mapping. Since kernel mappings are in every page table, it's easiest to 469 * mapping. Since kernel mappings are in every page table, it's easiest to
469 * throw them all away. This traps the Guest in amber for a while as 470 * throw them all away. This traps the Guest in amber for a while as
470 * everything faults back in, but it's rare. */ 471 * everything faults back in, but it's rare. */
471void guest_pagetable_clear_all(struct lguest *lg) 472void guest_pagetable_clear_all(struct lg_cpu *cpu)
472{ 473{
473 release_all_pagetables(lg); 474 release_all_pagetables(cpu->lg);
474 /* We need the Guest kernel stack mapped again. */ 475 /* We need the Guest kernel stack mapped again. */
475 pin_stack_pages(lg); 476 pin_stack_pages(cpu);
476} 477}
477/*:*/ 478/*:*/
478/*M:009 Since we throw away all mappings when a kernel mapping changes, our 479/*M:009 Since we throw away all mappings when a kernel mapping changes, our
@@ -497,24 +498,24 @@ void guest_pagetable_clear_all(struct lguest *lg)
497 * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if 498 * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
498 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. 499 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
499 */ 500 */
500static void do_set_pte(struct lguest *lg, int idx, 501static void do_set_pte(struct lg_cpu *cpu, int idx,
501 unsigned long vaddr, pte_t gpte) 502 unsigned long vaddr, pte_t gpte)
502{ 503{
503 /* Look up the matching shadow page directory entry. */ 504 /* Look up the matching shadow page directory entry. */
504 pgd_t *spgd = spgd_addr(lg, idx, vaddr); 505 pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
505 506
506 /* If the top level isn't present, there's no entry to update. */ 507 /* If the top level isn't present, there's no entry to update. */
507 if (pgd_flags(*spgd) & _PAGE_PRESENT) { 508 if (pgd_flags(*spgd) & _PAGE_PRESENT) {
508 /* Otherwise, we start by releasing the existing entry. */ 509 /* Otherwise, we start by releasing the existing entry. */
509 pte_t *spte = spte_addr(lg, *spgd, vaddr); 510 pte_t *spte = spte_addr(*spgd, vaddr);
510 release_pte(*spte); 511 release_pte(*spte);
511 512
512 /* If they're setting this entry as dirty or accessed, we might 513 /* If they're setting this entry as dirty or accessed, we might
513 * as well put that entry they've given us in now. This shaves 514 * as well put that entry they've given us in now. This shaves
514 * 10% off a copy-on-write micro-benchmark. */ 515 * 10% off a copy-on-write micro-benchmark. */
515 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 516 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
516 check_gpte(lg, gpte); 517 check_gpte(cpu, gpte);
517 *spte = gpte_to_spte(lg, gpte, 518 *spte = gpte_to_spte(cpu, gpte,
518 pte_flags(gpte) & _PAGE_DIRTY); 519 pte_flags(gpte) & _PAGE_DIRTY);
519 } else 520 } else
520 /* Otherwise kill it and we can demand_page() it in 521 /* Otherwise kill it and we can demand_page() it in
@@ -533,22 +534,22 @@ static void do_set_pte(struct lguest *lg, int idx,
533 * 534 *
534 * The benefit is that when we have to track a new page table, we can copy keep 535 * The benefit is that when we have to track a new page table, we can copy keep
535 * all the kernel mappings. This speeds up context switch immensely. */ 536 * all the kernel mappings. This speeds up context switch immensely. */
536void guest_set_pte(struct lguest *lg, 537void guest_set_pte(struct lg_cpu *cpu,
537 unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 538 unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
538{ 539{
539 /* Kernel mappings must be changed on all top levels. Slow, but 540 /* Kernel mappings must be changed on all top levels. Slow, but
540 * doesn't happen often. */ 541 * doesn't happen often. */
541 if (vaddr >= lg->kernel_address) { 542 if (vaddr >= cpu->lg->kernel_address) {
542 unsigned int i; 543 unsigned int i;
543 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 544 for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
544 if (lg->pgdirs[i].pgdir) 545 if (cpu->lg->pgdirs[i].pgdir)
545 do_set_pte(lg, i, vaddr, gpte); 546 do_set_pte(cpu, i, vaddr, gpte);
546 } else { 547 } else {
547 /* Is this page table one we have a shadow for? */ 548 /* Is this page table one we have a shadow for? */
548 int pgdir = find_pgdir(lg, gpgdir); 549 int pgdir = find_pgdir(cpu->lg, gpgdir);
549 if (pgdir != ARRAY_SIZE(lg->pgdirs)) 550 if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))
550 /* If so, do the update. */ 551 /* If so, do the update. */
551 do_set_pte(lg, pgdir, vaddr, gpte); 552 do_set_pte(cpu, pgdir, vaddr, gpte);
552 } 553 }
553} 554}
554 555
@@ -590,30 +591,32 @@ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
590{ 591{
591 /* We start on the first shadow page table, and give it a blank PGD 592 /* We start on the first shadow page table, and give it a blank PGD
592 * page. */ 593 * page. */
593 lg->pgdidx = 0; 594 lg->pgdirs[0].gpgdir = pgtable;
594 lg->pgdirs[lg->pgdidx].gpgdir = pgtable; 595 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
595 lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL); 596 if (!lg->pgdirs[0].pgdir)
596 if (!lg->pgdirs[lg->pgdidx].pgdir)
597 return -ENOMEM; 597 return -ENOMEM;
598 lg->cpus[0].cpu_pgd = 0;
598 return 0; 599 return 0;
599} 600}
600 601
601/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 602/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
602void page_table_guest_data_init(struct lguest *lg) 603void page_table_guest_data_init(struct lg_cpu *cpu)
603{ 604{
604 /* We get the kernel address: above this is all kernel memory. */ 605 /* We get the kernel address: above this is all kernel memory. */
605 if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address) 606 if (get_user(cpu->lg->kernel_address,
607 &cpu->lg->lguest_data->kernel_address)
606 /* We tell the Guest that it can't use the top 4MB of virtual 608 /* We tell the Guest that it can't use the top 4MB of virtual
607 * addresses used by the Switcher. */ 609 * addresses used by the Switcher. */
608 || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) 610 || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem)
609 || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir)) 611 || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir))
610 kill_guest(lg, "bad guest page %p", lg->lguest_data); 612 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
611 613
612 /* In flush_user_mappings() we loop from 0 to 614 /* In flush_user_mappings() we loop from 0 to
613 * "pgd_index(lg->kernel_address)". This assumes it won't hit the 615 * "pgd_index(lg->kernel_address)". This assumes it won't hit the
614 * Switcher mappings, so check that now. */ 616 * Switcher mappings, so check that now. */
615 if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX) 617 if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
616 kill_guest(lg, "bad kernel address %#lx", lg->kernel_address); 618 kill_guest(cpu, "bad kernel address %#lx",
619 cpu->lg->kernel_address);
617} 620}
618 621
619/* When a Guest dies, our cleanup is fairly simple. */ 622/* When a Guest dies, our cleanup is fairly simple. */
@@ -634,17 +637,18 @@ void free_guest_pagetable(struct lguest *lg)
634 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages 637 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages
635 * for each CPU already set up, we just need to hook them in now we know which 638 * for each CPU already set up, we just need to hook them in now we know which
636 * Guest is about to run on this CPU. */ 639 * Guest is about to run on this CPU. */
637void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) 640void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
638{ 641{
639 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 642 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
640 pgd_t switcher_pgd; 643 pgd_t switcher_pgd;
641 pte_t regs_pte; 644 pte_t regs_pte;
645 unsigned long pfn;
642 646
643 /* Make the last PGD entry for this Guest point to the Switcher's PTE 647 /* Make the last PGD entry for this Guest point to the Switcher's PTE
644 * page for this CPU (with appropriate flags). */ 648 * page for this CPU (with appropriate flags). */
645 switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL); 649 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL);
646 650
647 lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 651 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
648 652
649 /* We also change the Switcher PTE page. When we're running the Guest, 653 /* We also change the Switcher PTE page. When we're running the Guest,
650 * we want the Guest's "regs" page to appear where the first Switcher 654 * we want the Guest's "regs" page to appear where the first Switcher
@@ -653,7 +657,8 @@ void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
653 * CPU's "struct lguest_pages": if we make sure the Guest's register 657 * CPU's "struct lguest_pages": if we make sure the Guest's register
654 * page is already mapped there, we don't have to copy them out 658 * page is already mapped there, we don't have to copy them out
655 * again. */ 659 * again. */
656 regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL)); 660 pfn = __pa(cpu->regs_page) >> PAGE_SHIFT;
661 regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL));
657 switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; 662 switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;
658} 663}
659/*:*/ 664/*:*/
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index 9e189cbec7dd..ec6aa3f1c36b 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -58,7 +58,7 @@ static int ignored_gdt(unsigned int num)
58 * Protection Fault in the Switcher when it restores a Guest segment register 58 * Protection Fault in the Switcher when it restores a Guest segment register
59 * which tries to use that entry. Then we kill the Guest for causing such a 59 * which tries to use that entry. Then we kill the Guest for causing such a
60 * mess: the message will be "unhandled trap 256". */ 60 * mess: the message will be "unhandled trap 256". */
61static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) 61static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
62{ 62{
63 unsigned int i; 63 unsigned int i;
64 64
@@ -71,14 +71,14 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
71 /* Segment descriptors contain a privilege level: the Guest is 71 /* Segment descriptors contain a privilege level: the Guest is
72 * sometimes careless and leaves this as 0, even though it's 72 * sometimes careless and leaves this as 0, even though it's
73 * running at privilege level 1. If so, we fix it here. */ 73 * running at privilege level 1. If so, we fix it here. */
74 if ((lg->arch.gdt[i].b & 0x00006000) == 0) 74 if ((cpu->arch.gdt[i].b & 0x00006000) == 0)
75 lg->arch.gdt[i].b |= (GUEST_PL << 13); 75 cpu->arch.gdt[i].b |= (GUEST_PL << 13);
76 76
77 /* Each descriptor has an "accessed" bit. If we don't set it 77 /* Each descriptor has an "accessed" bit. If we don't set it
78 * now, the CPU will try to set it when the Guest first loads 78 * now, the CPU will try to set it when the Guest first loads
79 * that entry into a segment register. But the GDT isn't 79 * that entry into a segment register. But the GDT isn't
80 * writable by the Guest, so bad things can happen. */ 80 * writable by the Guest, so bad things can happen. */
81 lg->arch.gdt[i].b |= 0x00000100; 81 cpu->arch.gdt[i].b |= 0x00000100;
82 } 82 }
83} 83}
84 84
@@ -109,31 +109,31 @@ void setup_default_gdt_entries(struct lguest_ro_state *state)
109 109
110/* This routine sets up the initial Guest GDT for booting. All entries start 110/* This routine sets up the initial Guest GDT for booting. All entries start
111 * as 0 (unusable). */ 111 * as 0 (unusable). */
112void setup_guest_gdt(struct lguest *lg) 112void setup_guest_gdt(struct lg_cpu *cpu)
113{ 113{
114 /* Start with full 0-4G segments... */ 114 /* Start with full 0-4G segments... */
115 lg->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; 115 cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
116 lg->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; 116 cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
117 /* ...except the Guest is allowed to use them, so set the privilege 117 /* ...except the Guest is allowed to use them, so set the privilege
118 * level appropriately in the flags. */ 118 * level appropriately in the flags. */
119 lg->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); 119 cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
120 lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); 120 cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
121} 121}
122 122
123/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" 123/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage"
124 * entries. */ 124 * entries. */
125void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) 125void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
126{ 126{
127 unsigned int i; 127 unsigned int i;
128 128
129 for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++) 129 for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
130 gdt[i] = lg->arch.gdt[i]; 130 gdt[i] = cpu->arch.gdt[i];
131} 131}
132 132
133/*H:640 When the Guest is run on a different CPU, or the GDT entries have 133/*H:640 When the Guest is run on a different CPU, or the GDT entries have
134 * changed, copy_gdt() is called to copy the Guest's GDT entries across to this 134 * changed, copy_gdt() is called to copy the Guest's GDT entries across to this
135 * CPU's GDT. */ 135 * CPU's GDT. */
136void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) 136void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
137{ 137{
138 unsigned int i; 138 unsigned int i;
139 139
@@ -141,38 +141,38 @@ void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
141 * replaced. See ignored_gdt() above. */ 141 * replaced. See ignored_gdt() above. */
142 for (i = 0; i < GDT_ENTRIES; i++) 142 for (i = 0; i < GDT_ENTRIES; i++)
143 if (!ignored_gdt(i)) 143 if (!ignored_gdt(i))
144 gdt[i] = lg->arch.gdt[i]; 144 gdt[i] = cpu->arch.gdt[i];
145} 145}
146 146
147/*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). 147/*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT).
148 * We copy it from the Guest and tweak the entries. */ 148 * We copy it from the Guest and tweak the entries. */
149void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) 149void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num)
150{ 150{
151 /* We assume the Guest has the same number of GDT entries as the 151 /* We assume the Guest has the same number of GDT entries as the
152 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ 152 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */
153 if (num > ARRAY_SIZE(lg->arch.gdt)) 153 if (num > ARRAY_SIZE(cpu->arch.gdt))
154 kill_guest(lg, "too many gdt entries %i", num); 154 kill_guest(cpu, "too many gdt entries %i", num);
155 155
156 /* We read the whole thing in, then fix it up. */ 156 /* We read the whole thing in, then fix it up. */
157 __lgread(lg, lg->arch.gdt, table, num * sizeof(lg->arch.gdt[0])); 157 __lgread(cpu, cpu->arch.gdt, table, num * sizeof(cpu->arch.gdt[0]));
158 fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->arch.gdt)); 158 fixup_gdt_table(cpu, 0, ARRAY_SIZE(cpu->arch.gdt));
159 /* Mark that the GDT changed so the core knows it has to copy it again, 159 /* Mark that the GDT changed so the core knows it has to copy it again,
160 * even if the Guest is run on the same CPU. */ 160 * even if the Guest is run on the same CPU. */
161 lg->changed |= CHANGED_GDT; 161 cpu->changed |= CHANGED_GDT;
162} 162}
163 163
164/* This is the fast-track version for just changing the three TLS entries. 164/* This is the fast-track version for just changing the three TLS entries.
165 * Remember that this happens on every context switch, so it's worth 165 * Remember that this happens on every context switch, so it's worth
166 * optimizing. But wouldn't it be neater to have a single hypercall to cover 166 * optimizing. But wouldn't it be neater to have a single hypercall to cover
167 * both cases? */ 167 * both cases? */
168void guest_load_tls(struct lguest *lg, unsigned long gtls) 168void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
169{ 169{
170 struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN]; 170 struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
171 171
172 __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); 172 __lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
173 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); 173 fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
174 /* Note that just the TLS entries have changed. */ 174 /* Note that just the TLS entries have changed. */
175 lg->changed |= CHANGED_GDT_TLS; 175 cpu->changed |= CHANGED_GDT_TLS;
176} 176}
177/*:*/ 177/*:*/
178 178
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 44adb00e1490..61f2f8eb8cad 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -60,7 +60,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu)
60 (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); 60 (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
61} 61}
62 62
63static DEFINE_PER_CPU(struct lguest *, last_guest); 63static DEFINE_PER_CPU(struct lg_cpu *, last_cpu);
64 64
65/*S:010 65/*S:010
66 * We approach the Switcher. 66 * We approach the Switcher.
@@ -73,16 +73,16 @@ static DEFINE_PER_CPU(struct lguest *, last_guest);
73 * since it last ran. We saw this set in interrupts_and_traps.c and 73 * since it last ran. We saw this set in interrupts_and_traps.c and
74 * segments.c. 74 * segments.c.
75 */ 75 */
76static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) 76static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
77{ 77{
78 /* Copying all this data can be quite expensive. We usually run the 78 /* Copying all this data can be quite expensive. We usually run the
79 * same Guest we ran last time (and that Guest hasn't run anywhere else 79 * same Guest we ran last time (and that Guest hasn't run anywhere else
80 * meanwhile). If that's not the case, we pretend everything in the 80 * meanwhile). If that's not the case, we pretend everything in the
81 * Guest has changed. */ 81 * Guest has changed. */
82 if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { 82 if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) {
83 __get_cpu_var(last_guest) = lg; 83 __get_cpu_var(last_cpu) = cpu;
84 lg->last_pages = pages; 84 cpu->last_pages = pages;
85 lg->changed = CHANGED_ALL; 85 cpu->changed = CHANGED_ALL;
86 } 86 }
87 87
88 /* These copies are pretty cheap, so we do them unconditionally: */ 88 /* These copies are pretty cheap, so we do them unconditionally: */
@@ -90,42 +90,42 @@ static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
90 pages->state.host_cr3 = __pa(current->mm->pgd); 90 pages->state.host_cr3 = __pa(current->mm->pgd);
91 /* Set up the Guest's page tables to see this CPU's pages (and no 91 /* Set up the Guest's page tables to see this CPU's pages (and no
92 * other CPU's pages). */ 92 * other CPU's pages). */
93 map_switcher_in_guest(lg, pages); 93 map_switcher_in_guest(cpu, pages);
94 /* Set up the two "TSS" members which tell the CPU what stack to use 94 /* Set up the two "TSS" members which tell the CPU what stack to use
95 * for traps which do directly into the Guest (ie. traps at privilege 95 * for traps which do directly into the Guest (ie. traps at privilege
96 * level 1). */ 96 * level 1). */
97 pages->state.guest_tss.sp1 = lg->esp1; 97 pages->state.guest_tss.esp1 = cpu->esp1;
98 pages->state.guest_tss.ss1 = lg->ss1; 98 pages->state.guest_tss.ss1 = cpu->ss1;
99 99
100 /* Copy direct-to-Guest trap entries. */ 100 /* Copy direct-to-Guest trap entries. */
101 if (lg->changed & CHANGED_IDT) 101 if (cpu->changed & CHANGED_IDT)
102 copy_traps(lg, pages->state.guest_idt, default_idt_entries); 102 copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
103 103
104 /* Copy all GDT entries which the Guest can change. */ 104 /* Copy all GDT entries which the Guest can change. */
105 if (lg->changed & CHANGED_GDT) 105 if (cpu->changed & CHANGED_GDT)
106 copy_gdt(lg, pages->state.guest_gdt); 106 copy_gdt(cpu, pages->state.guest_gdt);
107 /* If only the TLS entries have changed, copy them. */ 107 /* If only the TLS entries have changed, copy them. */
108 else if (lg->changed & CHANGED_GDT_TLS) 108 else if (cpu->changed & CHANGED_GDT_TLS)
109 copy_gdt_tls(lg, pages->state.guest_gdt); 109 copy_gdt_tls(cpu, pages->state.guest_gdt);
110 110
111 /* Mark the Guest as unchanged for next time. */ 111 /* Mark the Guest as unchanged for next time. */
112 lg->changed = 0; 112 cpu->changed = 0;
113} 113}
114 114
115/* Finally: the code to actually call into the Switcher to run the Guest. */ 115/* Finally: the code to actually call into the Switcher to run the Guest. */
116static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) 116static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
117{ 117{
118 /* This is a dummy value we need for GCC's sake. */ 118 /* This is a dummy value we need for GCC's sake. */
119 unsigned int clobber; 119 unsigned int clobber;
120 120
121 /* Copy the guest-specific information into this CPU's "struct 121 /* Copy the guest-specific information into this CPU's "struct
122 * lguest_pages". */ 122 * lguest_pages". */
123 copy_in_guest_info(lg, pages); 123 copy_in_guest_info(cpu, pages);
124 124
125 /* Set the trap number to 256 (impossible value). If we fault while 125 /* Set the trap number to 256 (impossible value). If we fault while
126 * switching to the Guest (bad segment registers or bug), this will 126 * switching to the Guest (bad segment registers or bug), this will
127 * cause us to abort the Guest. */ 127 * cause us to abort the Guest. */
128 lg->regs->trapnum = 256; 128 cpu->regs->trapnum = 256;
129 129
130 /* Now: we push the "eflags" register on the stack, then do an "lcall". 130 /* Now: we push the "eflags" register on the stack, then do an "lcall".
131 * This is how we change from using the kernel code segment to using 131 * This is how we change from using the kernel code segment to using
@@ -143,7 +143,7 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
143 * 0-th argument above, ie "a"). %ebx contains the 143 * 0-th argument above, ie "a"). %ebx contains the
144 * physical address of the Guest's top-level page 144 * physical address of the Guest's top-level page
145 * directory. */ 145 * directory. */
146 : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) 146 : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir))
147 /* We tell gcc that all these registers could change, 147 /* We tell gcc that all these registers could change,
148 * which means we don't have to save and restore them in 148 * which means we don't have to save and restore them in
149 * the Switcher. */ 149 * the Switcher. */
@@ -161,12 +161,12 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
161 161
162/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts 162/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts
163 * are disabled: we own the CPU. */ 163 * are disabled: we own the CPU. */
164void lguest_arch_run_guest(struct lguest *lg) 164void lguest_arch_run_guest(struct lg_cpu *cpu)
165{ 165{
166 /* Remember the awfully-named TS bit? If the Guest has asked to set it 166 /* Remember the awfully-named TS bit? If the Guest has asked to set it
167 * we set it now, so we can trap and pass that trap to the Guest if it 167 * we set it now, so we can trap and pass that trap to the Guest if it
168 * uses the FPU. */ 168 * uses the FPU. */
169 if (lg->ts) 169 if (cpu->ts)
170 lguest_set_ts(); 170 lguest_set_ts();
171 171
172 /* SYSENTER is an optimized way of doing system calls. We can't allow 172 /* SYSENTER is an optimized way of doing system calls. We can't allow
@@ -180,7 +180,7 @@ void lguest_arch_run_guest(struct lguest *lg)
180 /* Now we actually run the Guest. It will return when something 180 /* Now we actually run the Guest. It will return when something
181 * interesting happens, and we can examine its registers to see what it 181 * interesting happens, and we can examine its registers to see what it
182 * was doing. */ 182 * was doing. */
183 run_guest_once(lg, lguest_pages(raw_smp_processor_id())); 183 run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
184 184
185 /* Note that the "regs" pointer contains two extra entries which are 185 /* Note that the "regs" pointer contains two extra entries which are
186 * not really registers: a trap number which says what interrupt or 186 * not really registers: a trap number which says what interrupt or
@@ -191,11 +191,11 @@ void lguest_arch_run_guest(struct lguest *lg)
191 * bad virtual address. We have to grab this now, because once we 191 * bad virtual address. We have to grab this now, because once we
192 * re-enable interrupts an interrupt could fault and thus overwrite 192 * re-enable interrupts an interrupt could fault and thus overwrite
193 * cr2, or we could even move off to a different CPU. */ 193 * cr2, or we could even move off to a different CPU. */
194 if (lg->regs->trapnum == 14) 194 if (cpu->regs->trapnum == 14)
195 lg->arch.last_pagefault = read_cr2(); 195 cpu->arch.last_pagefault = read_cr2();
196 /* Similarly, if we took a trap because the Guest used the FPU, 196 /* Similarly, if we took a trap because the Guest used the FPU,
197 * we have to restore the FPU it expects to see. */ 197 * we have to restore the FPU it expects to see. */
198 else if (lg->regs->trapnum == 7) 198 else if (cpu->regs->trapnum == 7)
199 math_state_restore(); 199 math_state_restore();
200 200
201 /* Restore SYSENTER if it's supposed to be on. */ 201 /* Restore SYSENTER if it's supposed to be on. */
@@ -214,22 +214,22 @@ void lguest_arch_run_guest(struct lguest *lg)
214 * When the Guest uses one of these instructions, we get a trap (General 214 * When the Guest uses one of these instructions, we get a trap (General
215 * Protection Fault) and come here. We see if it's one of those troublesome 215 * Protection Fault) and come here. We see if it's one of those troublesome
216 * instructions and skip over it. We return true if we did. */ 216 * instructions and skip over it. We return true if we did. */
217static int emulate_insn(struct lguest *lg) 217static int emulate_insn(struct lg_cpu *cpu)
218{ 218{
219 u8 insn; 219 u8 insn;
220 unsigned int insnlen = 0, in = 0, shift = 0; 220 unsigned int insnlen = 0, in = 0, shift = 0;
221 /* The eip contains the *virtual* address of the Guest's instruction: 221 /* The eip contains the *virtual* address of the Guest's instruction:
222 * guest_pa just subtracts the Guest's page_offset. */ 222 * guest_pa just subtracts the Guest's page_offset. */
223 unsigned long physaddr = guest_pa(lg, lg->regs->eip); 223 unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
224 224
225 /* This must be the Guest kernel trying to do something, not userspace! 225 /* This must be the Guest kernel trying to do something, not userspace!
226 * The bottom two bits of the CS segment register are the privilege 226 * The bottom two bits of the CS segment register are the privilege
227 * level. */ 227 * level. */
228 if ((lg->regs->cs & 3) != GUEST_PL) 228 if ((cpu->regs->cs & 3) != GUEST_PL)
229 return 0; 229 return 0;
230 230
231 /* Decoding x86 instructions is icky. */ 231 /* Decoding x86 instructions is icky. */
232 insn = lgread(lg, physaddr, u8); 232 insn = lgread(cpu, physaddr, u8);
233 233
234 /* 0x66 is an "operand prefix". It means it's using the upper 16 bits 234 /* 0x66 is an "operand prefix". It means it's using the upper 16 bits
235 of the eax register. */ 235 of the eax register. */
@@ -237,7 +237,7 @@ static int emulate_insn(struct lguest *lg)
237 shift = 16; 237 shift = 16;
238 /* The instruction is 1 byte so far, read the next byte. */ 238 /* The instruction is 1 byte so far, read the next byte. */
239 insnlen = 1; 239 insnlen = 1;
240 insn = lgread(lg, physaddr + insnlen, u8); 240 insn = lgread(cpu, physaddr + insnlen, u8);
241 } 241 }
242 242
243 /* We can ignore the lower bit for the moment and decode the 4 opcodes 243 /* We can ignore the lower bit for the moment and decode the 4 opcodes
@@ -268,26 +268,26 @@ static int emulate_insn(struct lguest *lg)
268 if (in) { 268 if (in) {
269 /* Lower bit tells is whether it's a 16 or 32 bit access */ 269 /* Lower bit tells is whether it's a 16 or 32 bit access */
270 if (insn & 0x1) 270 if (insn & 0x1)
271 lg->regs->eax = 0xFFFFFFFF; 271 cpu->regs->eax = 0xFFFFFFFF;
272 else 272 else
273 lg->regs->eax |= (0xFFFF << shift); 273 cpu->regs->eax |= (0xFFFF << shift);
274 } 274 }
275 /* Finally, we've "done" the instruction, so move past it. */ 275 /* Finally, we've "done" the instruction, so move past it. */
276 lg->regs->eip += insnlen; 276 cpu->regs->eip += insnlen;
277 /* Success! */ 277 /* Success! */
278 return 1; 278 return 1;
279} 279}
280 280
281/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ 281/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
282void lguest_arch_handle_trap(struct lguest *lg) 282void lguest_arch_handle_trap(struct lg_cpu *cpu)
283{ 283{
284 switch (lg->regs->trapnum) { 284 switch (cpu->regs->trapnum) {
285 case 13: /* We've intercepted a General Protection Fault. */ 285 case 13: /* We've intercepted a General Protection Fault. */
286 /* Check if this was one of those annoying IN or OUT 286 /* Check if this was one of those annoying IN or OUT
287 * instructions which we need to emulate. If so, we just go 287 * instructions which we need to emulate. If so, we just go
288 * back into the Guest after we've done it. */ 288 * back into the Guest after we've done it. */
289 if (lg->regs->errcode == 0) { 289 if (cpu->regs->errcode == 0) {
290 if (emulate_insn(lg)) 290 if (emulate_insn(cpu))
291 return; 291 return;
292 } 292 }
293 break; 293 break;
@@ -301,7 +301,8 @@ void lguest_arch_handle_trap(struct lguest *lg)
301 * 301 *
302 * The errcode tells whether this was a read or a write, and 302 * The errcode tells whether this was a read or a write, and
303 * whether kernel or userspace code. */ 303 * whether kernel or userspace code. */
304 if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode)) 304 if (demand_page(cpu, cpu->arch.last_pagefault,
305 cpu->regs->errcode))
305 return; 306 return;
306 307
307 /* OK, it's really not there (or not OK): the Guest needs to 308 /* OK, it's really not there (or not OK): the Guest needs to
@@ -311,15 +312,16 @@ void lguest_arch_handle_trap(struct lguest *lg)
311 * Note that if the Guest were really messed up, this could 312 * Note that if the Guest were really messed up, this could
312 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so 313 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
313 * lg->lguest_data could be NULL */ 314 * lg->lguest_data could be NULL */
314 if (lg->lguest_data && 315 if (cpu->lg->lguest_data &&
315 put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2)) 316 put_user(cpu->arch.last_pagefault,
316 kill_guest(lg, "Writing cr2"); 317 &cpu->lg->lguest_data->cr2))
318 kill_guest(cpu, "Writing cr2");
317 break; 319 break;
318 case 7: /* We've intercepted a Device Not Available fault. */ 320 case 7: /* We've intercepted a Device Not Available fault. */
319 /* If the Guest doesn't want to know, we already restored the 321 /* If the Guest doesn't want to know, we already restored the
320 * Floating Point Unit, so we just continue without telling 322 * Floating Point Unit, so we just continue without telling
321 * it. */ 323 * it. */
322 if (!lg->ts) 324 if (!cpu->ts)
323 return; 325 return;
324 break; 326 break;
325 case 32 ... 255: 327 case 32 ... 255:
@@ -332,19 +334,19 @@ void lguest_arch_handle_trap(struct lguest *lg)
332 case LGUEST_TRAP_ENTRY: 334 case LGUEST_TRAP_ENTRY:
333 /* Our 'struct hcall_args' maps directly over our regs: we set 335 /* Our 'struct hcall_args' maps directly over our regs: we set
334 * up the pointer now to indicate a hypercall is pending. */ 336 * up the pointer now to indicate a hypercall is pending. */
335 lg->hcall = (struct hcall_args *)lg->regs; 337 cpu->hcall = (struct hcall_args *)cpu->regs;
336 return; 338 return;
337 } 339 }
338 340
339 /* We didn't handle the trap, so it needs to go to the Guest. */ 341 /* We didn't handle the trap, so it needs to go to the Guest. */
340 if (!deliver_trap(lg, lg->regs->trapnum)) 342 if (!deliver_trap(cpu, cpu->regs->trapnum))
341 /* If the Guest doesn't have a handler (either it hasn't 343 /* If the Guest doesn't have a handler (either it hasn't
342 * registered any yet, or it's one of the faults we don't let 344 * registered any yet, or it's one of the faults we don't let
343 * it handle), it dies with a cryptic error message. */ 345 * it handle), it dies with a cryptic error message. */
344 kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", 346 kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
345 lg->regs->trapnum, lg->regs->eip, 347 cpu->regs->trapnum, cpu->regs->eip,
346 lg->regs->trapnum == 14 ? lg->arch.last_pagefault 348 cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
347 : lg->regs->errcode); 349 : cpu->regs->errcode);
348} 350}
349 351
350/* Now we can look at each of the routines this calls, in increasing order of 352/* Now we can look at each of the routines this calls, in increasing order of
@@ -487,17 +489,17 @@ void __exit lguest_arch_host_fini(void)
487 489
488 490
489/*H:122 The i386-specific hypercalls simply farm out to the right functions. */ 491/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
490int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args) 492int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
491{ 493{
492 switch (args->arg0) { 494 switch (args->arg0) {
493 case LHCALL_LOAD_GDT: 495 case LHCALL_LOAD_GDT:
494 load_guest_gdt(lg, args->arg1, args->arg2); 496 load_guest_gdt(cpu, args->arg1, args->arg2);
495 break; 497 break;
496 case LHCALL_LOAD_IDT_ENTRY: 498 case LHCALL_LOAD_IDT_ENTRY:
497 load_guest_idt_entry(lg, args->arg1, args->arg2, args->arg3); 499 load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
498 break; 500 break;
499 case LHCALL_LOAD_TLS: 501 case LHCALL_LOAD_TLS:
500 guest_load_tls(lg, args->arg1); 502 guest_load_tls(cpu, args->arg1);
501 break; 503 break;
502 default: 504 default:
503 /* Bad Guest. Bad! */ 505 /* Bad Guest. Bad! */
@@ -507,13 +509,14 @@ int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args)
507} 509}
508 510
509/*H:126 i386-specific hypercall initialization: */ 511/*H:126 i386-specific hypercall initialization: */
510int lguest_arch_init_hypercalls(struct lguest *lg) 512int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
511{ 513{
512 u32 tsc_speed; 514 u32 tsc_speed;
513 515
514 /* The pointer to the Guest's "struct lguest_data" is the only 516 /* The pointer to the Guest's "struct lguest_data" is the only
515 * argument. We check that address now. */ 517 * argument. We check that address now. */
516 if (!lguest_address_ok(lg, lg->hcall->arg1, sizeof(*lg->lguest_data))) 518 if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
519 sizeof(*cpu->lg->lguest_data)))
517 return -EFAULT; 520 return -EFAULT;
518 521
519 /* Having checked it, we simply set lg->lguest_data to point straight 522 /* Having checked it, we simply set lg->lguest_data to point straight
@@ -521,7 +524,7 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
521 * copy_to_user/from_user from now on, instead of lgread/write. I put 524 * copy_to_user/from_user from now on, instead of lgread/write. I put
522 * this in to show that I'm not immune to writing stupid 525 * this in to show that I'm not immune to writing stupid
523 * optimizations. */ 526 * optimizations. */
524 lg->lguest_data = lg->mem_base + lg->hcall->arg1; 527 cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
525 528
526 /* We insist that the Time Stamp Counter exist and doesn't change with 529 /* We insist that the Time Stamp Counter exist and doesn't change with
527 * cpu frequency. Some devious chip manufacturers decided that TSC 530 * cpu frequency. Some devious chip manufacturers decided that TSC
@@ -534,12 +537,12 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
534 tsc_speed = tsc_khz; 537 tsc_speed = tsc_khz;
535 else 538 else
536 tsc_speed = 0; 539 tsc_speed = 0;
537 if (put_user(tsc_speed, &lg->lguest_data->tsc_khz)) 540 if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
538 return -EFAULT; 541 return -EFAULT;
539 542
540 /* The interrupt code might not like the system call vector. */ 543 /* The interrupt code might not like the system call vector. */
541 if (!check_syscall_vector(lg)) 544 if (!check_syscall_vector(cpu->lg))
542 kill_guest(lg, "bad syscall vector"); 545 kill_guest(cpu, "bad syscall vector");
543 546
544 return 0; 547 return 0;
545} 548}
@@ -548,9 +551,9 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
548 * 551 *
549 * Most of the Guest's registers are left alone: we used get_zeroed_page() to 552 * Most of the Guest's registers are left alone: we used get_zeroed_page() to
550 * allocate the structure, so they will be 0. */ 553 * allocate the structure, so they will be 0. */
551void lguest_arch_setup_regs(struct lguest *lg, unsigned long start) 554void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
552{ 555{
553 struct lguest_regs *regs = lg->regs; 556 struct lguest_regs *regs = cpu->regs;
554 557
555 /* There are four "segment" registers which the Guest needs to boot: 558 /* There are four "segment" registers which the Guest needs to boot:
556 * The "code segment" register (cs) refers to the kernel code segment 559 * The "code segment" register (cs) refers to the kernel code segment
@@ -577,5 +580,5 @@ void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
577 580
578 /* There are a couple of GDT entries the Guest expects when first 581 /* There are a couple of GDT entries the Guest expects when first
579 * booting. */ 582 * booting. */
580 setup_guest_gdt(lg); 583 setup_guest_gdt(cpu);
581} 584}
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index e45f85f7c7ed..0dff05840ee2 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -4224,10 +4224,10 @@ zfcp_fsf_send_fcp_command_task_handler(struct zfcp_fsf_req *fsf_req)
4224 4224
4225 ZFCP_LOG_TRACE("%i bytes sense data provided by FCP\n", 4225 ZFCP_LOG_TRACE("%i bytes sense data provided by FCP\n",
4226 fcp_rsp_iu->fcp_sns_len); 4226 fcp_rsp_iu->fcp_sns_len);
4227 memcpy(&scpnt->sense_buffer, 4227 memcpy(scpnt->sense_buffer,
4228 zfcp_get_fcp_sns_info_ptr(fcp_rsp_iu), sns_len); 4228 zfcp_get_fcp_sns_info_ptr(fcp_rsp_iu), sns_len);
4229 ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_TRACE, 4229 ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_TRACE,
4230 (void *) &scpnt->sense_buffer, sns_len); 4230 (void *)scpnt->sense_buffer, sns_len);
4231 } 4231 }
4232 4232
4233 /* check for overrun */ 4233 /* check for overrun */
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c
index 1c244832c6c8..b4912d1cee2a 100644
--- a/drivers/scsi/3w-9xxx.c
+++ b/drivers/scsi/3w-9xxx.c
@@ -1990,7 +1990,6 @@ static struct scsi_host_template driver_template = {
1990 .max_sectors = TW_MAX_SECTORS, 1990 .max_sectors = TW_MAX_SECTORS,
1991 .cmd_per_lun = TW_MAX_CMDS_PER_LUN, 1991 .cmd_per_lun = TW_MAX_CMDS_PER_LUN,
1992 .use_clustering = ENABLE_CLUSTERING, 1992 .use_clustering = ENABLE_CLUSTERING,
1993 .use_sg_chaining = ENABLE_SG_CHAINING,
1994 .shost_attrs = twa_host_attrs, 1993 .shost_attrs = twa_host_attrs,
1995 .emulated = 1 1994 .emulated = 1
1996}; 1995};
diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c
index 59716ebeb10c..d09532162217 100644
--- a/drivers/scsi/3w-xxxx.c
+++ b/drivers/scsi/3w-xxxx.c
@@ -2261,7 +2261,6 @@ static struct scsi_host_template driver_template = {
2261 .max_sectors = TW_MAX_SECTORS, 2261 .max_sectors = TW_MAX_SECTORS,
2262 .cmd_per_lun = TW_MAX_CMDS_PER_LUN, 2262 .cmd_per_lun = TW_MAX_CMDS_PER_LUN,
2263 .use_clustering = ENABLE_CLUSTERING, 2263 .use_clustering = ENABLE_CLUSTERING,
2264 .use_sg_chaining = ENABLE_SG_CHAINING,
2265 .shost_attrs = tw_host_attrs, 2264 .shost_attrs = tw_host_attrs,
2266 .emulated = 1 2265 .emulated = 1
2267}; 2266};
diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c
index ead47c143ce0..4d3ebb1af490 100644
--- a/drivers/scsi/BusLogic.c
+++ b/drivers/scsi/BusLogic.c
@@ -3575,7 +3575,6 @@ static struct scsi_host_template Bus_Logic_template = {
3575 .unchecked_isa_dma = 1, 3575 .unchecked_isa_dma = 1,
3576 .max_sectors = 128, 3576 .max_sectors = 128,
3577 .use_clustering = ENABLE_CLUSTERING, 3577 .use_clustering = ENABLE_CLUSTERING,
3578 .use_sg_chaining = ENABLE_SG_CHAINING,
3579}; 3578};
3580 3579
3581/* 3580/*
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 3e161cd66463..14fc7f39e83e 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -345,7 +345,7 @@ config ISCSI_TCP
345 345
346config SGIWD93_SCSI 346config SGIWD93_SCSI
347 tristate "SGI WD93C93 SCSI Driver" 347 tristate "SGI WD93C93 SCSI Driver"
348 depends on SGI_IP22 && SCSI 348 depends on SGI_HAS_WD93 && SCSI
349 help 349 help
350 If you have a Western Digital WD93 SCSI controller on 350 If you have a Western Digital WD93 SCSI controller on
351 an SGI MIPS system, say Y. Otherwise, say N. 351 an SGI MIPS system, say Y. Otherwise, say N.
diff --git a/drivers/scsi/NCR53c406a.c b/drivers/scsi/NCR53c406a.c
index 137d065db3da..6961f78742ae 100644
--- a/drivers/scsi/NCR53c406a.c
+++ b/drivers/scsi/NCR53c406a.c
@@ -1065,7 +1065,6 @@ static struct scsi_host_template driver_template =
1065 .cmd_per_lun = 1 /* commands per lun */, 1065 .cmd_per_lun = 1 /* commands per lun */,
1066 .unchecked_isa_dma = 1 /* unchecked_isa_dma */, 1066 .unchecked_isa_dma = 1 /* unchecked_isa_dma */,
1067 .use_clustering = ENABLE_CLUSTERING, 1067 .use_clustering = ENABLE_CLUSTERING,
1068 .use_sg_chaining = ENABLE_SG_CHAINING,
1069}; 1068};
1070 1069
1071#include "scsi_module.c" 1070#include "scsi_module.c"
diff --git a/drivers/scsi/a100u2w.c b/drivers/scsi/a100u2w.c
index d3a6d15fb77a..f608d4a1d6da 100644
--- a/drivers/scsi/a100u2w.c
+++ b/drivers/scsi/a100u2w.c
@@ -1071,7 +1071,6 @@ static struct scsi_host_template inia100_template = {
1071 .sg_tablesize = SG_ALL, 1071 .sg_tablesize = SG_ALL,
1072 .cmd_per_lun = 1, 1072 .cmd_per_lun = 1,
1073 .use_clustering = ENABLE_CLUSTERING, 1073 .use_clustering = ENABLE_CLUSTERING,
1074 .use_sg_chaining = ENABLE_SG_CHAINING,
1075}; 1074};
1076 1075
1077static int __devinit inia100_probe_one(struct pci_dev *pdev, 1076static int __devinit inia100_probe_one(struct pci_dev *pdev,
diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c
index 851a7e599c50..f8afa358b6b6 100644
--- a/drivers/scsi/aacraid/commctrl.c
+++ b/drivers/scsi/aacraid/commctrl.c
@@ -243,7 +243,6 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg)
243 * Search the list of AdapterFibContext addresses on the adapter 243 * Search the list of AdapterFibContext addresses on the adapter
244 * to be sure this is a valid address 244 * to be sure this is a valid address
245 */ 245 */
246 spin_lock_irqsave(&dev->fib_lock, flags);
247 entry = dev->fib_list.next; 246 entry = dev->fib_list.next;
248 fibctx = NULL; 247 fibctx = NULL;
249 248
@@ -252,25 +251,24 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg)
252 /* 251 /*
253 * Extract the AdapterFibContext from the Input parameters. 252 * Extract the AdapterFibContext from the Input parameters.
254 */ 253 */
255 if (fibctx->unique == f.fibctx) { /* We found a winner */ 254 if (fibctx->unique == f.fibctx) { /* We found a winner */
256 break; 255 break;
257 } 256 }
258 entry = entry->next; 257 entry = entry->next;
259 fibctx = NULL; 258 fibctx = NULL;
260 } 259 }
261 if (!fibctx) { 260 if (!fibctx) {
262 spin_unlock_irqrestore(&dev->fib_lock, flags);
263 dprintk ((KERN_INFO "Fib Context not found\n")); 261 dprintk ((KERN_INFO "Fib Context not found\n"));
264 return -EINVAL; 262 return -EINVAL;
265 } 263 }
266 264
267 if((fibctx->type != FSAFS_NTC_GET_ADAPTER_FIB_CONTEXT) || 265 if((fibctx->type != FSAFS_NTC_GET_ADAPTER_FIB_CONTEXT) ||
268 (fibctx->size != sizeof(struct aac_fib_context))) { 266 (fibctx->size != sizeof(struct aac_fib_context))) {
269 spin_unlock_irqrestore(&dev->fib_lock, flags);
270 dprintk ((KERN_INFO "Fib Context corrupt?\n")); 267 dprintk ((KERN_INFO "Fib Context corrupt?\n"));
271 return -EINVAL; 268 return -EINVAL;
272 } 269 }
273 status = 0; 270 status = 0;
271 spin_lock_irqsave(&dev->fib_lock, flags);
274 /* 272 /*
275 * If there are no fibs to send back, then either wait or return 273 * If there are no fibs to send back, then either wait or return
276 * -EAGAIN 274 * -EAGAIN
@@ -328,9 +326,7 @@ return_fib:
328int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx) 326int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx)
329{ 327{
330 struct fib *fib; 328 struct fib *fib;
331 unsigned long flags;
332 329
333 spin_lock_irqsave(&dev->fib_lock, flags);
334 /* 330 /*
335 * First free any FIBs that have not been consumed. 331 * First free any FIBs that have not been consumed.
336 */ 332 */
@@ -353,7 +349,6 @@ int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx)
353 * Remove the Context from the AdapterFibContext List 349 * Remove the Context from the AdapterFibContext List
354 */ 350 */
355 list_del(&fibctx->next); 351 list_del(&fibctx->next);
356 spin_unlock_irqrestore(&dev->fib_lock, flags);
357 /* 352 /*
358 * Invalidate context 353 * Invalidate context
359 */ 354 */
@@ -419,8 +414,8 @@ static int close_getadapter_fib(struct aac_dev * dev, void __user *arg)
419 * @arg: ioctl arguments 414 * @arg: ioctl arguments
420 * 415 *
421 * This routine returns the driver version. 416 * This routine returns the driver version.
422 * Under Linux, there have been no version incompatibilities, so this is 417 * Under Linux, there have been no version incompatibilities, so this is
423 * simple! 418 * simple!
424 */ 419 */
425 420
426static int check_revision(struct aac_dev *dev, void __user *arg) 421static int check_revision(struct aac_dev *dev, void __user *arg)
@@ -468,7 +463,7 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg)
468 u32 data_dir; 463 u32 data_dir;
469 void __user *sg_user[32]; 464 void __user *sg_user[32];
470 void *sg_list[32]; 465 void *sg_list[32];
471 u32 sg_indx = 0; 466 u32 sg_indx = 0;
472 u32 byte_count = 0; 467 u32 byte_count = 0;
473 u32 actual_fibsize64, actual_fibsize = 0; 468 u32 actual_fibsize64, actual_fibsize = 0;
474 int i; 469 int i;
@@ -522,11 +517,11 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg)
522 // Fix up srb for endian and force some values 517 // Fix up srb for endian and force some values
523 518
524 srbcmd->function = cpu_to_le32(SRBF_ExecuteScsi); // Force this 519 srbcmd->function = cpu_to_le32(SRBF_ExecuteScsi); // Force this
525 srbcmd->channel = cpu_to_le32(user_srbcmd->channel); 520 srbcmd->channel = cpu_to_le32(user_srbcmd->channel);
526 srbcmd->id = cpu_to_le32(user_srbcmd->id); 521 srbcmd->id = cpu_to_le32(user_srbcmd->id);
527 srbcmd->lun = cpu_to_le32(user_srbcmd->lun); 522 srbcmd->lun = cpu_to_le32(user_srbcmd->lun);
528 srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout); 523 srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout);
529 srbcmd->flags = cpu_to_le32(flags); 524 srbcmd->flags = cpu_to_le32(flags);
530 srbcmd->retry_limit = 0; // Obsolete parameter 525 srbcmd->retry_limit = 0; // Obsolete parameter
531 srbcmd->cdb_size = cpu_to_le32(user_srbcmd->cdb_size); 526 srbcmd->cdb_size = cpu_to_le32(user_srbcmd->cdb_size);
532 memcpy(srbcmd->cdb, user_srbcmd->cdb, sizeof(srbcmd->cdb)); 527 memcpy(srbcmd->cdb, user_srbcmd->cdb, sizeof(srbcmd->cdb));
@@ -791,9 +786,9 @@ static int aac_get_pci_info(struct aac_dev* dev, void __user *arg)
791 pci_info.bus = dev->pdev->bus->number; 786 pci_info.bus = dev->pdev->bus->number;
792 pci_info.slot = PCI_SLOT(dev->pdev->devfn); 787 pci_info.slot = PCI_SLOT(dev->pdev->devfn);
793 788
794 if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) { 789 if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) {
795 dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n")); 790 dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n"));
796 return -EFAULT; 791 return -EFAULT;
797 } 792 }
798 return 0; 793 return 0;
799} 794}
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 61be22774e99..0e8267c1e915 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -1032,7 +1032,6 @@ static struct scsi_host_template aac_driver_template = {
1032 .cmd_per_lun = AAC_NUM_IO_FIB, 1032 .cmd_per_lun = AAC_NUM_IO_FIB,
1033#endif 1033#endif
1034 .use_clustering = ENABLE_CLUSTERING, 1034 .use_clustering = ENABLE_CLUSTERING,
1035 .use_sg_chaining = ENABLE_SG_CHAINING,
1036 .emulated = 1, 1035 .emulated = 1,
1037}; 1036};
1038 1037
diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c
index be58a0b097c7..7c45d88a205b 100644
--- a/drivers/scsi/aha1740.c
+++ b/drivers/scsi/aha1740.c
@@ -563,7 +563,6 @@ static struct scsi_host_template aha1740_template = {
563 .sg_tablesize = AHA1740_SCATTER, 563 .sg_tablesize = AHA1740_SCATTER,
564 .cmd_per_lun = AHA1740_CMDLUN, 564 .cmd_per_lun = AHA1740_CMDLUN,
565 .use_clustering = ENABLE_CLUSTERING, 565 .use_clustering = ENABLE_CLUSTERING,
566 .use_sg_chaining = ENABLE_SG_CHAINING,
567 .eh_abort_handler = aha1740_eh_abort_handler, 566 .eh_abort_handler = aha1740_eh_abort_handler,
568}; 567};
569 568
diff --git a/drivers/scsi/aic7xxx/aic79xx.h b/drivers/scsi/aic7xxx/aic79xx.h
index ce638aa6005a..2f00467b6b8c 100644
--- a/drivers/scsi/aic7xxx/aic79xx.h
+++ b/drivers/scsi/aic7xxx/aic79xx.h
@@ -1340,8 +1340,10 @@ struct ahd_pci_identity *ahd_find_pci_device(ahd_dev_softc_t);
1340int ahd_pci_config(struct ahd_softc *, 1340int ahd_pci_config(struct ahd_softc *,
1341 struct ahd_pci_identity *); 1341 struct ahd_pci_identity *);
1342int ahd_pci_test_register_access(struct ahd_softc *); 1342int ahd_pci_test_register_access(struct ahd_softc *);
1343#ifdef CONFIG_PM
1343void ahd_pci_suspend(struct ahd_softc *); 1344void ahd_pci_suspend(struct ahd_softc *);
1344void ahd_pci_resume(struct ahd_softc *); 1345void ahd_pci_resume(struct ahd_softc *);
1346#endif
1345 1347
1346/************************** SCB and SCB queue management **********************/ 1348/************************** SCB and SCB queue management **********************/
1347void ahd_qinfifo_requeue_tail(struct ahd_softc *ahd, 1349void ahd_qinfifo_requeue_tail(struct ahd_softc *ahd,
@@ -1352,8 +1354,10 @@ struct ahd_softc *ahd_alloc(void *platform_arg, char *name);
1352int ahd_softc_init(struct ahd_softc *); 1354int ahd_softc_init(struct ahd_softc *);
1353void ahd_controller_info(struct ahd_softc *ahd, char *buf); 1355void ahd_controller_info(struct ahd_softc *ahd, char *buf);
1354int ahd_init(struct ahd_softc *ahd); 1356int ahd_init(struct ahd_softc *ahd);
1357#ifdef CONFIG_PM
1355int ahd_suspend(struct ahd_softc *ahd); 1358int ahd_suspend(struct ahd_softc *ahd);
1356void ahd_resume(struct ahd_softc *ahd); 1359void ahd_resume(struct ahd_softc *ahd);
1360#endif
1357int ahd_default_config(struct ahd_softc *ahd); 1361int ahd_default_config(struct ahd_softc *ahd);
1358int ahd_parse_vpddata(struct ahd_softc *ahd, 1362int ahd_parse_vpddata(struct ahd_softc *ahd,
1359 struct vpd_config *vpd); 1363 struct vpd_config *vpd);
@@ -1361,7 +1365,6 @@ int ahd_parse_cfgdata(struct ahd_softc *ahd,
1361 struct seeprom_config *sc); 1365 struct seeprom_config *sc);
1362void ahd_intr_enable(struct ahd_softc *ahd, int enable); 1366void ahd_intr_enable(struct ahd_softc *ahd, int enable);
1363void ahd_pause_and_flushwork(struct ahd_softc *ahd); 1367void ahd_pause_and_flushwork(struct ahd_softc *ahd);
1364int ahd_suspend(struct ahd_softc *ahd);
1365void ahd_set_unit(struct ahd_softc *, int); 1368void ahd_set_unit(struct ahd_softc *, int);
1366void ahd_set_name(struct ahd_softc *, char *); 1369void ahd_set_name(struct ahd_softc *, char *);
1367struct scb *ahd_get_scb(struct ahd_softc *ahd, u_int col_idx); 1370struct scb *ahd_get_scb(struct ahd_softc *ahd, u_int col_idx);
diff --git a/drivers/scsi/aic7xxx/aic79xx_core.c b/drivers/scsi/aic7xxx/aic79xx_core.c
index a7dd8cdda472..ade0fb8fbdb2 100644
--- a/drivers/scsi/aic7xxx/aic79xx_core.c
+++ b/drivers/scsi/aic7xxx/aic79xx_core.c
@@ -7175,6 +7175,7 @@ ahd_pause_and_flushwork(struct ahd_softc *ahd)
7175 ahd->flags &= ~AHD_ALL_INTERRUPTS; 7175 ahd->flags &= ~AHD_ALL_INTERRUPTS;
7176} 7176}
7177 7177
7178#ifdef CONFIG_PM
7178int 7179int
7179ahd_suspend(struct ahd_softc *ahd) 7180ahd_suspend(struct ahd_softc *ahd)
7180{ 7181{
@@ -7197,6 +7198,7 @@ ahd_resume(struct ahd_softc *ahd)
7197 ahd_intr_enable(ahd, TRUE); 7198 ahd_intr_enable(ahd, TRUE);
7198 ahd_restart(ahd); 7199 ahd_restart(ahd);
7199} 7200}
7201#endif
7200 7202
7201/************************** Busy Target Table *********************************/ 7203/************************** Busy Target Table *********************************/
7202/* 7204/*
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c
index 0e4708fd43c8..014654792901 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm.c
@@ -766,7 +766,6 @@ struct scsi_host_template aic79xx_driver_template = {
766 .max_sectors = 8192, 766 .max_sectors = 8192,
767 .cmd_per_lun = 2, 767 .cmd_per_lun = 2,
768 .use_clustering = ENABLE_CLUSTERING, 768 .use_clustering = ENABLE_CLUSTERING,
769 .use_sg_chaining = ENABLE_SG_CHAINING,
770 .slave_alloc = ahd_linux_slave_alloc, 769 .slave_alloc = ahd_linux_slave_alloc,
771 .slave_configure = ahd_linux_slave_configure, 770 .slave_configure = ahd_linux_slave_configure,
772 .target_alloc = ahd_linux_target_alloc, 771 .target_alloc = ahd_linux_target_alloc,
@@ -1922,7 +1921,7 @@ ahd_linux_queue_cmd_complete(struct ahd_softc *ahd, struct scsi_cmnd *cmd)
1922 struct scsi_sense_data *sense; 1921 struct scsi_sense_data *sense;
1923 1922
1924 sense = (struct scsi_sense_data *) 1923 sense = (struct scsi_sense_data *)
1925 &cmd->sense_buffer; 1924 cmd->sense_buffer;
1926 if (sense->extra_len >= 5 && 1925 if (sense->extra_len >= 5 &&
1927 (sense->add_sense_code == 0x47 1926 (sense->add_sense_code == 0x47
1928 || sense->add_sense_code == 0x48)) 1927 || sense->add_sense_code == 0x48))
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
index 66f0259edb69..4150c8a8fdc2 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
@@ -43,17 +43,6 @@
43#include "aic79xx_inline.h" 43#include "aic79xx_inline.h"
44#include "aic79xx_pci.h" 44#include "aic79xx_pci.h"
45 45
46static int ahd_linux_pci_dev_probe(struct pci_dev *pdev,
47 const struct pci_device_id *ent);
48static int ahd_linux_pci_reserve_io_regions(struct ahd_softc *ahd,
49 u_long *base, u_long *base2);
50static int ahd_linux_pci_reserve_mem_region(struct ahd_softc *ahd,
51 u_long *bus_addr,
52 uint8_t __iomem **maddr);
53static int ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg);
54static int ahd_linux_pci_dev_resume(struct pci_dev *pdev);
55static void ahd_linux_pci_dev_remove(struct pci_dev *pdev);
56
57/* Define the macro locally since it's different for different class of chips. 46/* Define the macro locally since it's different for different class of chips.
58 */ 47 */
59#define ID(x) \ 48#define ID(x) \
@@ -85,17 +74,7 @@ static struct pci_device_id ahd_linux_pci_id_table[] = {
85 74
86MODULE_DEVICE_TABLE(pci, ahd_linux_pci_id_table); 75MODULE_DEVICE_TABLE(pci, ahd_linux_pci_id_table);
87 76
88static struct pci_driver aic79xx_pci_driver = {
89 .name = "aic79xx",
90 .probe = ahd_linux_pci_dev_probe,
91#ifdef CONFIG_PM 77#ifdef CONFIG_PM
92 .suspend = ahd_linux_pci_dev_suspend,
93 .resume = ahd_linux_pci_dev_resume,
94#endif
95 .remove = ahd_linux_pci_dev_remove,
96 .id_table = ahd_linux_pci_id_table
97};
98
99static int 78static int
100ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg) 79ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg)
101{ 80{
@@ -139,6 +118,7 @@ ahd_linux_pci_dev_resume(struct pci_dev *pdev)
139 118
140 return rc; 119 return rc;
141} 120}
121#endif
142 122
143static void 123static void
144ahd_linux_pci_dev_remove(struct pci_dev *pdev) 124ahd_linux_pci_dev_remove(struct pci_dev *pdev)
@@ -245,6 +225,17 @@ ahd_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
245 return (0); 225 return (0);
246} 226}
247 227
228static struct pci_driver aic79xx_pci_driver = {
229 .name = "aic79xx",
230 .probe = ahd_linux_pci_dev_probe,
231#ifdef CONFIG_PM
232 .suspend = ahd_linux_pci_dev_suspend,
233 .resume = ahd_linux_pci_dev_resume,
234#endif
235 .remove = ahd_linux_pci_dev_remove,
236 .id_table = ahd_linux_pci_id_table
237};
238
248int 239int
249ahd_linux_pci_init(void) 240ahd_linux_pci_init(void)
250{ 241{
diff --git a/drivers/scsi/aic7xxx/aic79xx_pci.c b/drivers/scsi/aic7xxx/aic79xx_pci.c
index 7a203a90601a..df853676e66a 100644
--- a/drivers/scsi/aic7xxx/aic79xx_pci.c
+++ b/drivers/scsi/aic7xxx/aic79xx_pci.c
@@ -389,6 +389,7 @@ ahd_pci_config(struct ahd_softc *ahd, struct ahd_pci_identity *entry)
389 return error; 389 return error;
390} 390}
391 391
392#ifdef CONFIG_PM
392void 393void
393ahd_pci_suspend(struct ahd_softc *ahd) 394ahd_pci_suspend(struct ahd_softc *ahd)
394{ 395{
@@ -415,6 +416,7 @@ ahd_pci_resume(struct ahd_softc *ahd)
415 ahd_pci_write_config(ahd->dev_softc, CSIZE_LATTIME, 416 ahd_pci_write_config(ahd->dev_softc, CSIZE_LATTIME,
416 ahd->suspend_state.pci_state.csize_lattime, /*bytes*/1); 417 ahd->suspend_state.pci_state.csize_lattime, /*bytes*/1);
417} 418}
419#endif
418 420
419/* 421/*
420 * Perform some simple tests that should catch situations where 422 * Perform some simple tests that should catch situations where
diff --git a/drivers/scsi/aic7xxx/aic7xxx.h b/drivers/scsi/aic7xxx/aic7xxx.h
index 3d4e42d90452..c0344e617651 100644
--- a/drivers/scsi/aic7xxx/aic7xxx.h
+++ b/drivers/scsi/aic7xxx/aic7xxx.h
@@ -1143,7 +1143,9 @@ struct ahc_pci_identity *ahc_find_pci_device(ahc_dev_softc_t);
1143int ahc_pci_config(struct ahc_softc *, 1143int ahc_pci_config(struct ahc_softc *,
1144 struct ahc_pci_identity *); 1144 struct ahc_pci_identity *);
1145int ahc_pci_test_register_access(struct ahc_softc *); 1145int ahc_pci_test_register_access(struct ahc_softc *);
1146#ifdef CONFIG_PM
1146void ahc_pci_resume(struct ahc_softc *ahc); 1147void ahc_pci_resume(struct ahc_softc *ahc);
1148#endif
1147 1149
1148/*************************** EISA/VL Front End ********************************/ 1150/*************************** EISA/VL Front End ********************************/
1149struct aic7770_identity *aic7770_find_device(uint32_t); 1151struct aic7770_identity *aic7770_find_device(uint32_t);
@@ -1170,8 +1172,10 @@ int ahc_chip_init(struct ahc_softc *ahc);
1170int ahc_init(struct ahc_softc *ahc); 1172int ahc_init(struct ahc_softc *ahc);
1171void ahc_intr_enable(struct ahc_softc *ahc, int enable); 1173void ahc_intr_enable(struct ahc_softc *ahc, int enable);
1172void ahc_pause_and_flushwork(struct ahc_softc *ahc); 1174void ahc_pause_and_flushwork(struct ahc_softc *ahc);
1175#ifdef CONFIG_PM
1173int ahc_suspend(struct ahc_softc *ahc); 1176int ahc_suspend(struct ahc_softc *ahc);
1174int ahc_resume(struct ahc_softc *ahc); 1177int ahc_resume(struct ahc_softc *ahc);
1178#endif
1175void ahc_set_unit(struct ahc_softc *, int); 1179void ahc_set_unit(struct ahc_softc *, int);
1176void ahc_set_name(struct ahc_softc *, char *); 1180void ahc_set_name(struct ahc_softc *, char *);
1177void ahc_alloc_scbs(struct ahc_softc *ahc); 1181void ahc_alloc_scbs(struct ahc_softc *ahc);
diff --git a/drivers/scsi/aic7xxx/aic7xxx_core.c b/drivers/scsi/aic7xxx/aic7xxx_core.c
index f350b5e89e76..6d2ae641273c 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_core.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_core.c
@@ -5078,6 +5078,7 @@ ahc_pause_and_flushwork(struct ahc_softc *ahc)
5078 ahc->flags &= ~AHC_ALL_INTERRUPTS; 5078 ahc->flags &= ~AHC_ALL_INTERRUPTS;
5079} 5079}
5080 5080
5081#ifdef CONFIG_PM
5081int 5082int
5082ahc_suspend(struct ahc_softc *ahc) 5083ahc_suspend(struct ahc_softc *ahc)
5083{ 5084{
@@ -5113,7 +5114,7 @@ ahc_resume(struct ahc_softc *ahc)
5113 ahc_restart(ahc); 5114 ahc_restart(ahc);
5114 return (0); 5115 return (0);
5115} 5116}
5116 5117#endif
5117/************************** Busy Target Table *********************************/ 5118/************************** Busy Target Table *********************************/
5118/* 5119/*
5119 * Return the untagged transaction id for a given target/channel lun. 5120 * Return the untagged transaction id for a given target/channel lun.
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c
index e310e414067f..99a3b33a3233 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_osm.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c
@@ -747,7 +747,6 @@ struct scsi_host_template aic7xxx_driver_template = {
747 .max_sectors = 8192, 747 .max_sectors = 8192,
748 .cmd_per_lun = 2, 748 .cmd_per_lun = 2,
749 .use_clustering = ENABLE_CLUSTERING, 749 .use_clustering = ENABLE_CLUSTERING,
750 .use_sg_chaining = ENABLE_SG_CHAINING,
751 .slave_alloc = ahc_linux_slave_alloc, 750 .slave_alloc = ahc_linux_slave_alloc,
752 .slave_configure = ahc_linux_slave_configure, 751 .slave_configure = ahc_linux_slave_configure,
753 .target_alloc = ahc_linux_target_alloc, 752 .target_alloc = ahc_linux_target_alloc,
@@ -1658,9 +1657,12 @@ ahc_done(struct ahc_softc *ahc, struct scb *scb)
1658 untagged_q = &(ahc->untagged_queues[target_offset]); 1657 untagged_q = &(ahc->untagged_queues[target_offset]);
1659 TAILQ_REMOVE(untagged_q, scb, links.tqe); 1658 TAILQ_REMOVE(untagged_q, scb, links.tqe);
1660 BUG_ON(!TAILQ_EMPTY(untagged_q)); 1659 BUG_ON(!TAILQ_EMPTY(untagged_q));
1661 } 1660 } else if ((scb->flags & SCB_ACTIVE) == 0) {
1662 1661 /*
1663 if ((scb->flags & SCB_ACTIVE) == 0) { 1662 * Transactions aborted from the untagged queue may
1663 * not have been dispatched to the controller, so
1664 * only check the SCB_ACTIVE flag for tagged transactions.
1665 */
1664 printf("SCB %d done'd twice\n", scb->hscb->tag); 1666 printf("SCB %d done'd twice\n", scb->hscb->tag);
1665 ahc_dump_card_state(ahc); 1667 ahc_dump_card_state(ahc);
1666 panic("Stopping for safety"); 1668 panic("Stopping for safety");
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
index 4488946cff2e..dd6e21d6f1dd 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
@@ -42,17 +42,6 @@
42#include "aic7xxx_osm.h" 42#include "aic7xxx_osm.h"
43#include "aic7xxx_pci.h" 43#include "aic7xxx_pci.h"
44 44
45static int ahc_linux_pci_dev_probe(struct pci_dev *pdev,
46 const struct pci_device_id *ent);
47static int ahc_linux_pci_reserve_io_region(struct ahc_softc *ahc,
48 u_long *base);
49static int ahc_linux_pci_reserve_mem_region(struct ahc_softc *ahc,
50 u_long *bus_addr,
51 uint8_t __iomem **maddr);
52static int ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg);
53static int ahc_linux_pci_dev_resume(struct pci_dev *pdev);
54static void ahc_linux_pci_dev_remove(struct pci_dev *pdev);
55
56/* Define the macro locally since it's different for different class of chips. 45/* Define the macro locally since it's different for different class of chips.
57*/ 46*/
58#define ID(x) ID_C(x, PCI_CLASS_STORAGE_SCSI) 47#define ID(x) ID_C(x, PCI_CLASS_STORAGE_SCSI)
@@ -132,17 +121,7 @@ static struct pci_device_id ahc_linux_pci_id_table[] = {
132 121
133MODULE_DEVICE_TABLE(pci, ahc_linux_pci_id_table); 122MODULE_DEVICE_TABLE(pci, ahc_linux_pci_id_table);
134 123
135static struct pci_driver aic7xxx_pci_driver = {
136 .name = "aic7xxx",
137 .probe = ahc_linux_pci_dev_probe,
138#ifdef CONFIG_PM 124#ifdef CONFIG_PM
139 .suspend = ahc_linux_pci_dev_suspend,
140 .resume = ahc_linux_pci_dev_resume,
141#endif
142 .remove = ahc_linux_pci_dev_remove,
143 .id_table = ahc_linux_pci_id_table
144};
145
146static int 125static int
147ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg) 126ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg)
148{ 127{
@@ -182,6 +161,7 @@ ahc_linux_pci_dev_resume(struct pci_dev *pdev)
182 161
183 return (ahc_resume(ahc)); 162 return (ahc_resume(ahc));
184} 163}
164#endif
185 165
186static void 166static void
187ahc_linux_pci_dev_remove(struct pci_dev *pdev) 167ahc_linux_pci_dev_remove(struct pci_dev *pdev)
@@ -289,6 +269,17 @@ ahc_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
289 return (0); 269 return (0);
290} 270}
291 271
272static struct pci_driver aic7xxx_pci_driver = {
273 .name = "aic7xxx",
274 .probe = ahc_linux_pci_dev_probe,
275#ifdef CONFIG_PM
276 .suspend = ahc_linux_pci_dev_suspend,
277 .resume = ahc_linux_pci_dev_resume,
278#endif
279 .remove = ahc_linux_pci_dev_remove,
280 .id_table = ahc_linux_pci_id_table
281};
282
292int 283int
293ahc_linux_pci_init(void) 284ahc_linux_pci_init(void)
294{ 285{
diff --git a/drivers/scsi/aic7xxx/aic7xxx_pci.c b/drivers/scsi/aic7xxx/aic7xxx_pci.c
index ae35937b8055..56848f41e4f9 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_pci.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_pci.c
@@ -2020,6 +2020,7 @@ ahc_pci_chip_init(struct ahc_softc *ahc)
2020 return (ahc_chip_init(ahc)); 2020 return (ahc_chip_init(ahc));
2021} 2021}
2022 2022
2023#ifdef CONFIG_PM
2023void 2024void
2024ahc_pci_resume(struct ahc_softc *ahc) 2025ahc_pci_resume(struct ahc_softc *ahc)
2025{ 2026{
@@ -2051,6 +2052,7 @@ ahc_pci_resume(struct ahc_softc *ahc)
2051 ahc_release_seeprom(&sd); 2052 ahc_release_seeprom(&sd);
2052 } 2053 }
2053} 2054}
2055#endif
2054 2056
2055static int 2057static int
2056ahc_aic785X_setup(struct ahc_softc *ahc) 2058ahc_aic785X_setup(struct ahc_softc *ahc)
diff --git a/drivers/scsi/aic7xxx_old.c b/drivers/scsi/aic7xxx_old.c
index bcb0b870320c..3bfd9296bbfa 100644
--- a/drivers/scsi/aic7xxx_old.c
+++ b/drivers/scsi/aic7xxx_old.c
@@ -11141,7 +11141,6 @@ static struct scsi_host_template driver_template = {
11141 .max_sectors = 2048, 11141 .max_sectors = 2048,
11142 .cmd_per_lun = 3, 11142 .cmd_per_lun = 3,
11143 .use_clustering = ENABLE_CLUSTERING, 11143 .use_clustering = ENABLE_CLUSTERING,
11144 .use_sg_chaining = ENABLE_SG_CHAINING,
11145}; 11144};
11146 11145
11147#include "scsi_module.c" 11146#include "scsi_module.c"
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c
index d80dba913a75..f4a202e8df26 100644
--- a/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -122,7 +122,6 @@ static struct scsi_host_template arcmsr_scsi_host_template = {
122 .max_sectors = ARCMSR_MAX_XFER_SECTORS, 122 .max_sectors = ARCMSR_MAX_XFER_SECTORS,
123 .cmd_per_lun = ARCMSR_MAX_CMD_PERLUN, 123 .cmd_per_lun = ARCMSR_MAX_CMD_PERLUN,
124 .use_clustering = ENABLE_CLUSTERING, 124 .use_clustering = ENABLE_CLUSTERING,
125 .use_sg_chaining = ENABLE_SG_CHAINING,
126 .shost_attrs = arcmsr_host_attrs, 125 .shost_attrs = arcmsr_host_attrs,
127}; 126};
128#ifdef CONFIG_SCSI_ARCMSR_AER 127#ifdef CONFIG_SCSI_ARCMSR_AER
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c
index f93c73c0ba53..22ef3716e786 100644
--- a/drivers/scsi/dc395x.c
+++ b/drivers/scsi/dc395x.c
@@ -4763,7 +4763,6 @@ static struct scsi_host_template dc395x_driver_template = {
4763 .eh_bus_reset_handler = dc395x_eh_bus_reset, 4763 .eh_bus_reset_handler = dc395x_eh_bus_reset,
4764 .unchecked_isa_dma = 0, 4764 .unchecked_isa_dma = 0,
4765 .use_clustering = DISABLE_CLUSTERING, 4765 .use_clustering = DISABLE_CLUSTERING,
4766 .use_sg_chaining = ENABLE_SG_CHAINING,
4767}; 4766};
4768 4767
4769 4768
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c
index 19cce125124c..c9dd8392aab2 100644
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -3340,7 +3340,6 @@ static struct scsi_host_template driver_template = {
3340 .this_id = 7, 3340 .this_id = 7,
3341 .cmd_per_lun = 1, 3341 .cmd_per_lun = 1,
3342 .use_clustering = ENABLE_CLUSTERING, 3342 .use_clustering = ENABLE_CLUSTERING,
3343 .use_sg_chaining = ENABLE_SG_CHAINING,
3344}; 3343};
3345#include "scsi_module.c" 3344#include "scsi_module.c"
3346MODULE_LICENSE("GPL"); 3345MODULE_LICENSE("GPL");
diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c
index 05163cefec12..8be3d76656fa 100644
--- a/drivers/scsi/eata.c
+++ b/drivers/scsi/eata.c
@@ -524,7 +524,6 @@ static struct scsi_host_template driver_template = {
524 .this_id = 7, 524 .this_id = 7,
525 .unchecked_isa_dma = 1, 525 .unchecked_isa_dma = 1,
526 .use_clustering = ENABLE_CLUSTERING, 526 .use_clustering = ENABLE_CLUSTERING,
527 .use_sg_chaining = ENABLE_SG_CHAINING,
528}; 527};
529 528
530#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD) 529#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 5ea1f986220c..880c78bff0e1 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -342,7 +342,6 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
342 shost->use_clustering = sht->use_clustering; 342 shost->use_clustering = sht->use_clustering;
343 shost->ordered_tag = sht->ordered_tag; 343 shost->ordered_tag = sht->ordered_tag;
344 shost->active_mode = sht->supported_mode; 344 shost->active_mode = sht->supported_mode;
345 shost->use_sg_chaining = sht->use_sg_chaining;
346 345
347 if (sht->supported_mode == MODE_UNKNOWN) 346 if (sht->supported_mode == MODE_UNKNOWN)
348 /* means we didn't set it ... default to INITIATOR */ 347 /* means we didn't set it ... default to INITIATOR */
diff --git a/drivers/scsi/hptiop.c b/drivers/scsi/hptiop.c
index e7b2f3575ce9..ff149ad6bc4e 100644
--- a/drivers/scsi/hptiop.c
+++ b/drivers/scsi/hptiop.c
@@ -573,7 +573,7 @@ static void hptiop_finish_scsi_req(struct hptiop_hba *hba, u32 tag,
573 scsi_set_resid(scp, 573 scsi_set_resid(scp,
574 scsi_bufflen(scp) - le32_to_cpu(req->dataxfer_length)); 574 scsi_bufflen(scp) - le32_to_cpu(req->dataxfer_length));
575 scp->result = SAM_STAT_CHECK_CONDITION; 575 scp->result = SAM_STAT_CHECK_CONDITION;
576 memcpy(&scp->sense_buffer, &req->sg_list, 576 memcpy(scp->sense_buffer, &req->sg_list,
577 min_t(size_t, SCSI_SENSE_BUFFERSIZE, 577 min_t(size_t, SCSI_SENSE_BUFFERSIZE,
578 le32_to_cpu(req->dataxfer_length))); 578 le32_to_cpu(req->dataxfer_length)));
579 break; 579 break;
@@ -906,7 +906,6 @@ static struct scsi_host_template driver_template = {
906 .unchecked_isa_dma = 0, 906 .unchecked_isa_dma = 0,
907 .emulated = 0, 907 .emulated = 0,
908 .use_clustering = ENABLE_CLUSTERING, 908 .use_clustering = ENABLE_CLUSTERING,
909 .use_sg_chaining = ENABLE_SG_CHAINING,
910 .proc_name = driver_name, 909 .proc_name = driver_name,
911 .shost_attrs = hptiop_attrs, 910 .shost_attrs = hptiop_attrs,
912 .this_id = -1, 911 .this_id = -1,
diff --git a/drivers/scsi/ibmmca.c b/drivers/scsi/ibmmca.c
index db004a450732..4d15a62914e9 100644
--- a/drivers/scsi/ibmmca.c
+++ b/drivers/scsi/ibmmca.c
@@ -1501,7 +1501,6 @@ static struct scsi_host_template ibmmca_driver_template = {
1501 .sg_tablesize = 16, 1501 .sg_tablesize = 16,
1502 .cmd_per_lun = 1, 1502 .cmd_per_lun = 1,
1503 .use_clustering = ENABLE_CLUSTERING, 1503 .use_clustering = ENABLE_CLUSTERING,
1504 .use_sg_chaining = ENABLE_SG_CHAINING,
1505}; 1504};
1506 1505
1507static int ibmmca_probe(struct device *dev) 1506static int ibmmca_probe(struct device *dev)
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index 30819012898f..78d46a900bb5 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -1600,7 +1600,6 @@ static struct scsi_host_template driver_template = {
1600 .this_id = -1, 1600 .this_id = -1,
1601 .sg_tablesize = SG_ALL, 1601 .sg_tablesize = SG_ALL,
1602 .use_clustering = ENABLE_CLUSTERING, 1602 .use_clustering = ENABLE_CLUSTERING,
1603 .use_sg_chaining = ENABLE_SG_CHAINING,
1604 .shost_attrs = ibmvscsi_attrs, 1603 .shost_attrs = ibmvscsi_attrs,
1605}; 1604};
1606 1605
diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c
index a10a5c74b48d..0cc8868ea35d 100644
--- a/drivers/scsi/initio.c
+++ b/drivers/scsi/initio.c
@@ -2833,7 +2833,6 @@ static struct scsi_host_template initio_template = {
2833 .sg_tablesize = SG_ALL, 2833 .sg_tablesize = SG_ALL,
2834 .cmd_per_lun = 1, 2834 .cmd_per_lun = 1,
2835 .use_clustering = ENABLE_CLUSTERING, 2835 .use_clustering = ENABLE_CLUSTERING,
2836 .use_sg_chaining = ENABLE_SG_CHAINING,
2837}; 2836};
2838 2837
2839static int initio_probe_one(struct pci_dev *pdev, 2838static int initio_probe_one(struct pci_dev *pdev,
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index e5be5fd4ef58..b6f99dfbb038 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -1933,7 +1933,6 @@ static struct scsi_host_template iscsi_sht = {
1933 .eh_device_reset_handler= iscsi_eh_device_reset, 1933 .eh_device_reset_handler= iscsi_eh_device_reset,
1934 .eh_host_reset_handler = iscsi_eh_host_reset, 1934 .eh_host_reset_handler = iscsi_eh_host_reset,
1935 .use_clustering = DISABLE_CLUSTERING, 1935 .use_clustering = DISABLE_CLUSTERING,
1936 .use_sg_chaining = ENABLE_SG_CHAINING,
1937 .slave_configure = iscsi_tcp_slave_configure, 1936 .slave_configure = iscsi_tcp_slave_configure,
1938 .proc_name = "iscsi_tcp", 1937 .proc_name = "iscsi_tcp",
1939 .this_id = -1, 1938 .this_id = -1,
diff --git a/drivers/scsi/libsrp.c b/drivers/scsi/libsrp.c
index 5cff0204227d..6d6a76e65a6c 100644
--- a/drivers/scsi/libsrp.c
+++ b/drivers/scsi/libsrp.c
@@ -426,8 +426,8 @@ int srp_cmd_queue(struct Scsi_Host *shost, struct srp_cmd *cmd, void *info,
426 426
427 sc->SCp.ptr = info; 427 sc->SCp.ptr = info;
428 memcpy(sc->cmnd, cmd->cdb, MAX_COMMAND_SIZE); 428 memcpy(sc->cmnd, cmd->cdb, MAX_COMMAND_SIZE);
429 sc->request_bufflen = len; 429 sc->sdb.length = len;
430 sc->request_buffer = (void *) (unsigned long) addr; 430 sc->sdb.table.sgl = (void *) (unsigned long) addr;
431 sc->tag = tag; 431 sc->tag = tag;
432 err = scsi_tgt_queue_command(sc, itn_id, (struct scsi_lun *)&cmd->lun, 432 err = scsi_tgt_queue_command(sc, itn_id, (struct scsi_lun *)&cmd->lun,
433 cmd->tag); 433 cmd->tag);
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 6483c62730b3..fc5c3a42b05a 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -1459,7 +1459,6 @@ struct scsi_host_template lpfc_template = {
1459 .scan_finished = lpfc_scan_finished, 1459 .scan_finished = lpfc_scan_finished,
1460 .this_id = -1, 1460 .this_id = -1,
1461 .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT, 1461 .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT,
1462 .use_sg_chaining = ENABLE_SG_CHAINING,
1463 .cmd_per_lun = LPFC_CMD_PER_LUN, 1462 .cmd_per_lun = LPFC_CMD_PER_LUN,
1464 .use_clustering = ENABLE_CLUSTERING, 1463 .use_clustering = ENABLE_CLUSTERING,
1465 .shost_attrs = lpfc_hba_attrs, 1464 .shost_attrs = lpfc_hba_attrs,
@@ -1482,7 +1481,6 @@ struct scsi_host_template lpfc_vport_template = {
1482 .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT, 1481 .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT,
1483 .cmd_per_lun = LPFC_CMD_PER_LUN, 1482 .cmd_per_lun = LPFC_CMD_PER_LUN,
1484 .use_clustering = ENABLE_CLUSTERING, 1483 .use_clustering = ENABLE_CLUSTERING,
1485 .use_sg_chaining = ENABLE_SG_CHAINING,
1486 .shost_attrs = lpfc_vport_attrs, 1484 .shost_attrs = lpfc_vport_attrs,
1487 .max_sectors = 0xFFFF, 1485 .max_sectors = 0xFFFF,
1488}; 1486};
diff --git a/drivers/scsi/mac53c94.c b/drivers/scsi/mac53c94.c
index a035001f4438..b12ad7c7c673 100644
--- a/drivers/scsi/mac53c94.c
+++ b/drivers/scsi/mac53c94.c
@@ -402,7 +402,6 @@ static struct scsi_host_template mac53c94_template = {
402 .sg_tablesize = SG_ALL, 402 .sg_tablesize = SG_ALL,
403 .cmd_per_lun = 1, 403 .cmd_per_lun = 1,
404 .use_clustering = DISABLE_CLUSTERING, 404 .use_clustering = DISABLE_CLUSTERING,
405 .use_sg_chaining = ENABLE_SG_CHAINING,
406}; 405};
407 406
408static int mac53c94_probe(struct macio_dev *mdev, const struct of_device_id *match) 407static int mac53c94_probe(struct macio_dev *mdev, const struct of_device_id *match)
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index 765c24d2bc38..4d59ae8491a4 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -4490,7 +4490,6 @@ static struct scsi_host_template megaraid_template = {
4490 .sg_tablesize = MAX_SGLIST, 4490 .sg_tablesize = MAX_SGLIST,
4491 .cmd_per_lun = DEF_CMD_PER_LUN, 4491 .cmd_per_lun = DEF_CMD_PER_LUN,
4492 .use_clustering = ENABLE_CLUSTERING, 4492 .use_clustering = ENABLE_CLUSTERING,
4493 .use_sg_chaining = ENABLE_SG_CHAINING,
4494 .eh_abort_handler = megaraid_abort, 4493 .eh_abort_handler = megaraid_abort,
4495 .eh_device_reset_handler = megaraid_reset, 4494 .eh_device_reset_handler = megaraid_reset,
4496 .eh_bus_reset_handler = megaraid_reset, 4495 .eh_bus_reset_handler = megaraid_reset,
diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c
index 24e32e446e76..6db77c00e3ee 100644
--- a/drivers/scsi/megaraid/megaraid_mbox.c
+++ b/drivers/scsi/megaraid/megaraid_mbox.c
@@ -361,7 +361,6 @@ static struct scsi_host_template megaraid_template_g = {
361 .eh_host_reset_handler = megaraid_reset_handler, 361 .eh_host_reset_handler = megaraid_reset_handler,
362 .change_queue_depth = megaraid_change_queue_depth, 362 .change_queue_depth = megaraid_change_queue_depth,
363 .use_clustering = ENABLE_CLUSTERING, 363 .use_clustering = ENABLE_CLUSTERING,
364 .use_sg_chaining = ENABLE_SG_CHAINING,
365 .sdev_attrs = megaraid_sdev_attrs, 364 .sdev_attrs = megaraid_sdev_attrs,
366 .shost_attrs = megaraid_shost_attrs, 365 .shost_attrs = megaraid_shost_attrs,
367}; 366};
diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c
index d7ec921865c4..672c759ac24d 100644
--- a/drivers/scsi/megaraid/megaraid_sas.c
+++ b/drivers/scsi/megaraid/megaraid_sas.c
@@ -1192,7 +1192,6 @@ static struct scsi_host_template megasas_template = {
1192 .eh_timed_out = megasas_reset_timer, 1192 .eh_timed_out = megasas_reset_timer,
1193 .bios_param = megasas_bios_param, 1193 .bios_param = megasas_bios_param,
1194 .use_clustering = ENABLE_CLUSTERING, 1194 .use_clustering = ENABLE_CLUSTERING,
1195 .use_sg_chaining = ENABLE_SG_CHAINING,
1196}; 1195};
1197 1196
1198/** 1197/**
diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c
index 7470ff39ab22..651d09b08f2a 100644
--- a/drivers/scsi/mesh.c
+++ b/drivers/scsi/mesh.c
@@ -1843,7 +1843,6 @@ static struct scsi_host_template mesh_template = {
1843 .sg_tablesize = SG_ALL, 1843 .sg_tablesize = SG_ALL,
1844 .cmd_per_lun = 2, 1844 .cmd_per_lun = 2,
1845 .use_clustering = DISABLE_CLUSTERING, 1845 .use_clustering = DISABLE_CLUSTERING,
1846 .use_sg_chaining = ENABLE_SG_CHAINING,
1847}; 1846};
1848 1847
1849static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match) 1848static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match)
diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c
index c02771aa6c9b..c5ebf018b378 100644
--- a/drivers/scsi/ncr53c8xx.c
+++ b/drivers/scsi/ncr53c8xx.c
@@ -4967,7 +4967,7 @@ void ncr_complete (struct ncb *np, struct ccb *cp)
4967 sizeof(cp->sense_buf))); 4967 sizeof(cp->sense_buf)));
4968 4968
4969 if (DEBUG_FLAGS & (DEBUG_RESULT|DEBUG_TINY)) { 4969 if (DEBUG_FLAGS & (DEBUG_RESULT|DEBUG_TINY)) {
4970 u_char * p = (u_char*) & cmd->sense_buffer; 4970 u_char *p = cmd->sense_buffer;
4971 int i; 4971 int i;
4972 PRINT_ADDR(cmd, "sense data:"); 4972 PRINT_ADDR(cmd, "sense data:");
4973 for (i=0; i<14; i++) printk (" %x", *p++); 4973 for (i=0; i<14; i++) printk (" %x", *p++);
diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c
index 28161dc95e0d..7fed35372150 100644
--- a/drivers/scsi/nsp32.c
+++ b/drivers/scsi/nsp32.c
@@ -281,7 +281,6 @@ static struct scsi_host_template nsp32_template = {
281 .cmd_per_lun = 1, 281 .cmd_per_lun = 1,
282 .this_id = NSP32_HOST_SCSIID, 282 .this_id = NSP32_HOST_SCSIID,
283 .use_clustering = DISABLE_CLUSTERING, 283 .use_clustering = DISABLE_CLUSTERING,
284 .use_sg_chaining = ENABLE_SG_CHAINING,
285 .eh_abort_handler = nsp32_eh_abort, 284 .eh_abort_handler = nsp32_eh_abort,
286 .eh_bus_reset_handler = nsp32_eh_bus_reset, 285 .eh_bus_reset_handler = nsp32_eh_bus_reset,
287 .eh_host_reset_handler = nsp32_eh_host_reset, 286 .eh_host_reset_handler = nsp32_eh_host_reset,
diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c
index 969b9387a0c3..3454a5714749 100644
--- a/drivers/scsi/pcmcia/sym53c500_cs.c
+++ b/drivers/scsi/pcmcia/sym53c500_cs.c
@@ -692,7 +692,6 @@ static struct scsi_host_template sym53c500_driver_template = {
692 .sg_tablesize = 32, 692 .sg_tablesize = 32,
693 .cmd_per_lun = 1, 693 .cmd_per_lun = 1,
694 .use_clustering = ENABLE_CLUSTERING, 694 .use_clustering = ENABLE_CLUSTERING,
695 .use_sg_chaining = ENABLE_SG_CHAINING,
696 .shost_attrs = SYM53C500_shost_attrs 695 .shost_attrs = SYM53C500_shost_attrs
697}; 696};
698 697
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index c94906abfee3..68c0d09ffe78 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -4204,7 +4204,6 @@ static struct scsi_host_template qla1280_driver_template = {
4204 .sg_tablesize = SG_ALL, 4204 .sg_tablesize = SG_ALL,
4205 .cmd_per_lun = 1, 4205 .cmd_per_lun = 1,
4206 .use_clustering = ENABLE_CLUSTERING, 4206 .use_clustering = ENABLE_CLUSTERING,
4207 .use_sg_chaining = ENABLE_SG_CHAINING,
4208}; 4207};
4209 4208
4210 4209
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index aba1e6d48066..3954ed2d7b51 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -131,7 +131,6 @@ static struct scsi_host_template qla2x00_driver_template = {
131 .this_id = -1, 131 .this_id = -1,
132 .cmd_per_lun = 3, 132 .cmd_per_lun = 3,
133 .use_clustering = ENABLE_CLUSTERING, 133 .use_clustering = ENABLE_CLUSTERING,
134 .use_sg_chaining = ENABLE_SG_CHAINING,
135 .sg_tablesize = SG_ALL, 134 .sg_tablesize = SG_ALL,
136 135
137 /* 136 /*
@@ -163,7 +162,6 @@ struct scsi_host_template qla24xx_driver_template = {
163 .this_id = -1, 162 .this_id = -1,
164 .cmd_per_lun = 3, 163 .cmd_per_lun = 3,
165 .use_clustering = ENABLE_CLUSTERING, 164 .use_clustering = ENABLE_CLUSTERING,
166 .use_sg_chaining = ENABLE_SG_CHAINING,
167 .sg_tablesize = SG_ALL, 165 .sg_tablesize = SG_ALL,
168 166
169 .max_sectors = 0xFFFF, 167 .max_sectors = 0xFFFF,
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index d3f86646cb08..2e2b9fedffcc 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -94,7 +94,6 @@ static struct scsi_host_template qla4xxx_driver_template = {
94 .this_id = -1, 94 .this_id = -1,
95 .cmd_per_lun = 3, 95 .cmd_per_lun = 3,
96 .use_clustering = ENABLE_CLUSTERING, 96 .use_clustering = ENABLE_CLUSTERING,
97 .use_sg_chaining = ENABLE_SG_CHAINING,
98 .sg_tablesize = SG_ALL, 97 .sg_tablesize = SG_ALL,
99 98
100 .max_sectors = 0xFFFF, 99 .max_sectors = 0xFFFF,
diff --git a/drivers/scsi/qlogicfas.c b/drivers/scsi/qlogicfas.c
index 1769f965eedf..1e874f1fb5c6 100644
--- a/drivers/scsi/qlogicfas.c
+++ b/drivers/scsi/qlogicfas.c
@@ -197,7 +197,6 @@ static struct scsi_host_template qlogicfas_driver_template = {
197 .sg_tablesize = SG_ALL, 197 .sg_tablesize = SG_ALL,
198 .cmd_per_lun = 1, 198 .cmd_per_lun = 1,
199 .use_clustering = DISABLE_CLUSTERING, 199 .use_clustering = DISABLE_CLUSTERING,
200 .use_sg_chaining = ENABLE_SG_CHAINING,
201}; 200};
202 201
203static __init int qlogicfas_init(void) 202static __init int qlogicfas_init(void)
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 1a9fba6a9f92..b35d19472caa 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -757,7 +757,7 @@ void scsi_finish_command(struct scsi_cmnd *cmd)
757 "Notifying upper driver of completion " 757 "Notifying upper driver of completion "
758 "(result %x)\n", cmd->result)); 758 "(result %x)\n", cmd->result));
759 759
760 good_bytes = cmd->request_bufflen; 760 good_bytes = scsi_bufflen(cmd);
761 if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) { 761 if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) {
762 drv = scsi_cmd_to_driver(cmd); 762 drv = scsi_cmd_to_driver(cmd);
763 if (drv->done) 763 if (drv->done)
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 82c06f0a9d02..1541c174937a 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -280,6 +280,8 @@ static int resp_write(struct scsi_cmnd * SCpnt, unsigned long long lba,
280 unsigned int num, struct sdebug_dev_info * devip); 280 unsigned int num, struct sdebug_dev_info * devip);
281static int resp_report_luns(struct scsi_cmnd * SCpnt, 281static int resp_report_luns(struct scsi_cmnd * SCpnt,
282 struct sdebug_dev_info * devip); 282 struct sdebug_dev_info * devip);
283static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba,
284 unsigned int num, struct sdebug_dev_info *devip);
283static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, 285static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
284 int arr_len); 286 int arr_len);
285static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, 287static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
@@ -311,12 +313,48 @@ static void sdebug_max_tgts_luns(void);
311static struct device pseudo_primary; 313static struct device pseudo_primary;
312static struct bus_type pseudo_lld_bus; 314static struct bus_type pseudo_lld_bus;
313 315
316static void get_data_transfer_info(unsigned char *cmd,
317 unsigned long long *lba, unsigned int *num)
318{
319 int i;
320
321 switch (*cmd) {
322 case WRITE_16:
323 case READ_16:
324 for (*lba = 0, i = 0; i < 8; ++i) {
325 if (i > 0)
326 *lba <<= 8;
327 *lba += cmd[2 + i];
328 }
329 *num = cmd[13] + (cmd[12] << 8) +
330 (cmd[11] << 16) + (cmd[10] << 24);
331 break;
332 case WRITE_12:
333 case READ_12:
334 *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24);
335 *num = cmd[9] + (cmd[8] << 8) + (cmd[7] << 16) + (cmd[6] << 24);
336 break;
337 case WRITE_10:
338 case READ_10:
339 case XDWRITEREAD_10:
340 *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24);
341 *num = cmd[8] + (cmd[7] << 8);
342 break;
343 case WRITE_6:
344 case READ_6:
345 *lba = cmd[3] + (cmd[2] << 8) + ((cmd[1] & 0x1f) << 16);
346 *num = (0 == cmd[4]) ? 256 : cmd[4];
347 break;
348 default:
349 break;
350 }
351}
314 352
315static 353static
316int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) 354int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
317{ 355{
318 unsigned char *cmd = (unsigned char *) SCpnt->cmnd; 356 unsigned char *cmd = (unsigned char *) SCpnt->cmnd;
319 int len, k, j; 357 int len, k;
320 unsigned int num; 358 unsigned int num;
321 unsigned long long lba; 359 unsigned long long lba;
322 int errsts = 0; 360 int errsts = 0;
@@ -452,28 +490,7 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
452 break; 490 break;
453 if (scsi_debug_fake_rw) 491 if (scsi_debug_fake_rw)
454 break; 492 break;
455 if ((*cmd) == READ_16) { 493 get_data_transfer_info(cmd, &lba, &num);
456 for (lba = 0, j = 0; j < 8; ++j) {
457 if (j > 0)
458 lba <<= 8;
459 lba += cmd[2 + j];
460 }
461 num = cmd[13] + (cmd[12] << 8) +
462 (cmd[11] << 16) + (cmd[10] << 24);
463 } else if ((*cmd) == READ_12) {
464 lba = cmd[5] + (cmd[4] << 8) +
465 (cmd[3] << 16) + (cmd[2] << 24);
466 num = cmd[9] + (cmd[8] << 8) +
467 (cmd[7] << 16) + (cmd[6] << 24);
468 } else if ((*cmd) == READ_10) {
469 lba = cmd[5] + (cmd[4] << 8) +
470 (cmd[3] << 16) + (cmd[2] << 24);
471 num = cmd[8] + (cmd[7] << 8);
472 } else { /* READ (6) */
473 lba = cmd[3] + (cmd[2] << 8) +
474 ((cmd[1] & 0x1f) << 16);
475 num = (0 == cmd[4]) ? 256 : cmd[4];
476 }
477 errsts = resp_read(SCpnt, lba, num, devip); 494 errsts = resp_read(SCpnt, lba, num, devip);
478 if (inj_recovered && (0 == errsts)) { 495 if (inj_recovered && (0 == errsts)) {
479 mk_sense_buffer(devip, RECOVERED_ERROR, 496 mk_sense_buffer(devip, RECOVERED_ERROR,
@@ -500,28 +517,7 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
500 break; 517 break;
501 if (scsi_debug_fake_rw) 518 if (scsi_debug_fake_rw)
502 break; 519 break;
503 if ((*cmd) == WRITE_16) { 520 get_data_transfer_info(cmd, &lba, &num);
504 for (lba = 0, j = 0; j < 8; ++j) {
505 if (j > 0)
506 lba <<= 8;
507 lba += cmd[2 + j];
508 }
509 num = cmd[13] + (cmd[12] << 8) +
510 (cmd[11] << 16) + (cmd[10] << 24);
511 } else if ((*cmd) == WRITE_12) {
512 lba = cmd[5] + (cmd[4] << 8) +
513 (cmd[3] << 16) + (cmd[2] << 24);
514 num = cmd[9] + (cmd[8] << 8) +
515 (cmd[7] << 16) + (cmd[6] << 24);
516 } else if ((*cmd) == WRITE_10) {
517 lba = cmd[5] + (cmd[4] << 8) +
518 (cmd[3] << 16) + (cmd[2] << 24);
519 num = cmd[8] + (cmd[7] << 8);
520 } else { /* WRITE (6) */
521 lba = cmd[3] + (cmd[2] << 8) +
522 ((cmd[1] & 0x1f) << 16);
523 num = (0 == cmd[4]) ? 256 : cmd[4];
524 }
525 errsts = resp_write(SCpnt, lba, num, devip); 521 errsts = resp_write(SCpnt, lba, num, devip);
526 if (inj_recovered && (0 == errsts)) { 522 if (inj_recovered && (0 == errsts)) {
527 mk_sense_buffer(devip, RECOVERED_ERROR, 523 mk_sense_buffer(devip, RECOVERED_ERROR,
@@ -549,6 +545,28 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
549 case WRITE_BUFFER: 545 case WRITE_BUFFER:
550 errsts = check_readiness(SCpnt, 1, devip); 546 errsts = check_readiness(SCpnt, 1, devip);
551 break; 547 break;
548 case XDWRITEREAD_10:
549 if (!scsi_bidi_cmnd(SCpnt)) {
550 mk_sense_buffer(devip, ILLEGAL_REQUEST,
551 INVALID_FIELD_IN_CDB, 0);
552 errsts = check_condition_result;
553 break;
554 }
555
556 errsts = check_readiness(SCpnt, 0, devip);
557 if (errsts)
558 break;
559 if (scsi_debug_fake_rw)
560 break;
561 get_data_transfer_info(cmd, &lba, &num);
562 errsts = resp_read(SCpnt, lba, num, devip);
563 if (errsts)
564 break;
565 errsts = resp_write(SCpnt, lba, num, devip);
566 if (errsts)
567 break;
568 errsts = resp_xdwriteread(SCpnt, lba, num, devip);
569 break;
552 default: 570 default:
553 if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) 571 if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
554 printk(KERN_INFO "scsi_debug: Opcode: 0x%x not " 572 printk(KERN_INFO "scsi_debug: Opcode: 0x%x not "
@@ -601,18 +619,18 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
601 int k, req_len, act_len, len, active; 619 int k, req_len, act_len, len, active;
602 void * kaddr; 620 void * kaddr;
603 void * kaddr_off; 621 void * kaddr_off;
604 struct scatterlist * sg; 622 struct scatterlist *sg;
623 struct scsi_data_buffer *sdb = scsi_in(scp);
605 624
606 if (0 == scsi_bufflen(scp)) 625 if (!sdb->length)
607 return 0; 626 return 0;
608 if (NULL == scsi_sglist(scp)) 627 if (!sdb->table.sgl)
609 return (DID_ERROR << 16); 628 return (DID_ERROR << 16);
610 if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) || 629 if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_FROM_DEVICE))
611 (scp->sc_data_direction == DMA_FROM_DEVICE)))
612 return (DID_ERROR << 16); 630 return (DID_ERROR << 16);
613 active = 1; 631 active = 1;
614 req_len = act_len = 0; 632 req_len = act_len = 0;
615 scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) { 633 for_each_sg(sdb->table.sgl, sg, sdb->table.nents, k) {
616 if (active) { 634 if (active) {
617 kaddr = (unsigned char *) 635 kaddr = (unsigned char *)
618 kmap_atomic(sg_page(sg), KM_USER0); 636 kmap_atomic(sg_page(sg), KM_USER0);
@@ -630,10 +648,10 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
630 } 648 }
631 req_len += sg->length; 649 req_len += sg->length;
632 } 650 }
633 if (scsi_get_resid(scp)) 651 if (sdb->resid)
634 scsi_set_resid(scp, scsi_get_resid(scp) - act_len); 652 sdb->resid -= act_len;
635 else 653 else
636 scsi_set_resid(scp, req_len - act_len); 654 sdb->resid = req_len - act_len;
637 return 0; 655 return 0;
638} 656}
639 657
@@ -650,8 +668,7 @@ static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
650 return 0; 668 return 0;
651 if (NULL == scsi_sglist(scp)) 669 if (NULL == scsi_sglist(scp))
652 return -1; 670 return -1;
653 if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) || 671 if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_TO_DEVICE))
654 (scp->sc_data_direction == DMA_TO_DEVICE)))
655 return -1; 672 return -1;
656 req_len = fin = 0; 673 req_len = fin = 0;
657 scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) { 674 scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) {
@@ -1956,6 +1973,50 @@ static int resp_report_luns(struct scsi_cmnd * scp,
1956 min((int)alloc_len, SDEBUG_RLUN_ARR_SZ)); 1973 min((int)alloc_len, SDEBUG_RLUN_ARR_SZ));
1957} 1974}
1958 1975
1976static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba,
1977 unsigned int num, struct sdebug_dev_info *devip)
1978{
1979 int i, j, ret = -1;
1980 unsigned char *kaddr, *buf;
1981 unsigned int offset;
1982 struct scatterlist *sg;
1983 struct scsi_data_buffer *sdb = scsi_in(scp);
1984
1985 /* better not to use temporary buffer. */
1986 buf = kmalloc(scsi_bufflen(scp), GFP_ATOMIC);
1987 if (!buf)
1988 return ret;
1989
1990 offset = 0;
1991 scsi_for_each_sg(scp, sg, scsi_sg_count(scp), i) {
1992 kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0);
1993 if (!kaddr)
1994 goto out;
1995
1996 memcpy(buf + offset, kaddr + sg->offset, sg->length);
1997 offset += sg->length;
1998 kunmap_atomic(kaddr, KM_USER0);
1999 }
2000
2001 offset = 0;
2002 for_each_sg(sdb->table.sgl, sg, sdb->table.nents, i) {
2003 kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0);
2004 if (!kaddr)
2005 goto out;
2006
2007 for (j = 0; j < sg->length; j++)
2008 *(kaddr + sg->offset + j) ^= *(buf + offset + j);
2009
2010 offset += sg->length;
2011 kunmap_atomic(kaddr, KM_USER0);
2012 }
2013 ret = 0;
2014out:
2015 kfree(buf);
2016
2017 return ret;
2018}
2019
1959/* When timer goes off this function is called. */ 2020/* When timer goes off this function is called. */
1960static void timer_intr_handler(unsigned long indx) 2021static void timer_intr_handler(unsigned long indx)
1961{ 2022{
@@ -1989,6 +2050,7 @@ static int scsi_debug_slave_alloc(struct scsi_device * sdp)
1989 if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) 2050 if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
1990 printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %u>\n", 2051 printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %u>\n",
1991 sdp->host->host_no, sdp->channel, sdp->id, sdp->lun); 2052 sdp->host->host_no, sdp->channel, sdp->id, sdp->lun);
2053 set_bit(QUEUE_FLAG_BIDI, &sdp->request_queue->queue_flags);
1992 return 0; 2054 return 0;
1993} 2055}
1994 2056
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 547e85aa414f..045a0868fc7b 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -617,29 +617,27 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses,
617 ses->cmd_len = scmd->cmd_len; 617 ses->cmd_len = scmd->cmd_len;
618 memcpy(ses->cmnd, scmd->cmnd, sizeof(scmd->cmnd)); 618 memcpy(ses->cmnd, scmd->cmnd, sizeof(scmd->cmnd));
619 ses->data_direction = scmd->sc_data_direction; 619 ses->data_direction = scmd->sc_data_direction;
620 ses->bufflen = scmd->request_bufflen; 620 ses->sdb = scmd->sdb;
621 ses->buffer = scmd->request_buffer; 621 ses->next_rq = scmd->request->next_rq;
622 ses->use_sg = scmd->use_sg;
623 ses->resid = scmd->resid;
624 ses->result = scmd->result; 622 ses->result = scmd->result;
625 623
624 memset(&scmd->sdb, 0, sizeof(scmd->sdb));
625 scmd->request->next_rq = NULL;
626
626 if (sense_bytes) { 627 if (sense_bytes) {
627 scmd->request_bufflen = min_t(unsigned, 628 scmd->sdb.length = min_t(unsigned, SCSI_SENSE_BUFFERSIZE,
628 SCSI_SENSE_BUFFERSIZE, sense_bytes); 629 sense_bytes);
629 sg_init_one(&ses->sense_sgl, scmd->sense_buffer, 630 sg_init_one(&ses->sense_sgl, scmd->sense_buffer,
630 scmd->request_bufflen); 631 scmd->sdb.length);
631 scmd->request_buffer = &ses->sense_sgl; 632 scmd->sdb.table.sgl = &ses->sense_sgl;
632 scmd->sc_data_direction = DMA_FROM_DEVICE; 633 scmd->sc_data_direction = DMA_FROM_DEVICE;
633 scmd->use_sg = 1; 634 scmd->sdb.table.nents = 1;
634 memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); 635 memset(scmd->cmnd, 0, sizeof(scmd->cmnd));
635 scmd->cmnd[0] = REQUEST_SENSE; 636 scmd->cmnd[0] = REQUEST_SENSE;
636 scmd->cmnd[4] = scmd->request_bufflen; 637 scmd->cmnd[4] = scmd->sdb.length;
637 scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); 638 scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]);
638 } else { 639 } else {
639 scmd->request_buffer = NULL;
640 scmd->request_bufflen = 0;
641 scmd->sc_data_direction = DMA_NONE; 640 scmd->sc_data_direction = DMA_NONE;
642 scmd->use_sg = 0;
643 if (cmnd) { 641 if (cmnd) {
644 memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); 642 memset(scmd->cmnd, 0, sizeof(scmd->cmnd));
645 memcpy(scmd->cmnd, cmnd, cmnd_size); 643 memcpy(scmd->cmnd, cmnd, cmnd_size);
@@ -676,10 +674,8 @@ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses)
676 scmd->cmd_len = ses->cmd_len; 674 scmd->cmd_len = ses->cmd_len;
677 memcpy(scmd->cmnd, ses->cmnd, sizeof(scmd->cmnd)); 675 memcpy(scmd->cmnd, ses->cmnd, sizeof(scmd->cmnd));
678 scmd->sc_data_direction = ses->data_direction; 676 scmd->sc_data_direction = ses->data_direction;
679 scmd->request_bufflen = ses->bufflen; 677 scmd->sdb = ses->sdb;
680 scmd->request_buffer = ses->buffer; 678 scmd->request->next_rq = ses->next_rq;
681 scmd->use_sg = ses->use_sg;
682 scmd->resid = ses->resid;
683 scmd->result = ses->result; 679 scmd->result = ses->result;
684} 680}
685EXPORT_SYMBOL(scsi_eh_restore_cmnd); 681EXPORT_SYMBOL(scsi_eh_restore_cmnd);
@@ -1700,8 +1696,7 @@ scsi_reset_provider(struct scsi_device *dev, int flag)
1700 memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd)); 1696 memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd));
1701 1697
1702 scmd->scsi_done = scsi_reset_provider_done_command; 1698 scmd->scsi_done = scsi_reset_provider_done_command;
1703 scmd->request_buffer = NULL; 1699 memset(&scmd->sdb, 0, sizeof(scmd->sdb));
1704 scmd->request_bufflen = 0;
1705 1700
1706 scmd->cmd_len = 0; 1701 scmd->cmd_len = 0;
1707 1702
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7c4c889c5221..b12fb310e399 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/bio.h> 10#include <linux/bio.h>
11#include <linux/bitops.h>
11#include <linux/blkdev.h> 12#include <linux/blkdev.h>
12#include <linux/completion.h> 13#include <linux/completion.h>
13#include <linux/kernel.h> 14#include <linux/kernel.h>
@@ -34,13 +35,6 @@
34#define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools) 35#define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools)
35#define SG_MEMPOOL_SIZE 2 36#define SG_MEMPOOL_SIZE 2
36 37
37/*
38 * The maximum number of SG segments that we will put inside a scatterlist
39 * (unless chaining is used). Should ideally fit inside a single page, to
40 * avoid a higher order allocation.
41 */
42#define SCSI_MAX_SG_SEGMENTS 128
43
44struct scsi_host_sg_pool { 38struct scsi_host_sg_pool {
45 size_t size; 39 size_t size;
46 char *name; 40 char *name;
@@ -48,22 +42,31 @@ struct scsi_host_sg_pool {
48 mempool_t *pool; 42 mempool_t *pool;
49}; 43};
50 44
51#define SP(x) { x, "sgpool-" #x } 45#define SP(x) { x, "sgpool-" __stringify(x) }
46#if (SCSI_MAX_SG_SEGMENTS < 32)
47#error SCSI_MAX_SG_SEGMENTS is too small (must be 32 or greater)
48#endif
52static struct scsi_host_sg_pool scsi_sg_pools[] = { 49static struct scsi_host_sg_pool scsi_sg_pools[] = {
53 SP(8), 50 SP(8),
54 SP(16), 51 SP(16),
55#if (SCSI_MAX_SG_SEGMENTS > 16)
56 SP(32),
57#if (SCSI_MAX_SG_SEGMENTS > 32) 52#if (SCSI_MAX_SG_SEGMENTS > 32)
58 SP(64), 53 SP(32),
59#if (SCSI_MAX_SG_SEGMENTS > 64) 54#if (SCSI_MAX_SG_SEGMENTS > 64)
55 SP(64),
56#if (SCSI_MAX_SG_SEGMENTS > 128)
60 SP(128), 57 SP(128),
58#if (SCSI_MAX_SG_SEGMENTS > 256)
59#error SCSI_MAX_SG_SEGMENTS is too large (256 MAX)
60#endif
61#endif 61#endif
62#endif 62#endif
63#endif 63#endif
64 SP(SCSI_MAX_SG_SEGMENTS)
64}; 65};
65#undef SP 66#undef SP
66 67
68static struct kmem_cache *scsi_bidi_sdb_cache;
69
67static void scsi_run_queue(struct request_queue *q); 70static void scsi_run_queue(struct request_queue *q);
68 71
69/* 72/*
@@ -440,7 +443,7 @@ EXPORT_SYMBOL_GPL(scsi_execute_async);
440static void scsi_init_cmd_errh(struct scsi_cmnd *cmd) 443static void scsi_init_cmd_errh(struct scsi_cmnd *cmd)
441{ 444{
442 cmd->serial_number = 0; 445 cmd->serial_number = 0;
443 cmd->resid = 0; 446 scsi_set_resid(cmd, 0);
444 memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); 447 memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
445 if (cmd->cmd_len == 0) 448 if (cmd->cmd_len == 0)
446 cmd->cmd_len = COMMAND_SIZE(cmd->cmnd[0]); 449 cmd->cmd_len = COMMAND_SIZE(cmd->cmnd[0]);
@@ -690,42 +693,16 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
690 return NULL; 693 return NULL;
691} 694}
692 695
693/*
694 * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
695 * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
696 */
697#define SCSI_MAX_SG_CHAIN_SEGMENTS 2048
698
699static inline unsigned int scsi_sgtable_index(unsigned short nents) 696static inline unsigned int scsi_sgtable_index(unsigned short nents)
700{ 697{
701 unsigned int index; 698 unsigned int index;
702 699
703 switch (nents) { 700 BUG_ON(nents > SCSI_MAX_SG_SEGMENTS);
704 case 1 ... 8: 701
702 if (nents <= 8)
705 index = 0; 703 index = 0;
706 break; 704 else
707 case 9 ... 16: 705 index = get_count_order(nents) - 3;
708 index = 1;
709 break;
710#if (SCSI_MAX_SG_SEGMENTS > 16)
711 case 17 ... 32:
712 index = 2;
713 break;
714#if (SCSI_MAX_SG_SEGMENTS > 32)
715 case 33 ... 64:
716 index = 3;
717 break;
718#if (SCSI_MAX_SG_SEGMENTS > 64)
719 case 65 ... 128:
720 index = 4;
721 break;
722#endif
723#endif
724#endif
725 default:
726 printk(KERN_ERR "scsi: bad segment count=%d\n", nents);
727 BUG();
728 }
729 706
730 return index; 707 return index;
731} 708}
@@ -746,31 +723,27 @@ static struct scatterlist *scsi_sg_alloc(unsigned int nents, gfp_t gfp_mask)
746 return mempool_alloc(sgp->pool, gfp_mask); 723 return mempool_alloc(sgp->pool, gfp_mask);
747} 724}
748 725
749int scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask) 726static int scsi_alloc_sgtable(struct scsi_data_buffer *sdb, int nents,
727 gfp_t gfp_mask)
750{ 728{
751 int ret; 729 int ret;
752 730
753 BUG_ON(!cmd->use_sg); 731 BUG_ON(!nents);
754 732
755 ret = __sg_alloc_table(&cmd->sg_table, cmd->use_sg, 733 ret = __sg_alloc_table(&sdb->table, nents, SCSI_MAX_SG_SEGMENTS,
756 SCSI_MAX_SG_SEGMENTS, gfp_mask, scsi_sg_alloc); 734 gfp_mask, scsi_sg_alloc);
757 if (unlikely(ret)) 735 if (unlikely(ret))
758 __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, 736 __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS,
759 scsi_sg_free); 737 scsi_sg_free);
760 738
761 cmd->request_buffer = cmd->sg_table.sgl;
762 return ret; 739 return ret;
763} 740}
764 741
765EXPORT_SYMBOL(scsi_alloc_sgtable); 742static void scsi_free_sgtable(struct scsi_data_buffer *sdb)
766
767void scsi_free_sgtable(struct scsi_cmnd *cmd)
768{ 743{
769 __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free); 744 __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free);
770} 745}
771 746
772EXPORT_SYMBOL(scsi_free_sgtable);
773
774/* 747/*
775 * Function: scsi_release_buffers() 748 * Function: scsi_release_buffers()
776 * 749 *
@@ -788,17 +761,49 @@ EXPORT_SYMBOL(scsi_free_sgtable);
788 * the scatter-gather table, and potentially any bounce 761 * the scatter-gather table, and potentially any bounce
789 * buffers. 762 * buffers.
790 */ 763 */
791static void scsi_release_buffers(struct scsi_cmnd *cmd) 764void scsi_release_buffers(struct scsi_cmnd *cmd)
765{
766 if (cmd->sdb.table.nents)
767 scsi_free_sgtable(&cmd->sdb);
768
769 memset(&cmd->sdb, 0, sizeof(cmd->sdb));
770
771 if (scsi_bidi_cmnd(cmd)) {
772 struct scsi_data_buffer *bidi_sdb =
773 cmd->request->next_rq->special;
774 scsi_free_sgtable(bidi_sdb);
775 kmem_cache_free(scsi_bidi_sdb_cache, bidi_sdb);
776 cmd->request->next_rq->special = NULL;
777 }
778}
779EXPORT_SYMBOL(scsi_release_buffers);
780
781/*
782 * Bidi commands Must be complete as a whole, both sides at once.
783 * If part of the bytes were written and lld returned
784 * scsi_in()->resid and/or scsi_out()->resid this information will be left
785 * in req->data_len and req->next_rq->data_len. The upper-layer driver can
786 * decide what to do with this information.
787 */
788void scsi_end_bidi_request(struct scsi_cmnd *cmd)
792{ 789{
793 if (cmd->use_sg) 790 struct request *req = cmd->request;
794 scsi_free_sgtable(cmd); 791 unsigned int dlen = req->data_len;
792 unsigned int next_dlen = req->next_rq->data_len;
793
794 req->data_len = scsi_out(cmd)->resid;
795 req->next_rq->data_len = scsi_in(cmd)->resid;
796
797 /* The req and req->next_rq have not been completed */
798 BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen));
799
800 scsi_release_buffers(cmd);
795 801
796 /* 802 /*
797 * Zero these out. They now point to freed memory, and it is 803 * This will goose the queue request function at the end, so we don't
798 * dangerous to hang onto the pointers. 804 * need to worry about launching another command.
799 */ 805 */
800 cmd->request_buffer = NULL; 806 scsi_next_command(cmd);
801 cmd->request_bufflen = 0;
802} 807}
803 808
804/* 809/*
@@ -832,7 +837,7 @@ static void scsi_release_buffers(struct scsi_cmnd *cmd)
832void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) 837void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
833{ 838{
834 int result = cmd->result; 839 int result = cmd->result;
835 int this_count = cmd->request_bufflen; 840 int this_count = scsi_bufflen(cmd);
836 struct request_queue *q = cmd->device->request_queue; 841 struct request_queue *q = cmd->device->request_queue;
837 struct request *req = cmd->request; 842 struct request *req = cmd->request;
838 int clear_errors = 1; 843 int clear_errors = 1;
@@ -840,8 +845,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
840 int sense_valid = 0; 845 int sense_valid = 0;
841 int sense_deferred = 0; 846 int sense_deferred = 0;
842 847
843 scsi_release_buffers(cmd);
844
845 if (result) { 848 if (result) {
846 sense_valid = scsi_command_normalize_sense(cmd, &sshdr); 849 sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
847 if (sense_valid) 850 if (sense_valid)
@@ -864,9 +867,17 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
864 req->sense_len = len; 867 req->sense_len = len;
865 } 868 }
866 } 869 }
867 req->data_len = cmd->resid; 870 if (scsi_bidi_cmnd(cmd)) {
871 /* will also release_buffers */
872 scsi_end_bidi_request(cmd);
873 return;
874 }
875 req->data_len = scsi_get_resid(cmd);
868 } 876 }
869 877
878 BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */
879 scsi_release_buffers(cmd);
880
870 /* 881 /*
871 * Next deal with any sectors which we were able to correctly 882 * Next deal with any sectors which we were able to correctly
872 * handle. 883 * handle.
@@ -874,7 +885,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
874 SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, " 885 SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, "
875 "%d bytes done.\n", 886 "%d bytes done.\n",
876 req->nr_sectors, good_bytes)); 887 req->nr_sectors, good_bytes));
877 SCSI_LOG_HLCOMPLETE(1, printk("use_sg is %d\n", cmd->use_sg));
878 888
879 if (clear_errors) 889 if (clear_errors)
880 req->errors = 0; 890 req->errors = 0;
@@ -991,52 +1001,80 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
991 scsi_end_request(cmd, -EIO, this_count, !result); 1001 scsi_end_request(cmd, -EIO, this_count, !result);
992} 1002}
993 1003
994/* 1004static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
995 * Function: scsi_init_io() 1005 gfp_t gfp_mask)
996 *
997 * Purpose: SCSI I/O initialize function.
998 *
999 * Arguments: cmd - Command descriptor we wish to initialize
1000 *
1001 * Returns: 0 on success
1002 * BLKPREP_DEFER if the failure is retryable
1003 */
1004static int scsi_init_io(struct scsi_cmnd *cmd)
1005{ 1006{
1006 struct request *req = cmd->request; 1007 int count;
1007 int count;
1008
1009 /*
1010 * We used to not use scatter-gather for single segment request,
1011 * but now we do (it makes highmem I/O easier to support without
1012 * kmapping pages)
1013 */
1014 cmd->use_sg = req->nr_phys_segments;
1015 1008
1016 /* 1009 /*
1017 * If sg table allocation fails, requeue request later. 1010 * If sg table allocation fails, requeue request later.
1018 */ 1011 */
1019 if (unlikely(scsi_alloc_sgtable(cmd, GFP_ATOMIC))) { 1012 if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
1020 scsi_unprep_request(req); 1013 gfp_mask))) {
1021 return BLKPREP_DEFER; 1014 return BLKPREP_DEFER;
1022 } 1015 }
1023 1016
1024 req->buffer = NULL; 1017 req->buffer = NULL;
1025 if (blk_pc_request(req)) 1018 if (blk_pc_request(req))
1026 cmd->request_bufflen = req->data_len; 1019 sdb->length = req->data_len;
1027 else 1020 else
1028 cmd->request_bufflen = req->nr_sectors << 9; 1021 sdb->length = req->nr_sectors << 9;
1029 1022
1030 /* 1023 /*
1031 * Next, walk the list, and fill in the addresses and sizes of 1024 * Next, walk the list, and fill in the addresses and sizes of
1032 * each segment. 1025 * each segment.
1033 */ 1026 */
1034 count = blk_rq_map_sg(req->q, req, cmd->request_buffer); 1027 count = blk_rq_map_sg(req->q, req, sdb->table.sgl);
1035 BUG_ON(count > cmd->use_sg); 1028 BUG_ON(count > sdb->table.nents);
1036 cmd->use_sg = count; 1029 sdb->table.nents = count;
1037 return BLKPREP_OK; 1030 return BLKPREP_OK;
1038} 1031}
1039 1032
1033/*
1034 * Function: scsi_init_io()
1035 *
1036 * Purpose: SCSI I/O initialize function.
1037 *
1038 * Arguments: cmd - Command descriptor we wish to initialize
1039 *
1040 * Returns: 0 on success
1041 * BLKPREP_DEFER if the failure is retryable
1042 * BLKPREP_KILL if the failure is fatal
1043 */
1044int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
1045{
1046 int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask);
1047 if (error)
1048 goto err_exit;
1049
1050 if (blk_bidi_rq(cmd->request)) {
1051 struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc(
1052 scsi_bidi_sdb_cache, GFP_ATOMIC);
1053 if (!bidi_sdb) {
1054 error = BLKPREP_DEFER;
1055 goto err_exit;
1056 }
1057
1058 cmd->request->next_rq->special = bidi_sdb;
1059 error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb,
1060 GFP_ATOMIC);
1061 if (error)
1062 goto err_exit;
1063 }
1064
1065 return BLKPREP_OK ;
1066
1067err_exit:
1068 scsi_release_buffers(cmd);
1069 if (error == BLKPREP_KILL)
1070 scsi_put_command(cmd);
1071 else /* BLKPREP_DEFER */
1072 scsi_unprep_request(cmd->request);
1073
1074 return error;
1075}
1076EXPORT_SYMBOL(scsi_init_io);
1077
1040static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev, 1078static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev,
1041 struct request *req) 1079 struct request *req)
1042{ 1080{
@@ -1081,16 +1119,14 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
1081 1119
1082 BUG_ON(!req->nr_phys_segments); 1120 BUG_ON(!req->nr_phys_segments);
1083 1121
1084 ret = scsi_init_io(cmd); 1122 ret = scsi_init_io(cmd, GFP_ATOMIC);
1085 if (unlikely(ret)) 1123 if (unlikely(ret))
1086 return ret; 1124 return ret;
1087 } else { 1125 } else {
1088 BUG_ON(req->data_len); 1126 BUG_ON(req->data_len);
1089 BUG_ON(req->data); 1127 BUG_ON(req->data);
1090 1128
1091 cmd->request_bufflen = 0; 1129 memset(&cmd->sdb, 0, sizeof(cmd->sdb));
1092 cmd->request_buffer = NULL;
1093 cmd->use_sg = 0;
1094 req->buffer = NULL; 1130 req->buffer = NULL;
1095 } 1131 }
1096 1132
@@ -1132,7 +1168,7 @@ int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
1132 if (unlikely(!cmd)) 1168 if (unlikely(!cmd))
1133 return BLKPREP_DEFER; 1169 return BLKPREP_DEFER;
1134 1170
1135 return scsi_init_io(cmd); 1171 return scsi_init_io(cmd, GFP_ATOMIC);
1136} 1172}
1137EXPORT_SYMBOL(scsi_setup_fs_cmnd); 1173EXPORT_SYMBOL(scsi_setup_fs_cmnd);
1138 1174
@@ -1542,20 +1578,7 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
1542 * this limit is imposed by hardware restrictions 1578 * this limit is imposed by hardware restrictions
1543 */ 1579 */
1544 blk_queue_max_hw_segments(q, shost->sg_tablesize); 1580 blk_queue_max_hw_segments(q, shost->sg_tablesize);
1545 1581 blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
1546 /*
1547 * In the future, sg chaining support will be mandatory and this
1548 * ifdef can then go away. Right now we don't have all archs
1549 * converted, so better keep it safe.
1550 */
1551#ifdef ARCH_HAS_SG_CHAIN
1552 if (shost->use_sg_chaining)
1553 blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
1554 else
1555 blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
1556#else
1557 blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
1558#endif
1559 1582
1560 blk_queue_max_sectors(q, shost->max_sectors); 1583 blk_queue_max_sectors(q, shost->max_sectors);
1561 blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost)); 1584 blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
@@ -1654,6 +1677,14 @@ int __init scsi_init_queue(void)
1654 return -ENOMEM; 1677 return -ENOMEM;
1655 } 1678 }
1656 1679
1680 scsi_bidi_sdb_cache = kmem_cache_create("scsi_bidi_sdb",
1681 sizeof(struct scsi_data_buffer),
1682 0, 0, NULL);
1683 if (!scsi_bidi_sdb_cache) {
1684 printk(KERN_ERR "SCSI: can't init scsi bidi sdb cache\n");
1685 goto cleanup_io_context;
1686 }
1687
1657 for (i = 0; i < SG_MEMPOOL_NR; i++) { 1688 for (i = 0; i < SG_MEMPOOL_NR; i++) {
1658 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; 1689 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
1659 int size = sgp->size * sizeof(struct scatterlist); 1690 int size = sgp->size * sizeof(struct scatterlist);
@@ -1663,6 +1694,7 @@ int __init scsi_init_queue(void)
1663 if (!sgp->slab) { 1694 if (!sgp->slab) {
1664 printk(KERN_ERR "SCSI: can't init sg slab %s\n", 1695 printk(KERN_ERR "SCSI: can't init sg slab %s\n",
1665 sgp->name); 1696 sgp->name);
1697 goto cleanup_bidi_sdb;
1666 } 1698 }
1667 1699
1668 sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE, 1700 sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
@@ -1670,10 +1702,25 @@ int __init scsi_init_queue(void)
1670 if (!sgp->pool) { 1702 if (!sgp->pool) {
1671 printk(KERN_ERR "SCSI: can't init sg mempool %s\n", 1703 printk(KERN_ERR "SCSI: can't init sg mempool %s\n",
1672 sgp->name); 1704 sgp->name);
1705 goto cleanup_bidi_sdb;
1673 } 1706 }
1674 } 1707 }
1675 1708
1676 return 0; 1709 return 0;
1710
1711cleanup_bidi_sdb:
1712 for (i = 0; i < SG_MEMPOOL_NR; i++) {
1713 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
1714 if (sgp->pool)
1715 mempool_destroy(sgp->pool);
1716 if (sgp->slab)
1717 kmem_cache_destroy(sgp->slab);
1718 }
1719 kmem_cache_destroy(scsi_bidi_sdb_cache);
1720cleanup_io_context:
1721 kmem_cache_destroy(scsi_io_context_cache);
1722
1723 return -ENOMEM;
1677} 1724}
1678 1725
1679void scsi_exit_queue(void) 1726void scsi_exit_queue(void)
@@ -1681,6 +1728,7 @@ void scsi_exit_queue(void)
1681 int i; 1728 int i;
1682 1729
1683 kmem_cache_destroy(scsi_io_context_cache); 1730 kmem_cache_destroy(scsi_io_context_cache);
1731 kmem_cache_destroy(scsi_bidi_sdb_cache);
1684 1732
1685 for (i = 0; i < SG_MEMPOOL_NR; i++) { 1733 for (i = 0; i < SG_MEMPOOL_NR; i++) {
1686 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; 1734 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c
index 01e03f3f6ffa..91630baea532 100644
--- a/drivers/scsi/scsi_tgt_lib.c
+++ b/drivers/scsi/scsi_tgt_lib.c
@@ -331,8 +331,7 @@ static void scsi_tgt_cmd_done(struct scsi_cmnd *cmd)
331 331
332 scsi_tgt_uspace_send_status(cmd, tcmd->itn_id, tcmd->tag); 332 scsi_tgt_uspace_send_status(cmd, tcmd->itn_id, tcmd->tag);
333 333
334 if (scsi_sglist(cmd)) 334 scsi_release_buffers(cmd);
335 scsi_free_sgtable(cmd);
336 335
337 queue_work(scsi_tgtd, &tcmd->work); 336 queue_work(scsi_tgtd, &tcmd->work);
338} 337}
@@ -353,25 +352,6 @@ static int scsi_tgt_transfer_response(struct scsi_cmnd *cmd)
353 return 0; 352 return 0;
354} 353}
355 354
356static int scsi_tgt_init_cmd(struct scsi_cmnd *cmd, gfp_t gfp_mask)
357{
358 struct request *rq = cmd->request;
359 int count;
360
361 cmd->use_sg = rq->nr_phys_segments;
362 if (scsi_alloc_sgtable(cmd, gfp_mask))
363 return -ENOMEM;
364
365 cmd->request_bufflen = rq->data_len;
366
367 dprintk("cmd %p cnt %d %lu\n", cmd, scsi_sg_count(cmd),
368 rq_data_dir(rq));
369 count = blk_rq_map_sg(rq->q, rq, scsi_sglist(cmd));
370 BUG_ON(count > cmd->use_sg);
371 cmd->use_sg = count;
372 return 0;
373}
374
375/* TODO: test this crap and replace bio_map_user with new interface maybe */ 355/* TODO: test this crap and replace bio_map_user with new interface maybe */
376static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd, 356static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
377 unsigned long uaddr, unsigned int len, int rw) 357 unsigned long uaddr, unsigned int len, int rw)
@@ -397,9 +377,11 @@ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
397 } 377 }
398 378
399 tcmd->bio = rq->bio; 379 tcmd->bio = rq->bio;
400 err = scsi_tgt_init_cmd(cmd, GFP_KERNEL); 380 err = scsi_init_io(cmd, GFP_KERNEL);
401 if (err) 381 if (err) {
382 scsi_release_buffers(cmd);
402 goto unmap_rq; 383 goto unmap_rq;
384 }
403 385
404 return 0; 386 return 0;
405 387
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 24eba3118b5a..51a5557f42dd 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -519,7 +519,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
519 SCpnt->cmnd[4] = (unsigned char) this_count; 519 SCpnt->cmnd[4] = (unsigned char) this_count;
520 SCpnt->cmnd[5] = 0; 520 SCpnt->cmnd[5] = 0;
521 } 521 }
522 SCpnt->request_bufflen = this_count * sdp->sector_size; 522 SCpnt->sdb.length = this_count * sdp->sector_size;
523 523
524 /* 524 /*
525 * We shouldn't disconnect in the middle of a sector, so with a dumb 525 * We shouldn't disconnect in the middle of a sector, so with a dumb
@@ -926,7 +926,7 @@ static struct block_device_operations sd_fops = {
926static int sd_done(struct scsi_cmnd *SCpnt) 926static int sd_done(struct scsi_cmnd *SCpnt)
927{ 927{
928 int result = SCpnt->result; 928 int result = SCpnt->result;
929 unsigned int xfer_size = SCpnt->request_bufflen; 929 unsigned int xfer_size = scsi_bufflen(SCpnt);
930 unsigned int good_bytes = result ? 0 : xfer_size; 930 unsigned int good_bytes = result ? 0 : xfer_size;
931 u64 start_lba = SCpnt->request->sector; 931 u64 start_lba = SCpnt->request->sector;
932 u64 bad_lba; 932 u64 bad_lba;
diff --git a/drivers/scsi/sgiwd93.c b/drivers/scsi/sgiwd93.c
index d4ebe8c67ba9..26cfc56c7091 100644
--- a/drivers/scsi/sgiwd93.c
+++ b/drivers/scsi/sgiwd93.c
@@ -33,10 +33,9 @@
33 33
34struct ip22_hostdata { 34struct ip22_hostdata {
35 struct WD33C93_hostdata wh; 35 struct WD33C93_hostdata wh;
36 struct hpc_data { 36 dma_addr_t dma;
37 dma_addr_t dma; 37 void *cpu;
38 void *cpu; 38 struct device *dev;
39 } hd;
40}; 39};
41 40
42#define host_to_hostdata(host) ((struct ip22_hostdata *)((host)->hostdata)) 41#define host_to_hostdata(host) ((struct ip22_hostdata *)((host)->hostdata))
@@ -46,6 +45,11 @@ struct hpc_chunk {
46 u32 _padding; /* align to quadword boundary */ 45 u32 _padding; /* align to quadword boundary */
47}; 46};
48 47
48/* space for hpc dma descriptors */
49#define HPC_DMA_SIZE PAGE_SIZE
50
51#define DMA_DIR(d) ((d == DATA_OUT_DIR) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
52
49static irqreturn_t sgiwd93_intr(int irq, void *dev_id) 53static irqreturn_t sgiwd93_intr(int irq, void *dev_id)
50{ 54{
51 struct Scsi_Host * host = dev_id; 55 struct Scsi_Host * host = dev_id;
@@ -59,15 +63,17 @@ static irqreturn_t sgiwd93_intr(int irq, void *dev_id)
59} 63}
60 64
61static inline 65static inline
62void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp) 66void fill_hpc_entries(struct ip22_hostdata *hd, struct scsi_cmnd *cmd, int din)
63{ 67{
64 unsigned long len = cmd->SCp.this_residual; 68 unsigned long len = cmd->SCp.this_residual;
65 void *addr = cmd->SCp.ptr; 69 void *addr = cmd->SCp.ptr;
66 dma_addr_t physaddr; 70 dma_addr_t physaddr;
67 unsigned long count; 71 unsigned long count;
72 struct hpc_chunk *hcp;
68 73
69 physaddr = dma_map_single(NULL, addr, len, cmd->sc_data_direction); 74 physaddr = dma_map_single(hd->dev, addr, len, DMA_DIR(din));
70 cmd->SCp.dma_handle = physaddr; 75 cmd->SCp.dma_handle = physaddr;
76 hcp = hd->cpu;
71 77
72 while (len) { 78 while (len) {
73 /* 79 /*
@@ -89,6 +95,9 @@ void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp)
89 */ 95 */
90 hcp->desc.pbuf = 0; 96 hcp->desc.pbuf = 0;
91 hcp->desc.cntinfo = HPCDMA_EOX; 97 hcp->desc.cntinfo = HPCDMA_EOX;
98 dma_cache_sync(hd->dev, hd->cpu,
99 (unsigned long)(hcp + 1) - (unsigned long)hd->cpu,
100 DMA_TO_DEVICE);
92} 101}
93 102
94static int dma_setup(struct scsi_cmnd *cmd, int datainp) 103static int dma_setup(struct scsi_cmnd *cmd, int datainp)
@@ -96,9 +105,8 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp)
96 struct ip22_hostdata *hdata = host_to_hostdata(cmd->device->host); 105 struct ip22_hostdata *hdata = host_to_hostdata(cmd->device->host);
97 struct hpc3_scsiregs *hregs = 106 struct hpc3_scsiregs *hregs =
98 (struct hpc3_scsiregs *) cmd->device->host->base; 107 (struct hpc3_scsiregs *) cmd->device->host->base;
99 struct hpc_chunk *hcp = (struct hpc_chunk *) hdata->hd.cpu;
100 108
101 pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hcp); 109 pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hdata->cpu);
102 110
103 hdata->wh.dma_dir = datainp; 111 hdata->wh.dma_dir = datainp;
104 112
@@ -111,12 +119,12 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp)
111 if (cmd->SCp.ptr == NULL || cmd->SCp.this_residual == 0) 119 if (cmd->SCp.ptr == NULL || cmd->SCp.this_residual == 0)
112 return 1; 120 return 1;
113 121
114 fill_hpc_entries(hcp, cmd, datainp); 122 fill_hpc_entries(hdata, cmd, datainp);
115 123
116 pr_debug(" HPCGO\n"); 124 pr_debug(" HPCGO\n");
117 125
118 /* Start up the HPC. */ 126 /* Start up the HPC. */
119 hregs->ndptr = hdata->hd.dma; 127 hregs->ndptr = hdata->dma;
120 if (datainp) 128 if (datainp)
121 hregs->ctrl = HPC3_SCTRL_ACTIVE; 129 hregs->ctrl = HPC3_SCTRL_ACTIVE;
122 else 130 else
@@ -134,6 +142,9 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt,
134 if (!SCpnt) 142 if (!SCpnt)
135 return; 143 return;
136 144
145 if (SCpnt->SCp.ptr == NULL || SCpnt->SCp.this_residual == 0)
146 return;
147
137 hregs = (struct hpc3_scsiregs *) SCpnt->device->host->base; 148 hregs = (struct hpc3_scsiregs *) SCpnt->device->host->base;
138 149
139 pr_debug("dma_stop: status<%d> ", status); 150 pr_debug("dma_stop: status<%d> ", status);
@@ -145,8 +156,9 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt,
145 barrier(); 156 barrier();
146 } 157 }
147 hregs->ctrl = 0; 158 hregs->ctrl = 0;
148 dma_unmap_single(NULL, SCpnt->SCp.dma_handle, SCpnt->SCp.this_residual, 159 dma_unmap_single(hdata->dev, SCpnt->SCp.dma_handle,
149 SCpnt->sc_data_direction); 160 SCpnt->SCp.this_residual,
161 DMA_DIR(hdata->wh.dma_dir));
150 162
151 pr_debug("\n"); 163 pr_debug("\n");
152} 164}
@@ -161,22 +173,23 @@ void sgiwd93_reset(unsigned long base)
161} 173}
162EXPORT_SYMBOL_GPL(sgiwd93_reset); 174EXPORT_SYMBOL_GPL(sgiwd93_reset);
163 175
164static inline void init_hpc_chain(struct hpc_data *hd) 176static inline void init_hpc_chain(struct ip22_hostdata *hdata)
165{ 177{
166 struct hpc_chunk *hcp = (struct hpc_chunk *) hd->cpu; 178 struct hpc_chunk *hcp = (struct hpc_chunk *)hdata->cpu;
167 struct hpc_chunk *dma = (struct hpc_chunk *) hd->dma; 179 dma_addr_t dma = hdata->dma;
168 unsigned long start, end; 180 unsigned long start, end;
169 181
170 start = (unsigned long) hcp; 182 start = (unsigned long) hcp;
171 end = start + PAGE_SIZE; 183 end = start + HPC_DMA_SIZE;
172 while (start < end) { 184 while (start < end) {
173 hcp->desc.pnext = (u32) (dma + 1); 185 hcp->desc.pnext = (u32) (dma + sizeof(struct hpc_chunk));
174 hcp->desc.cntinfo = HPCDMA_EOX; 186 hcp->desc.cntinfo = HPCDMA_EOX;
175 hcp++; dma++; 187 hcp++;
188 dma += sizeof(struct hpc_chunk);
176 start += sizeof(struct hpc_chunk); 189 start += sizeof(struct hpc_chunk);
177 }; 190 };
178 hcp--; 191 hcp--;
179 hcp->desc.pnext = hd->dma; 192 hcp->desc.pnext = hdata->dma;
180} 193}
181 194
182static int sgiwd93_bus_reset(struct scsi_cmnd *cmd) 195static int sgiwd93_bus_reset(struct scsi_cmnd *cmd)
@@ -235,16 +248,17 @@ static int __init sgiwd93_probe(struct platform_device *pdev)
235 host->irq = irq; 248 host->irq = irq;
236 249
237 hdata = host_to_hostdata(host); 250 hdata = host_to_hostdata(host);
238 hdata->hd.cpu = dma_alloc_coherent(&pdev->dev, PAGE_SIZE, 251 hdata->dev = &pdev->dev;
239 &hdata->hd.dma, GFP_KERNEL); 252 hdata->cpu = dma_alloc_noncoherent(&pdev->dev, HPC_DMA_SIZE,
240 if (!hdata->hd.cpu) { 253 &hdata->dma, GFP_KERNEL);
254 if (!hdata->cpu) {
241 printk(KERN_WARNING "sgiwd93: Could not allocate memory for " 255 printk(KERN_WARNING "sgiwd93: Could not allocate memory for "
242 "host %d buffer.\n", unit); 256 "host %d buffer.\n", unit);
243 err = -ENOMEM; 257 err = -ENOMEM;
244 goto out_put; 258 goto out_put;
245 } 259 }
246 260
247 init_hpc_chain(&hdata->hd); 261 init_hpc_chain(hdata);
248 262
249 regs.SASR = wdregs + 3; 263 regs.SASR = wdregs + 3;
250 regs.SCMD = wdregs + 7; 264 regs.SCMD = wdregs + 7;
@@ -274,7 +288,7 @@ static int __init sgiwd93_probe(struct platform_device *pdev)
274out_irq: 288out_irq:
275 free_irq(irq, host); 289 free_irq(irq, host);
276out_free: 290out_free:
277 dma_free_coherent(NULL, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma); 291 dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma);
278out_put: 292out_put:
279 scsi_host_put(host); 293 scsi_host_put(host);
280out: 294out:
@@ -290,7 +304,7 @@ static void __exit sgiwd93_remove(struct platform_device *pdev)
290 304
291 scsi_remove_host(host); 305 scsi_remove_host(host);
292 free_irq(pd->irq, host); 306 free_irq(pd->irq, host);
293 dma_free_coherent(&pdev->dev, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma); 307 dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma);
294 scsi_host_put(host); 308 scsi_host_put(host);
295} 309}
296 310
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 1fcee16fa36d..50ba49250203 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -231,7 +231,7 @@ out:
231static int sr_done(struct scsi_cmnd *SCpnt) 231static int sr_done(struct scsi_cmnd *SCpnt)
232{ 232{
233 int result = SCpnt->result; 233 int result = SCpnt->result;
234 int this_count = SCpnt->request_bufflen; 234 int this_count = scsi_bufflen(SCpnt);
235 int good_bytes = (result == 0 ? this_count : 0); 235 int good_bytes = (result == 0 ? this_count : 0);
236 int block_sectors = 0; 236 int block_sectors = 0;
237 long error_sector; 237 long error_sector;
@@ -379,17 +379,18 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
379 } 379 }
380 380
381 { 381 {
382 struct scatterlist *sg = SCpnt->request_buffer; 382 struct scatterlist *sg;
383 int i, size = 0; 383 int i, size = 0, sg_count = scsi_sg_count(SCpnt);
384 for (i = 0; i < SCpnt->use_sg; i++)
385 size += sg[i].length;
386 384
387 if (size != SCpnt->request_bufflen && SCpnt->use_sg) { 385 scsi_for_each_sg(SCpnt, sg, sg_count, i)
386 size += sg->length;
387
388 if (size != scsi_bufflen(SCpnt)) {
388 scmd_printk(KERN_ERR, SCpnt, 389 scmd_printk(KERN_ERR, SCpnt,
389 "mismatch count %d, bytes %d\n", 390 "mismatch count %d, bytes %d\n",
390 size, SCpnt->request_bufflen); 391 size, scsi_bufflen(SCpnt));
391 if (SCpnt->request_bufflen > size) 392 if (scsi_bufflen(SCpnt) > size)
392 SCpnt->request_bufflen = size; 393 SCpnt->sdb.length = size;
393 } 394 }
394 } 395 }
395 396
@@ -397,12 +398,12 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
397 * request doesn't start on hw block boundary, add scatter pads 398 * request doesn't start on hw block boundary, add scatter pads
398 */ 399 */
399 if (((unsigned int)rq->sector % (s_size >> 9)) || 400 if (((unsigned int)rq->sector % (s_size >> 9)) ||
400 (SCpnt->request_bufflen % s_size)) { 401 (scsi_bufflen(SCpnt) % s_size)) {
401 scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n"); 402 scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n");
402 goto out; 403 goto out;
403 } 404 }
404 405
405 this_count = (SCpnt->request_bufflen >> 9) / (s_size >> 9); 406 this_count = (scsi_bufflen(SCpnt) >> 9) / (s_size >> 9);
406 407
407 408
408 SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n", 409 SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n",
@@ -416,7 +417,7 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
416 417
417 if (this_count > 0xffff) { 418 if (this_count > 0xffff) {
418 this_count = 0xffff; 419 this_count = 0xffff;
419 SCpnt->request_bufflen = this_count * s_size; 420 SCpnt->sdb.length = this_count * s_size;
420 } 421 }
421 422
422 SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff; 423 SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff;
diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c
index e3fab3a6aed7..72f6d8015358 100644
--- a/drivers/scsi/stex.c
+++ b/drivers/scsi/stex.c
@@ -1123,7 +1123,6 @@ static struct scsi_host_template driver_template = {
1123 .this_id = -1, 1123 .this_id = -1,
1124 .sg_tablesize = ST_MAX_SG, 1124 .sg_tablesize = ST_MAX_SG,
1125 .cmd_per_lun = ST_CMD_PER_LUN, 1125 .cmd_per_lun = ST_CMD_PER_LUN,
1126 .use_sg_chaining = ENABLE_SG_CHAINING,
1127}; 1126};
1128 1127
1129static int stex_set_dma_mask(struct pci_dev * pdev) 1128static int stex_set_dma_mask(struct pci_dev * pdev)
diff --git a/drivers/scsi/sym53c416.c b/drivers/scsi/sym53c416.c
index 1f6fd1680335..6325901e5093 100644
--- a/drivers/scsi/sym53c416.c
+++ b/drivers/scsi/sym53c416.c
@@ -840,6 +840,5 @@ static struct scsi_host_template driver_template = {
840 .cmd_per_lun = 1, 840 .cmd_per_lun = 1,
841 .unchecked_isa_dma = 1, 841 .unchecked_isa_dma = 1,
842 .use_clustering = ENABLE_CLUSTERING, 842 .use_clustering = ENABLE_CLUSTERING,
843 .use_sg_chaining = ENABLE_SG_CHAINING,
844}; 843};
845#include "scsi_module.c" 844#include "scsi_module.c"
diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c b/drivers/scsi/sym53c8xx_2/sym_glue.c
index 21e926dcdab0..d39107b7669b 100644
--- a/drivers/scsi/sym53c8xx_2/sym_glue.c
+++ b/drivers/scsi/sym53c8xx_2/sym_glue.c
@@ -207,7 +207,7 @@ void sym_set_cam_result_error(struct sym_hcb *np, struct sym_ccb *cp, int resid)
207 /* 207 /*
208 * Bounce back the sense data to user. 208 * Bounce back the sense data to user.
209 */ 209 */
210 memset(&cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); 210 memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
211 memcpy(cmd->sense_buffer, cp->sns_bbuf, 211 memcpy(cmd->sense_buffer, cp->sns_bbuf,
212 min(SCSI_SENSE_BUFFERSIZE, SYM_SNS_BBUF_LEN)); 212 min(SCSI_SENSE_BUFFERSIZE, SYM_SNS_BBUF_LEN));
213#if 0 213#if 0
@@ -1681,7 +1681,6 @@ static struct scsi_host_template sym2_template = {
1681 .eh_host_reset_handler = sym53c8xx_eh_host_reset_handler, 1681 .eh_host_reset_handler = sym53c8xx_eh_host_reset_handler,
1682 .this_id = 7, 1682 .this_id = 7,
1683 .use_clustering = ENABLE_CLUSTERING, 1683 .use_clustering = ENABLE_CLUSTERING,
1684 .use_sg_chaining = ENABLE_SG_CHAINING,
1685 .max_sectors = 0xFFFF, 1684 .max_sectors = 0xFFFF,
1686#ifdef SYM_LINUX_PROC_INFO_SUPPORT 1685#ifdef SYM_LINUX_PROC_INFO_SUPPORT
1687 .proc_info = sym53c8xx_proc_info, 1686 .proc_info = sym53c8xx_proc_info,
diff --git a/drivers/scsi/u14-34f.c b/drivers/scsi/u14-34f.c
index 4bc5407f9695..662c00451be4 100644
--- a/drivers/scsi/u14-34f.c
+++ b/drivers/scsi/u14-34f.c
@@ -451,7 +451,6 @@ static struct scsi_host_template driver_template = {
451 .this_id = 7, 451 .this_id = 7,
452 .unchecked_isa_dma = 1, 452 .unchecked_isa_dma = 1,
453 .use_clustering = ENABLE_CLUSTERING, 453 .use_clustering = ENABLE_CLUSTERING,
454 .use_sg_chaining = ENABLE_SG_CHAINING,
455 }; 454 };
456 455
457#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD) 456#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/drivers/scsi/ultrastor.c b/drivers/scsi/ultrastor.c
index 75eca6b22db5..f385dce8dfbe 100644
--- a/drivers/scsi/ultrastor.c
+++ b/drivers/scsi/ultrastor.c
@@ -1204,6 +1204,5 @@ static struct scsi_host_template driver_template = {
1204 .cmd_per_lun = ULTRASTOR_MAX_CMDS_PER_LUN, 1204 .cmd_per_lun = ULTRASTOR_MAX_CMDS_PER_LUN,
1205 .unchecked_isa_dma = 1, 1205 .unchecked_isa_dma = 1,
1206 .use_clustering = ENABLE_CLUSTERING, 1206 .use_clustering = ENABLE_CLUSTERING,
1207 .use_sg_chaining = ENABLE_SG_CHAINING,
1208}; 1207};
1209#include "scsi_module.c" 1208#include "scsi_module.c"
diff --git a/drivers/scsi/wd7000.c b/drivers/scsi/wd7000.c
index b4304ae78527..c975c01b3a02 100644
--- a/drivers/scsi/wd7000.c
+++ b/drivers/scsi/wd7000.c
@@ -1671,7 +1671,6 @@ static struct scsi_host_template driver_template = {
1671 .cmd_per_lun = 1, 1671 .cmd_per_lun = 1,
1672 .unchecked_isa_dma = 1, 1672 .unchecked_isa_dma = 1,
1673 .use_clustering = ENABLE_CLUSTERING, 1673 .use_clustering = ENABLE_CLUSTERING,
1674 .use_sg_chaining = ENABLE_SG_CHAINING,
1675}; 1674};
1676 1675
1677#include "scsi_module.c" 1676#include "scsi_module.c"
diff --git a/drivers/usb/storage/isd200.c b/drivers/usb/storage/isd200.c
index 178e8c2a8a2f..0db488624ab1 100644
--- a/drivers/usb/storage/isd200.c
+++ b/drivers/usb/storage/isd200.c
@@ -415,14 +415,14 @@ static void isd200_set_srb(struct isd200_info *info,
415 sg_init_one(&info->sg, buff, bufflen); 415 sg_init_one(&info->sg, buff, bufflen);
416 416
417 srb->sc_data_direction = dir; 417 srb->sc_data_direction = dir;
418 srb->request_buffer = buff ? &info->sg : NULL; 418 srb->sdb.table.sgl = buff ? &info->sg : NULL;
419 srb->request_bufflen = bufflen; 419 srb->sdb.length = bufflen;
420 srb->use_sg = buff ? 1 : 0; 420 srb->sdb.table.nents = buff ? 1 : 0;
421} 421}
422 422
423static void isd200_srb_set_bufflen(struct scsi_cmnd *srb, unsigned bufflen) 423static void isd200_srb_set_bufflen(struct scsi_cmnd *srb, unsigned bufflen)
424{ 424{
425 srb->request_bufflen = bufflen; 425 srb->sdb.length = bufflen;
426} 426}
427 427
428 428
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 899fc13d0612..afcdc69e37d6 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -609,7 +609,7 @@ config SBC_EPX_C3_WATCHDOG
609 609
610config INDYDOG 610config INDYDOG
611 tristate "Indy/I2 Hardware Watchdog" 611 tristate "Indy/I2 Hardware Watchdog"
612 depends on SGI_IP22 612 depends on SGI_HAS_INDYDOG
613 help 613 help
614 Hardware driver for the Indy's/I2's watchdog. This is a 614 Hardware driver for the Indy's/I2's watchdog. This is a
615 watchdog timer that will reboot the machine after a 60 second 615 watchdog timer that will reboot the machine after a 60 second
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 46754553fdcc..ff97ba924333 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,7 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
49 spin_unlock(&ls->ls_recover_list_lock); 49 spin_unlock(&ls->ls_recover_list_lock);
50 50
51 if (!found) 51 if (!found)
52 de = allocate_direntry(ls, len); 52 de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_KERNEL);
53 return de; 53 return de;
54} 54}
55 55
@@ -62,7 +62,7 @@ void dlm_clear_free_entries(struct dlm_ls *ls)
62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry, 62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
63 list); 63 list);
64 list_del(&de->list); 64 list_del(&de->list);
65 free_direntry(de); 65 kfree(de);
66 } 66 }
67 spin_unlock(&ls->ls_recover_list_lock); 67 spin_unlock(&ls->ls_recover_list_lock);
68} 68}
@@ -171,7 +171,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen
171 } 171 }
172 172
173 list_del(&de->list); 173 list_del(&de->list);
174 free_direntry(de); 174 kfree(de);
175 out: 175 out:
176 write_unlock(&ls->ls_dirtbl[bucket].lock); 176 write_unlock(&ls->ls_dirtbl[bucket].lock);
177} 177}
@@ -302,7 +302,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
302 302
303 write_unlock(&ls->ls_dirtbl[bucket].lock); 303 write_unlock(&ls->ls_dirtbl[bucket].lock);
304 304
305 de = allocate_direntry(ls, namelen); 305 de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL);
306 if (!de) 306 if (!de)
307 return -ENOMEM; 307 return -ENOMEM;
308 308
@@ -313,7 +313,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
313 write_lock(&ls->ls_dirtbl[bucket].lock); 313 write_lock(&ls->ls_dirtbl[bucket].lock);
314 tmp = search_bucket(ls, name, namelen, bucket); 314 tmp = search_bucket(ls, name, namelen, bucket);
315 if (tmp) { 315 if (tmp) {
316 free_direntry(de); 316 kfree(de);
317 de = tmp; 317 de = tmp;
318 } else { 318 } else {
319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); 319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
@@ -329,49 +329,47 @@ int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
329 return get_entry(ls, nodeid, name, namelen, r_nodeid); 329 return get_entry(ls, nodeid, name, namelen, r_nodeid);
330} 330}
331 331
332/* Copy the names of master rsb's into the buffer provided. 332static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len)
333 Only select names whose dir node is the given nodeid. */ 333{
334 struct dlm_rsb *r;
335
336 down_read(&ls->ls_root_sem);
337 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
338 if (len == r->res_length && !memcmp(name, r->res_name, len)) {
339 up_read(&ls->ls_root_sem);
340 return r;
341 }
342 }
343 up_read(&ls->ls_root_sem);
344 return NULL;
345}
346
347/* Find the rsb where we left off (or start again), then send rsb names
348 for rsb's we're master of and whose directory node matches the requesting
349 node. inbuf is the rsb name last sent, inlen is the name's length */
334 350
335void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, 351void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
336 char *outbuf, int outlen, int nodeid) 352 char *outbuf, int outlen, int nodeid)
337{ 353{
338 struct list_head *list; 354 struct list_head *list;
339 struct dlm_rsb *start_r = NULL, *r = NULL; 355 struct dlm_rsb *r;
340 int offset = 0, start_namelen, error, dir_nodeid; 356 int offset = 0, dir_nodeid;
341 char *start_name;
342 uint16_t be_namelen; 357 uint16_t be_namelen;
343 358
344 /*
345 * Find the rsb where we left off (or start again)
346 */
347
348 start_namelen = inlen;
349 start_name = inbuf;
350
351 if (start_namelen > 1) {
352 /*
353 * We could also use a find_rsb_root() function here that
354 * searched the ls_root_list.
355 */
356 error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
357 &start_r);
358 DLM_ASSERT(!error && start_r,
359 printk("error %d\n", error););
360 DLM_ASSERT(!list_empty(&start_r->res_root_list),
361 dlm_print_rsb(start_r););
362 dlm_put_rsb(start_r);
363 }
364
365 /*
366 * Send rsb names for rsb's we're master of and whose directory node
367 * matches the requesting node.
368 */
369
370 down_read(&ls->ls_root_sem); 359 down_read(&ls->ls_root_sem);
371 if (start_r) 360
372 list = start_r->res_root_list.next; 361 if (inlen > 1) {
373 else 362 r = find_rsb_root(ls, inbuf, inlen);
363 if (!r) {
364 inbuf[inlen - 1] = '\0';
365 log_error(ls, "copy_master_names from %d start %d %s",
366 nodeid, inlen, inbuf);
367 goto out;
368 }
369 list = r->res_root_list.next;
370 } else {
374 list = ls->ls_root_list.next; 371 list = ls->ls_root_list.next;
372 }
375 373
376 for (offset = 0; list != &ls->ls_root_list; list = list->next) { 374 for (offset = 0; list != &ls->ls_root_list; list = list->next) {
377 r = list_entry(list, struct dlm_rsb, res_root_list); 375 r = list_entry(list, struct dlm_rsb, res_root_list);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d2fc2384c3be..ec61bbaf25df 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -570,5 +570,21 @@ static inline int dlm_no_directory(struct dlm_ls *ls)
570 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0; 570 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
571} 571}
572 572
573int dlm_netlink_init(void);
574void dlm_netlink_exit(void);
575void dlm_timeout_warn(struct dlm_lkb *lkb);
576
577#ifdef CONFIG_DLM_DEBUG
578int dlm_register_debugfs(void);
579void dlm_unregister_debugfs(void);
580int dlm_create_debug_file(struct dlm_ls *ls);
581void dlm_delete_debug_file(struct dlm_ls *ls);
582#else
583static inline int dlm_register_debugfs(void) { return 0; }
584static inline void dlm_unregister_debugfs(void) { }
585static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
586static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
587#endif
588
573#endif /* __DLM_INTERNAL_DOT_H__ */ 589#endif /* __DLM_INTERNAL_DOT_H__ */
574 590
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 3915b8e14146..ff4a198fa677 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -88,7 +88,6 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
88static int receive_extralen(struct dlm_message *ms); 88static int receive_extralen(struct dlm_message *ms);
89static void do_purge(struct dlm_ls *ls, int nodeid, int pid); 89static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
90static void del_timeout(struct dlm_lkb *lkb); 90static void del_timeout(struct dlm_lkb *lkb);
91void dlm_timeout_warn(struct dlm_lkb *lkb);
92 91
93/* 92/*
94 * Lock compatibilty matrix - thanks Steve 93 * Lock compatibilty matrix - thanks Steve
@@ -335,7 +334,7 @@ static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
335{ 334{
336 struct dlm_rsb *r; 335 struct dlm_rsb *r;
337 336
338 r = allocate_rsb(ls, len); 337 r = dlm_allocate_rsb(ls, len);
339 if (!r) 338 if (!r)
340 return NULL; 339 return NULL;
341 340
@@ -478,7 +477,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
478 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp); 477 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
479 if (!error) { 478 if (!error) {
480 write_unlock(&ls->ls_rsbtbl[bucket].lock); 479 write_unlock(&ls->ls_rsbtbl[bucket].lock);
481 free_rsb(r); 480 dlm_free_rsb(r);
482 r = tmp; 481 r = tmp;
483 goto out; 482 goto out;
484 } 483 }
@@ -490,12 +489,6 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
490 return error; 489 return error;
491} 490}
492 491
493int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
494 unsigned int flags, struct dlm_rsb **r_ret)
495{
496 return find_rsb(ls, name, namelen, flags, r_ret);
497}
498
499/* This is only called to add a reference when the code already holds 492/* This is only called to add a reference when the code already holds
500 a valid reference to the rsb, so there's no need for locking. */ 493 a valid reference to the rsb, so there's no need for locking. */
501 494
@@ -519,7 +512,7 @@ static void toss_rsb(struct kref *kref)
519 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss); 512 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
520 r->res_toss_time = jiffies; 513 r->res_toss_time = jiffies;
521 if (r->res_lvbptr) { 514 if (r->res_lvbptr) {
522 free_lvb(r->res_lvbptr); 515 dlm_free_lvb(r->res_lvbptr);
523 r->res_lvbptr = NULL; 516 r->res_lvbptr = NULL;
524 } 517 }
525} 518}
@@ -589,7 +582,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
589 uint32_t lkid = 0; 582 uint32_t lkid = 0;
590 uint16_t bucket; 583 uint16_t bucket;
591 584
592 lkb = allocate_lkb(ls); 585 lkb = dlm_allocate_lkb(ls);
593 if (!lkb) 586 if (!lkb)
594 return -ENOMEM; 587 return -ENOMEM;
595 588
@@ -683,8 +676,8 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
683 676
684 /* for local/process lkbs, lvbptr points to caller's lksb */ 677 /* for local/process lkbs, lvbptr points to caller's lksb */
685 if (lkb->lkb_lvbptr && is_master_copy(lkb)) 678 if (lkb->lkb_lvbptr && is_master_copy(lkb))
686 free_lvb(lkb->lkb_lvbptr); 679 dlm_free_lvb(lkb->lkb_lvbptr);
687 free_lkb(lkb); 680 dlm_free_lkb(lkb);
688 return 1; 681 return 1;
689 } else { 682 } else {
690 write_unlock(&ls->ls_lkbtbl[bucket].lock); 683 write_unlock(&ls->ls_lkbtbl[bucket].lock);
@@ -988,7 +981,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
988 981
989 if (is_master(r)) 982 if (is_master(r))
990 dir_remove(r); 983 dir_remove(r);
991 free_rsb(r); 984 dlm_free_rsb(r);
992 count++; 985 count++;
993 } else { 986 } else {
994 write_unlock(&ls->ls_rsbtbl[b].lock); 987 write_unlock(&ls->ls_rsbtbl[b].lock);
@@ -1171,7 +1164,7 @@ static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1171 return; 1164 return;
1172 1165
1173 if (!r->res_lvbptr) 1166 if (!r->res_lvbptr)
1174 r->res_lvbptr = allocate_lvb(r->res_ls); 1167 r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
1175 1168
1176 if (!r->res_lvbptr) 1169 if (!r->res_lvbptr)
1177 return; 1170 return;
@@ -1203,7 +1196,7 @@ static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1203 return; 1196 return;
1204 1197
1205 if (!r->res_lvbptr) 1198 if (!r->res_lvbptr)
1206 r->res_lvbptr = allocate_lvb(r->res_ls); 1199 r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
1207 1200
1208 if (!r->res_lvbptr) 1201 if (!r->res_lvbptr)
1209 return; 1202 return;
@@ -1852,7 +1845,7 @@ static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
1852static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) 1845static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1853{ 1846{
1854 struct dlm_ls *ls = r->res_ls; 1847 struct dlm_ls *ls = r->res_ls;
1855 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); 1848 int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
1856 1849
1857 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) { 1850 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
1858 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); 1851 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
@@ -1886,7 +1879,7 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1886 return 1; 1879 return 1;
1887 } 1880 }
1888 1881
1889 for (;;) { 1882 for (i = 0; i < 2; i++) {
1890 /* It's possible for dlm_scand to remove an old rsb for 1883 /* It's possible for dlm_scand to remove an old rsb for
1891 this same resource from the toss list, us to create 1884 this same resource from the toss list, us to create
1892 a new one, look up the master locally, and find it 1885 a new one, look up the master locally, and find it
@@ -1900,6 +1893,8 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1900 log_debug(ls, "dir_lookup error %d %s", error, r->res_name); 1893 log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
1901 schedule(); 1894 schedule();
1902 } 1895 }
1896 if (error && error != -EEXIST)
1897 return error;
1903 1898
1904 if (ret_nodeid == our_nodeid) { 1899 if (ret_nodeid == our_nodeid) {
1905 r->res_first_lkid = 0; 1900 r->res_first_lkid = 0;
@@ -1941,8 +1936,11 @@ static void confirm_master(struct dlm_rsb *r, int error)
1941 break; 1936 break;
1942 1937
1943 case -EAGAIN: 1938 case -EAGAIN:
1944 /* the remote master didn't queue our NOQUEUE request; 1939 case -EBADR:
1945 make a waiting lkb the first_lkid */ 1940 case -ENOTBLK:
1941 /* the remote request failed and won't be retried (it was
1942 a NOQUEUE, or has been canceled/unlocked); make a waiting
1943 lkb the first_lkid */
1946 1944
1947 r->res_first_lkid = 0; 1945 r->res_first_lkid = 0;
1948 1946
@@ -2108,17 +2106,18 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
2108 /* an lkb may be waiting for an rsb lookup to complete where the 2106 /* an lkb may be waiting for an rsb lookup to complete where the
2109 lookup was initiated by another lock */ 2107 lookup was initiated by another lock */
2110 2108
2111 if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) { 2109 if (!list_empty(&lkb->lkb_rsb_lookup)) {
2112 if (!list_empty(&lkb->lkb_rsb_lookup)) { 2110 if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
2113 log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id); 2111 log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
2114 list_del_init(&lkb->lkb_rsb_lookup); 2112 list_del_init(&lkb->lkb_rsb_lookup);
2115 queue_cast(lkb->lkb_resource, lkb, 2113 queue_cast(lkb->lkb_resource, lkb,
2116 args->flags & DLM_LKF_CANCEL ? 2114 args->flags & DLM_LKF_CANCEL ?
2117 -DLM_ECANCEL : -DLM_EUNLOCK); 2115 -DLM_ECANCEL : -DLM_EUNLOCK);
2118 unhold_lkb(lkb); /* undoes create_lkb() */ 2116 unhold_lkb(lkb); /* undoes create_lkb() */
2119 rv = -EBUSY;
2120 goto out;
2121 } 2117 }
2118 /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
2119 rv = -EBUSY;
2120 goto out;
2122 } 2121 }
2123 2122
2124 /* cancel not allowed with another cancel/unlock in progress */ 2123 /* cancel not allowed with another cancel/unlock in progress */
@@ -2986,7 +2985,7 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
2986 2985
2987 if (lkb->lkb_exflags & DLM_LKF_VALBLK) { 2986 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
2988 if (!lkb->lkb_lvbptr) 2987 if (!lkb->lkb_lvbptr)
2989 lkb->lkb_lvbptr = allocate_lvb(ls); 2988 lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
2990 if (!lkb->lkb_lvbptr) 2989 if (!lkb->lkb_lvbptr)
2991 return -ENOMEM; 2990 return -ENOMEM;
2992 len = receive_extralen(ms); 2991 len = receive_extralen(ms);
@@ -3006,11 +3005,9 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3006 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST); 3005 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
3007 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP); 3006 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
3008 3007
3009 DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
3010
3011 if (lkb->lkb_exflags & DLM_LKF_VALBLK) { 3008 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3012 /* lkb was just created so there won't be an lvb yet */ 3009 /* lkb was just created so there won't be an lvb yet */
3013 lkb->lkb_lvbptr = allocate_lvb(ls); 3010 lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
3014 if (!lkb->lkb_lvbptr) 3011 if (!lkb->lkb_lvbptr)
3015 return -ENOMEM; 3012 return -ENOMEM;
3016 } 3013 }
@@ -3021,16 +3018,6 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3021static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, 3018static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3022 struct dlm_message *ms) 3019 struct dlm_message *ms)
3023{ 3020{
3024 if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
3025 log_error(ls, "convert_args nodeid %d %d lkid %x %x",
3026 lkb->lkb_nodeid, ms->m_header.h_nodeid,
3027 lkb->lkb_id, lkb->lkb_remid);
3028 return -EINVAL;
3029 }
3030
3031 if (!is_master_copy(lkb))
3032 return -EINVAL;
3033
3034 if (lkb->lkb_status != DLM_LKSTS_GRANTED) 3021 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
3035 return -EBUSY; 3022 return -EBUSY;
3036 3023
@@ -3046,8 +3033,6 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3046static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, 3033static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3047 struct dlm_message *ms) 3034 struct dlm_message *ms)
3048{ 3035{
3049 if (!is_master_copy(lkb))
3050 return -EINVAL;
3051 if (receive_lvb(ls, lkb, ms)) 3036 if (receive_lvb(ls, lkb, ms))
3052 return -ENOMEM; 3037 return -ENOMEM;
3053 return 0; 3038 return 0;
@@ -3063,6 +3048,50 @@ static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
3063 lkb->lkb_remid = ms->m_lkid; 3048 lkb->lkb_remid = ms->m_lkid;
3064} 3049}
3065 3050
3051/* This is called after the rsb is locked so that we can safely inspect
3052 fields in the lkb. */
3053
3054static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
3055{
3056 int from = ms->m_header.h_nodeid;
3057 int error = 0;
3058
3059 switch (ms->m_type) {
3060 case DLM_MSG_CONVERT:
3061 case DLM_MSG_UNLOCK:
3062 case DLM_MSG_CANCEL:
3063 if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
3064 error = -EINVAL;
3065 break;
3066
3067 case DLM_MSG_CONVERT_REPLY:
3068 case DLM_MSG_UNLOCK_REPLY:
3069 case DLM_MSG_CANCEL_REPLY:
3070 case DLM_MSG_GRANT:
3071 case DLM_MSG_BAST:
3072 if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
3073 error = -EINVAL;
3074 break;
3075
3076 case DLM_MSG_REQUEST_REPLY:
3077 if (!is_process_copy(lkb))
3078 error = -EINVAL;
3079 else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
3080 error = -EINVAL;
3081 break;
3082
3083 default:
3084 error = -EINVAL;
3085 }
3086
3087 if (error)
3088 log_error(lkb->lkb_resource->res_ls,
3089 "ignore invalid message %d from %d %x %x %x %d",
3090 ms->m_type, from, lkb->lkb_id, lkb->lkb_remid,
3091 lkb->lkb_flags, lkb->lkb_nodeid);
3092 return error;
3093}
3094
3066static void receive_request(struct dlm_ls *ls, struct dlm_message *ms) 3095static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
3067{ 3096{
3068 struct dlm_lkb *lkb; 3097 struct dlm_lkb *lkb;
@@ -3124,17 +3153,21 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
3124 hold_rsb(r); 3153 hold_rsb(r);
3125 lock_rsb(r); 3154 lock_rsb(r);
3126 3155
3156 error = validate_message(lkb, ms);
3157 if (error)
3158 goto out;
3159
3127 receive_flags(lkb, ms); 3160 receive_flags(lkb, ms);
3128 error = receive_convert_args(ls, lkb, ms); 3161 error = receive_convert_args(ls, lkb, ms);
3129 if (error) 3162 if (error)
3130 goto out; 3163 goto out_reply;
3131 reply = !down_conversion(lkb); 3164 reply = !down_conversion(lkb);
3132 3165
3133 error = do_convert(r, lkb); 3166 error = do_convert(r, lkb);
3134 out: 3167 out_reply:
3135 if (reply) 3168 if (reply)
3136 send_convert_reply(r, lkb, error); 3169 send_convert_reply(r, lkb, error);
3137 3170 out:
3138 unlock_rsb(r); 3171 unlock_rsb(r);
3139 put_rsb(r); 3172 put_rsb(r);
3140 dlm_put_lkb(lkb); 3173 dlm_put_lkb(lkb);
@@ -3160,15 +3193,19 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
3160 hold_rsb(r); 3193 hold_rsb(r);
3161 lock_rsb(r); 3194 lock_rsb(r);
3162 3195
3196 error = validate_message(lkb, ms);
3197 if (error)
3198 goto out;
3199
3163 receive_flags(lkb, ms); 3200 receive_flags(lkb, ms);
3164 error = receive_unlock_args(ls, lkb, ms); 3201 error = receive_unlock_args(ls, lkb, ms);
3165 if (error) 3202 if (error)
3166 goto out; 3203 goto out_reply;
3167 3204
3168 error = do_unlock(r, lkb); 3205 error = do_unlock(r, lkb);
3169 out: 3206 out_reply:
3170 send_unlock_reply(r, lkb, error); 3207 send_unlock_reply(r, lkb, error);
3171 3208 out:
3172 unlock_rsb(r); 3209 unlock_rsb(r);
3173 put_rsb(r); 3210 put_rsb(r);
3174 dlm_put_lkb(lkb); 3211 dlm_put_lkb(lkb);
@@ -3196,9 +3233,13 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
3196 hold_rsb(r); 3233 hold_rsb(r);
3197 lock_rsb(r); 3234 lock_rsb(r);
3198 3235
3236 error = validate_message(lkb, ms);
3237 if (error)
3238 goto out;
3239
3199 error = do_cancel(r, lkb); 3240 error = do_cancel(r, lkb);
3200 send_cancel_reply(r, lkb, error); 3241 send_cancel_reply(r, lkb, error);
3201 3242 out:
3202 unlock_rsb(r); 3243 unlock_rsb(r);
3203 put_rsb(r); 3244 put_rsb(r);
3204 dlm_put_lkb(lkb); 3245 dlm_put_lkb(lkb);
@@ -3217,22 +3258,26 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
3217 3258
3218 error = find_lkb(ls, ms->m_remid, &lkb); 3259 error = find_lkb(ls, ms->m_remid, &lkb);
3219 if (error) { 3260 if (error) {
3220 log_error(ls, "receive_grant no lkb"); 3261 log_debug(ls, "receive_grant from %d no lkb %x",
3262 ms->m_header.h_nodeid, ms->m_remid);
3221 return; 3263 return;
3222 } 3264 }
3223 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3224 3265
3225 r = lkb->lkb_resource; 3266 r = lkb->lkb_resource;
3226 3267
3227 hold_rsb(r); 3268 hold_rsb(r);
3228 lock_rsb(r); 3269 lock_rsb(r);
3229 3270
3271 error = validate_message(lkb, ms);
3272 if (error)
3273 goto out;
3274
3230 receive_flags_reply(lkb, ms); 3275 receive_flags_reply(lkb, ms);
3231 if (is_altmode(lkb)) 3276 if (is_altmode(lkb))
3232 munge_altmode(lkb, ms); 3277 munge_altmode(lkb, ms);
3233 grant_lock_pc(r, lkb, ms); 3278 grant_lock_pc(r, lkb, ms);
3234 queue_cast(r, lkb, 0); 3279 queue_cast(r, lkb, 0);
3235 3280 out:
3236 unlock_rsb(r); 3281 unlock_rsb(r);
3237 put_rsb(r); 3282 put_rsb(r);
3238 dlm_put_lkb(lkb); 3283 dlm_put_lkb(lkb);
@@ -3246,18 +3291,22 @@ static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
3246 3291
3247 error = find_lkb(ls, ms->m_remid, &lkb); 3292 error = find_lkb(ls, ms->m_remid, &lkb);
3248 if (error) { 3293 if (error) {
3249 log_error(ls, "receive_bast no lkb"); 3294 log_debug(ls, "receive_bast from %d no lkb %x",
3295 ms->m_header.h_nodeid, ms->m_remid);
3250 return; 3296 return;
3251 } 3297 }
3252 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3253 3298
3254 r = lkb->lkb_resource; 3299 r = lkb->lkb_resource;
3255 3300
3256 hold_rsb(r); 3301 hold_rsb(r);
3257 lock_rsb(r); 3302 lock_rsb(r);
3258 3303
3259 queue_bast(r, lkb, ms->m_bastmode); 3304 error = validate_message(lkb, ms);
3305 if (error)
3306 goto out;
3260 3307
3308 queue_bast(r, lkb, ms->m_bastmode);
3309 out:
3261 unlock_rsb(r); 3310 unlock_rsb(r);
3262 put_rsb(r); 3311 put_rsb(r);
3263 dlm_put_lkb(lkb); 3312 dlm_put_lkb(lkb);
@@ -3323,15 +3372,19 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
3323 3372
3324 error = find_lkb(ls, ms->m_remid, &lkb); 3373 error = find_lkb(ls, ms->m_remid, &lkb);
3325 if (error) { 3374 if (error) {
3326 log_error(ls, "receive_request_reply no lkb"); 3375 log_debug(ls, "receive_request_reply from %d no lkb %x",
3376 ms->m_header.h_nodeid, ms->m_remid);
3327 return; 3377 return;
3328 } 3378 }
3329 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3330 3379
3331 r = lkb->lkb_resource; 3380 r = lkb->lkb_resource;
3332 hold_rsb(r); 3381 hold_rsb(r);
3333 lock_rsb(r); 3382 lock_rsb(r);
3334 3383
3384 error = validate_message(lkb, ms);
3385 if (error)
3386 goto out;
3387
3335 mstype = lkb->lkb_wait_type; 3388 mstype = lkb->lkb_wait_type;
3336 error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); 3389 error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
3337 if (error) 3390 if (error)
@@ -3383,6 +3436,7 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
3383 if (is_overlap(lkb)) { 3436 if (is_overlap(lkb)) {
3384 /* we'll ignore error in cancel/unlock reply */ 3437 /* we'll ignore error in cancel/unlock reply */
3385 queue_cast_overlap(r, lkb); 3438 queue_cast_overlap(r, lkb);
3439 confirm_master(r, result);
3386 unhold_lkb(lkb); /* undoes create_lkb() */ 3440 unhold_lkb(lkb); /* undoes create_lkb() */
3387 } else 3441 } else
3388 _request_lock(r, lkb); 3442 _request_lock(r, lkb);
@@ -3463,6 +3517,10 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3463 hold_rsb(r); 3517 hold_rsb(r);
3464 lock_rsb(r); 3518 lock_rsb(r);
3465 3519
3520 error = validate_message(lkb, ms);
3521 if (error)
3522 goto out;
3523
3466 /* stub reply can happen with waiters_mutex held */ 3524 /* stub reply can happen with waiters_mutex held */
3467 error = remove_from_waiters_ms(lkb, ms); 3525 error = remove_from_waiters_ms(lkb, ms);
3468 if (error) 3526 if (error)
@@ -3481,10 +3539,10 @@ static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
3481 3539
3482 error = find_lkb(ls, ms->m_remid, &lkb); 3540 error = find_lkb(ls, ms->m_remid, &lkb);
3483 if (error) { 3541 if (error) {
3484 log_error(ls, "receive_convert_reply no lkb"); 3542 log_debug(ls, "receive_convert_reply from %d no lkb %x",
3543 ms->m_header.h_nodeid, ms->m_remid);
3485 return; 3544 return;
3486 } 3545 }
3487 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3488 3546
3489 _receive_convert_reply(lkb, ms); 3547 _receive_convert_reply(lkb, ms);
3490 dlm_put_lkb(lkb); 3548 dlm_put_lkb(lkb);
@@ -3498,6 +3556,10 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3498 hold_rsb(r); 3556 hold_rsb(r);
3499 lock_rsb(r); 3557 lock_rsb(r);
3500 3558
3559 error = validate_message(lkb, ms);
3560 if (error)
3561 goto out;
3562
3501 /* stub reply can happen with waiters_mutex held */ 3563 /* stub reply can happen with waiters_mutex held */
3502 error = remove_from_waiters_ms(lkb, ms); 3564 error = remove_from_waiters_ms(lkb, ms);
3503 if (error) 3565 if (error)
@@ -3529,10 +3591,10 @@ static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
3529 3591
3530 error = find_lkb(ls, ms->m_remid, &lkb); 3592 error = find_lkb(ls, ms->m_remid, &lkb);
3531 if (error) { 3593 if (error) {
3532 log_error(ls, "receive_unlock_reply no lkb"); 3594 log_debug(ls, "receive_unlock_reply from %d no lkb %x",
3595 ms->m_header.h_nodeid, ms->m_remid);
3533 return; 3596 return;
3534 } 3597 }
3535 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3536 3598
3537 _receive_unlock_reply(lkb, ms); 3599 _receive_unlock_reply(lkb, ms);
3538 dlm_put_lkb(lkb); 3600 dlm_put_lkb(lkb);
@@ -3546,6 +3608,10 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3546 hold_rsb(r); 3608 hold_rsb(r);
3547 lock_rsb(r); 3609 lock_rsb(r);
3548 3610
3611 error = validate_message(lkb, ms);
3612 if (error)
3613 goto out;
3614
3549 /* stub reply can happen with waiters_mutex held */ 3615 /* stub reply can happen with waiters_mutex held */
3550 error = remove_from_waiters_ms(lkb, ms); 3616 error = remove_from_waiters_ms(lkb, ms);
3551 if (error) 3617 if (error)
@@ -3577,10 +3643,10 @@ static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
3577 3643
3578 error = find_lkb(ls, ms->m_remid, &lkb); 3644 error = find_lkb(ls, ms->m_remid, &lkb);
3579 if (error) { 3645 if (error) {
3580 log_error(ls, "receive_cancel_reply no lkb"); 3646 log_debug(ls, "receive_cancel_reply from %d no lkb %x",
3647 ms->m_header.h_nodeid, ms->m_remid);
3581 return; 3648 return;
3582 } 3649 }
3583 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3584 3650
3585 _receive_cancel_reply(lkb, ms); 3651 _receive_cancel_reply(lkb, ms);
3586 dlm_put_lkb(lkb); 3652 dlm_put_lkb(lkb);
@@ -3640,6 +3706,13 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
3640 3706
3641static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) 3707static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
3642{ 3708{
3709 if (!dlm_is_member(ls, ms->m_header.h_nodeid)) {
3710 log_debug(ls, "ignore non-member message %d from %d %x %x %d",
3711 ms->m_type, ms->m_header.h_nodeid, ms->m_lkid,
3712 ms->m_remid, ms->m_result);
3713 return;
3714 }
3715
3643 switch (ms->m_type) { 3716 switch (ms->m_type) {
3644 3717
3645 /* messages sent to a master node */ 3718 /* messages sent to a master node */
@@ -3778,8 +3851,9 @@ void dlm_receive_buffer(struct dlm_header *hd, int nodeid)
3778 3851
3779 ls = dlm_find_lockspace_global(hd->h_lockspace); 3852 ls = dlm_find_lockspace_global(hd->h_lockspace);
3780 if (!ls) { 3853 if (!ls) {
3781 log_print("invalid h_lockspace %x from %d cmd %d type %d", 3854 if (dlm_config.ci_log_debug)
3782 hd->h_lockspace, nodeid, hd->h_cmd, type); 3855 log_print("invalid lockspace %x from %d cmd %d type %d",
3856 hd->h_lockspace, nodeid, hd->h_cmd, type);
3783 3857
3784 if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) 3858 if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
3785 dlm_send_ls_not_ready(nodeid, rc); 3859 dlm_send_ls_not_ready(nodeid, rc);
@@ -3806,6 +3880,7 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3806 ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; 3880 ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
3807 ls->ls_stub_ms.m_result = -EINPROGRESS; 3881 ls->ls_stub_ms.m_result = -EINPROGRESS;
3808 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 3882 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3883 ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
3809 _receive_convert_reply(lkb, &ls->ls_stub_ms); 3884 _receive_convert_reply(lkb, &ls->ls_stub_ms);
3810 3885
3811 /* Same special case as in receive_rcom_lock_args() */ 3886 /* Same special case as in receive_rcom_lock_args() */
@@ -3847,6 +3922,7 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
3847void dlm_recover_waiters_pre(struct dlm_ls *ls) 3922void dlm_recover_waiters_pre(struct dlm_ls *ls)
3848{ 3923{
3849 struct dlm_lkb *lkb, *safe; 3924 struct dlm_lkb *lkb, *safe;
3925 int wait_type, stub_unlock_result, stub_cancel_result;
3850 3926
3851 mutex_lock(&ls->ls_waiters_mutex); 3927 mutex_lock(&ls->ls_waiters_mutex);
3852 3928
@@ -3865,7 +3941,33 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
3865 if (!waiter_needs_recovery(ls, lkb)) 3941 if (!waiter_needs_recovery(ls, lkb))
3866 continue; 3942 continue;
3867 3943
3868 switch (lkb->lkb_wait_type) { 3944 wait_type = lkb->lkb_wait_type;
3945 stub_unlock_result = -DLM_EUNLOCK;
3946 stub_cancel_result = -DLM_ECANCEL;
3947
3948 /* Main reply may have been received leaving a zero wait_type,
3949 but a reply for the overlapping op may not have been
3950 received. In that case we need to fake the appropriate
3951 reply for the overlap op. */
3952
3953 if (!wait_type) {
3954 if (is_overlap_cancel(lkb)) {
3955 wait_type = DLM_MSG_CANCEL;
3956 if (lkb->lkb_grmode == DLM_LOCK_IV)
3957 stub_cancel_result = 0;
3958 }
3959 if (is_overlap_unlock(lkb)) {
3960 wait_type = DLM_MSG_UNLOCK;
3961 if (lkb->lkb_grmode == DLM_LOCK_IV)
3962 stub_unlock_result = -ENOENT;
3963 }
3964
3965 log_debug(ls, "rwpre overlap %x %x %d %d %d",
3966 lkb->lkb_id, lkb->lkb_flags, wait_type,
3967 stub_cancel_result, stub_unlock_result);
3968 }
3969
3970 switch (wait_type) {
3869 3971
3870 case DLM_MSG_REQUEST: 3972 case DLM_MSG_REQUEST:
3871 lkb->lkb_flags |= DLM_IFL_RESEND; 3973 lkb->lkb_flags |= DLM_IFL_RESEND;
@@ -3878,8 +3980,9 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
3878 case DLM_MSG_UNLOCK: 3980 case DLM_MSG_UNLOCK:
3879 hold_lkb(lkb); 3981 hold_lkb(lkb);
3880 ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY; 3982 ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
3881 ls->ls_stub_ms.m_result = -DLM_EUNLOCK; 3983 ls->ls_stub_ms.m_result = stub_unlock_result;
3882 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 3984 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3985 ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
3883 _receive_unlock_reply(lkb, &ls->ls_stub_ms); 3986 _receive_unlock_reply(lkb, &ls->ls_stub_ms);
3884 dlm_put_lkb(lkb); 3987 dlm_put_lkb(lkb);
3885 break; 3988 break;
@@ -3887,15 +3990,16 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
3887 case DLM_MSG_CANCEL: 3990 case DLM_MSG_CANCEL:
3888 hold_lkb(lkb); 3991 hold_lkb(lkb);
3889 ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY; 3992 ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
3890 ls->ls_stub_ms.m_result = -DLM_ECANCEL; 3993 ls->ls_stub_ms.m_result = stub_cancel_result;
3891 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 3994 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3995 ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
3892 _receive_cancel_reply(lkb, &ls->ls_stub_ms); 3996 _receive_cancel_reply(lkb, &ls->ls_stub_ms);
3893 dlm_put_lkb(lkb); 3997 dlm_put_lkb(lkb);
3894 break; 3998 break;
3895 3999
3896 default: 4000 default:
3897 log_error(ls, "invalid lkb wait_type %d", 4001 log_error(ls, "invalid lkb wait_type %d %d",
3898 lkb->lkb_wait_type); 4002 lkb->lkb_wait_type, wait_type);
3899 } 4003 }
3900 schedule(); 4004 schedule();
3901 } 4005 }
@@ -4184,7 +4288,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
4184 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP); 4288 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
4185 4289
4186 if (lkb->lkb_exflags & DLM_LKF_VALBLK) { 4290 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
4187 lkb->lkb_lvbptr = allocate_lvb(ls); 4291 lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
4188 if (!lkb->lkb_lvbptr) 4292 if (!lkb->lkb_lvbptr)
4189 return -ENOMEM; 4293 return -ENOMEM;
4190 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) - 4294 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
@@ -4259,7 +4363,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
4259 put_rsb(r); 4363 put_rsb(r);
4260 out: 4364 out:
4261 if (error) 4365 if (error)
4262 log_print("recover_master_copy %d %x", error, rl->rl_lkid); 4366 log_debug(ls, "recover_master_copy %d %x", error, rl->rl_lkid);
4263 rl->rl_result = error; 4367 rl->rl_result = error;
4264 return error; 4368 return error;
4265} 4369}
@@ -4342,7 +4446,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
4342 } 4446 }
4343 } 4447 }
4344 4448
4345 /* After ua is attached to lkb it will be freed by free_lkb(). 4449 /* After ua is attached to lkb it will be freed by dlm_free_lkb().
4346 When DLM_IFL_USER is set, the dlm knows that this is a userspace 4450 When DLM_IFL_USER is set, the dlm knows that this is a userspace
4347 lock and that lkb_astparam is the dlm_user_args structure. */ 4451 lock and that lkb_astparam is the dlm_user_args structure. */
4348 4452
@@ -4679,6 +4783,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
4679 } 4783 }
4680 4784
4681 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { 4785 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
4786 lkb->lkb_ast_type = 0;
4682 list_del(&lkb->lkb_astqueue); 4787 list_del(&lkb->lkb_astqueue);
4683 dlm_put_lkb(lkb); 4788 dlm_put_lkb(lkb);
4684 } 4789 }
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index ada04680a1e5..27b6ed302911 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -19,8 +19,6 @@ void dlm_print_lkb(struct dlm_lkb *lkb);
19void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms); 19void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
20void dlm_receive_buffer(struct dlm_header *hd, int nodeid); 20void dlm_receive_buffer(struct dlm_header *hd, int nodeid);
21int dlm_modes_compat(int mode1, int mode2); 21int dlm_modes_compat(int mode1, int mode2);
22int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
23 unsigned int flags, struct dlm_rsb **r_ret);
24void dlm_put_rsb(struct dlm_rsb *r); 22void dlm_put_rsb(struct dlm_rsb *r);
25void dlm_hold_rsb(struct dlm_rsb *r); 23void dlm_hold_rsb(struct dlm_rsb *r);
26int dlm_put_lkb(struct dlm_lkb *lkb); 24int dlm_put_lkb(struct dlm_lkb *lkb);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 5c108c49cb8c..b180fdc51085 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -24,14 +24,6 @@
24#include "recover.h" 24#include "recover.h"
25#include "requestqueue.h" 25#include "requestqueue.h"
26 26
27#ifdef CONFIG_DLM_DEBUG
28int dlm_create_debug_file(struct dlm_ls *ls);
29void dlm_delete_debug_file(struct dlm_ls *ls);
30#else
31static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
32static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
33#endif
34
35static int ls_count; 27static int ls_count;
36static struct mutex ls_lock; 28static struct mutex ls_lock;
37static struct list_head lslist; 29static struct list_head lslist;
@@ -684,9 +676,9 @@ static int release_lockspace(struct dlm_ls *ls, int force)
684 dlm_del_ast(lkb); 676 dlm_del_ast(lkb);
685 677
686 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY) 678 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
687 free_lvb(lkb->lkb_lvbptr); 679 dlm_free_lvb(lkb->lkb_lvbptr);
688 680
689 free_lkb(lkb); 681 dlm_free_lkb(lkb);
690 } 682 }
691 } 683 }
692 dlm_astd_resume(); 684 dlm_astd_resume();
@@ -704,7 +696,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
704 res_hashchain); 696 res_hashchain);
705 697
706 list_del(&rsb->res_hashchain); 698 list_del(&rsb->res_hashchain);
707 free_rsb(rsb); 699 dlm_free_rsb(rsb);
708 } 700 }
709 701
710 head = &ls->ls_rsbtbl[i].toss; 702 head = &ls->ls_rsbtbl[i].toss;
@@ -712,7 +704,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
712 rsb = list_entry(head->next, struct dlm_rsb, 704 rsb = list_entry(head->next, struct dlm_rsb,
713 res_hashchain); 705 res_hashchain);
714 list_del(&rsb->res_hashchain); 706 list_del(&rsb->res_hashchain);
715 free_rsb(rsb); 707 dlm_free_rsb(rsb);
716 } 708 }
717 } 709 }
718 710
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index e9923ca9c2d9..7c1e5e5cccd8 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -864,7 +864,7 @@ static void sctp_init_assoc(struct connection *con)
864static void tcp_connect_to_sock(struct connection *con) 864static void tcp_connect_to_sock(struct connection *con)
865{ 865{
866 int result = -EHOSTUNREACH; 866 int result = -EHOSTUNREACH;
867 struct sockaddr_storage saddr; 867 struct sockaddr_storage saddr, src_addr;
868 int addr_len; 868 int addr_len;
869 struct socket *sock; 869 struct socket *sock;
870 870
@@ -898,6 +898,17 @@ static void tcp_connect_to_sock(struct connection *con)
898 con->connect_action = tcp_connect_to_sock; 898 con->connect_action = tcp_connect_to_sock;
899 add_sock(sock, con); 899 add_sock(sock, con);
900 900
901 /* Bind to our cluster-known address connecting to avoid
902 routing problems */
903 memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
904 make_sockaddr(&src_addr, 0, &addr_len);
905 result = sock->ops->bind(sock, (struct sockaddr *) &src_addr,
906 addr_len);
907 if (result < 0) {
908 log_print("could not bind for connect: %d", result);
909 /* This *may* not indicate a critical error */
910 }
911
901 make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); 912 make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
902 913
903 log_print("connecting to %d", con->nodeid); 914 log_print("connecting to %d", con->nodeid);
@@ -1426,6 +1437,8 @@ void dlm_lowcomms_stop(void)
1426 con = __nodeid2con(i, 0); 1437 con = __nodeid2con(i, 0);
1427 if (con) { 1438 if (con) {
1428 close_connection(con, true); 1439 close_connection(con, true);
1440 if (con->othercon)
1441 kmem_cache_free(con_cache, con->othercon);
1429 kmem_cache_free(con_cache, con); 1442 kmem_cache_free(con_cache, con);
1430 } 1443 }
1431 } 1444 }
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index eca2907f2386..58487fb95a4c 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -18,16 +18,6 @@
18#include "memory.h" 18#include "memory.h"
19#include "config.h" 19#include "config.h"
20 20
21#ifdef CONFIG_DLM_DEBUG
22int dlm_register_debugfs(void);
23void dlm_unregister_debugfs(void);
24#else
25static inline int dlm_register_debugfs(void) { return 0; }
26static inline void dlm_unregister_debugfs(void) { }
27#endif
28int dlm_netlink_init(void);
29void dlm_netlink_exit(void);
30
31static int __init init_dlm(void) 21static int __init init_dlm(void)
32{ 22{
33 int error; 23 int error;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index e9cdcab306e2..fa17f5a27883 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -70,7 +70,7 @@ static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
70 ls->ls_num_nodes--; 70 ls->ls_num_nodes--;
71} 71}
72 72
73static int dlm_is_member(struct dlm_ls *ls, int nodeid) 73int dlm_is_member(struct dlm_ls *ls, int nodeid)
74{ 74{
75 struct dlm_member *memb; 75 struct dlm_member *memb;
76 76
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
index 927c08c19214..7a26fca1e0b5 100644
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -19,6 +19,7 @@ void dlm_clear_members(struct dlm_ls *ls);
19void dlm_clear_members_gone(struct dlm_ls *ls); 19void dlm_clear_members_gone(struct dlm_ls *ls);
20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); 20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
21int dlm_is_removed(struct dlm_ls *ls, int nodeid); 21int dlm_is_removed(struct dlm_ls *ls, int nodeid);
22int dlm_is_member(struct dlm_ls *ls, int nodeid);
22 23
23#endif /* __MEMBER_DOT_H__ */ 24#endif /* __MEMBER_DOT_H__ */
24 25
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index ecf0e5cb2035..f7783867491a 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -35,7 +35,7 @@ void dlm_memory_exit(void)
35 kmem_cache_destroy(lkb_cache); 35 kmem_cache_destroy(lkb_cache);
36} 36}
37 37
38char *allocate_lvb(struct dlm_ls *ls) 38char *dlm_allocate_lvb(struct dlm_ls *ls)
39{ 39{
40 char *p; 40 char *p;
41 41
@@ -43,7 +43,7 @@ char *allocate_lvb(struct dlm_ls *ls)
43 return p; 43 return p;
44} 44}
45 45
46void free_lvb(char *p) 46void dlm_free_lvb(char *p)
47{ 47{
48 kfree(p); 48 kfree(p);
49} 49}
@@ -51,7 +51,7 @@ void free_lvb(char *p)
51/* FIXME: have some minimal space built-in to rsb for the name and 51/* FIXME: have some minimal space built-in to rsb for the name and
52 kmalloc a separate name if needed, like dentries are done */ 52 kmalloc a separate name if needed, like dentries are done */
53 53
54struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen) 54struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
55{ 55{
56 struct dlm_rsb *r; 56 struct dlm_rsb *r;
57 57
@@ -61,14 +61,14 @@ struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
61 return r; 61 return r;
62} 62}
63 63
64void free_rsb(struct dlm_rsb *r) 64void dlm_free_rsb(struct dlm_rsb *r)
65{ 65{
66 if (r->res_lvbptr) 66 if (r->res_lvbptr)
67 free_lvb(r->res_lvbptr); 67 dlm_free_lvb(r->res_lvbptr);
68 kfree(r); 68 kfree(r);
69} 69}
70 70
71struct dlm_lkb *allocate_lkb(struct dlm_ls *ls) 71struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
72{ 72{
73 struct dlm_lkb *lkb; 73 struct dlm_lkb *lkb;
74 74
@@ -76,7 +76,7 @@ struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
76 return lkb; 76 return lkb;
77} 77}
78 78
79void free_lkb(struct dlm_lkb *lkb) 79void dlm_free_lkb(struct dlm_lkb *lkb)
80{ 80{
81 if (lkb->lkb_flags & DLM_IFL_USER) { 81 if (lkb->lkb_flags & DLM_IFL_USER) {
82 struct dlm_user_args *ua; 82 struct dlm_user_args *ua;
@@ -90,19 +90,3 @@ void free_lkb(struct dlm_lkb *lkb)
90 kmem_cache_free(lkb_cache, lkb); 90 kmem_cache_free(lkb_cache, lkb);
91} 91}
92 92
93struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
94{
95 struct dlm_direntry *de;
96
97 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
98 printk("namelen = %d\n", namelen););
99
100 de = kzalloc(sizeof(*de) + namelen, GFP_KERNEL);
101 return de;
102}
103
104void free_direntry(struct dlm_direntry *de)
105{
106 kfree(de);
107}
108
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
index 6ead158ccc5c..485fb29143bd 100644
--- a/fs/dlm/memory.h
+++ b/fs/dlm/memory.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -16,14 +16,12 @@
16 16
17int dlm_memory_init(void); 17int dlm_memory_init(void);
18void dlm_memory_exit(void); 18void dlm_memory_exit(void);
19struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen); 19struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen);
20void free_rsb(struct dlm_rsb *r); 20void dlm_free_rsb(struct dlm_rsb *r);
21struct dlm_lkb *allocate_lkb(struct dlm_ls *ls); 21struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
22void free_lkb(struct dlm_lkb *l); 22void dlm_free_lkb(struct dlm_lkb *l);
23struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen); 23char *dlm_allocate_lvb(struct dlm_ls *ls);
24void free_direntry(struct dlm_direntry *de); 24void dlm_free_lvb(char *l);
25char *allocate_lvb(struct dlm_ls *ls);
26void free_lvb(char *l);
27 25
28#endif /* __MEMORY_DOT_H__ */ 26#endif /* __MEMORY_DOT_H__ */
29 27
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index f8c69dda16a0..e69926e984db 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -58,8 +58,12 @@ static void copy_from_cb(void *dst, const void *base, unsigned offset,
58int dlm_process_incoming_buffer(int nodeid, const void *base, 58int dlm_process_incoming_buffer(int nodeid, const void *base,
59 unsigned offset, unsigned len, unsigned limit) 59 unsigned offset, unsigned len, unsigned limit)
60{ 60{
61 unsigned char __tmp[DLM_INBUF_LEN]; 61 union {
62 struct dlm_header *msg = (struct dlm_header *) __tmp; 62 unsigned char __buf[DLM_INBUF_LEN];
63 /* this is to force proper alignment on some arches */
64 struct dlm_header dlm;
65 } __tmp;
66 struct dlm_header *msg = &__tmp.dlm;
63 int ret = 0; 67 int ret = 0;
64 int err = 0; 68 int err = 0;
65 uint16_t msglen; 69 uint16_t msglen;
@@ -100,8 +104,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
100 in the buffer on the stack (which should work for most 104 in the buffer on the stack (which should work for most
101 ordinary messages). */ 105 ordinary messages). */
102 106
103 if (msglen > sizeof(__tmp) && 107 if (msglen > DLM_INBUF_LEN && msg == &__tmp.dlm) {
104 msg == (struct dlm_header *) __tmp) {
105 msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); 108 msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
106 if (msg == NULL) 109 if (msg == NULL)
107 return ret; 110 return ret;
@@ -119,7 +122,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
119 dlm_receive_buffer(msg, nodeid); 122 dlm_receive_buffer(msg, nodeid);
120 } 123 }
121 124
122 if (msg != (struct dlm_header *) __tmp) 125 if (msg != &__tmp.dlm)
123 kfree(msg); 126 kfree(msg);
124 127
125 return err ? err : ret; 128 return err ? err : ret;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index ae2fd97fa4ad..026824cd3acb 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -197,11 +197,6 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
197 spin_unlock(&ls->ls_rcom_spin); 197 spin_unlock(&ls->ls_rcom_spin);
198} 198}
199 199
200static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
201{
202 receive_sync_reply(ls, rc_in);
203}
204
205int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) 200int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
206{ 201{
207 struct dlm_rcom *rc; 202 struct dlm_rcom *rc;
@@ -254,11 +249,6 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
254 send_rcom(ls, mh, rc); 249 send_rcom(ls, mh, rc);
255} 250}
256 251
257static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
258{
259 receive_sync_reply(ls, rc_in);
260}
261
262int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) 252int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
263{ 253{
264 struct dlm_rcom *rc; 254 struct dlm_rcom *rc;
@@ -381,11 +371,6 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
381 send_rcom(ls, mh, rc); 371 send_rcom(ls, mh, rc);
382} 372}
383 373
384static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
385{
386 dlm_recover_process_copy(ls, rc_in);
387}
388
389/* If the lockspace doesn't exist then still send a status message 374/* If the lockspace doesn't exist then still send a status message
390 back; it's possible that it just doesn't have its global_id yet. */ 375 back; it's possible that it just doesn't have its global_id yet. */
391 376
@@ -481,11 +466,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
481 break; 466 break;
482 467
483 case DLM_RCOM_STATUS_REPLY: 468 case DLM_RCOM_STATUS_REPLY:
484 receive_rcom_status_reply(ls, rc); 469 receive_sync_reply(ls, rc);
485 break; 470 break;
486 471
487 case DLM_RCOM_NAMES_REPLY: 472 case DLM_RCOM_NAMES_REPLY:
488 receive_rcom_names_reply(ls, rc); 473 receive_sync_reply(ls, rc);
489 break; 474 break;
490 475
491 case DLM_RCOM_LOOKUP_REPLY: 476 case DLM_RCOM_LOOKUP_REPLY:
@@ -493,11 +478,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
493 break; 478 break;
494 479
495 case DLM_RCOM_LOCK_REPLY: 480 case DLM_RCOM_LOCK_REPLY:
496 receive_rcom_lock_reply(ls, rc); 481 dlm_recover_process_copy(ls, rc);
497 break; 482 break;
498 483
499 default: 484 default:
500 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type);); 485 log_error(ls, "receive_rcom bad type %d", rc->rc_type);
501 } 486 }
502 out: 487 out:
503 return; 488 return;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index c2cc7694cd16..df075dc300fa 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -629,7 +629,7 @@ static void recover_lvb(struct dlm_rsb *r)
629 goto out; 629 goto out;
630 630
631 if (!r->res_lvbptr) { 631 if (!r->res_lvbptr) {
632 r->res_lvbptr = allocate_lvb(r->res_ls); 632 r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
633 if (!r->res_lvbptr) 633 if (!r->res_lvbptr)
634 goto out; 634 goto out;
635 } 635 }
@@ -731,6 +731,20 @@ int dlm_create_root_list(struct dlm_ls *ls)
731 list_add(&r->res_root_list, &ls->ls_root_list); 731 list_add(&r->res_root_list, &ls->ls_root_list);
732 dlm_hold_rsb(r); 732 dlm_hold_rsb(r);
733 } 733 }
734
735 /* If we're using a directory, add tossed rsbs to the root
736 list; they'll have entries created in the new directory,
737 but no other recovery steps should do anything with them. */
738
739 if (dlm_no_directory(ls)) {
740 read_unlock(&ls->ls_rsbtbl[i].lock);
741 continue;
742 }
743
744 list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) {
745 list_add(&r->res_root_list, &ls->ls_root_list);
746 dlm_hold_rsb(r);
747 }
734 read_unlock(&ls->ls_rsbtbl[i].lock); 748 read_unlock(&ls->ls_rsbtbl[i].lock);
735 } 749 }
736 out: 750 out:
@@ -750,6 +764,11 @@ void dlm_release_root_list(struct dlm_ls *ls)
750 up_write(&ls->ls_root_sem); 764 up_write(&ls->ls_root_sem);
751} 765}
752 766
767/* If not using a directory, clear the entire toss list, there's no benefit to
768 caching the master value since it's fixed. If we are using a dir, keep the
769 rsb's we're the master of. Recovery will add them to the root list and from
770 there they'll be entered in the rebuilt directory. */
771
753void dlm_clear_toss_list(struct dlm_ls *ls) 772void dlm_clear_toss_list(struct dlm_ls *ls)
754{ 773{
755 struct dlm_rsb *r, *safe; 774 struct dlm_rsb *r, *safe;
@@ -759,8 +778,10 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
759 write_lock(&ls->ls_rsbtbl[i].lock); 778 write_lock(&ls->ls_rsbtbl[i].lock);
760 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss, 779 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
761 res_hashchain) { 780 res_hashchain) {
762 list_del(&r->res_hashchain); 781 if (dlm_no_directory(ls) || !is_master(r)) {
763 free_rsb(r); 782 list_del(&r->res_hashchain);
783 dlm_free_rsb(r);
784 }
764 } 785 }
765 write_unlock(&ls->ls_rsbtbl[i].lock); 786 write_unlock(&ls->ls_rsbtbl[i].lock);
766 } 787 }
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 4b89e20eebe7..997f9531d594 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -67,17 +67,18 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
67 dlm_astd_resume(); 67 dlm_astd_resume();
68 68
69 /* 69 /*
70 * This list of root rsb's will be the basis of most of the recovery 70 * Free non-master tossed rsb's. Master rsb's are kept on toss
71 * routines. 71 * list and put on root list to be included in resdir recovery.
72 */ 72 */
73 73
74 dlm_create_root_list(ls); 74 dlm_clear_toss_list(ls);
75 75
76 /* 76 /*
77 * Free all the tossed rsb's so we don't have to recover them. 77 * This list of root rsb's will be the basis of most of the recovery
78 * routines.
78 */ 79 */
79 80
80 dlm_clear_toss_list(ls); 81 dlm_create_root_list(ls);
81 82
82 /* 83 /*
83 * Add or remove nodes from the lockspace's ls_nodes list. 84 * Add or remove nodes from the lockspace's ls_nodes list.
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 4f741546f4bb..7cbc6826239b 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -24,8 +24,7 @@
24#include "lvb_table.h" 24#include "lvb_table.h"
25#include "user.h" 25#include "user.h"
26 26
27static const char *name_prefix="dlm"; 27static const char name_prefix[] = "dlm";
28static struct miscdevice ctl_device;
29static const struct file_operations device_fops; 28static const struct file_operations device_fops;
30 29
31#ifdef CONFIG_COMPAT 30#ifdef CONFIG_COMPAT
@@ -82,7 +81,8 @@ struct dlm_lock_result32 {
82}; 81};
83 82
84static void compat_input(struct dlm_write_request *kb, 83static void compat_input(struct dlm_write_request *kb,
85 struct dlm_write_request32 *kb32) 84 struct dlm_write_request32 *kb32,
85 int max_namelen)
86{ 86{
87 kb->version[0] = kb32->version[0]; 87 kb->version[0] = kb32->version[0];
88 kb->version[1] = kb32->version[1]; 88 kb->version[1] = kb32->version[1];
@@ -112,7 +112,11 @@ static void compat_input(struct dlm_write_request *kb,
112 kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr; 112 kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
113 kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb; 113 kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
114 memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN); 114 memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
115 memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen); 115 if (kb->i.lock.namelen <= max_namelen)
116 memcpy(kb->i.lock.name, kb32->i.lock.name,
117 kb->i.lock.namelen);
118 else
119 kb->i.lock.namelen = max_namelen;
116 } 120 }
117} 121}
118 122
@@ -236,12 +240,12 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
236 spin_unlock(&proc->asts_spin); 240 spin_unlock(&proc->asts_spin);
237 241
238 if (eol) { 242 if (eol) {
239 spin_lock(&ua->proc->locks_spin); 243 spin_lock(&proc->locks_spin);
240 if (!list_empty(&lkb->lkb_ownqueue)) { 244 if (!list_empty(&lkb->lkb_ownqueue)) {
241 list_del_init(&lkb->lkb_ownqueue); 245 list_del_init(&lkb->lkb_ownqueue);
242 dlm_put_lkb(lkb); 246 dlm_put_lkb(lkb);
243 } 247 }
244 spin_unlock(&ua->proc->locks_spin); 248 spin_unlock(&proc->locks_spin);
245 } 249 }
246 out: 250 out:
247 mutex_unlock(&ls->ls_clear_proc_locks); 251 mutex_unlock(&ls->ls_clear_proc_locks);
@@ -529,7 +533,8 @@ static ssize_t device_write(struct file *file, const char __user *buf,
529 533
530 if (proc) 534 if (proc)
531 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags); 535 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
532 compat_input(kbuf, k32buf); 536 compat_input(kbuf, k32buf,
537 count - sizeof(struct dlm_write_request32));
533 kfree(k32buf); 538 kfree(k32buf);
534 } 539 }
535#endif 540#endif
@@ -896,14 +901,16 @@ static const struct file_operations ctl_device_fops = {
896 .owner = THIS_MODULE, 901 .owner = THIS_MODULE,
897}; 902};
898 903
904static struct miscdevice ctl_device = {
905 .name = "dlm-control",
906 .fops = &ctl_device_fops,
907 .minor = MISC_DYNAMIC_MINOR,
908};
909
899int dlm_user_init(void) 910int dlm_user_init(void)
900{ 911{
901 int error; 912 int error;
902 913
903 ctl_device.name = "dlm-control";
904 ctl_device.fops = &ctl_device_fops;
905 ctl_device.minor = MISC_DYNAMIC_MINOR;
906
907 error = misc_register(&ctl_device); 914 error = misc_register(&ctl_device);
908 if (error) 915 if (error)
909 log_print("misc_register failed for control device"); 916 log_print("misc_register failed for control device");
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index 963889cf6740..4d9c1f4e1bd1 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,14 @@
14#include "rcom.h" 14#include "rcom.h"
15#include "util.h" 15#include "util.h"
16 16
17#define DLM_ERRNO_EDEADLK 35
18#define DLM_ERRNO_EBADR 53
19#define DLM_ERRNO_EBADSLT 57
20#define DLM_ERRNO_EPROTO 71
21#define DLM_ERRNO_EOPNOTSUPP 95
22#define DLM_ERRNO_ETIMEDOUT 110
23#define DLM_ERRNO_EINPROGRESS 115
24
17static void header_out(struct dlm_header *hd) 25static void header_out(struct dlm_header *hd)
18{ 26{
19 hd->h_version = cpu_to_le32(hd->h_version); 27 hd->h_version = cpu_to_le32(hd->h_version);
@@ -30,11 +38,54 @@ static void header_in(struct dlm_header *hd)
30 hd->h_length = le16_to_cpu(hd->h_length); 38 hd->h_length = le16_to_cpu(hd->h_length);
31} 39}
32 40
33void dlm_message_out(struct dlm_message *ms) 41/* higher errno values are inconsistent across architectures, so select
42 one set of values for on the wire */
43
44static int to_dlm_errno(int err)
45{
46 switch (err) {
47 case -EDEADLK:
48 return -DLM_ERRNO_EDEADLK;
49 case -EBADR:
50 return -DLM_ERRNO_EBADR;
51 case -EBADSLT:
52 return -DLM_ERRNO_EBADSLT;
53 case -EPROTO:
54 return -DLM_ERRNO_EPROTO;
55 case -EOPNOTSUPP:
56 return -DLM_ERRNO_EOPNOTSUPP;
57 case -ETIMEDOUT:
58 return -DLM_ERRNO_ETIMEDOUT;
59 case -EINPROGRESS:
60 return -DLM_ERRNO_EINPROGRESS;
61 }
62 return err;
63}
64
65static int from_dlm_errno(int err)
34{ 66{
35 struct dlm_header *hd = (struct dlm_header *) ms; 67 switch (err) {
68 case -DLM_ERRNO_EDEADLK:
69 return -EDEADLK;
70 case -DLM_ERRNO_EBADR:
71 return -EBADR;
72 case -DLM_ERRNO_EBADSLT:
73 return -EBADSLT;
74 case -DLM_ERRNO_EPROTO:
75 return -EPROTO;
76 case -DLM_ERRNO_EOPNOTSUPP:
77 return -EOPNOTSUPP;
78 case -DLM_ERRNO_ETIMEDOUT:
79 return -ETIMEDOUT;
80 case -DLM_ERRNO_EINPROGRESS:
81 return -EINPROGRESS;
82 }
83 return err;
84}
36 85
37 header_out(hd); 86void dlm_message_out(struct dlm_message *ms)
87{
88 header_out(&ms->m_header);
38 89
39 ms->m_type = cpu_to_le32(ms->m_type); 90 ms->m_type = cpu_to_le32(ms->m_type);
40 ms->m_nodeid = cpu_to_le32(ms->m_nodeid); 91 ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
@@ -53,14 +104,12 @@ void dlm_message_out(struct dlm_message *ms)
53 ms->m_rqmode = cpu_to_le32(ms->m_rqmode); 104 ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
54 ms->m_bastmode = cpu_to_le32(ms->m_bastmode); 105 ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
55 ms->m_asts = cpu_to_le32(ms->m_asts); 106 ms->m_asts = cpu_to_le32(ms->m_asts);
56 ms->m_result = cpu_to_le32(ms->m_result); 107 ms->m_result = cpu_to_le32(to_dlm_errno(ms->m_result));
57} 108}
58 109
59void dlm_message_in(struct dlm_message *ms) 110void dlm_message_in(struct dlm_message *ms)
60{ 111{
61 struct dlm_header *hd = (struct dlm_header *) ms; 112 header_in(&ms->m_header);
62
63 header_in(hd);
64 113
65 ms->m_type = le32_to_cpu(ms->m_type); 114 ms->m_type = le32_to_cpu(ms->m_type);
66 ms->m_nodeid = le32_to_cpu(ms->m_nodeid); 115 ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
@@ -79,7 +128,7 @@ void dlm_message_in(struct dlm_message *ms)
79 ms->m_rqmode = le32_to_cpu(ms->m_rqmode); 128 ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
80 ms->m_bastmode = le32_to_cpu(ms->m_bastmode); 129 ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
81 ms->m_asts = le32_to_cpu(ms->m_asts); 130 ms->m_asts = le32_to_cpu(ms->m_asts);
82 ms->m_result = le32_to_cpu(ms->m_result); 131 ms->m_result = from_dlm_errno(le32_to_cpu(ms->m_result));
83} 132}
84 133
85static void rcom_lock_out(struct rcom_lock *rl) 134static void rcom_lock_out(struct rcom_lock *rl)
@@ -126,10 +175,9 @@ static void rcom_config_in(struct rcom_config *rf)
126 175
127void dlm_rcom_out(struct dlm_rcom *rc) 176void dlm_rcom_out(struct dlm_rcom *rc)
128{ 177{
129 struct dlm_header *hd = (struct dlm_header *) rc;
130 int type = rc->rc_type; 178 int type = rc->rc_type;
131 179
132 header_out(hd); 180 header_out(&rc->rc_header);
133 181
134 rc->rc_type = cpu_to_le32(rc->rc_type); 182 rc->rc_type = cpu_to_le32(rc->rc_type);
135 rc->rc_result = cpu_to_le32(rc->rc_result); 183 rc->rc_result = cpu_to_le32(rc->rc_result);
@@ -137,7 +185,7 @@ void dlm_rcom_out(struct dlm_rcom *rc)
137 rc->rc_seq = cpu_to_le64(rc->rc_seq); 185 rc->rc_seq = cpu_to_le64(rc->rc_seq);
138 rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply); 186 rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply);
139 187
140 if (type == DLM_RCOM_LOCK) 188 if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
141 rcom_lock_out((struct rcom_lock *) rc->rc_buf); 189 rcom_lock_out((struct rcom_lock *) rc->rc_buf);
142 190
143 else if (type == DLM_RCOM_STATUS_REPLY) 191 else if (type == DLM_RCOM_STATUS_REPLY)
@@ -146,9 +194,9 @@ void dlm_rcom_out(struct dlm_rcom *rc)
146 194
147void dlm_rcom_in(struct dlm_rcom *rc) 195void dlm_rcom_in(struct dlm_rcom *rc)
148{ 196{
149 struct dlm_header *hd = (struct dlm_header *) rc; 197 int type;
150 198
151 header_in(hd); 199 header_in(&rc->rc_header);
152 200
153 rc->rc_type = le32_to_cpu(rc->rc_type); 201 rc->rc_type = le32_to_cpu(rc->rc_type);
154 rc->rc_result = le32_to_cpu(rc->rc_result); 202 rc->rc_result = le32_to_cpu(rc->rc_result);
@@ -156,10 +204,12 @@ void dlm_rcom_in(struct dlm_rcom *rc)
156 rc->rc_seq = le64_to_cpu(rc->rc_seq); 204 rc->rc_seq = le64_to_cpu(rc->rc_seq);
157 rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply); 205 rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply);
158 206
159 if (rc->rc_type == DLM_RCOM_LOCK) 207 type = rc->rc_type;
208
209 if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
160 rcom_lock_in((struct rcom_lock *) rc->rc_buf); 210 rcom_lock_in((struct rcom_lock *) rc->rc_buf);
161 211
162 else if (rc->rc_type == DLM_RCOM_STATUS_REPLY) 212 else if (type == DLM_RCOM_STATUS_REPLY)
163 rcom_config_in((struct rcom_config *) rc->rc_buf); 213 rcom_config_in((struct rcom_config *) rc->rc_buf);
164} 214}
165 215
diff --git a/include/asm-x86/Kbuild b/include/asm-x86/Kbuild
index e6189b229143..3c6f0f80e827 100644
--- a/include/asm-x86/Kbuild
+++ b/include/asm-x86/Kbuild
@@ -3,6 +3,7 @@ include include/asm-generic/Kbuild.asm
3header-y += boot.h 3header-y += boot.h
4header-y += bootparam.h 4header-y += bootparam.h
5header-y += debugreg.h 5header-y += debugreg.h
6header-y += kvm.h
6header-y += ldt.h 7header-y += ldt.h
7header-y += msr-index.h 8header-y += msr-index.h
8header-y += prctl.h 9header-y += prctl.h
diff --git a/include/asm-x86/kvm.h b/include/asm-x86/kvm.h
new file mode 100644
index 000000000000..7a71120426a3
--- /dev/null
+++ b/include/asm-x86/kvm.h
@@ -0,0 +1,191 @@
1#ifndef __LINUX_KVM_X86_H
2#define __LINUX_KVM_X86_H
3
4/*
5 * KVM x86 specific structures and definitions
6 *
7 */
8
9#include <asm/types.h>
10#include <linux/ioctl.h>
11
12/* Architectural interrupt line count. */
13#define KVM_NR_INTERRUPTS 256
14
15struct kvm_memory_alias {
16 __u32 slot; /* this has a different namespace than memory slots */
17 __u32 flags;
18 __u64 guest_phys_addr;
19 __u64 memory_size;
20 __u64 target_phys_addr;
21};
22
23/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
24struct kvm_pic_state {
25 __u8 last_irr; /* edge detection */
26 __u8 irr; /* interrupt request register */
27 __u8 imr; /* interrupt mask register */
28 __u8 isr; /* interrupt service register */
29 __u8 priority_add; /* highest irq priority */
30 __u8 irq_base;
31 __u8 read_reg_select;
32 __u8 poll;
33 __u8 special_mask;
34 __u8 init_state;
35 __u8 auto_eoi;
36 __u8 rotate_on_auto_eoi;
37 __u8 special_fully_nested_mode;
38 __u8 init4; /* true if 4 byte init */
39 __u8 elcr; /* PIIX edge/trigger selection */
40 __u8 elcr_mask;
41};
42
43#define KVM_IOAPIC_NUM_PINS 24
44struct kvm_ioapic_state {
45 __u64 base_address;
46 __u32 ioregsel;
47 __u32 id;
48 __u32 irr;
49 __u32 pad;
50 union {
51 __u64 bits;
52 struct {
53 __u8 vector;
54 __u8 delivery_mode:3;
55 __u8 dest_mode:1;
56 __u8 delivery_status:1;
57 __u8 polarity:1;
58 __u8 remote_irr:1;
59 __u8 trig_mode:1;
60 __u8 mask:1;
61 __u8 reserve:7;
62 __u8 reserved[4];
63 __u8 dest_id;
64 } fields;
65 } redirtbl[KVM_IOAPIC_NUM_PINS];
66};
67
68#define KVM_IRQCHIP_PIC_MASTER 0
69#define KVM_IRQCHIP_PIC_SLAVE 1
70#define KVM_IRQCHIP_IOAPIC 2
71
72/* for KVM_GET_REGS and KVM_SET_REGS */
73struct kvm_regs {
74 /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
75 __u64 rax, rbx, rcx, rdx;
76 __u64 rsi, rdi, rsp, rbp;
77 __u64 r8, r9, r10, r11;
78 __u64 r12, r13, r14, r15;
79 __u64 rip, rflags;
80};
81
82/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
83#define KVM_APIC_REG_SIZE 0x400
84struct kvm_lapic_state {
85 char regs[KVM_APIC_REG_SIZE];
86};
87
88struct kvm_segment {
89 __u64 base;
90 __u32 limit;
91 __u16 selector;
92 __u8 type;
93 __u8 present, dpl, db, s, l, g, avl;
94 __u8 unusable;
95 __u8 padding;
96};
97
98struct kvm_dtable {
99 __u64 base;
100 __u16 limit;
101 __u16 padding[3];
102};
103
104
105/* for KVM_GET_SREGS and KVM_SET_SREGS */
106struct kvm_sregs {
107 /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
108 struct kvm_segment cs, ds, es, fs, gs, ss;
109 struct kvm_segment tr, ldt;
110 struct kvm_dtable gdt, idt;
111 __u64 cr0, cr2, cr3, cr4, cr8;
112 __u64 efer;
113 __u64 apic_base;
114 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
115};
116
117/* for KVM_GET_FPU and KVM_SET_FPU */
118struct kvm_fpu {
119 __u8 fpr[8][16];
120 __u16 fcw;
121 __u16 fsw;
122 __u8 ftwx; /* in fxsave format */
123 __u8 pad1;
124 __u16 last_opcode;
125 __u64 last_ip;
126 __u64 last_dp;
127 __u8 xmm[16][16];
128 __u32 mxcsr;
129 __u32 pad2;
130};
131
132struct kvm_msr_entry {
133 __u32 index;
134 __u32 reserved;
135 __u64 data;
136};
137
138/* for KVM_GET_MSRS and KVM_SET_MSRS */
139struct kvm_msrs {
140 __u32 nmsrs; /* number of msrs in entries */
141 __u32 pad;
142
143 struct kvm_msr_entry entries[0];
144};
145
146/* for KVM_GET_MSR_INDEX_LIST */
147struct kvm_msr_list {
148 __u32 nmsrs; /* number of msrs in entries */
149 __u32 indices[0];
150};
151
152
153struct kvm_cpuid_entry {
154 __u32 function;
155 __u32 eax;
156 __u32 ebx;
157 __u32 ecx;
158 __u32 edx;
159 __u32 padding;
160};
161
162/* for KVM_SET_CPUID */
163struct kvm_cpuid {
164 __u32 nent;
165 __u32 padding;
166 struct kvm_cpuid_entry entries[0];
167};
168
169struct kvm_cpuid_entry2 {
170 __u32 function;
171 __u32 index;
172 __u32 flags;
173 __u32 eax;
174 __u32 ebx;
175 __u32 ecx;
176 __u32 edx;
177 __u32 padding[3];
178};
179
180#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
181#define KVM_CPUID_FLAG_STATEFUL_FUNC 2
182#define KVM_CPUID_FLAG_STATE_READ_NEXT 4
183
184/* for KVM_SET_CPUID2 */
185struct kvm_cpuid2 {
186 __u32 nent;
187 __u32 padding;
188 struct kvm_cpuid_entry2 entries[0];
189};
190
191#endif
diff --git a/drivers/kvm/kvm.h b/include/asm-x86/kvm_host.h
index 3b0bc4bda5f2..4702b04b979a 100644
--- a/drivers/kvm/kvm.h
+++ b/include/asm-x86/kvm_host.h
@@ -1,23 +1,24 @@
1#ifndef __KVM_H 1#/*
2#define __KVM_H 2 * Kernel-based Virtual Machine driver for Linux
3 3 *
4/* 4 * This header defines architecture specific interfaces, x86 version
5 *
5 * This work is licensed under the terms of the GNU GPL, version 2. See 6 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory. 7 * the COPYING file in the top-level directory.
8 *
7 */ 9 */
8 10
11#ifndef ASM_KVM_HOST_H
12#define ASM_KVM_HOST_H
13
9#include <linux/types.h> 14#include <linux/types.h>
10#include <linux/list.h>
11#include <linux/mutex.h>
12#include <linux/spinlock.h>
13#include <linux/signal.h>
14#include <linux/sched.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/preempt.h>
17#include <asm/signal.h>
18 16
19#include <linux/kvm.h> 17#include <linux/kvm.h>
20#include <linux/kvm_para.h> 18#include <linux/kvm_para.h>
19#include <linux/kvm_types.h>
20
21#include <asm/desc.h>
21 22
22#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) 23#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
23#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) 24#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
@@ -37,15 +38,8 @@
37#define INVALID_PAGE (~(hpa_t)0) 38#define INVALID_PAGE (~(hpa_t)0)
38#define UNMAPPED_GVA (~(gpa_t)0) 39#define UNMAPPED_GVA (~(gpa_t)0)
39 40
40#define KVM_MAX_VCPUS 4
41#define KVM_ALIAS_SLOTS 4
42#define KVM_MEMORY_SLOTS 8
43#define KVM_NUM_MMU_PAGES 1024
44#define KVM_MIN_FREE_MMU_PAGES 5
45#define KVM_REFILL_PAGES 25
46#define KVM_MAX_CPUID_ENTRIES 40
47
48#define DE_VECTOR 0 41#define DE_VECTOR 0
42#define UD_VECTOR 6
49#define NM_VECTOR 7 43#define NM_VECTOR 7
50#define DF_VECTOR 8 44#define DF_VECTOR 8
51#define TS_VECTOR 10 45#define TS_VECTOR 10
@@ -59,31 +53,66 @@
59 53
60#define IOPL_SHIFT 12 54#define IOPL_SHIFT 12
61 55
62#define KVM_PIO_PAGE_OFFSET 1 56#define KVM_ALIAS_SLOTS 4
63 57
64/* 58#define KVM_PERMILLE_MMU_PAGES 20
65 * vcpu->requests bit members 59#define KVM_MIN_ALLOC_MMU_PAGES 64
66 */ 60#define KVM_NUM_MMU_PAGES 1024
67#define KVM_TLB_FLUSH 0 61#define KVM_MIN_FREE_MMU_PAGES 5
62#define KVM_REFILL_PAGES 25
63#define KVM_MAX_CPUID_ENTRIES 40
68 64
69/* 65extern spinlock_t kvm_lock;
70 * Address types: 66extern struct list_head vm_list;
71 * 67
72 * gva - guest virtual address 68struct kvm_vcpu;
73 * gpa - guest physical address 69struct kvm;
74 * gfn - guest frame number 70
75 * hva - host virtual address 71enum {
76 * hpa - host physical address 72 VCPU_REGS_RAX = 0,
77 * hfn - host frame number 73 VCPU_REGS_RCX = 1,
78 */ 74 VCPU_REGS_RDX = 2,
75 VCPU_REGS_RBX = 3,
76 VCPU_REGS_RSP = 4,
77 VCPU_REGS_RBP = 5,
78 VCPU_REGS_RSI = 6,
79 VCPU_REGS_RDI = 7,
80#ifdef CONFIG_X86_64
81 VCPU_REGS_R8 = 8,
82 VCPU_REGS_R9 = 9,
83 VCPU_REGS_R10 = 10,
84 VCPU_REGS_R11 = 11,
85 VCPU_REGS_R12 = 12,
86 VCPU_REGS_R13 = 13,
87 VCPU_REGS_R14 = 14,
88 VCPU_REGS_R15 = 15,
89#endif
90 NR_VCPU_REGS
91};
92
93enum {
94 VCPU_SREG_CS,
95 VCPU_SREG_DS,
96 VCPU_SREG_ES,
97 VCPU_SREG_FS,
98 VCPU_SREG_GS,
99 VCPU_SREG_SS,
100 VCPU_SREG_TR,
101 VCPU_SREG_LDTR,
102};
79 103
80typedef unsigned long gva_t; 104#include <asm/kvm_x86_emulate.h>
81typedef u64 gpa_t;
82typedef unsigned long gfn_t;
83 105
84typedef unsigned long hva_t; 106#define KVM_NR_MEM_OBJS 40
85typedef u64 hpa_t; 107
86typedef unsigned long hfn_t; 108/*
109 * We don't want allocation failures within the mmu code, so we preallocate
110 * enough memory for a single page fault in a cache.
111 */
112struct kvm_mmu_memory_cache {
113 int nobjs;
114 void *objects[KVM_NR_MEM_OBJS];
115};
87 116
88#define NR_PTE_CHAIN_ENTRIES 5 117#define NR_PTE_CHAIN_ENTRIES 5
89 118
@@ -99,7 +128,7 @@ struct kvm_pte_chain {
99 * bits 4:7 - page table level for this shadow (1-4) 128 * bits 4:7 - page table level for this shadow (1-4)
100 * bits 8:9 - page table quadrant for 2-level guests 129 * bits 8:9 - page table quadrant for 2-level guests
101 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) 130 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
102 * bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde 131 * bits 17:19 - common access permissions for all ptes in this shadow page
103 */ 132 */
104union kvm_mmu_page_role { 133union kvm_mmu_page_role {
105 unsigned word; 134 unsigned word;
@@ -109,7 +138,7 @@ union kvm_mmu_page_role {
109 unsigned quadrant : 2; 138 unsigned quadrant : 2;
110 unsigned pad_for_nice_hex_output : 6; 139 unsigned pad_for_nice_hex_output : 6;
111 unsigned metaphysical : 1; 140 unsigned metaphysical : 1;
112 unsigned hugepage_access : 3; 141 unsigned access : 3;
113 }; 142 };
114}; 143};
115 144
@@ -125,6 +154,8 @@ struct kvm_mmu_page {
125 union kvm_mmu_page_role role; 154 union kvm_mmu_page_role role;
126 155
127 u64 *spt; 156 u64 *spt;
157 /* hold the gfn of each spte inside spt */
158 gfn_t *gfns;
128 unsigned long slot_bitmap; /* One bit set per slot which has memory 159 unsigned long slot_bitmap; /* One bit set per slot which has memory
129 * in this shadow page. 160 * in this shadow page.
130 */ 161 */
@@ -136,9 +167,6 @@ struct kvm_mmu_page {
136 }; 167 };
137}; 168};
138 169
139struct kvm_vcpu;
140extern struct kmem_cache *kvm_vcpu_cache;
141
142/* 170/*
143 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level 171 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
144 * 32-bit). The kvm_mmu structure abstracts the details of the current mmu 172 * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
@@ -149,6 +177,8 @@ struct kvm_mmu {
149 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 177 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
150 void (*free)(struct kvm_vcpu *vcpu); 178 void (*free)(struct kvm_vcpu *vcpu);
151 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); 179 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
180 void (*prefetch_page)(struct kvm_vcpu *vcpu,
181 struct kvm_mmu_page *page);
152 hpa_t root_hpa; 182 hpa_t root_hpa;
153 int root_level; 183 int root_level;
154 int shadow_root_level; 184 int shadow_root_level;
@@ -156,159 +186,9 @@ struct kvm_mmu {
156 u64 *pae_root; 186 u64 *pae_root;
157}; 187};
158 188
159#define KVM_NR_MEM_OBJS 20 189struct kvm_vcpu_arch {
160
161struct kvm_mmu_memory_cache {
162 int nobjs;
163 void *objects[KVM_NR_MEM_OBJS];
164};
165
166/*
167 * We don't want allocation failures within the mmu code, so we preallocate
168 * enough memory for a single page fault in a cache.
169 */
170struct kvm_guest_debug {
171 int enabled;
172 unsigned long bp[4];
173 int singlestep;
174};
175
176enum {
177 VCPU_REGS_RAX = 0,
178 VCPU_REGS_RCX = 1,
179 VCPU_REGS_RDX = 2,
180 VCPU_REGS_RBX = 3,
181 VCPU_REGS_RSP = 4,
182 VCPU_REGS_RBP = 5,
183 VCPU_REGS_RSI = 6,
184 VCPU_REGS_RDI = 7,
185#ifdef CONFIG_X86_64
186 VCPU_REGS_R8 = 8,
187 VCPU_REGS_R9 = 9,
188 VCPU_REGS_R10 = 10,
189 VCPU_REGS_R11 = 11,
190 VCPU_REGS_R12 = 12,
191 VCPU_REGS_R13 = 13,
192 VCPU_REGS_R14 = 14,
193 VCPU_REGS_R15 = 15,
194#endif
195 NR_VCPU_REGS
196};
197
198enum {
199 VCPU_SREG_CS,
200 VCPU_SREG_DS,
201 VCPU_SREG_ES,
202 VCPU_SREG_FS,
203 VCPU_SREG_GS,
204 VCPU_SREG_SS,
205 VCPU_SREG_TR,
206 VCPU_SREG_LDTR,
207};
208
209struct kvm_pio_request {
210 unsigned long count;
211 int cur_count;
212 struct page *guest_pages[2];
213 unsigned guest_page_offset;
214 int in;
215 int port;
216 int size;
217 int string;
218 int down;
219 int rep;
220};
221
222struct kvm_stat {
223 u32 pf_fixed;
224 u32 pf_guest;
225 u32 tlb_flush;
226 u32 invlpg;
227
228 u32 exits;
229 u32 io_exits;
230 u32 mmio_exits;
231 u32 signal_exits;
232 u32 irq_window_exits;
233 u32 halt_exits;
234 u32 halt_wakeup;
235 u32 request_irq_exits;
236 u32 irq_exits;
237 u32 light_exits;
238 u32 efer_reload;
239};
240
241struct kvm_io_device {
242 void (*read)(struct kvm_io_device *this,
243 gpa_t addr,
244 int len,
245 void *val);
246 void (*write)(struct kvm_io_device *this,
247 gpa_t addr,
248 int len,
249 const void *val);
250 int (*in_range)(struct kvm_io_device *this, gpa_t addr);
251 void (*destructor)(struct kvm_io_device *this);
252
253 void *private;
254};
255
256static inline void kvm_iodevice_read(struct kvm_io_device *dev,
257 gpa_t addr,
258 int len,
259 void *val)
260{
261 dev->read(dev, addr, len, val);
262}
263
264static inline void kvm_iodevice_write(struct kvm_io_device *dev,
265 gpa_t addr,
266 int len,
267 const void *val)
268{
269 dev->write(dev, addr, len, val);
270}
271
272static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
273{
274 return dev->in_range(dev, addr);
275}
276
277static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
278{
279 if (dev->destructor)
280 dev->destructor(dev);
281}
282
283/*
284 * It would be nice to use something smarter than a linear search, TBD...
285 * Thankfully we dont expect many devices to register (famous last words :),
286 * so until then it will suffice. At least its abstracted so we can change
287 * in one place.
288 */
289struct kvm_io_bus {
290 int dev_count;
291#define NR_IOBUS_DEVS 6
292 struct kvm_io_device *devs[NR_IOBUS_DEVS];
293};
294
295void kvm_io_bus_init(struct kvm_io_bus *bus);
296void kvm_io_bus_destroy(struct kvm_io_bus *bus);
297struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
298void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
299 struct kvm_io_device *dev);
300
301struct kvm_vcpu {
302 struct kvm *kvm;
303 struct preempt_notifier preempt_notifier;
304 int vcpu_id;
305 struct mutex mutex;
306 int cpu;
307 u64 host_tsc; 190 u64 host_tsc;
308 struct kvm_run *run;
309 int interrupt_window_open; 191 int interrupt_window_open;
310 int guest_mode;
311 unsigned long requests;
312 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ 192 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
313 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); 193 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
314 unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ 194 unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
@@ -317,9 +197,6 @@ struct kvm_vcpu {
317 unsigned long cr0; 197 unsigned long cr0;
318 unsigned long cr2; 198 unsigned long cr2;
319 unsigned long cr3; 199 unsigned long cr3;
320 gpa_t para_state_gpa;
321 struct page *para_state_page;
322 gpa_t hypercall_gpa;
323 unsigned long cr4; 200 unsigned long cr4;
324 unsigned long cr8; 201 unsigned long cr8;
325 u64 pdptrs[4]; /* pae */ 202 u64 pdptrs[4]; /* pae */
@@ -334,6 +211,7 @@ struct kvm_vcpu {
334 int mp_state; 211 int mp_state;
335 int sipi_vector; 212 int sipi_vector;
336 u64 ia32_misc_enable_msr; 213 u64 ia32_misc_enable_msr;
214 bool tpr_access_reporting;
337 215
338 struct kvm_mmu mmu; 216 struct kvm_mmu mmu;
339 217
@@ -344,29 +222,26 @@ struct kvm_vcpu {
344 222
345 gfn_t last_pt_write_gfn; 223 gfn_t last_pt_write_gfn;
346 int last_pt_write_count; 224 int last_pt_write_count;
225 u64 *last_pte_updated;
347 226
348 struct kvm_guest_debug guest_debug; 227 struct {
228 gfn_t gfn; /* presumed gfn during guest pte update */
229 struct page *page; /* page corresponding to that gfn */
230 } update_pte;
349 231
350 struct i387_fxsave_struct host_fx_image; 232 struct i387_fxsave_struct host_fx_image;
351 struct i387_fxsave_struct guest_fx_image; 233 struct i387_fxsave_struct guest_fx_image;
352 int fpu_active; 234
353 int guest_fpu_loaded;
354
355 int mmio_needed;
356 int mmio_read_completed;
357 int mmio_is_write;
358 int mmio_size;
359 unsigned char mmio_data[8];
360 gpa_t mmio_phys_addr;
361 gva_t mmio_fault_cr2; 235 gva_t mmio_fault_cr2;
362 struct kvm_pio_request pio; 236 struct kvm_pio_request pio;
363 void *pio_data; 237 void *pio_data;
364 wait_queue_head_t wq;
365 238
366 int sigset_active; 239 struct kvm_queued_exception {
367 sigset_t sigset; 240 bool pending;
368 241 bool has_error_code;
369 struct kvm_stat stat; 242 u8 nr;
243 u32 error_code;
244 } exception;
370 245
371 struct { 246 struct {
372 int active; 247 int active;
@@ -381,7 +256,10 @@ struct kvm_vcpu {
381 int halt_request; /* real mode on Intel only */ 256 int halt_request; /* real mode on Intel only */
382 257
383 int cpuid_nent; 258 int cpuid_nent;
384 struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; 259 struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
260 /* emulate context */
261
262 struct x86_emulate_ctxt emulate_ctxt;
385}; 263};
386 264
387struct kvm_mem_alias { 265struct kvm_mem_alias {
@@ -390,51 +268,58 @@ struct kvm_mem_alias {
390 gfn_t target_gfn; 268 gfn_t target_gfn;
391}; 269};
392 270
393struct kvm_memory_slot { 271struct kvm_arch{
394 gfn_t base_gfn;
395 unsigned long npages;
396 unsigned long flags;
397 struct page **phys_mem;
398 unsigned long *dirty_bitmap;
399};
400
401struct kvm {
402 struct mutex lock; /* protects everything except vcpus */
403 int naliases; 272 int naliases;
404 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; 273 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
405 int nmemslots; 274
406 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; 275 unsigned int n_free_mmu_pages;
276 unsigned int n_requested_mmu_pages;
277 unsigned int n_alloc_mmu_pages;
278 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
407 /* 279 /*
408 * Hash table of struct kvm_mmu_page. 280 * Hash table of struct kvm_mmu_page.
409 */ 281 */
410 struct list_head active_mmu_pages; 282 struct list_head active_mmu_pages;
411 int n_free_mmu_pages;
412 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
413 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
414 unsigned long rmap_overflow;
415 struct list_head vm_list;
416 struct file *filp;
417 struct kvm_io_bus mmio_bus;
418 struct kvm_io_bus pio_bus;
419 struct kvm_pic *vpic; 283 struct kvm_pic *vpic;
420 struct kvm_ioapic *vioapic; 284 struct kvm_ioapic *vioapic;
285
421 int round_robin_prev_vcpu; 286 int round_robin_prev_vcpu;
287 unsigned int tss_addr;
288 struct page *apic_access_page;
422}; 289};
423 290
424static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) 291struct kvm_vm_stat {
425{ 292 u32 mmu_shadow_zapped;
426 return kvm->vpic; 293 u32 mmu_pte_write;
427} 294 u32 mmu_pte_updated;
295 u32 mmu_pde_zapped;
296 u32 mmu_flooded;
297 u32 mmu_recycled;
298 u32 mmu_cache_miss;
299 u32 remote_tlb_flush;
300};
428 301
429static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) 302struct kvm_vcpu_stat {
430{ 303 u32 pf_fixed;
431 return kvm->vioapic; 304 u32 pf_guest;
432} 305 u32 tlb_flush;
306 u32 invlpg;
433 307
434static inline int irqchip_in_kernel(struct kvm *kvm) 308 u32 exits;
435{ 309 u32 io_exits;
436 return pic_irqchip(kvm) != 0; 310 u32 mmio_exits;
437} 311 u32 signal_exits;
312 u32 irq_window_exits;
313 u32 halt_exits;
314 u32 halt_wakeup;
315 u32 request_irq_exits;
316 u32 irq_exits;
317 u32 host_state_reload;
318 u32 efer_reload;
319 u32 fpu_reload;
320 u32 insn_emulation;
321 u32 insn_emulation_fail;
322};
438 323
439struct descriptor_table { 324struct descriptor_table {
440 u16 limit; 325 u16 limit;
@@ -449,11 +334,12 @@ struct kvm_x86_ops {
449 void (*check_processor_compatibility)(void *rtn); 334 void (*check_processor_compatibility)(void *rtn);
450 int (*hardware_setup)(void); /* __init */ 335 int (*hardware_setup)(void); /* __init */
451 void (*hardware_unsetup)(void); /* __exit */ 336 void (*hardware_unsetup)(void); /* __exit */
337 bool (*cpu_has_accelerated_tpr)(void);
452 338
453 /* Create, but do not attach this VCPU */ 339 /* Create, but do not attach this VCPU */
454 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); 340 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
455 void (*vcpu_free)(struct kvm_vcpu *vcpu); 341 void (*vcpu_free)(struct kvm_vcpu *vcpu);
456 void (*vcpu_reset)(struct kvm_vcpu *vcpu); 342 int (*vcpu_reset)(struct kvm_vcpu *vcpu);
457 343
458 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); 344 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
459 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 345 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
@@ -489,10 +375,6 @@ struct kvm_x86_ops {
489 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 375 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
490 376
491 void (*tlb_flush)(struct kvm_vcpu *vcpu); 377 void (*tlb_flush)(struct kvm_vcpu *vcpu);
492 void (*inject_page_fault)(struct kvm_vcpu *vcpu,
493 unsigned long addr, u32 err_code);
494
495 void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
496 378
497 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); 379 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
498 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); 380 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
@@ -501,54 +383,31 @@ struct kvm_x86_ops {
501 unsigned char *hypercall_addr); 383 unsigned char *hypercall_addr);
502 int (*get_irq)(struct kvm_vcpu *vcpu); 384 int (*get_irq)(struct kvm_vcpu *vcpu);
503 void (*set_irq)(struct kvm_vcpu *vcpu, int vec); 385 void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
386 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
387 bool has_error_code, u32 error_code);
388 bool (*exception_injected)(struct kvm_vcpu *vcpu);
504 void (*inject_pending_irq)(struct kvm_vcpu *vcpu); 389 void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
505 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, 390 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
506 struct kvm_run *run); 391 struct kvm_run *run);
392
393 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
507}; 394};
508 395
509extern struct kvm_x86_ops *kvm_x86_ops; 396extern struct kvm_x86_ops *kvm_x86_ops;
510 397
511/* The guest did something we don't support. */
512#define pr_unimpl(vcpu, fmt, ...) \
513 do { \
514 if (printk_ratelimit()) \
515 printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
516 current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
517 } while(0)
518
519#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
520#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
521
522int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
523void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
524
525int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
526 struct module *module);
527void kvm_exit_x86(void);
528
529int kvm_mmu_module_init(void); 398int kvm_mmu_module_init(void);
530void kvm_mmu_module_exit(void); 399void kvm_mmu_module_exit(void);
531 400
532void kvm_mmu_destroy(struct kvm_vcpu *vcpu); 401void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
533int kvm_mmu_create(struct kvm_vcpu *vcpu); 402int kvm_mmu_create(struct kvm_vcpu *vcpu);
534int kvm_mmu_setup(struct kvm_vcpu *vcpu); 403int kvm_mmu_setup(struct kvm_vcpu *vcpu);
404void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
535 405
536int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 406int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
537void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); 407void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
538void kvm_mmu_zap_all(struct kvm *kvm); 408void kvm_mmu_zap_all(struct kvm *kvm);
539 409unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
540hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); 410void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
541#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
542#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
543static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
544hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
545struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
546
547extern hpa_t bad_page_address;
548
549struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
550struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
551void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
552 411
553enum emulation_result { 412enum emulation_result {
554 EMULATE_DONE, /* no further processing */ 413 EMULATE_DONE, /* no further processing */
@@ -556,8 +415,10 @@ enum emulation_result {
556 EMULATE_FAIL, /* can't emulate this instruction */ 415 EMULATE_FAIL, /* can't emulate this instruction */
557}; 416};
558 417
418#define EMULTYPE_NO_DECODE (1 << 0)
419#define EMULTYPE_TRAP_UD (1 << 1)
559int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, 420int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
560 unsigned long cr2, u16 error_code); 421 unsigned long cr2, u16 error_code, int emulation_type);
561void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 422void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
562void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 423void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
563void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 424void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -572,7 +433,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
572 433
573struct x86_emulate_ctxt; 434struct x86_emulate_ctxt;
574 435
575int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 436int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
576 int size, unsigned port); 437 int size, unsigned port);
577int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 438int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
578 int size, unsigned long count, int down, 439 int size, unsigned long count, int down,
@@ -581,7 +442,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
581int kvm_emulate_halt(struct kvm_vcpu *vcpu); 442int kvm_emulate_halt(struct kvm_vcpu *vcpu);
582int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 443int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
583int emulate_clts(struct kvm_vcpu *vcpu); 444int emulate_clts(struct kvm_vcpu *vcpu);
584int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, 445int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
585 unsigned long *dest); 446 unsigned long *dest);
586int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, 447int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
587 unsigned long value); 448 unsigned long value);
@@ -597,15 +458,15 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
597int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 458int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
598int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 459int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
599 460
600void fx_init(struct kvm_vcpu *vcpu); 461void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
462void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
463void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
464 u32 error_code);
601 465
602void kvm_resched(struct kvm_vcpu *vcpu); 466void fx_init(struct kvm_vcpu *vcpu);
603void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
604void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
605void kvm_flush_remote_tlbs(struct kvm *kvm);
606 467
607int emulator_read_std(unsigned long addr, 468int emulator_read_std(unsigned long addr,
608 void *val, 469 void *val,
609 unsigned int bytes, 470 unsigned int bytes,
610 struct kvm_vcpu *vcpu); 471 struct kvm_vcpu *vcpu);
611int emulator_write_emulated(unsigned long addr, 472int emulator_write_emulated(unsigned long addr,
@@ -615,6 +476,7 @@ int emulator_write_emulated(unsigned long addr,
615 476
616unsigned long segment_base(u16 selector); 477unsigned long segment_base(u16 selector);
617 478
479void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
618void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 480void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
619 const u8 *new, int bytes); 481 const u8 *new, int bytes);
620int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); 482int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
@@ -622,66 +484,14 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
622int kvm_mmu_load(struct kvm_vcpu *vcpu); 484int kvm_mmu_load(struct kvm_vcpu *vcpu);
623void kvm_mmu_unload(struct kvm_vcpu *vcpu); 485void kvm_mmu_unload(struct kvm_vcpu *vcpu);
624 486
625int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); 487int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
626 488
627static inline void kvm_guest_enter(void) 489int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
628{
629 current->flags |= PF_VCPU;
630}
631 490
632static inline void kvm_guest_exit(void) 491int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
633{
634 current->flags &= ~PF_VCPU;
635}
636 492
637static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 493int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
638 u32 error_code) 494int complete_pio(struct kvm_vcpu *vcpu);
639{
640 return vcpu->mmu.page_fault(vcpu, gva, error_code);
641}
642
643static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
644{
645 if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
646 __kvm_mmu_free_some_pages(vcpu);
647}
648
649static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
650{
651 if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
652 return 0;
653
654 return kvm_mmu_load(vcpu);
655}
656
657static inline int is_long_mode(struct kvm_vcpu *vcpu)
658{
659#ifdef CONFIG_X86_64
660 return vcpu->shadow_efer & EFER_LME;
661#else
662 return 0;
663#endif
664}
665
666static inline int is_pae(struct kvm_vcpu *vcpu)
667{
668 return vcpu->cr4 & X86_CR4_PAE;
669}
670
671static inline int is_pse(struct kvm_vcpu *vcpu)
672{
673 return vcpu->cr4 & X86_CR4_PSE;
674}
675
676static inline int is_paging(struct kvm_vcpu *vcpu)
677{
678 return vcpu->cr0 & X86_CR0_PG;
679}
680
681static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
682{
683 return slot - kvm->memslots;
684}
685 495
686static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) 496static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
687{ 497{
@@ -693,55 +503,55 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
693static inline u16 read_fs(void) 503static inline u16 read_fs(void)
694{ 504{
695 u16 seg; 505 u16 seg;
696 asm ("mov %%fs, %0" : "=g"(seg)); 506 asm("mov %%fs, %0" : "=g"(seg));
697 return seg; 507 return seg;
698} 508}
699 509
700static inline u16 read_gs(void) 510static inline u16 read_gs(void)
701{ 511{
702 u16 seg; 512 u16 seg;
703 asm ("mov %%gs, %0" : "=g"(seg)); 513 asm("mov %%gs, %0" : "=g"(seg));
704 return seg; 514 return seg;
705} 515}
706 516
707static inline u16 read_ldt(void) 517static inline u16 read_ldt(void)
708{ 518{
709 u16 ldt; 519 u16 ldt;
710 asm ("sldt %0" : "=g"(ldt)); 520 asm("sldt %0" : "=g"(ldt));
711 return ldt; 521 return ldt;
712} 522}
713 523
714static inline void load_fs(u16 sel) 524static inline void load_fs(u16 sel)
715{ 525{
716 asm ("mov %0, %%fs" : : "rm"(sel)); 526 asm("mov %0, %%fs" : : "rm"(sel));
717} 527}
718 528
719static inline void load_gs(u16 sel) 529static inline void load_gs(u16 sel)
720{ 530{
721 asm ("mov %0, %%gs" : : "rm"(sel)); 531 asm("mov %0, %%gs" : : "rm"(sel));
722} 532}
723 533
724#ifndef load_ldt 534#ifndef load_ldt
725static inline void load_ldt(u16 sel) 535static inline void load_ldt(u16 sel)
726{ 536{
727 asm ("lldt %0" : : "rm"(sel)); 537 asm("lldt %0" : : "rm"(sel));
728} 538}
729#endif 539#endif
730 540
731static inline void get_idt(struct descriptor_table *table) 541static inline void get_idt(struct descriptor_table *table)
732{ 542{
733 asm ("sidt %0" : "=m"(*table)); 543 asm("sidt %0" : "=m"(*table));
734} 544}
735 545
736static inline void get_gdt(struct descriptor_table *table) 546static inline void get_gdt(struct descriptor_table *table)
737{ 547{
738 asm ("sgdt %0" : "=m"(*table)); 548 asm("sgdt %0" : "=m"(*table));
739} 549}
740 550
741static inline unsigned long read_tr_base(void) 551static inline unsigned long read_tr_base(void)
742{ 552{
743 u16 tr; 553 u16 tr;
744 asm ("str %0" : "=g"(tr)); 554 asm("str %0" : "=g"(tr));
745 return segment_base(tr); 555 return segment_base(tr);
746} 556}
747 557
@@ -757,17 +567,17 @@ static inline unsigned long read_msr(unsigned long msr)
757 567
758static inline void fx_save(struct i387_fxsave_struct *image) 568static inline void fx_save(struct i387_fxsave_struct *image)
759{ 569{
760 asm ("fxsave (%0)":: "r" (image)); 570 asm("fxsave (%0)":: "r" (image));
761} 571}
762 572
763static inline void fx_restore(struct i387_fxsave_struct *image) 573static inline void fx_restore(struct i387_fxsave_struct *image)
764{ 574{
765 asm ("fxrstor (%0)":: "r" (image)); 575 asm("fxrstor (%0)":: "r" (image));
766} 576}
767 577
768static inline void fpu_init(void) 578static inline void fpu_init(void)
769{ 579{
770 asm ("finit"); 580 asm("finit");
771} 581}
772 582
773static inline u32 get_rdx_init_val(void) 583static inline u32 get_rdx_init_val(void)
@@ -775,6 +585,11 @@ static inline u32 get_rdx_init_val(void)
775 return 0x600; /* P6 family */ 585 return 0x600; /* P6 family */
776} 586}
777 587
588static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
589{
590 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
591}
592
778#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" 593#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
779#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" 594#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
780#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" 595#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
new file mode 100644
index 000000000000..c6f3fd8d8c53
--- /dev/null
+++ b/include/asm-x86/kvm_para.h
@@ -0,0 +1,105 @@
1#ifndef __X86_KVM_PARA_H
2#define __X86_KVM_PARA_H
3
4/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
5 * should be used to determine that a VM is running under KVM.
6 */
7#define KVM_CPUID_SIGNATURE 0x40000000
8
9/* This CPUID returns a feature bitmap in eax. Before enabling a particular
10 * paravirtualization, the appropriate feature bit should be checked.
11 */
12#define KVM_CPUID_FEATURES 0x40000001
13
14#ifdef __KERNEL__
15#include <asm/processor.h>
16
17/* This instruction is vmcall. On non-VT architectures, it will generate a
18 * trap that we will then rewrite to the appropriate instruction.
19 */
20#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
21
22/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
23 * instruction. The hypervisor may replace it with something else but only the
24 * instructions are guaranteed to be supported.
25 *
26 * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
27 * The hypercall number should be placed in rax and the return value will be
28 * placed in rax. No other registers will be clobbered unless explicited
29 * noted by the particular hypercall.
30 */
31
32static inline long kvm_hypercall0(unsigned int nr)
33{
34 long ret;
35 asm volatile(KVM_HYPERCALL
36 : "=a"(ret)
37 : "a"(nr));
38 return ret;
39}
40
41static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
42{
43 long ret;
44 asm volatile(KVM_HYPERCALL
45 : "=a"(ret)
46 : "a"(nr), "b"(p1));
47 return ret;
48}
49
50static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
51 unsigned long p2)
52{
53 long ret;
54 asm volatile(KVM_HYPERCALL
55 : "=a"(ret)
56 : "a"(nr), "b"(p1), "c"(p2));
57 return ret;
58}
59
60static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
61 unsigned long p2, unsigned long p3)
62{
63 long ret;
64 asm volatile(KVM_HYPERCALL
65 : "=a"(ret)
66 : "a"(nr), "b"(p1), "c"(p2), "d"(p3));
67 return ret;
68}
69
70static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
71 unsigned long p2, unsigned long p3,
72 unsigned long p4)
73{
74 long ret;
75 asm volatile(KVM_HYPERCALL
76 : "=a"(ret)
77 : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4));
78 return ret;
79}
80
81static inline int kvm_para_available(void)
82{
83 unsigned int eax, ebx, ecx, edx;
84 char signature[13];
85
86 cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
87 memcpy(signature + 0, &ebx, 4);
88 memcpy(signature + 4, &ecx, 4);
89 memcpy(signature + 8, &edx, 4);
90 signature[12] = 0;
91
92 if (strcmp(signature, "KVMKVMKVM") == 0)
93 return 1;
94
95 return 0;
96}
97
98static inline unsigned int kvm_arch_para_features(void)
99{
100 return cpuid_eax(KVM_CPUID_FEATURES);
101}
102
103#endif
104
105#endif
diff --git a/drivers/kvm/x86_emulate.h b/include/asm-x86/kvm_x86_emulate.h
index 92c73aa7f9ac..7db91b9bdcd4 100644
--- a/drivers/kvm/x86_emulate.h
+++ b/include/asm-x86/kvm_x86_emulate.h
@@ -63,17 +63,6 @@ struct x86_emulate_ops {
63 unsigned int bytes, struct kvm_vcpu *vcpu); 63 unsigned int bytes, struct kvm_vcpu *vcpu);
64 64
65 /* 65 /*
66 * write_std: Write bytes of standard (non-emulated/special) memory.
67 * Used for stack operations, and others.
68 * @addr: [IN ] Linear address to which to write.
69 * @val: [IN ] Value to write to memory (low-order bytes used as
70 * required).
71 * @bytes: [IN ] Number of bytes to write to memory.
72 */
73 int (*write_std)(unsigned long addr, const void *val,
74 unsigned int bytes, struct kvm_vcpu *vcpu);
75
76 /*
77 * read_emulated: Read bytes from emulated/special memory area. 66 * read_emulated: Read bytes from emulated/special memory area.
78 * @addr: [IN ] Linear address from which to read. 67 * @addr: [IN ] Linear address from which to read.
79 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 68 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
@@ -112,13 +101,50 @@ struct x86_emulate_ops {
112 101
113}; 102};
114 103
104/* Type, address-of, and value of an instruction's operand. */
105struct operand {
106 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
107 unsigned int bytes;
108 unsigned long val, orig_val, *ptr;
109};
110
111struct fetch_cache {
112 u8 data[15];
113 unsigned long start;
114 unsigned long end;
115};
116
117struct decode_cache {
118 u8 twobyte;
119 u8 b;
120 u8 lock_prefix;
121 u8 rep_prefix;
122 u8 op_bytes;
123 u8 ad_bytes;
124 u8 rex_prefix;
125 struct operand src;
126 struct operand dst;
127 unsigned long *override_base;
128 unsigned int d;
129 unsigned long regs[NR_VCPU_REGS];
130 unsigned long eip;
131 /* modrm */
132 u8 modrm;
133 u8 modrm_mod;
134 u8 modrm_reg;
135 u8 modrm_rm;
136 u8 use_modrm_ea;
137 unsigned long modrm_ea;
138 unsigned long modrm_val;
139 struct fetch_cache fetch;
140};
141
115struct x86_emulate_ctxt { 142struct x86_emulate_ctxt {
116 /* Register state before/after emulation. */ 143 /* Register state before/after emulation. */
117 struct kvm_vcpu *vcpu; 144 struct kvm_vcpu *vcpu;
118 145
119 /* Linear faulting address (if emulating a page-faulting instruction). */ 146 /* Linear faulting address (if emulating a page-faulting instruction). */
120 unsigned long eflags; 147 unsigned long eflags;
121 unsigned long cr2;
122 148
123 /* Emulated execution mode, represented by an X86EMUL_MODE value. */ 149 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
124 int mode; 150 int mode;
@@ -129,8 +155,16 @@ struct x86_emulate_ctxt {
129 unsigned long ss_base; 155 unsigned long ss_base;
130 unsigned long gs_base; 156 unsigned long gs_base;
131 unsigned long fs_base; 157 unsigned long fs_base;
158
159 /* decode cache */
160
161 struct decode_cache decode;
132}; 162};
133 163
164/* Repeat String Operation Prefix */
165#define REPE_PREFIX 1
166#define REPNE_PREFIX 2
167
134/* Execution mode, passed to the emulator. */ 168/* Execution mode, passed to the emulator. */
135#define X86EMUL_MODE_REAL 0 /* Real mode. */ 169#define X86EMUL_MODE_REAL 0 /* Real mode. */
136#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ 170#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
@@ -144,12 +178,9 @@ struct x86_emulate_ctxt {
144#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 178#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
145#endif 179#endif
146 180
147/* 181int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
148 * x86_emulate_memop: Emulate an instruction that faulted attempting to 182 struct x86_emulate_ops *ops);
149 * read/write a 'special' memory area. 183int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
150 * Returns -1 on failure, 0 on success. 184 struct x86_emulate_ops *ops);
151 */
152int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
153 struct x86_emulate_ops *ops);
154 185
155#endif /* __X86_EMULATE_H__ */ 186#endif /* __X86_EMULATE_H__ */
diff --git a/include/asm-x86/lguest.h b/include/asm-x86/lguest.h
index 1c8367a692f6..4d9367b72976 100644
--- a/include/asm-x86/lguest.h
+++ b/include/asm-x86/lguest.h
@@ -56,7 +56,7 @@ struct lguest_ro_state
56 struct desc_struct guest_gdt[GDT_ENTRIES]; 56 struct desc_struct guest_gdt[GDT_ENTRIES];
57}; 57};
58 58
59struct lguest_arch 59struct lg_cpu_arch
60{ 60{
61 /* The GDT entries copied into lguest_ro_state when running. */ 61 /* The GDT entries copied into lguest_ro_state when running. */
62 struct desc_struct gdt[GDT_ENTRIES]; 62 struct desc_struct gdt[GDT_ENTRIES];
diff --git a/include/asm-x86/lguest_hcall.h b/include/asm-x86/lguest_hcall.h
index 2091779e91fb..758b9a5d4539 100644
--- a/include/asm-x86/lguest_hcall.h
+++ b/include/asm-x86/lguest_hcall.h
@@ -4,7 +4,7 @@
4 4
5#define LHCALL_FLUSH_ASYNC 0 5#define LHCALL_FLUSH_ASYNC 0
6#define LHCALL_LGUEST_INIT 1 6#define LHCALL_LGUEST_INIT 1
7#define LHCALL_CRASH 2 7#define LHCALL_SHUTDOWN 2
8#define LHCALL_LOAD_GDT 3 8#define LHCALL_LOAD_GDT 3
9#define LHCALL_NEW_PGTABLE 4 9#define LHCALL_NEW_PGTABLE 4
10#define LHCALL_FLUSH_TLB 5 10#define LHCALL_FLUSH_TLB 5
@@ -20,6 +20,10 @@
20 20
21#define LGUEST_TRAP_ENTRY 0x1F 21#define LGUEST_TRAP_ENTRY 0x1F
22 22
23/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
24#define LGUEST_SHUTDOWN_POWEROFF 1
25#define LGUEST_SHUTDOWN_RESTART 2
26
23#ifndef __ASSEMBLY__ 27#ifndef __ASSEMBLY__
24#include <asm/hw_irq.h> 28#include <asm/hw_irq.h>
25 29
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 27b9350052b4..85b2482cc736 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -100,7 +100,6 @@ header-y += iso_fs.h
100header-y += ixjuser.h 100header-y += ixjuser.h
101header-y += jffs2.h 101header-y += jffs2.h
102header-y += keyctl.h 102header-y += keyctl.h
103header-y += kvm.h
104header-y += limits.h 103header-y += limits.h
105header-y += lock_dlm_plock.h 104header-y += lock_dlm_plock.h
106header-y += magic.h 105header-y += magic.h
@@ -256,6 +255,7 @@ unifdef-y += kd.h
256unifdef-y += kernelcapi.h 255unifdef-y += kernelcapi.h
257unifdef-y += kernel.h 256unifdef-y += kernel.h
258unifdef-y += keyboard.h 257unifdef-y += keyboard.h
258unifdef-$(CONFIG_HAVE_KVM) += kvm.h
259unifdef-y += llc.h 259unifdef-y += llc.h
260unifdef-y += loop.h 260unifdef-y += loop.h
261unifdef-y += lp.h 261unifdef-y += lp.h
diff --git a/include/linux/audit.h b/include/linux/audit.h
index c68781692838..bdd6f5de5fc4 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -115,6 +115,8 @@
115#define AUDIT_MAC_IPSEC_ADDSPD 1413 /* Not used */ 115#define AUDIT_MAC_IPSEC_ADDSPD 1413 /* Not used */
116#define AUDIT_MAC_IPSEC_DELSPD 1414 /* Not used */ 116#define AUDIT_MAC_IPSEC_DELSPD 1414 /* Not used */
117#define AUDIT_MAC_IPSEC_EVENT 1415 /* Audit an IPSec event */ 117#define AUDIT_MAC_IPSEC_EVENT 1415 /* Audit an IPSec event */
118#define AUDIT_MAC_UNLBL_STCADD 1416 /* NetLabel: add a static label */
119#define AUDIT_MAC_UNLBL_STCDEL 1417 /* NetLabel: del a static label */
118 120
119#define AUDIT_FIRST_KERN_ANOM_MSG 1700 121#define AUDIT_FIRST_KERN_ANOM_MSG 1700
120#define AUDIT_LAST_KERN_ANOM_MSG 1799 122#define AUDIT_LAST_KERN_ANOM_MSG 1799
diff --git a/include/linux/device.h b/include/linux/device.h
index 1880208964d6..db375be333c7 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -84,6 +84,9 @@ int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data,
84struct device *bus_find_device(struct bus_type *bus, struct device *start, 84struct device *bus_find_device(struct bus_type *bus, struct device *start,
85 void *data, 85 void *data,
86 int (*match)(struct device *dev, void *data)); 86 int (*match)(struct device *dev, void *data));
87struct device *bus_find_device_by_name(struct bus_type *bus,
88 struct device *start,
89 const char *name);
87 90
88int __must_check bus_for_each_drv(struct bus_type *bus, 91int __must_check bus_for_each_drv(struct bus_type *bus,
89 struct device_driver *start, void *data, 92 struct device_driver *start, void *data,
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 057a7f34ee36..4de4fd2d8607 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -9,12 +9,10 @@
9 9
10#include <asm/types.h> 10#include <asm/types.h>
11#include <linux/ioctl.h> 11#include <linux/ioctl.h>
12#include <asm/kvm.h>
12 13
13#define KVM_API_VERSION 12 14#define KVM_API_VERSION 12
14 15
15/* Architectural interrupt line count. */
16#define KVM_NR_INTERRUPTS 256
17
18/* for KVM_CREATE_MEMORY_REGION */ 16/* for KVM_CREATE_MEMORY_REGION */
19struct kvm_memory_region { 17struct kvm_memory_region {
20 __u32 slot; 18 __u32 slot;
@@ -23,17 +21,19 @@ struct kvm_memory_region {
23 __u64 memory_size; /* bytes */ 21 __u64 memory_size; /* bytes */
24}; 22};
25 23
26/* for kvm_memory_region::flags */ 24/* for KVM_SET_USER_MEMORY_REGION */
27#define KVM_MEM_LOG_DIRTY_PAGES 1UL 25struct kvm_userspace_memory_region {
28 26 __u32 slot;
29struct kvm_memory_alias {
30 __u32 slot; /* this has a different namespace than memory slots */
31 __u32 flags; 27 __u32 flags;
32 __u64 guest_phys_addr; 28 __u64 guest_phys_addr;
33 __u64 memory_size; 29 __u64 memory_size; /* bytes */
34 __u64 target_phys_addr; 30 __u64 userspace_addr; /* start of the userspace allocated memory */
35}; 31};
36 32
33/* for kvm_memory_region::flags */
34#define KVM_MEM_LOG_DIRTY_PAGES 1UL
35
36
37/* for KVM_IRQ_LINE */ 37/* for KVM_IRQ_LINE */
38struct kvm_irq_level { 38struct kvm_irq_level {
39 /* 39 /*
@@ -45,62 +45,18 @@ struct kvm_irq_level {
45 __u32 level; 45 __u32 level;
46}; 46};
47 47
48/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
49struct kvm_pic_state {
50 __u8 last_irr; /* edge detection */
51 __u8 irr; /* interrupt request register */
52 __u8 imr; /* interrupt mask register */
53 __u8 isr; /* interrupt service register */
54 __u8 priority_add; /* highest irq priority */
55 __u8 irq_base;
56 __u8 read_reg_select;
57 __u8 poll;
58 __u8 special_mask;
59 __u8 init_state;
60 __u8 auto_eoi;
61 __u8 rotate_on_auto_eoi;
62 __u8 special_fully_nested_mode;
63 __u8 init4; /* true if 4 byte init */
64 __u8 elcr; /* PIIX edge/trigger selection */
65 __u8 elcr_mask;
66};
67
68#define KVM_IOAPIC_NUM_PINS 24
69struct kvm_ioapic_state {
70 __u64 base_address;
71 __u32 ioregsel;
72 __u32 id;
73 __u32 irr;
74 __u32 pad;
75 union {
76 __u64 bits;
77 struct {
78 __u8 vector;
79 __u8 delivery_mode:3;
80 __u8 dest_mode:1;
81 __u8 delivery_status:1;
82 __u8 polarity:1;
83 __u8 remote_irr:1;
84 __u8 trig_mode:1;
85 __u8 mask:1;
86 __u8 reserve:7;
87 __u8 reserved[4];
88 __u8 dest_id;
89 } fields;
90 } redirtbl[KVM_IOAPIC_NUM_PINS];
91};
92
93#define KVM_IRQCHIP_PIC_MASTER 0
94#define KVM_IRQCHIP_PIC_SLAVE 1
95#define KVM_IRQCHIP_IOAPIC 2
96 48
97struct kvm_irqchip { 49struct kvm_irqchip {
98 __u32 chip_id; 50 __u32 chip_id;
99 __u32 pad; 51 __u32 pad;
100 union { 52 union {
101 char dummy[512]; /* reserving space */ 53 char dummy[512]; /* reserving space */
54#ifdef CONFIG_X86
102 struct kvm_pic_state pic; 55 struct kvm_pic_state pic;
56#endif
57#if defined(CONFIG_X86) || defined(CONFIG_IA64)
103 struct kvm_ioapic_state ioapic; 58 struct kvm_ioapic_state ioapic;
59#endif
104 } chip; 60 } chip;
105}; 61};
106 62
@@ -116,6 +72,7 @@ struct kvm_irqchip {
116#define KVM_EXIT_FAIL_ENTRY 9 72#define KVM_EXIT_FAIL_ENTRY 9
117#define KVM_EXIT_INTR 10 73#define KVM_EXIT_INTR 10
118#define KVM_EXIT_SET_TPR 11 74#define KVM_EXIT_SET_TPR 11
75#define KVM_EXIT_TPR_ACCESS 12
119 76
120/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ 77/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
121struct kvm_run { 78struct kvm_run {
@@ -174,90 +131,17 @@ struct kvm_run {
174 __u32 longmode; 131 __u32 longmode;
175 __u32 pad; 132 __u32 pad;
176 } hypercall; 133 } hypercall;
134 /* KVM_EXIT_TPR_ACCESS */
135 struct {
136 __u64 rip;
137 __u32 is_write;
138 __u32 pad;
139 } tpr_access;
177 /* Fix the size of the union. */ 140 /* Fix the size of the union. */
178 char padding[256]; 141 char padding[256];
179 }; 142 };
180}; 143};
181 144
182/* for KVM_GET_REGS and KVM_SET_REGS */
183struct kvm_regs {
184 /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
185 __u64 rax, rbx, rcx, rdx;
186 __u64 rsi, rdi, rsp, rbp;
187 __u64 r8, r9, r10, r11;
188 __u64 r12, r13, r14, r15;
189 __u64 rip, rflags;
190};
191
192/* for KVM_GET_FPU and KVM_SET_FPU */
193struct kvm_fpu {
194 __u8 fpr[8][16];
195 __u16 fcw;
196 __u16 fsw;
197 __u8 ftwx; /* in fxsave format */
198 __u8 pad1;
199 __u16 last_opcode;
200 __u64 last_ip;
201 __u64 last_dp;
202 __u8 xmm[16][16];
203 __u32 mxcsr;
204 __u32 pad2;
205};
206
207/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
208#define KVM_APIC_REG_SIZE 0x400
209struct kvm_lapic_state {
210 char regs[KVM_APIC_REG_SIZE];
211};
212
213struct kvm_segment {
214 __u64 base;
215 __u32 limit;
216 __u16 selector;
217 __u8 type;
218 __u8 present, dpl, db, s, l, g, avl;
219 __u8 unusable;
220 __u8 padding;
221};
222
223struct kvm_dtable {
224 __u64 base;
225 __u16 limit;
226 __u16 padding[3];
227};
228
229/* for KVM_GET_SREGS and KVM_SET_SREGS */
230struct kvm_sregs {
231 /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
232 struct kvm_segment cs, ds, es, fs, gs, ss;
233 struct kvm_segment tr, ldt;
234 struct kvm_dtable gdt, idt;
235 __u64 cr0, cr2, cr3, cr4, cr8;
236 __u64 efer;
237 __u64 apic_base;
238 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
239};
240
241struct kvm_msr_entry {
242 __u32 index;
243 __u32 reserved;
244 __u64 data;
245};
246
247/* for KVM_GET_MSRS and KVM_SET_MSRS */
248struct kvm_msrs {
249 __u32 nmsrs; /* number of msrs in entries */
250 __u32 pad;
251
252 struct kvm_msr_entry entries[0];
253};
254
255/* for KVM_GET_MSR_INDEX_LIST */
256struct kvm_msr_list {
257 __u32 nmsrs; /* number of msrs in entries */
258 __u32 indices[0];
259};
260
261/* for KVM_TRANSLATE */ 145/* for KVM_TRANSLATE */
262struct kvm_translation { 146struct kvm_translation {
263 /* in */ 147 /* in */
@@ -302,28 +186,24 @@ struct kvm_dirty_log {
302 }; 186 };
303}; 187};
304 188
305struct kvm_cpuid_entry {
306 __u32 function;
307 __u32 eax;
308 __u32 ebx;
309 __u32 ecx;
310 __u32 edx;
311 __u32 padding;
312};
313
314/* for KVM_SET_CPUID */
315struct kvm_cpuid {
316 __u32 nent;
317 __u32 padding;
318 struct kvm_cpuid_entry entries[0];
319};
320
321/* for KVM_SET_SIGNAL_MASK */ 189/* for KVM_SET_SIGNAL_MASK */
322struct kvm_signal_mask { 190struct kvm_signal_mask {
323 __u32 len; 191 __u32 len;
324 __u8 sigset[0]; 192 __u8 sigset[0];
325}; 193};
326 194
195/* for KVM_TPR_ACCESS_REPORTING */
196struct kvm_tpr_access_ctl {
197 __u32 enabled;
198 __u32 flags;
199 __u32 reserved[8];
200};
201
202/* for KVM_SET_VAPIC_ADDR */
203struct kvm_vapic_addr {
204 __u64 vapic_addr;
205};
206
327#define KVMIO 0xAE 207#define KVMIO 0xAE
328 208
329/* 209/*
@@ -347,11 +227,21 @@ struct kvm_signal_mask {
347 */ 227 */
348#define KVM_CAP_IRQCHIP 0 228#define KVM_CAP_IRQCHIP 0
349#define KVM_CAP_HLT 1 229#define KVM_CAP_HLT 1
230#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
231#define KVM_CAP_USER_MEMORY 3
232#define KVM_CAP_SET_TSS_ADDR 4
233#define KVM_CAP_EXT_CPUID 5
234#define KVM_CAP_VAPIC 6
350 235
351/* 236/*
352 * ioctls for VM fds 237 * ioctls for VM fds
353 */ 238 */
354#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) 239#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region)
240#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44)
241#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45)
242#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
243 struct kvm_userspace_memory_region)
244#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
355/* 245/*
356 * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns 246 * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
357 * a vcpu fd. 247 * a vcpu fd.
@@ -359,6 +249,7 @@ struct kvm_signal_mask {
359#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) 249#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
360#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) 250#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log)
361#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) 251#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias)
252#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x48, struct kvm_cpuid2)
362/* Device model IOC */ 253/* Device model IOC */
363#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) 254#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60)
364#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) 255#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level)
@@ -384,5 +275,11 @@ struct kvm_signal_mask {
384#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) 275#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu)
385#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state) 276#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state)
386#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state) 277#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state)
278#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2)
279#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
280/* Available with KVM_CAP_VAPIC */
281#define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl)
282/* Available with KVM_CAP_VAPIC */
283#define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr)
387 284
388#endif 285#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
new file mode 100644
index 000000000000..ea4764b0a2f4
--- /dev/null
+++ b/include/linux/kvm_host.h
@@ -0,0 +1,299 @@
1#ifndef __KVM_HOST_H
2#define __KVM_HOST_H
3
4/*
5 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory.
7 */
8
9#include <linux/types.h>
10#include <linux/hardirq.h>
11#include <linux/list.h>
12#include <linux/mutex.h>
13#include <linux/spinlock.h>
14#include <linux/signal.h>
15#include <linux/sched.h>
16#include <linux/mm.h>
17#include <linux/preempt.h>
18#include <asm/signal.h>
19
20#include <linux/kvm.h>
21#include <linux/kvm_para.h>
22
23#include <linux/kvm_types.h>
24
25#include <asm/kvm_host.h>
26
27#define KVM_MAX_VCPUS 4
28#define KVM_MEMORY_SLOTS 8
29/* memory slots that does not exposed to userspace */
30#define KVM_PRIVATE_MEM_SLOTS 4
31
32#define KVM_PIO_PAGE_OFFSET 1
33
34/*
35 * vcpu->requests bit members
36 */
37#define KVM_REQ_TLB_FLUSH 0
38#define KVM_REQ_MIGRATE_TIMER 1
39#define KVM_REQ_REPORT_TPR_ACCESS 2
40
41struct kvm_vcpu;
42extern struct kmem_cache *kvm_vcpu_cache;
43
44struct kvm_guest_debug {
45 int enabled;
46 unsigned long bp[4];
47 int singlestep;
48};
49
50/*
51 * It would be nice to use something smarter than a linear search, TBD...
52 * Thankfully we dont expect many devices to register (famous last words :),
53 * so until then it will suffice. At least its abstracted so we can change
54 * in one place.
55 */
56struct kvm_io_bus {
57 int dev_count;
58#define NR_IOBUS_DEVS 6
59 struct kvm_io_device *devs[NR_IOBUS_DEVS];
60};
61
62void kvm_io_bus_init(struct kvm_io_bus *bus);
63void kvm_io_bus_destroy(struct kvm_io_bus *bus);
64struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
65void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
66 struct kvm_io_device *dev);
67
68struct kvm_vcpu {
69 struct kvm *kvm;
70 struct preempt_notifier preempt_notifier;
71 int vcpu_id;
72 struct mutex mutex;
73 int cpu;
74 struct kvm_run *run;
75 int guest_mode;
76 unsigned long requests;
77 struct kvm_guest_debug guest_debug;
78 int fpu_active;
79 int guest_fpu_loaded;
80 wait_queue_head_t wq;
81 int sigset_active;
82 sigset_t sigset;
83 struct kvm_vcpu_stat stat;
84
85#ifdef CONFIG_HAS_IOMEM
86 int mmio_needed;
87 int mmio_read_completed;
88 int mmio_is_write;
89 int mmio_size;
90 unsigned char mmio_data[8];
91 gpa_t mmio_phys_addr;
92#endif
93
94 struct kvm_vcpu_arch arch;
95};
96
97struct kvm_memory_slot {
98 gfn_t base_gfn;
99 unsigned long npages;
100 unsigned long flags;
101 unsigned long *rmap;
102 unsigned long *dirty_bitmap;
103 unsigned long userspace_addr;
104 int user_alloc;
105};
106
107struct kvm {
108 struct mutex lock; /* protects the vcpus array and APIC accesses */
109 spinlock_t mmu_lock;
110 struct mm_struct *mm; /* userspace tied to this vm */
111 int nmemslots;
112 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
113 KVM_PRIVATE_MEM_SLOTS];
114 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
115 struct list_head vm_list;
116 struct file *filp;
117 struct kvm_io_bus mmio_bus;
118 struct kvm_io_bus pio_bus;
119 struct kvm_vm_stat stat;
120 struct kvm_arch arch;
121};
122
123/* The guest did something we don't support. */
124#define pr_unimpl(vcpu, fmt, ...) \
125 do { \
126 if (printk_ratelimit()) \
127 printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
128 current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
129 } while (0)
130
131#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
132#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
133
134int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
135void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
136
137void vcpu_load(struct kvm_vcpu *vcpu);
138void vcpu_put(struct kvm_vcpu *vcpu);
139
140void decache_vcpus_on_cpu(int cpu);
141
142
143int kvm_init(void *opaque, unsigned int vcpu_size,
144 struct module *module);
145void kvm_exit(void);
146
147#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
148#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
149static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
150struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
151
152extern struct page *bad_page;
153
154int is_error_page(struct page *page);
155int kvm_is_error_hva(unsigned long addr);
156int kvm_set_memory_region(struct kvm *kvm,
157 struct kvm_userspace_memory_region *mem,
158 int user_alloc);
159int __kvm_set_memory_region(struct kvm *kvm,
160 struct kvm_userspace_memory_region *mem,
161 int user_alloc);
162int kvm_arch_set_memory_region(struct kvm *kvm,
163 struct kvm_userspace_memory_region *mem,
164 struct kvm_memory_slot old,
165 int user_alloc);
166gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
167struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
168void kvm_release_page_clean(struct page *page);
169void kvm_release_page_dirty(struct page *page);
170int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
171 int len);
172int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
173 unsigned long len);
174int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
175int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
176 int offset, int len);
177int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
178 unsigned long len);
179int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
180int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
181struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
182int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
183void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
184
185void kvm_vcpu_block(struct kvm_vcpu *vcpu);
186void kvm_resched(struct kvm_vcpu *vcpu);
187void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
188void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
189void kvm_flush_remote_tlbs(struct kvm *kvm);
190
191long kvm_arch_dev_ioctl(struct file *filp,
192 unsigned int ioctl, unsigned long arg);
193long kvm_arch_vcpu_ioctl(struct file *filp,
194 unsigned int ioctl, unsigned long arg);
195void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
196void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
197
198int kvm_dev_ioctl_check_extension(long ext);
199
200int kvm_get_dirty_log(struct kvm *kvm,
201 struct kvm_dirty_log *log, int *is_dirty);
202int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
203 struct kvm_dirty_log *log);
204
205int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
206 struct
207 kvm_userspace_memory_region *mem,
208 int user_alloc);
209long kvm_arch_vm_ioctl(struct file *filp,
210 unsigned int ioctl, unsigned long arg);
211void kvm_arch_destroy_vm(struct kvm *kvm);
212
213int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
214int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
215
216int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
217 struct kvm_translation *tr);
218
219int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
220int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
221int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
222 struct kvm_sregs *sregs);
223int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
224 struct kvm_sregs *sregs);
225int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
226 struct kvm_debug_guest *dbg);
227int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
228
229int kvm_arch_init(void *opaque);
230void kvm_arch_exit(void);
231
232int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
233void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
234
235void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
236void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
237void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
238struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
239int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
240void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
241
242int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
243void kvm_arch_hardware_enable(void *garbage);
244void kvm_arch_hardware_disable(void *garbage);
245int kvm_arch_hardware_setup(void);
246void kvm_arch_hardware_unsetup(void);
247void kvm_arch_check_processor_compat(void *rtn);
248int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
249
250void kvm_free_physmem(struct kvm *kvm);
251
252struct kvm *kvm_arch_create_vm(void);
253void kvm_arch_destroy_vm(struct kvm *kvm);
254
255int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
256int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
257void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
258
259static inline void kvm_guest_enter(void)
260{
261 account_system_vtime(current);
262 current->flags |= PF_VCPU;
263}
264
265static inline void kvm_guest_exit(void)
266{
267 account_system_vtime(current);
268 current->flags &= ~PF_VCPU;
269}
270
271static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
272{
273 return slot - kvm->memslots;
274}
275
276static inline gpa_t gfn_to_gpa(gfn_t gfn)
277{
278 return (gpa_t)gfn << PAGE_SHIFT;
279}
280
281static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
282{
283 set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
284}
285
286enum kvm_stat_kind {
287 KVM_STAT_VM,
288 KVM_STAT_VCPU,
289};
290
291struct kvm_stats_debugfs_item {
292 const char *name;
293 int offset;
294 enum kvm_stat_kind kind;
295 struct dentry *dentry;
296};
297extern struct kvm_stats_debugfs_item debugfs_entries[];
298
299#endif
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index 3b292565a693..5497aac0d2f8 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -2,72 +2,30 @@
2#define __LINUX_KVM_PARA_H 2#define __LINUX_KVM_PARA_H
3 3
4/* 4/*
5 * Guest OS interface for KVM paravirtualization 5 * This header file provides a method for making a hypercall to the host
6 * 6 * Architectures should define:
7 * Note: this interface is totally experimental, and is certain to change 7 * - kvm_hypercall0, kvm_hypercall1...
8 * as we make progress. 8 * - kvm_arch_para_features
9 * - kvm_para_available
9 */ 10 */
10 11
11/* 12/* Return values for hypercalls */
12 * Per-VCPU descriptor area shared between guest and host. Writable to 13#define KVM_ENOSYS 1000
13 * both guest and host. Registered with the host by the guest when
14 * a guest acknowledges paravirtual mode.
15 *
16 * NOTE: all addresses are guest-physical addresses (gpa), to make it
17 * easier for the hypervisor to map between the various addresses.
18 */
19struct kvm_vcpu_para_state {
20 /*
21 * API version information for compatibility. If there's any support
22 * mismatch (too old host trying to execute too new guest) then
23 * the host will deny entry into paravirtual mode. Any other
24 * combination (new host + old guest and new host + new guest)
25 * is supposed to work - new host versions will support all old
26 * guest API versions.
27 */
28 u32 guest_version;
29 u32 host_version;
30 u32 size;
31 u32 ret;
32
33 /*
34 * The address of the vm exit instruction (VMCALL or VMMCALL),
35 * which the host will patch according to the CPU model the
36 * VM runs on:
37 */
38 u64 hypercall_gpa;
39
40} __attribute__ ((aligned(PAGE_SIZE)));
41
42#define KVM_PARA_API_VERSION 1
43
44/*
45 * This is used for an RDMSR's ECX parameter to probe for a KVM host.
46 * Hopefully no CPU vendor will use up this number. This is placed well
47 * out of way of the typical space occupied by CPU vendors' MSR indices,
48 * and we think (or at least hope) it wont be occupied in the future
49 * either.
50 */
51#define MSR_KVM_API_MAGIC 0x87655678
52 14
53#define KVM_EINVAL 1 15#define KVM_HC_VAPIC_POLL_IRQ 1
54 16
55/* 17/*
56 * Hypercall calling convention: 18 * hypercalls use architecture specific
57 *
58 * Each hypercall may have 0-6 parameters.
59 *
60 * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1
61 *
62 * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention
63 * order: RDI, RSI, RDX, RCX, R8, R9.
64 *
65 * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP.
66 * (the first 3 are according to the gcc regparm calling convention)
67 *
68 * No registers are clobbered by the hypercall, except that the
69 * return value is in RAX.
70 */ 19 */
71#define __NR_hypercalls 0 20#include <asm/kvm_para.h>
21
22#ifdef __KERNEL__
23static inline int kvm_para_has_feature(unsigned int feature)
24{
25 if (kvm_arch_para_features() & (1UL << feature))
26 return 1;
27 return 0;
28}
29#endif /* __KERNEL__ */
30#endif /* __LINUX_KVM_PARA_H */
72 31
73#endif
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
new file mode 100644
index 000000000000..1c4e46decb22
--- /dev/null
+++ b/include/linux/kvm_types.h
@@ -0,0 +1,54 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License.
5 *
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with this program; if not, write to the Free Software
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 *
15 */
16
17#ifndef __KVM_TYPES_H__
18#define __KVM_TYPES_H__
19
20#include <asm/types.h>
21
22/*
23 * Address types:
24 *
25 * gva - guest virtual address
26 * gpa - guest physical address
27 * gfn - guest frame number
28 * hva - host virtual address
29 * hpa - host physical address
30 * hfn - host frame number
31 */
32
33typedef unsigned long gva_t;
34typedef u64 gpa_t;
35typedef unsigned long gfn_t;
36
37typedef unsigned long hva_t;
38typedef u64 hpa_t;
39typedef unsigned long hfn_t;
40
41struct kvm_pio_request {
42 unsigned long count;
43 int cur_count;
44 struct page *guest_pages[2];
45 unsigned guest_page_offset;
46 int in;
47 int port;
48 int size;
49 int string;
50 int down;
51 int rep;
52};
53
54#endif /* __KVM_TYPES_H__ */
diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 6080f73fc85f..8c2cc4c02526 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -120,16 +120,35 @@ void selinux_get_task_sid(struct task_struct *tsk, u32 *sid);
120int selinux_string_to_sid(char *str, u32 *sid); 120int selinux_string_to_sid(char *str, u32 *sid);
121 121
122/** 122/**
123 * selinux_relabel_packet_permission - check permission to relabel a packet 123 * selinux_secmark_relabel_packet_permission - secmark permission check
124 * @sid: ID value to be applied to network packet (via SECMARK, most likely) 124 * @sid: SECMARK ID value to be applied to network packet
125 * 125 *
126 * Returns 0 if the current task is allowed to label packets with the 126 * Returns 0 if the current task is allowed to set the SECMARK label of
127 * supplied security ID. Note that it is implicit that the packet is always 127 * packets with the supplied security ID. Note that it is implicit that
128 * being relabeled from the default unlabled value, and that the access 128 * the packet is always being relabeled from the default unlabeled value,
129 * control decision is made in the AVC. 129 * and that the access control decision is made in the AVC.
130 */ 130 */
131int selinux_relabel_packet_permission(u32 sid); 131int selinux_secmark_relabel_packet_permission(u32 sid);
132 132
133/**
134 * selinux_secmark_refcount_inc - increments the secmark use counter
135 *
136 * SELinux keeps track of the current SECMARK targets in use so it knows
137 * when to apply SECMARK label access checks to network packets. This
138 * function incements this reference count to indicate that a new SECMARK
139 * target has been configured.
140 */
141void selinux_secmark_refcount_inc(void);
142
143/**
144 * selinux_secmark_refcount_dec - decrements the secmark use counter
145 *
146 * SELinux keeps track of the current SECMARK targets in use so it knows
147 * when to apply SECMARK label access checks to network packets. This
148 * function decements this reference count to indicate that one of the
149 * existing SECMARK targets has been removed/flushed.
150 */
151void selinux_secmark_refcount_dec(void);
133#else 152#else
134 153
135static inline int selinux_audit_rule_init(u32 field, u32 op, 154static inline int selinux_audit_rule_init(u32 field, u32 op,
@@ -184,11 +203,21 @@ static inline int selinux_string_to_sid(const char *str, u32 *sid)
184 return 0; 203 return 0;
185} 204}
186 205
187static inline int selinux_relabel_packet_permission(u32 sid) 206static inline int selinux_secmark_relabel_packet_permission(u32 sid)
188{ 207{
189 return 0; 208 return 0;
190} 209}
191 210
211static inline void selinux_secmark_refcount_inc(void)
212{
213 return;
214}
215
216static inline void selinux_secmark_refcount_dec(void)
217{
218 return;
219}
220
192#endif /* CONFIG_SECURITY_SELINUX */ 221#endif /* CONFIG_SECURITY_SELINUX */
193 222
194#endif /* _LINUX_SELINUX_H */ 223#endif /* _LINUX_SELINUX_H */
diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index 2e5b2f6f9fa0..b3213c7c5309 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -67,7 +67,11 @@
67 * NetLabel NETLINK protocol 67 * NetLabel NETLINK protocol
68 */ 68 */
69 69
70#define NETLBL_PROTO_VERSION 1 70/* NetLabel NETLINK protocol version
71 * 1: initial version
72 * 2: added static labels for unlabeled connections
73 */
74#define NETLBL_PROTO_VERSION 2
71 75
72/* NetLabel NETLINK types/families */ 76/* NetLabel NETLINK types/families */
73#define NETLBL_NLTYPE_NONE 0 77#define NETLBL_NLTYPE_NONE 0
@@ -105,17 +109,49 @@ struct netlbl_dom_map;
105/* Domain mapping operations */ 109/* Domain mapping operations */
106int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info); 110int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info);
107 111
108/* LSM security attributes */ 112/*
113 * LSM security attributes
114 */
115
116/**
117 * struct netlbl_lsm_cache - NetLabel LSM security attribute cache
118 * @refcount: atomic reference counter
119 * @free: LSM supplied function to free the cache data
120 * @data: LSM supplied cache data
121 *
122 * Description:
123 * This structure is provided for LSMs which wish to make use of the NetLabel
124 * caching mechanism to store LSM specific data/attributes in the NetLabel
125 * cache. If the LSM has to perform a lot of translation from the NetLabel
126 * security attributes into it's own internal representation then the cache
127 * mechanism can provide a way to eliminate some or all of that translation
128 * overhead on a cache hit.
129 *
130 */
109struct netlbl_lsm_cache { 131struct netlbl_lsm_cache {
110 atomic_t refcount; 132 atomic_t refcount;
111 void (*free) (const void *data); 133 void (*free) (const void *data);
112 void *data; 134 void *data;
113}; 135};
114/* The catmap bitmap field MUST be a power of two in length and large 136
137/**
138 * struct netlbl_lsm_secattr_catmap - NetLabel LSM secattr category bitmap
139 * @startbit: the value of the lowest order bit in the bitmap
140 * @bitmap: the category bitmap
141 * @next: pointer to the next bitmap "node" or NULL
142 *
143 * Description:
144 * This structure is used to represent category bitmaps. Due to the large
145 * number of categories supported by most labeling protocols it is not
146 * practical to transfer a full bitmap internally so NetLabel adopts a sparse
147 * bitmap structure modeled after SELinux's ebitmap structure.
148 * The catmap bitmap field MUST be a power of two in length and large
115 * enough to hold at least 240 bits. Special care (i.e. check the code!) 149 * enough to hold at least 240 bits. Special care (i.e. check the code!)
116 * should be used when changing these values as the LSM implementation 150 * should be used when changing these values as the LSM implementation
117 * probably has functions which rely on the sizes of these types to speed 151 * probably has functions which rely on the sizes of these types to speed
118 * processing. */ 152 * processing.
153 *
154 */
119#define NETLBL_CATMAP_MAPTYPE u64 155#define NETLBL_CATMAP_MAPTYPE u64
120#define NETLBL_CATMAP_MAPCNT 4 156#define NETLBL_CATMAP_MAPCNT 4
121#define NETLBL_CATMAP_MAPSIZE (sizeof(NETLBL_CATMAP_MAPTYPE) * 8) 157#define NETLBL_CATMAP_MAPSIZE (sizeof(NETLBL_CATMAP_MAPTYPE) * 8)
@@ -127,22 +163,48 @@ struct netlbl_lsm_secattr_catmap {
127 NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT]; 163 NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT];
128 struct netlbl_lsm_secattr_catmap *next; 164 struct netlbl_lsm_secattr_catmap *next;
129}; 165};
166
167/**
168 * struct netlbl_lsm_secattr - NetLabel LSM security attributes
169 * @flags: indicate which attributes are contained in this structure
170 * @type: indicate the NLTYPE of the attributes
171 * @domain: the NetLabel LSM domain
172 * @cache: NetLabel LSM specific cache
173 * @attr.mls: MLS sensitivity label
174 * @attr.mls.cat: MLS category bitmap
175 * @attr.mls.lvl: MLS sensitivity level
176 * @attr.secid: LSM specific secid token
177 *
178 * Description:
179 * This structure is used to pass security attributes between NetLabel and the
180 * LSM modules. The flags field is used to specify which fields within the
181 * struct are valid and valid values can be created by bitwise OR'ing the
182 * NETLBL_SECATTR_* defines. The domain field is typically set by the LSM to
183 * specify domain specific configuration settings and is not usually used by
184 * NetLabel itself when returning security attributes to the LSM.
185 *
186 */
130#define NETLBL_SECATTR_NONE 0x00000000 187#define NETLBL_SECATTR_NONE 0x00000000
131#define NETLBL_SECATTR_DOMAIN 0x00000001 188#define NETLBL_SECATTR_DOMAIN 0x00000001
132#define NETLBL_SECATTR_CACHE 0x00000002 189#define NETLBL_SECATTR_CACHE 0x00000002
133#define NETLBL_SECATTR_MLS_LVL 0x00000004 190#define NETLBL_SECATTR_MLS_LVL 0x00000004
134#define NETLBL_SECATTR_MLS_CAT 0x00000008 191#define NETLBL_SECATTR_MLS_CAT 0x00000008
192#define NETLBL_SECATTR_SECID 0x00000010
135#define NETLBL_SECATTR_CACHEABLE (NETLBL_SECATTR_MLS_LVL | \ 193#define NETLBL_SECATTR_CACHEABLE (NETLBL_SECATTR_MLS_LVL | \
136 NETLBL_SECATTR_MLS_CAT) 194 NETLBL_SECATTR_MLS_CAT | \
195 NETLBL_SECATTR_SECID)
137struct netlbl_lsm_secattr { 196struct netlbl_lsm_secattr {
138 u32 flags; 197 u32 flags;
139 198 u32 type;
140 char *domain; 199 char *domain;
141
142 u32 mls_lvl;
143 struct netlbl_lsm_secattr_catmap *mls_cat;
144
145 struct netlbl_lsm_cache *cache; 200 struct netlbl_lsm_cache *cache;
201 union {
202 struct {
203 struct netlbl_lsm_secattr_catmap *cat;
204 u32 lvl;
205 } mls;
206 u32 secid;
207 } attr;
146}; 208};
147 209
148/* 210/*
@@ -231,10 +293,7 @@ static inline void netlbl_secattr_catmap_free(
231 */ 293 */
232static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr) 294static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr)
233{ 295{
234 secattr->flags = 0; 296 memset(secattr, 0, sizeof(*secattr));
235 secattr->domain = NULL;
236 secattr->mls_cat = NULL;
237 secattr->cache = NULL;
238} 297}
239 298
240/** 299/**
@@ -248,11 +307,11 @@ static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr)
248 */ 307 */
249static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr) 308static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr)
250{ 309{
251 if (secattr->cache)
252 netlbl_secattr_cache_free(secattr->cache);
253 kfree(secattr->domain); 310 kfree(secattr->domain);
254 if (secattr->mls_cat) 311 if (secattr->flags & NETLBL_SECATTR_CACHE)
255 netlbl_secattr_catmap_free(secattr->mls_cat); 312 netlbl_secattr_cache_free(secattr->cache);
313 if (secattr->flags & NETLBL_SECATTR_MLS_CAT)
314 netlbl_secattr_catmap_free(secattr->attr.mls.cat);
256} 315}
257 316
258/** 317/**
@@ -300,7 +359,7 @@ int netlbl_secattr_catmap_setrng(struct netlbl_lsm_secattr_catmap *catmap,
300 gfp_t flags); 359 gfp_t flags);
301 360
302/* 361/*
303 * LSM protocol operations 362 * LSM protocol operations (NetLabel LSM/kernel API)
304 */ 363 */
305int netlbl_enabled(void); 364int netlbl_enabled(void);
306int netlbl_sock_setattr(struct sock *sk, 365int netlbl_sock_setattr(struct sock *sk,
@@ -308,6 +367,7 @@ int netlbl_sock_setattr(struct sock *sk,
308int netlbl_sock_getattr(struct sock *sk, 367int netlbl_sock_getattr(struct sock *sk,
309 struct netlbl_lsm_secattr *secattr); 368 struct netlbl_lsm_secattr *secattr);
310int netlbl_skbuff_getattr(const struct sk_buff *skb, 369int netlbl_skbuff_getattr(const struct sk_buff *skb,
370 u16 family,
311 struct netlbl_lsm_secattr *secattr); 371 struct netlbl_lsm_secattr *secattr);
312void netlbl_skbuff_err(struct sk_buff *skb, int error); 372void netlbl_skbuff_err(struct sk_buff *skb, int error);
313 373
@@ -360,6 +420,7 @@ static inline int netlbl_sock_getattr(struct sock *sk,
360 return -ENOSYS; 420 return -ENOSYS;
361} 421}
362static inline int netlbl_skbuff_getattr(const struct sk_buff *skb, 422static inline int netlbl_skbuff_getattr(const struct sk_buff *skb,
423 u16 family,
363 struct netlbl_lsm_secattr *secattr) 424 struct netlbl_lsm_secattr *secattr)
364{ 425{
365 return -ENOSYS; 426 return -ENOSYS;
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 702fcfeb37f1..82251575a9b4 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -11,6 +11,25 @@
11#include <linux/types.h> 11#include <linux/types.h>
12 12
13/* 13/*
14 * The maximum number of SG segments that we will put inside a
15 * scatterlist (unless chaining is used). Should ideally fit inside a
16 * single page, to avoid a higher order allocation. We could define this
17 * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order. The
18 * minimum value is 32
19 */
20#define SCSI_MAX_SG_SEGMENTS 128
21
22/*
23 * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
24 * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
25 */
26#ifdef ARCH_HAS_SG_CHAIN
27#define SCSI_MAX_SG_CHAIN_SEGMENTS 2048
28#else
29#define SCSI_MAX_SG_CHAIN_SEGMENTS SCSI_MAX_SG_SEGMENTS
30#endif
31
32/*
14 * SCSI command lengths 33 * SCSI command lengths
15 */ 34 */
16 35
@@ -83,6 +102,7 @@ extern const unsigned char scsi_command_size[8];
83#define READ_TOC 0x43 102#define READ_TOC 0x43
84#define LOG_SELECT 0x4c 103#define LOG_SELECT 0x4c
85#define LOG_SENSE 0x4d 104#define LOG_SENSE 0x4d
105#define XDWRITEREAD_10 0x53
86#define MODE_SELECT_10 0x55 106#define MODE_SELECT_10 0x55
87#define RESERVE_10 0x56 107#define RESERVE_10 0x56
88#define RELEASE_10 0x57 108#define RELEASE_10 0x57
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index a457fca66f61..de28aab820b0 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -2,15 +2,20 @@
2#define _SCSI_SCSI_CMND_H 2#define _SCSI_SCSI_CMND_H
3 3
4#include <linux/dma-mapping.h> 4#include <linux/dma-mapping.h>
5#include <linux/blkdev.h>
5#include <linux/list.h> 6#include <linux/list.h>
6#include <linux/types.h> 7#include <linux/types.h>
7#include <linux/timer.h> 8#include <linux/timer.h>
8#include <linux/scatterlist.h> 9#include <linux/scatterlist.h>
9 10
10struct request;
11struct Scsi_Host; 11struct Scsi_Host;
12struct scsi_device; 12struct scsi_device;
13 13
14struct scsi_data_buffer {
15 struct sg_table table;
16 unsigned length;
17 int resid;
18};
14 19
15/* embedded in scsi_cmnd */ 20/* embedded in scsi_cmnd */
16struct scsi_pointer { 21struct scsi_pointer {
@@ -61,15 +66,11 @@ struct scsi_cmnd {
61 /* These elements define the operation we are about to perform */ 66 /* These elements define the operation we are about to perform */
62#define MAX_COMMAND_SIZE 16 67#define MAX_COMMAND_SIZE 16
63 unsigned char cmnd[MAX_COMMAND_SIZE]; 68 unsigned char cmnd[MAX_COMMAND_SIZE];
64 unsigned request_bufflen; /* Actual request size */
65 69
66 struct timer_list eh_timeout; /* Used to time out the command. */ 70 struct timer_list eh_timeout; /* Used to time out the command. */
67 void *request_buffer; /* Actual requested buffer */
68 71
69 /* These elements define the operation we ultimately want to perform */ 72 /* These elements define the operation we ultimately want to perform */
70 struct sg_table sg_table; 73 struct scsi_data_buffer sdb;
71 unsigned short use_sg; /* Number of pieces of scatter-gather */
72
73 unsigned underflow; /* Return error if less than 74 unsigned underflow; /* Return error if less than
74 this amount is transferred */ 75 this amount is transferred */
75 76
@@ -79,10 +80,6 @@ struct scsi_cmnd {
79 reconnects. Probably == sector 80 reconnects. Probably == sector
80 size */ 81 size */
81 82
82 int resid; /* Number of bytes requested to be
83 transferred less actual number
84 transferred (0 if not supported) */
85
86 struct request *request; /* The command we are 83 struct request *request; /* The command we are
87 working on */ 84 working on */
88 85
@@ -127,27 +124,55 @@ extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count,
127 size_t *offset, size_t *len); 124 size_t *offset, size_t *len);
128extern void scsi_kunmap_atomic_sg(void *virt); 125extern void scsi_kunmap_atomic_sg(void *virt);
129 126
130extern int scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t); 127extern int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask);
131extern void scsi_free_sgtable(struct scsi_cmnd *); 128extern void scsi_release_buffers(struct scsi_cmnd *cmd);
132 129
133extern int scsi_dma_map(struct scsi_cmnd *cmd); 130extern int scsi_dma_map(struct scsi_cmnd *cmd);
134extern void scsi_dma_unmap(struct scsi_cmnd *cmd); 131extern void scsi_dma_unmap(struct scsi_cmnd *cmd);
135 132
136#define scsi_sg_count(cmd) ((cmd)->use_sg) 133static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd)
137#define scsi_sglist(cmd) ((cmd)->sg_table.sgl) 134{
138#define scsi_bufflen(cmd) ((cmd)->request_bufflen) 135 return cmd->sdb.table.nents;
136}
137
138static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd)
139{
140 return cmd->sdb.table.sgl;
141}
142
143static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd)
144{
145 return cmd->sdb.length;
146}
139 147
140static inline void scsi_set_resid(struct scsi_cmnd *cmd, int resid) 148static inline void scsi_set_resid(struct scsi_cmnd *cmd, int resid)
141{ 149{
142 cmd->resid = resid; 150 cmd->sdb.resid = resid;
143} 151}
144 152
145static inline int scsi_get_resid(struct scsi_cmnd *cmd) 153static inline int scsi_get_resid(struct scsi_cmnd *cmd)
146{ 154{
147 return cmd->resid; 155 return cmd->sdb.resid;
148} 156}
149 157
150#define scsi_for_each_sg(cmd, sg, nseg, __i) \ 158#define scsi_for_each_sg(cmd, sg, nseg, __i) \
151 for_each_sg(scsi_sglist(cmd), sg, nseg, __i) 159 for_each_sg(scsi_sglist(cmd), sg, nseg, __i)
152 160
161static inline int scsi_bidi_cmnd(struct scsi_cmnd *cmd)
162{
163 return blk_bidi_rq(cmd->request) &&
164 (cmd->request->next_rq->special != NULL);
165}
166
167static inline struct scsi_data_buffer *scsi_in(struct scsi_cmnd *cmd)
168{
169 return scsi_bidi_cmnd(cmd) ?
170 cmd->request->next_rq->special : &cmd->sdb;
171}
172
173static inline struct scsi_data_buffer *scsi_out(struct scsi_cmnd *cmd)
174{
175 return &cmd->sdb;
176}
177
153#endif /* _SCSI_SCSI_CMND_H */ 178#endif /* _SCSI_SCSI_CMND_H */
diff --git a/include/scsi/scsi_eh.h b/include/scsi/scsi_eh.h
index d21b8913ceb3..25071d5d9bf8 100644
--- a/include/scsi/scsi_eh.h
+++ b/include/scsi/scsi_eh.h
@@ -68,16 +68,15 @@ extern int scsi_get_sense_info_fld(const u8 * sense_buffer, int sb_len,
68extern int scsi_reset_provider(struct scsi_device *, int); 68extern int scsi_reset_provider(struct scsi_device *, int);
69 69
70struct scsi_eh_save { 70struct scsi_eh_save {
71 /* saved state */
71 int result; 72 int result;
72 enum dma_data_direction data_direction; 73 enum dma_data_direction data_direction;
73 unsigned char cmd_len; 74 unsigned char cmd_len;
74 unsigned char cmnd[MAX_COMMAND_SIZE]; 75 unsigned char cmnd[MAX_COMMAND_SIZE];
76 struct scsi_data_buffer sdb;
77 struct request *next_rq;
75 78
76 void *buffer; 79 /* new command support */
77 unsigned bufflen;
78 unsigned short use_sg;
79 int resid;
80
81 struct scatterlist sense_sgl; 80 struct scatterlist sense_sgl;
82}; 81};
83 82
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 0fd4746ee39d..5c58d594126a 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -39,9 +39,6 @@ struct blk_queue_tags;
39#define DISABLE_CLUSTERING 0 39#define DISABLE_CLUSTERING 0
40#define ENABLE_CLUSTERING 1 40#define ENABLE_CLUSTERING 1
41 41
42#define DISABLE_SG_CHAINING 0
43#define ENABLE_SG_CHAINING 1
44
45enum scsi_eh_timer_return { 42enum scsi_eh_timer_return {
46 EH_NOT_HANDLED, 43 EH_NOT_HANDLED,
47 EH_HANDLED, 44 EH_HANDLED,
@@ -136,9 +133,9 @@ struct scsi_host_template {
136 * the done callback is invoked. 133 * the done callback is invoked.
137 * 134 *
138 * This is called to inform the LLD to transfer 135 * This is called to inform the LLD to transfer
139 * cmd->request_bufflen bytes. The cmd->use_sg speciefies the 136 * scsi_bufflen(cmd) bytes. scsi_sg_count(cmd) speciefies the
140 * number of scatterlist entried in the command and 137 * number of scatterlist entried in the command and
141 * cmd->request_buffer contains the scatterlist. 138 * scsi_sglist(cmd) returns the scatterlist.
142 * 139 *
143 * return values: see queuecommand 140 * return values: see queuecommand
144 * 141 *
@@ -446,15 +443,6 @@ struct scsi_host_template {
446 unsigned ordered_tag:1; 443 unsigned ordered_tag:1;
447 444
448 /* 445 /*
449 * true if the low-level driver can support sg chaining. this
450 * will be removed eventually when all the drivers are
451 * converted to support sg chaining.
452 *
453 * Status: OBSOLETE
454 */
455 unsigned use_sg_chaining:1;
456
457 /*
458 * Countdown for host blocking with no commands outstanding 446 * Countdown for host blocking with no commands outstanding
459 */ 447 */
460 unsigned int max_host_blocked; 448 unsigned int max_host_blocked;
@@ -598,7 +586,6 @@ struct Scsi_Host {
598 unsigned unchecked_isa_dma:1; 586 unsigned unchecked_isa_dma:1;
599 unsigned use_clustering:1; 587 unsigned use_clustering:1;
600 unsigned use_blk_tcq:1; 588 unsigned use_blk_tcq:1;
601 unsigned use_sg_chaining:1;
602 589
603 /* 590 /*
604 * Host has requested that no further requests come through for the 591 * Host has requested that no further requests come through for the
diff --git a/kernel/fork.c b/kernel/fork.c
index 314f5101d2b0..05e0b6f4365b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -393,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm)
393 destroy_context(mm); 393 destroy_context(mm);
394 free_mm(mm); 394 free_mm(mm);
395} 395}
396EXPORT_SYMBOL_GPL(__mmdrop);
396 397
397/* 398/*
398 * Decrement the use count and release all resources for an mm. 399 * Decrement the use count and release all resources for an mm.
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index d4dc4eb48d95..a2241060113b 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -348,6 +348,7 @@ static int cipso_v4_cache_check(const unsigned char *key,
348 atomic_inc(&entry->lsm_data->refcount); 348 atomic_inc(&entry->lsm_data->refcount);
349 secattr->cache = entry->lsm_data; 349 secattr->cache = entry->lsm_data;
350 secattr->flags |= NETLBL_SECATTR_CACHE; 350 secattr->flags |= NETLBL_SECATTR_CACHE;
351 secattr->type = NETLBL_NLTYPE_CIPSOV4;
351 if (prev_entry == NULL) { 352 if (prev_entry == NULL) {
352 spin_unlock_bh(&cipso_v4_cache[bkt].lock); 353 spin_unlock_bh(&cipso_v4_cache[bkt].lock);
353 return 0; 354 return 0;
@@ -865,7 +866,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
865 } 866 }
866 867
867 for (;;) { 868 for (;;) {
868 host_spot = netlbl_secattr_catmap_walk(secattr->mls_cat, 869 host_spot = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
869 host_spot + 1); 870 host_spot + 1);
870 if (host_spot < 0) 871 if (host_spot < 0)
871 break; 872 break;
@@ -948,7 +949,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
948 return -EPERM; 949 return -EPERM;
949 break; 950 break;
950 } 951 }
951 ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat, 952 ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
952 host_spot, 953 host_spot,
953 GFP_ATOMIC); 954 GFP_ATOMIC);
954 if (ret_val != 0) 955 if (ret_val != 0)
@@ -1014,7 +1015,8 @@ static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def,
1014 u32 cat_iter = 0; 1015 u32 cat_iter = 0;
1015 1016
1016 for (;;) { 1017 for (;;) {
1017 cat = netlbl_secattr_catmap_walk(secattr->mls_cat, cat + 1); 1018 cat = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
1019 cat + 1);
1018 if (cat < 0) 1020 if (cat < 0)
1019 break; 1021 break;
1020 if ((cat_iter + 2) > net_cat_len) 1022 if ((cat_iter + 2) > net_cat_len)
@@ -1049,7 +1051,7 @@ static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def,
1049 u32 iter; 1051 u32 iter;
1050 1052
1051 for (iter = 0; iter < net_cat_len; iter += 2) { 1053 for (iter = 0; iter < net_cat_len; iter += 2) {
1052 ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat, 1054 ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
1053 ntohs(get_unaligned((__be16 *)&net_cat[iter])), 1055 ntohs(get_unaligned((__be16 *)&net_cat[iter])),
1054 GFP_ATOMIC); 1056 GFP_ATOMIC);
1055 if (ret_val != 0) 1057 if (ret_val != 0)
@@ -1130,7 +1132,8 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
1130 return -ENOSPC; 1132 return -ENOSPC;
1131 1133
1132 for (;;) { 1134 for (;;) {
1133 iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1); 1135 iter = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
1136 iter + 1);
1134 if (iter < 0) 1137 if (iter < 0)
1135 break; 1138 break;
1136 cat_size += (iter == 0 ? 0 : sizeof(u16)); 1139 cat_size += (iter == 0 ? 0 : sizeof(u16));
@@ -1138,7 +1141,8 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
1138 return -ENOSPC; 1141 return -ENOSPC;
1139 array[array_cnt++] = iter; 1142 array[array_cnt++] = iter;
1140 1143
1141 iter = netlbl_secattr_catmap_walk_rng(secattr->mls_cat, iter); 1144 iter = netlbl_secattr_catmap_walk_rng(secattr->attr.mls.cat,
1145 iter);
1142 if (iter < 0) 1146 if (iter < 0)
1143 return -EFAULT; 1147 return -EFAULT;
1144 cat_size += sizeof(u16); 1148 cat_size += sizeof(u16);
@@ -1191,7 +1195,7 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def,
1191 else 1195 else
1192 cat_low = 0; 1196 cat_low = 0;
1193 1197
1194 ret_val = netlbl_secattr_catmap_setrng(secattr->mls_cat, 1198 ret_val = netlbl_secattr_catmap_setrng(secattr->attr.mls.cat,
1195 cat_low, 1199 cat_low,
1196 cat_high, 1200 cat_high,
1197 GFP_ATOMIC); 1201 GFP_ATOMIC);
@@ -1251,7 +1255,9 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
1251 if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) 1255 if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
1252 return -EPERM; 1256 return -EPERM;
1253 1257
1254 ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level); 1258 ret_val = cipso_v4_map_lvl_hton(doi_def,
1259 secattr->attr.mls.lvl,
1260 &level);
1255 if (ret_val != 0) 1261 if (ret_val != 0)
1256 return ret_val; 1262 return ret_val;
1257 1263
@@ -1303,12 +1309,13 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
1303 ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); 1309 ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
1304 if (ret_val != 0) 1310 if (ret_val != 0)
1305 return ret_val; 1311 return ret_val;
1306 secattr->mls_lvl = level; 1312 secattr->attr.mls.lvl = level;
1307 secattr->flags |= NETLBL_SECATTR_MLS_LVL; 1313 secattr->flags |= NETLBL_SECATTR_MLS_LVL;
1308 1314
1309 if (tag_len > 4) { 1315 if (tag_len > 4) {
1310 secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); 1316 secattr->attr.mls.cat =
1311 if (secattr->mls_cat == NULL) 1317 netlbl_secattr_catmap_alloc(GFP_ATOMIC);
1318 if (secattr->attr.mls.cat == NULL)
1312 return -ENOMEM; 1319 return -ENOMEM;
1313 1320
1314 ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def, 1321 ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def,
@@ -1316,7 +1323,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
1316 tag_len - 4, 1323 tag_len - 4,
1317 secattr); 1324 secattr);
1318 if (ret_val != 0) { 1325 if (ret_val != 0) {
1319 netlbl_secattr_catmap_free(secattr->mls_cat); 1326 netlbl_secattr_catmap_free(secattr->attr.mls.cat);
1320 return ret_val; 1327 return ret_val;
1321 } 1328 }
1322 1329
@@ -1350,7 +1357,9 @@ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def,
1350 if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) 1357 if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
1351 return -EPERM; 1358 return -EPERM;
1352 1359
1353 ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level); 1360 ret_val = cipso_v4_map_lvl_hton(doi_def,
1361 secattr->attr.mls.lvl,
1362 &level);
1354 if (ret_val != 0) 1363 if (ret_val != 0)
1355 return ret_val; 1364 return ret_val;
1356 1365
@@ -1396,12 +1405,13 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
1396 ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); 1405 ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
1397 if (ret_val != 0) 1406 if (ret_val != 0)
1398 return ret_val; 1407 return ret_val;
1399 secattr->mls_lvl = level; 1408 secattr->attr.mls.lvl = level;
1400 secattr->flags |= NETLBL_SECATTR_MLS_LVL; 1409 secattr->flags |= NETLBL_SECATTR_MLS_LVL;
1401 1410
1402 if (tag_len > 4) { 1411 if (tag_len > 4) {
1403 secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); 1412 secattr->attr.mls.cat =
1404 if (secattr->mls_cat == NULL) 1413 netlbl_secattr_catmap_alloc(GFP_ATOMIC);
1414 if (secattr->attr.mls.cat == NULL)
1405 return -ENOMEM; 1415 return -ENOMEM;
1406 1416
1407 ret_val = cipso_v4_map_cat_enum_ntoh(doi_def, 1417 ret_val = cipso_v4_map_cat_enum_ntoh(doi_def,
@@ -1409,7 +1419,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
1409 tag_len - 4, 1419 tag_len - 4,
1410 secattr); 1420 secattr);
1411 if (ret_val != 0) { 1421 if (ret_val != 0) {
1412 netlbl_secattr_catmap_free(secattr->mls_cat); 1422 netlbl_secattr_catmap_free(secattr->attr.mls.cat);
1413 return ret_val; 1423 return ret_val;
1414 } 1424 }
1415 1425
@@ -1443,7 +1453,9 @@ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def,
1443 if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) 1453 if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
1444 return -EPERM; 1454 return -EPERM;
1445 1455
1446 ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level); 1456 ret_val = cipso_v4_map_lvl_hton(doi_def,
1457 secattr->attr.mls.lvl,
1458 &level);
1447 if (ret_val != 0) 1459 if (ret_val != 0)
1448 return ret_val; 1460 return ret_val;
1449 1461
@@ -1488,12 +1500,13 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
1488 ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); 1500 ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
1489 if (ret_val != 0) 1501 if (ret_val != 0)
1490 return ret_val; 1502 return ret_val;
1491 secattr->mls_lvl = level; 1503 secattr->attr.mls.lvl = level;
1492 secattr->flags |= NETLBL_SECATTR_MLS_LVL; 1504 secattr->flags |= NETLBL_SECATTR_MLS_LVL;
1493 1505
1494 if (tag_len > 4) { 1506 if (tag_len > 4) {
1495 secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); 1507 secattr->attr.mls.cat =
1496 if (secattr->mls_cat == NULL) 1508 netlbl_secattr_catmap_alloc(GFP_ATOMIC);
1509 if (secattr->attr.mls.cat == NULL)
1497 return -ENOMEM; 1510 return -ENOMEM;
1498 1511
1499 ret_val = cipso_v4_map_cat_rng_ntoh(doi_def, 1512 ret_val = cipso_v4_map_cat_rng_ntoh(doi_def,
@@ -1501,7 +1514,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
1501 tag_len - 4, 1514 tag_len - 4,
1502 secattr); 1515 secattr);
1503 if (ret_val != 0) { 1516 if (ret_val != 0) {
1504 netlbl_secattr_catmap_free(secattr->mls_cat); 1517 netlbl_secattr_catmap_free(secattr->attr.mls.cat);
1505 return ret_val; 1518 return ret_val;
1506 } 1519 }
1507 1520
@@ -1850,6 +1863,8 @@ static int cipso_v4_getattr(const unsigned char *cipso,
1850 ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr); 1863 ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr);
1851 break; 1864 break;
1852 } 1865 }
1866 if (ret_val == 0)
1867 secattr->type = NETLBL_NLTYPE_CIPSOV4;
1853 1868
1854getattr_return: 1869getattr_return:
1855 rcu_read_unlock(); 1870 rcu_read_unlock();
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index b11b3ecbb39d..7708e2084ce2 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -72,12 +72,13 @@ static bool checkentry_selinux(struct xt_secmark_target_info *info)
72 return false; 72 return false;
73 } 73 }
74 74
75 err = selinux_relabel_packet_permission(sel->selsid); 75 err = selinux_secmark_relabel_packet_permission(sel->selsid);
76 if (err) { 76 if (err) {
77 printk(KERN_INFO PFX "unable to obtain relabeling permission\n"); 77 printk(KERN_INFO PFX "unable to obtain relabeling permission\n");
78 return false; 78 return false;
79 } 79 }
80 80
81 selinux_secmark_refcount_inc();
81 return true; 82 return true;
82} 83}
83 84
@@ -110,11 +111,20 @@ secmark_tg_check(const char *tablename, const void *entry,
110 return true; 111 return true;
111} 112}
112 113
114void secmark_tg_destroy(const struct xt_target *target, void *targinfo)
115{
116 switch (mode) {
117 case SECMARK_MODE_SEL:
118 selinux_secmark_refcount_dec();
119 }
120}
121
113static struct xt_target secmark_tg_reg[] __read_mostly = { 122static struct xt_target secmark_tg_reg[] __read_mostly = {
114 { 123 {
115 .name = "SECMARK", 124 .name = "SECMARK",
116 .family = AF_INET, 125 .family = AF_INET,
117 .checkentry = secmark_tg_check, 126 .checkentry = secmark_tg_check,
127 .destroy = secmark_tg_destroy,
118 .target = secmark_tg, 128 .target = secmark_tg,
119 .targetsize = sizeof(struct xt_secmark_target_info), 129 .targetsize = sizeof(struct xt_secmark_target_info),
120 .table = "mangle", 130 .table = "mangle",
@@ -124,6 +134,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = {
124 .name = "SECMARK", 134 .name = "SECMARK",
125 .family = AF_INET6, 135 .family = AF_INET6,
126 .checkentry = secmark_tg_check, 136 .checkentry = secmark_tg_check,
137 .destroy = secmark_tg_destroy,
127 .target = secmark_tg, 138 .target = secmark_tg,
128 .targetsize = sizeof(struct xt_secmark_target_info), 139 .targetsize = sizeof(struct xt_secmark_target_info),
129 .table = "mangle", 140 .table = "mangle",
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index ba0ca8d3f77d..becf91a952ae 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -38,6 +38,7 @@
38#include <net/genetlink.h> 38#include <net/genetlink.h>
39#include <net/netlabel.h> 39#include <net/netlabel.h>
40#include <net/cipso_ipv4.h> 40#include <net/cipso_ipv4.h>
41#include <asm/atomic.h>
41 42
42#include "netlabel_user.h" 43#include "netlabel_user.h"
43#include "netlabel_cipso_v4.h" 44#include "netlabel_cipso_v4.h"
@@ -421,7 +422,7 @@ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info)
421 break; 422 break;
422 } 423 }
423 if (ret_val == 0) 424 if (ret_val == 0)
424 netlbl_mgmt_protocount_inc(); 425 atomic_inc(&netlabel_mgmt_protocount);
425 426
426 audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD, 427 audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD,
427 &audit_info); 428 &audit_info);
@@ -698,7 +699,7 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info)
698 &audit_info, 699 &audit_info,
699 netlbl_cipsov4_doi_free); 700 netlbl_cipsov4_doi_free);
700 if (ret_val == 0) 701 if (ret_val == 0)
701 netlbl_mgmt_protocount_dec(); 702 atomic_dec(&netlabel_mgmt_protocount);
702 703
703 audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_DEL, 704 audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_DEL,
704 &audit_info); 705 &audit_info);
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index b3675bd7db33..9a8ea0195c4f 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -54,9 +54,6 @@ struct netlbl_domhsh_tbl {
54 * hash table should be okay */ 54 * hash table should be okay */
55static DEFINE_SPINLOCK(netlbl_domhsh_lock); 55static DEFINE_SPINLOCK(netlbl_domhsh_lock);
56static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL; 56static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL;
57
58/* Default domain mapping */
59static DEFINE_SPINLOCK(netlbl_domhsh_def_lock);
60static struct netlbl_dom_map *netlbl_domhsh_def = NULL; 57static struct netlbl_dom_map *netlbl_domhsh_def = NULL;
61 58
62/* 59/*
@@ -109,17 +106,14 @@ static u32 netlbl_domhsh_hash(const char *key)
109/** 106/**
110 * netlbl_domhsh_search - Search for a domain entry 107 * netlbl_domhsh_search - Search for a domain entry
111 * @domain: the domain 108 * @domain: the domain
112 * @def: return default if no match is found
113 * 109 *
114 * Description: 110 * Description:
115 * Searches the domain hash table and returns a pointer to the hash table 111 * Searches the domain hash table and returns a pointer to the hash table
116 * entry if found, otherwise NULL is returned. If @def is non-zero and a 112 * entry if found, otherwise NULL is returned. The caller is responsibile for
117 * match is not found in the domain hash table the default mapping is returned 113 * the rcu hash table locks (i.e. the caller much call rcu_read_[un]lock()).
118 * if it exists. The caller is responsibile for the rcu hash table locks
119 * (i.e. the caller much call rcu_read_[un]lock()).
120 * 114 *
121 */ 115 */
122static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, u32 def) 116static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
123{ 117{
124 u32 bkt; 118 u32 bkt;
125 struct netlbl_dom_map *iter; 119 struct netlbl_dom_map *iter;
@@ -133,10 +127,31 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, u32 def)
133 return iter; 127 return iter;
134 } 128 }
135 129
136 if (def != 0) { 130 return NULL;
137 iter = rcu_dereference(netlbl_domhsh_def); 131}
138 if (iter != NULL && iter->valid) 132
139 return iter; 133/**
134 * netlbl_domhsh_search_def - Search for a domain entry
135 * @domain: the domain
136 * @def: return default if no match is found
137 *
138 * Description:
139 * Searches the domain hash table and returns a pointer to the hash table
140 * entry if an exact match is found, if an exact match is not present in the
141 * hash table then the default entry is returned if valid otherwise NULL is
142 * returned. The caller is responsibile for the rcu hash table locks
143 * (i.e. the caller much call rcu_read_[un]lock()).
144 *
145 */
146static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain)
147{
148 struct netlbl_dom_map *entry;
149
150 entry = netlbl_domhsh_search(domain);
151 if (entry == NULL) {
152 entry = rcu_dereference(netlbl_domhsh_def);
153 if (entry != NULL && entry->valid)
154 return entry;
140 } 155 }
141 156
142 return NULL; 157 return NULL;
@@ -221,24 +236,22 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
221 INIT_RCU_HEAD(&entry->rcu); 236 INIT_RCU_HEAD(&entry->rcu);
222 237
223 rcu_read_lock(); 238 rcu_read_lock();
239 spin_lock(&netlbl_domhsh_lock);
224 if (entry->domain != NULL) { 240 if (entry->domain != NULL) {
225 bkt = netlbl_domhsh_hash(entry->domain); 241 bkt = netlbl_domhsh_hash(entry->domain);
226 spin_lock(&netlbl_domhsh_lock); 242 if (netlbl_domhsh_search(entry->domain) == NULL)
227 if (netlbl_domhsh_search(entry->domain, 0) == NULL)
228 list_add_tail_rcu(&entry->list, 243 list_add_tail_rcu(&entry->list,
229 &rcu_dereference(netlbl_domhsh)->tbl[bkt]); 244 &rcu_dereference(netlbl_domhsh)->tbl[bkt]);
230 else 245 else
231 ret_val = -EEXIST; 246 ret_val = -EEXIST;
232 spin_unlock(&netlbl_domhsh_lock);
233 } else { 247 } else {
234 INIT_LIST_HEAD(&entry->list); 248 INIT_LIST_HEAD(&entry->list);
235 spin_lock(&netlbl_domhsh_def_lock);
236 if (rcu_dereference(netlbl_domhsh_def) == NULL) 249 if (rcu_dereference(netlbl_domhsh_def) == NULL)
237 rcu_assign_pointer(netlbl_domhsh_def, entry); 250 rcu_assign_pointer(netlbl_domhsh_def, entry);
238 else 251 else
239 ret_val = -EEXIST; 252 ret_val = -EEXIST;
240 spin_unlock(&netlbl_domhsh_def_lock);
241 } 253 }
254 spin_unlock(&netlbl_domhsh_lock);
242 audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); 255 audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info);
243 if (audit_buf != NULL) { 256 if (audit_buf != NULL) {
244 audit_log_format(audit_buf, 257 audit_log_format(audit_buf,
@@ -307,7 +320,10 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
307 struct audit_buffer *audit_buf; 320 struct audit_buffer *audit_buf;
308 321
309 rcu_read_lock(); 322 rcu_read_lock();
310 entry = netlbl_domhsh_search(domain, (domain != NULL ? 0 : 1)); 323 if (domain)
324 entry = netlbl_domhsh_search(domain);
325 else
326 entry = netlbl_domhsh_search_def(domain);
311 if (entry == NULL) 327 if (entry == NULL)
312 goto remove_return; 328 goto remove_return;
313 switch (entry->type) { 329 switch (entry->type) {
@@ -316,23 +332,16 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
316 entry->domain); 332 entry->domain);
317 break; 333 break;
318 } 334 }
319 if (entry != rcu_dereference(netlbl_domhsh_def)) { 335 spin_lock(&netlbl_domhsh_lock);
320 spin_lock(&netlbl_domhsh_lock); 336 if (entry->valid) {
321 if (entry->valid) { 337 entry->valid = 0;
322 entry->valid = 0; 338 if (entry != rcu_dereference(netlbl_domhsh_def))
323 list_del_rcu(&entry->list); 339 list_del_rcu(&entry->list);
324 ret_val = 0; 340 else
325 }
326 spin_unlock(&netlbl_domhsh_lock);
327 } else {
328 spin_lock(&netlbl_domhsh_def_lock);
329 if (entry->valid) {
330 entry->valid = 0;
331 rcu_assign_pointer(netlbl_domhsh_def, NULL); 341 rcu_assign_pointer(netlbl_domhsh_def, NULL);
332 ret_val = 0; 342 ret_val = 0;
333 }
334 spin_unlock(&netlbl_domhsh_def_lock);
335 } 343 }
344 spin_unlock(&netlbl_domhsh_lock);
336 345
337 audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); 346 audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info);
338 if (audit_buf != NULL) { 347 if (audit_buf != NULL) {
@@ -377,7 +386,7 @@ int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info)
377 */ 386 */
378struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain) 387struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain)
379{ 388{
380 return netlbl_domhsh_search(domain, 1); 389 return netlbl_domhsh_search_def(domain);
381} 390}
382 391
383/** 392/**
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 4f50949722a9..c69e3e1f05c3 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -34,6 +34,7 @@
34#include <net/netlabel.h> 34#include <net/netlabel.h>
35#include <net/cipso_ipv4.h> 35#include <net/cipso_ipv4.h>
36#include <asm/bug.h> 36#include <asm/bug.h>
37#include <asm/atomic.h>
37 38
38#include "netlabel_domainhash.h" 39#include "netlabel_domainhash.h"
39#include "netlabel_unlabeled.h" 40#include "netlabel_unlabeled.h"
@@ -262,7 +263,7 @@ int netlbl_enabled(void)
262 /* At some point we probably want to expose this mechanism to the user 263 /* At some point we probably want to expose this mechanism to the user
263 * as well so that admins can toggle NetLabel regardless of the 264 * as well so that admins can toggle NetLabel regardless of the
264 * configuration */ 265 * configuration */
265 return (netlbl_mgmt_protocount_value() > 0 ? 1 : 0); 266 return (atomic_read(&netlabel_mgmt_protocount) > 0);
266} 267}
267 268
268/** 269/**
@@ -311,7 +312,7 @@ socket_setattr_return:
311 * @secattr: the security attributes 312 * @secattr: the security attributes
312 * 313 *
313 * Description: 314 * Description:
314 * Examines the given sock to see any NetLabel style labeling has been 315 * Examines the given sock to see if any NetLabel style labeling has been
315 * applied to the sock, if so it parses the socket label and returns the 316 * applied to the sock, if so it parses the socket label and returns the
316 * security attributes in @secattr. Returns zero on success, negative values 317 * security attributes in @secattr. Returns zero on success, negative values
317 * on failure. 318 * on failure.
@@ -319,18 +320,13 @@ socket_setattr_return:
319 */ 320 */
320int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) 321int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
321{ 322{
322 int ret_val; 323 return cipso_v4_sock_getattr(sk, secattr);
323
324 ret_val = cipso_v4_sock_getattr(sk, secattr);
325 if (ret_val == 0)
326 return 0;
327
328 return netlbl_unlabel_getattr(secattr);
329} 324}
330 325
331/** 326/**
332 * netlbl_skbuff_getattr - Determine the security attributes of a packet 327 * netlbl_skbuff_getattr - Determine the security attributes of a packet
333 * @skb: the packet 328 * @skb: the packet
329 * @family: protocol family
334 * @secattr: the security attributes 330 * @secattr: the security attributes
335 * 331 *
336 * Description: 332 * Description:
@@ -341,13 +337,14 @@ int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
341 * 337 *
342 */ 338 */
343int netlbl_skbuff_getattr(const struct sk_buff *skb, 339int netlbl_skbuff_getattr(const struct sk_buff *skb,
340 u16 family,
344 struct netlbl_lsm_secattr *secattr) 341 struct netlbl_lsm_secattr *secattr)
345{ 342{
346 if (CIPSO_V4_OPTEXIST(skb) && 343 if (CIPSO_V4_OPTEXIST(skb) &&
347 cipso_v4_skbuff_getattr(skb, secattr) == 0) 344 cipso_v4_skbuff_getattr(skb, secattr) == 0)
348 return 0; 345 return 0;
349 346
350 return netlbl_unlabel_getattr(secattr); 347 return netlbl_unlabel_getattr(skb, family, secattr);
351} 348}
352 349
353/** 350/**
@@ -431,6 +428,10 @@ static int __init netlbl_init(void)
431 if (ret_val != 0) 428 if (ret_val != 0)
432 goto init_failure; 429 goto init_failure;
433 430
431 ret_val = netlbl_unlabel_init(NETLBL_UNLHSH_BITSIZE);
432 if (ret_val != 0)
433 goto init_failure;
434
434 ret_val = netlbl_netlink_init(); 435 ret_val = netlbl_netlink_init();
435 if (ret_val != 0) 436 if (ret_val != 0)
436 goto init_failure; 437 goto init_failure;
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index 9c41464d58d1..e2258dc3c845 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -37,14 +37,14 @@
37#include <net/genetlink.h> 37#include <net/genetlink.h>
38#include <net/netlabel.h> 38#include <net/netlabel.h>
39#include <net/cipso_ipv4.h> 39#include <net/cipso_ipv4.h>
40#include <asm/atomic.h>
40 41
41#include "netlabel_domainhash.h" 42#include "netlabel_domainhash.h"
42#include "netlabel_user.h" 43#include "netlabel_user.h"
43#include "netlabel_mgmt.h" 44#include "netlabel_mgmt.h"
44 45
45/* NetLabel configured protocol count */ 46/* NetLabel configured protocol counter */
46static DEFINE_SPINLOCK(netlabel_mgmt_protocount_lock); 47atomic_t netlabel_mgmt_protocount = ATOMIC_INIT(0);
47static u32 netlabel_mgmt_protocount = 0;
48 48
49/* Argument struct for netlbl_domhsh_walk() */ 49/* Argument struct for netlbl_domhsh_walk() */
50struct netlbl_domhsh_walk_arg { 50struct netlbl_domhsh_walk_arg {
@@ -71,63 +71,6 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = {
71}; 71};
72 72
73/* 73/*
74 * NetLabel Misc Management Functions
75 */
76
77/**
78 * netlbl_mgmt_protocount_inc - Increment the configured labeled protocol count
79 *
80 * Description:
81 * Increment the number of labeled protocol configurations in the current
82 * NetLabel configuration. Keep track of this for use in determining if
83 * NetLabel label enforcement should be active/enabled or not in the LSM.
84 *
85 */
86void netlbl_mgmt_protocount_inc(void)
87{
88 spin_lock(&netlabel_mgmt_protocount_lock);
89 netlabel_mgmt_protocount++;
90 spin_unlock(&netlabel_mgmt_protocount_lock);
91}
92
93/**
94 * netlbl_mgmt_protocount_dec - Decrement the configured labeled protocol count
95 *
96 * Description:
97 * Decrement the number of labeled protocol configurations in the current
98 * NetLabel configuration. Keep track of this for use in determining if
99 * NetLabel label enforcement should be active/enabled or not in the LSM.
100 *
101 */
102void netlbl_mgmt_protocount_dec(void)
103{
104 spin_lock(&netlabel_mgmt_protocount_lock);
105 if (netlabel_mgmt_protocount > 0)
106 netlabel_mgmt_protocount--;
107 spin_unlock(&netlabel_mgmt_protocount_lock);
108}
109
110/**
111 * netlbl_mgmt_protocount_value - Return the number of configured protocols
112 *
113 * Description:
114 * Return the number of labeled protocols in the current NetLabel
115 * configuration. This value is useful in determining if NetLabel label
116 * enforcement should be active/enabled or not in the LSM.
117 *
118 */
119u32 netlbl_mgmt_protocount_value(void)
120{
121 u32 val;
122
123 rcu_read_lock();
124 val = netlabel_mgmt_protocount;
125 rcu_read_unlock();
126
127 return val;
128}
129
130/*
131 * NetLabel Command Handlers 74 * NetLabel Command Handlers
132 */ 75 */
133 76
diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h
index ccb2b3923591..a43bff169d6b 100644
--- a/net/netlabel/netlabel_mgmt.h
+++ b/net/netlabel/netlabel_mgmt.h
@@ -32,6 +32,7 @@
32#define _NETLABEL_MGMT_H 32#define _NETLABEL_MGMT_H
33 33
34#include <net/netlabel.h> 34#include <net/netlabel.h>
35#include <asm/atomic.h>
35 36
36/* 37/*
37 * The following NetLabel payloads are supported by the management interface. 38 * The following NetLabel payloads are supported by the management interface.
@@ -168,9 +169,7 @@ enum {
168/* NetLabel protocol functions */ 169/* NetLabel protocol functions */
169int netlbl_mgmt_genl_init(void); 170int netlbl_mgmt_genl_init(void);
170 171
171/* NetLabel misc management functions */ 172/* NetLabel configured protocol reference counter */
172void netlbl_mgmt_protocount_inc(void); 173extern atomic_t netlabel_mgmt_protocount;
173void netlbl_mgmt_protocount_dec(void);
174u32 netlbl_mgmt_protocount_value(void);
175 174
176#endif 175#endif
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 348292450deb..42e81fd8cc49 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -10,7 +10,7 @@
10 */ 10 */
11 11
12/* 12/*
13 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 13 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2007
14 * 14 *
15 * This program is free software; you can redistribute it and/or modify 15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by 16 * it under the terms of the GNU General Public License as published by
@@ -36,22 +36,92 @@
36#include <linux/string.h> 36#include <linux/string.h>
37#include <linux/skbuff.h> 37#include <linux/skbuff.h>
38#include <linux/audit.h> 38#include <linux/audit.h>
39#include <linux/in.h>
40#include <linux/in6.h>
41#include <linux/ip.h>
42#include <linux/ipv6.h>
43#include <linux/notifier.h>
44#include <linux/netdevice.h>
45#include <linux/security.h>
39#include <net/sock.h> 46#include <net/sock.h>
40#include <net/netlink.h> 47#include <net/netlink.h>
41#include <net/genetlink.h> 48#include <net/genetlink.h>
42 49#include <net/ip.h>
50#include <net/ipv6.h>
51#include <net/net_namespace.h>
43#include <net/netlabel.h> 52#include <net/netlabel.h>
44#include <asm/bug.h> 53#include <asm/bug.h>
54#include <asm/atomic.h>
45 55
46#include "netlabel_user.h" 56#include "netlabel_user.h"
47#include "netlabel_domainhash.h" 57#include "netlabel_domainhash.h"
48#include "netlabel_unlabeled.h" 58#include "netlabel_unlabeled.h"
59#include "netlabel_mgmt.h"
60
61/* NOTE: at present we always use init's network namespace since we don't
62 * presently support different namespaces even though the majority of
63 * the functions in this file are "namespace safe" */
64
65/* The unlabeled connection hash table which we use to map network interfaces
66 * and addresses of unlabeled packets to a user specified secid value for the
67 * LSM. The hash table is used to lookup the network interface entry
68 * (struct netlbl_unlhsh_iface) and then the interface entry is used to
69 * lookup an IP address match from an ordered list. If a network interface
70 * match can not be found in the hash table then the default entry
71 * (netlbl_unlhsh_def) is used. The IP address entry list
72 * (struct netlbl_unlhsh_addr) is ordered such that the entries with a
73 * larger netmask come first.
74 */
75struct netlbl_unlhsh_tbl {
76 struct list_head *tbl;
77 u32 size;
78};
79struct netlbl_unlhsh_addr4 {
80 __be32 addr;
81 __be32 mask;
82 u32 secid;
83
84 u32 valid;
85 struct list_head list;
86 struct rcu_head rcu;
87};
88struct netlbl_unlhsh_addr6 {
89 struct in6_addr addr;
90 struct in6_addr mask;
91 u32 secid;
92
93 u32 valid;
94 struct list_head list;
95 struct rcu_head rcu;
96};
97struct netlbl_unlhsh_iface {
98 int ifindex;
99 struct list_head addr4_list;
100 struct list_head addr6_list;
101
102 u32 valid;
103 struct list_head list;
104 struct rcu_head rcu;
105};
106
107/* Argument struct for netlbl_unlhsh_walk() */
108struct netlbl_unlhsh_walk_arg {
109 struct netlink_callback *nl_cb;
110 struct sk_buff *skb;
111 u32 seq;
112};
113
114/* Unlabeled connection hash table */
115/* updates should be so rare that having one spinlock for the entire
116 * hash table should be okay */
117static DEFINE_SPINLOCK(netlbl_unlhsh_lock);
118static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL;
119static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL;
49 120
50/* Accept unlabeled packets flag */ 121/* Accept unlabeled packets flag */
51static DEFINE_SPINLOCK(netlabel_unlabel_acceptflg_lock);
52static u8 netlabel_unlabel_acceptflg = 0; 122static u8 netlabel_unlabel_acceptflg = 0;
53 123
54/* NetLabel Generic NETLINK CIPSOv4 family */ 124/* NetLabel Generic NETLINK unlabeled family */
55static struct genl_family netlbl_unlabel_gnl_family = { 125static struct genl_family netlbl_unlabel_gnl_family = {
56 .id = GENL_ID_GENERATE, 126 .id = GENL_ID_GENERATE,
57 .hdrsize = 0, 127 .hdrsize = 0,
@@ -63,11 +133,841 @@ static struct genl_family netlbl_unlabel_gnl_family = {
63/* NetLabel Netlink attribute policy */ 133/* NetLabel Netlink attribute policy */
64static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { 134static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
65 [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 }, 135 [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 },
136 [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY,
137 .len = sizeof(struct in6_addr) },
138 [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY,
139 .len = sizeof(struct in6_addr) },
140 [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY,
141 .len = sizeof(struct in_addr) },
142 [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY,
143 .len = sizeof(struct in_addr) },
144 [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING,
145 .len = IFNAMSIZ - 1 },
146 [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY }
66}; 147};
67 148
68/* 149/*
69 * Helper Functions 150 * Audit Helper Functions
151 */
152
153/**
154 * netlbl_unlabel_audit_addr4 - Audit an IPv4 address
155 * @audit_buf: audit buffer
156 * @dev: network interface
157 * @addr: IP address
158 * @mask: IP address mask
159 *
160 * Description:
161 * Write the IPv4 address and address mask, if necessary, to @audit_buf.
162 *
163 */
164static void netlbl_unlabel_audit_addr4(struct audit_buffer *audit_buf,
165 const char *dev,
166 __be32 addr, __be32 mask)
167{
168 u32 mask_val = ntohl(mask);
169
170 if (dev != NULL)
171 audit_log_format(audit_buf, " netif=%s", dev);
172 audit_log_format(audit_buf, " src=" NIPQUAD_FMT, NIPQUAD(addr));
173 if (mask_val != 0xffffffff) {
174 u32 mask_len = 0;
175 while (mask_val > 0) {
176 mask_val <<= 1;
177 mask_len++;
178 }
179 audit_log_format(audit_buf, " src_prefixlen=%d", mask_len);
180 }
181}
182
183/**
184 * netlbl_unlabel_audit_addr6 - Audit an IPv6 address
185 * @audit_buf: audit buffer
186 * @dev: network interface
187 * @addr: IP address
188 * @mask: IP address mask
189 *
190 * Description:
191 * Write the IPv6 address and address mask, if necessary, to @audit_buf.
192 *
193 */
194static void netlbl_unlabel_audit_addr6(struct audit_buffer *audit_buf,
195 const char *dev,
196 const struct in6_addr *addr,
197 const struct in6_addr *mask)
198{
199 if (dev != NULL)
200 audit_log_format(audit_buf, " netif=%s", dev);
201 audit_log_format(audit_buf, " src=" NIP6_FMT, NIP6(*addr));
202 if (ntohl(mask->s6_addr32[3]) != 0xffffffff) {
203 u32 mask_len = 0;
204 u32 mask_val;
205 int iter = -1;
206 while (ntohl(mask->s6_addr32[++iter]) == 0xffffffff)
207 mask_len += 32;
208 mask_val = ntohl(mask->s6_addr32[iter]);
209 while (mask_val > 0) {
210 mask_val <<= 1;
211 mask_len++;
212 }
213 audit_log_format(audit_buf, " src_prefixlen=%d", mask_len);
214 }
215}
216
217/*
218 * Unlabeled Connection Hash Table Functions
219 */
220
221/**
222 * netlbl_unlhsh_free_addr4 - Frees an IPv4 address entry from the hash table
223 * @entry: the entry's RCU field
224 *
225 * Description:
226 * This function is designed to be used as a callback to the call_rcu()
227 * function so that memory allocated to a hash table address entry can be
228 * released safely.
229 *
230 */
231static void netlbl_unlhsh_free_addr4(struct rcu_head *entry)
232{
233 struct netlbl_unlhsh_addr4 *ptr;
234
235 ptr = container_of(entry, struct netlbl_unlhsh_addr4, rcu);
236 kfree(ptr);
237}
238
239#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
240/**
241 * netlbl_unlhsh_free_addr6 - Frees an IPv6 address entry from the hash table
242 * @entry: the entry's RCU field
243 *
244 * Description:
245 * This function is designed to be used as a callback to the call_rcu()
246 * function so that memory allocated to a hash table address entry can be
247 * released safely.
248 *
249 */
250static void netlbl_unlhsh_free_addr6(struct rcu_head *entry)
251{
252 struct netlbl_unlhsh_addr6 *ptr;
253
254 ptr = container_of(entry, struct netlbl_unlhsh_addr6, rcu);
255 kfree(ptr);
256}
257#endif /* IPv6 */
258
259/**
260 * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table
261 * @entry: the entry's RCU field
262 *
263 * Description:
264 * This function is designed to be used as a callback to the call_rcu()
265 * function so that memory allocated to a hash table interface entry can be
266 * released safely. It is important to note that this function does not free
267 * the IPv4 and IPv6 address lists contained as part of an interface entry. It
268 * is up to the rest of the code to make sure an interface entry is only freed
269 * once it's address lists are empty.
270 *
271 */
272static void netlbl_unlhsh_free_iface(struct rcu_head *entry)
273{
274 struct netlbl_unlhsh_iface *iface;
275 struct netlbl_unlhsh_addr4 *iter4;
276 struct netlbl_unlhsh_addr4 *tmp4;
277 struct netlbl_unlhsh_addr6 *iter6;
278 struct netlbl_unlhsh_addr6 *tmp6;
279
280 iface = container_of(entry, struct netlbl_unlhsh_iface, rcu);
281
282 /* no need for locks here since we are the only one with access to this
283 * structure */
284
285 list_for_each_entry_safe(iter4, tmp4, &iface->addr4_list, list)
286 if (iter4->valid) {
287 list_del_rcu(&iter4->list);
288 kfree(iter4);
289 }
290 list_for_each_entry_safe(iter6, tmp6, &iface->addr6_list, list)
291 if (iter6->valid) {
292 list_del_rcu(&iter6->list);
293 kfree(iter6);
294 }
295 kfree(iface);
296}
297
298/**
299 * netlbl_unlhsh_hash - Hashing function for the hash table
300 * @ifindex: the network interface/device to hash
301 *
302 * Description:
303 * This is the hashing function for the unlabeled hash table, it returns the
304 * bucket number for the given device/interface. The caller is responsible for
305 * calling the rcu_read_[un]lock() functions.
306 *
70 */ 307 */
308static u32 netlbl_unlhsh_hash(int ifindex)
309{
310 /* this is taken _almost_ directly from
311 * security/selinux/netif.c:sel_netif_hasfn() as they do pretty much
312 * the same thing */
313 return ifindex & (rcu_dereference(netlbl_unlhsh)->size - 1);
314}
315
316/**
317 * netlbl_unlhsh_search_addr4 - Search for a matching IPv4 address entry
318 * @addr: IPv4 address
319 * @iface: the network interface entry
320 *
321 * Description:
322 * Searches the IPv4 address list of the network interface specified by @iface.
323 * If a matching address entry is found it is returned, otherwise NULL is
324 * returned. The caller is responsible for calling the rcu_read_[un]lock()
325 * functions.
326 *
327 */
328static struct netlbl_unlhsh_addr4 *netlbl_unlhsh_search_addr4(
329 __be32 addr,
330 const struct netlbl_unlhsh_iface *iface)
331{
332 struct netlbl_unlhsh_addr4 *iter;
333
334 list_for_each_entry_rcu(iter, &iface->addr4_list, list)
335 if (iter->valid && (addr & iter->mask) == iter->addr)
336 return iter;
337
338 return NULL;
339}
340
341#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
342/**
343 * netlbl_unlhsh_search_addr6 - Search for a matching IPv6 address entry
344 * @addr: IPv6 address
345 * @iface: the network interface entry
346 *
347 * Description:
348 * Searches the IPv6 address list of the network interface specified by @iface.
349 * If a matching address entry is found it is returned, otherwise NULL is
350 * returned. The caller is responsible for calling the rcu_read_[un]lock()
351 * functions.
352 *
353 */
354static struct netlbl_unlhsh_addr6 *netlbl_unlhsh_search_addr6(
355 const struct in6_addr *addr,
356 const struct netlbl_unlhsh_iface *iface)
357{
358 struct netlbl_unlhsh_addr6 *iter;
359
360 list_for_each_entry_rcu(iter, &iface->addr6_list, list)
361 if (iter->valid &&
362 ipv6_masked_addr_cmp(&iter->addr, &iter->mask, addr) == 0)
363 return iter;
364
365 return NULL;
366}
367#endif /* IPv6 */
368
369/**
370 * netlbl_unlhsh_search_iface - Search for a matching interface entry
371 * @ifindex: the network interface
372 *
373 * Description:
374 * Searches the unlabeled connection hash table and returns a pointer to the
375 * interface entry which matches @ifindex, otherwise NULL is returned. The
376 * caller is responsible for calling the rcu_read_[un]lock() functions.
377 *
378 */
379static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex)
380{
381 u32 bkt;
382 struct netlbl_unlhsh_iface *iter;
383
384 bkt = netlbl_unlhsh_hash(ifindex);
385 list_for_each_entry_rcu(iter,
386 &rcu_dereference(netlbl_unlhsh)->tbl[bkt],
387 list)
388 if (iter->valid && iter->ifindex == ifindex)
389 return iter;
390
391 return NULL;
392}
393
394/**
395 * netlbl_unlhsh_search_iface_def - Search for a matching interface entry
396 * @ifindex: the network interface
397 *
398 * Description:
399 * Searches the unlabeled connection hash table and returns a pointer to the
400 * interface entry which matches @ifindex. If an exact match can not be found
401 * and there is a valid default entry, the default entry is returned, otherwise
402 * NULL is returned. The caller is responsible for calling the
403 * rcu_read_[un]lock() functions.
404 *
405 */
406static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface_def(int ifindex)
407{
408 struct netlbl_unlhsh_iface *entry;
409
410 entry = netlbl_unlhsh_search_iface(ifindex);
411 if (entry != NULL)
412 return entry;
413
414 entry = rcu_dereference(netlbl_unlhsh_def);
415 if (entry != NULL && entry->valid)
416 return entry;
417
418 return NULL;
419}
420
421/**
422 * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table
423 * @iface: the associated interface entry
424 * @addr: IPv4 address in network byte order
425 * @mask: IPv4 address mask in network byte order
426 * @secid: LSM secid value for entry
427 *
428 * Description:
429 * Add a new address entry into the unlabeled connection hash table using the
430 * interface entry specified by @iface. On success zero is returned, otherwise
431 * a negative value is returned. The caller is responsible for calling the
432 * rcu_read_[un]lock() functions.
433 *
434 */
435static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface,
436 const struct in_addr *addr,
437 const struct in_addr *mask,
438 u32 secid)
439{
440 struct netlbl_unlhsh_addr4 *entry;
441 struct netlbl_unlhsh_addr4 *iter;
442
443 entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
444 if (entry == NULL)
445 return -ENOMEM;
446
447 entry->addr = addr->s_addr & mask->s_addr;
448 entry->mask = mask->s_addr;
449 entry->secid = secid;
450 entry->valid = 1;
451 INIT_RCU_HEAD(&entry->rcu);
452
453 spin_lock(&netlbl_unlhsh_lock);
454 iter = netlbl_unlhsh_search_addr4(entry->addr, iface);
455 if (iter != NULL &&
456 iter->addr == addr->s_addr && iter->mask == mask->s_addr) {
457 spin_unlock(&netlbl_unlhsh_lock);
458 kfree(entry);
459 return -EEXIST;
460 }
461 /* in order to speed up address searches through the list (the common
462 * case) we need to keep the list in order based on the size of the
463 * address mask such that the entry with the widest mask (smallest
464 * numerical value) appears first in the list */
465 list_for_each_entry_rcu(iter, &iface->addr4_list, list)
466 if (iter->valid &&
467 ntohl(entry->mask) > ntohl(iter->mask)) {
468 __list_add_rcu(&entry->list,
469 iter->list.prev,
470 &iter->list);
471 spin_unlock(&netlbl_unlhsh_lock);
472 return 0;
473 }
474 list_add_tail_rcu(&entry->list, &iface->addr4_list);
475 spin_unlock(&netlbl_unlhsh_lock);
476 return 0;
477}
478
479#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
480/**
481 * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table
482 * @iface: the associated interface entry
483 * @addr: IPv6 address in network byte order
484 * @mask: IPv6 address mask in network byte order
485 * @secid: LSM secid value for entry
486 *
487 * Description:
488 * Add a new address entry into the unlabeled connection hash table using the
489 * interface entry specified by @iface. On success zero is returned, otherwise
490 * a negative value is returned. The caller is responsible for calling the
491 * rcu_read_[un]lock() functions.
492 *
493 */
494static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface,
495 const struct in6_addr *addr,
496 const struct in6_addr *mask,
497 u32 secid)
498{
499 struct netlbl_unlhsh_addr6 *entry;
500 struct netlbl_unlhsh_addr6 *iter;
501
502 entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
503 if (entry == NULL)
504 return -ENOMEM;
505
506 ipv6_addr_copy(&entry->addr, addr);
507 entry->addr.s6_addr32[0] &= mask->s6_addr32[0];
508 entry->addr.s6_addr32[1] &= mask->s6_addr32[1];
509 entry->addr.s6_addr32[2] &= mask->s6_addr32[2];
510 entry->addr.s6_addr32[3] &= mask->s6_addr32[3];
511 ipv6_addr_copy(&entry->mask, mask);
512 entry->secid = secid;
513 entry->valid = 1;
514 INIT_RCU_HEAD(&entry->rcu);
515
516 spin_lock(&netlbl_unlhsh_lock);
517 iter = netlbl_unlhsh_search_addr6(&entry->addr, iface);
518 if (iter != NULL &&
519 (ipv6_addr_equal(&iter->addr, addr) &&
520 ipv6_addr_equal(&iter->mask, mask))) {
521 spin_unlock(&netlbl_unlhsh_lock);
522 kfree(entry);
523 return -EEXIST;
524 }
525 /* in order to speed up address searches through the list (the common
526 * case) we need to keep the list in order based on the size of the
527 * address mask such that the entry with the widest mask (smallest
528 * numerical value) appears first in the list */
529 list_for_each_entry_rcu(iter, &iface->addr6_list, list)
530 if (iter->valid &&
531 ipv6_addr_cmp(&entry->mask, &iter->mask) > 0) {
532 __list_add_rcu(&entry->list,
533 iter->list.prev,
534 &iter->list);
535 spin_unlock(&netlbl_unlhsh_lock);
536 return 0;
537 }
538 list_add_tail_rcu(&entry->list, &iface->addr6_list);
539 spin_unlock(&netlbl_unlhsh_lock);
540 return 0;
541}
542#endif /* IPv6 */
543
544/**
545 * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table
546 * @ifindex: network interface
547 *
548 * Description:
549 * Add a new, empty, interface entry into the unlabeled connection hash table.
550 * On success a pointer to the new interface entry is returned, on failure NULL
551 * is returned. The caller is responsible for calling the rcu_read_[un]lock()
552 * functions.
553 *
554 */
555static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex)
556{
557 u32 bkt;
558 struct netlbl_unlhsh_iface *iface;
559
560 iface = kzalloc(sizeof(*iface), GFP_ATOMIC);
561 if (iface == NULL)
562 return NULL;
563
564 iface->ifindex = ifindex;
565 INIT_LIST_HEAD(&iface->addr4_list);
566 INIT_LIST_HEAD(&iface->addr6_list);
567 iface->valid = 1;
568 INIT_RCU_HEAD(&iface->rcu);
569
570 spin_lock(&netlbl_unlhsh_lock);
571 if (ifindex > 0) {
572 bkt = netlbl_unlhsh_hash(ifindex);
573 if (netlbl_unlhsh_search_iface(ifindex) != NULL)
574 goto add_iface_failure;
575 list_add_tail_rcu(&iface->list,
576 &rcu_dereference(netlbl_unlhsh)->tbl[bkt]);
577 } else {
578 INIT_LIST_HEAD(&iface->list);
579 if (rcu_dereference(netlbl_unlhsh_def) != NULL)
580 goto add_iface_failure;
581 rcu_assign_pointer(netlbl_unlhsh_def, iface);
582 }
583 spin_unlock(&netlbl_unlhsh_lock);
584
585 return iface;
586
587add_iface_failure:
588 spin_unlock(&netlbl_unlhsh_lock);
589 kfree(iface);
590 return NULL;
591}
592
593/**
594 * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table
595 * @net: network namespace
596 * @dev_name: interface name
597 * @addr: IP address in network byte order
598 * @mask: address mask in network byte order
599 * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
600 * @secid: LSM secid value for the entry
601 * @audit_info: NetLabel audit information
602 *
603 * Description:
604 * Adds a new entry to the unlabeled connection hash table. Returns zero on
605 * success, negative values on failure.
606 *
607 */
608static int netlbl_unlhsh_add(struct net *net,
609 const char *dev_name,
610 const void *addr,
611 const void *mask,
612 u32 addr_len,
613 u32 secid,
614 struct netlbl_audit *audit_info)
615{
616 int ret_val;
617 int ifindex;
618 struct net_device *dev;
619 struct netlbl_unlhsh_iface *iface;
620 struct in_addr *addr4, *mask4;
621 struct in6_addr *addr6, *mask6;
622 struct audit_buffer *audit_buf = NULL;
623 char *secctx = NULL;
624 u32 secctx_len;
625
626 if (addr_len != sizeof(struct in_addr) &&
627 addr_len != sizeof(struct in6_addr))
628 return -EINVAL;
629
630 rcu_read_lock();
631 if (dev_name != NULL) {
632 dev = dev_get_by_name(net, dev_name);
633 if (dev == NULL) {
634 ret_val = -ENODEV;
635 goto unlhsh_add_return;
636 }
637 ifindex = dev->ifindex;
638 dev_put(dev);
639 iface = netlbl_unlhsh_search_iface(ifindex);
640 } else {
641 ifindex = 0;
642 iface = rcu_dereference(netlbl_unlhsh_def);
643 }
644 if (iface == NULL) {
645 iface = netlbl_unlhsh_add_iface(ifindex);
646 if (iface == NULL) {
647 ret_val = -ENOMEM;
648 goto unlhsh_add_return;
649 }
650 }
651 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD,
652 audit_info);
653 switch (addr_len) {
654 case sizeof(struct in_addr):
655 addr4 = (struct in_addr *)addr;
656 mask4 = (struct in_addr *)mask;
657 ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid);
658 if (audit_buf != NULL)
659 netlbl_unlabel_audit_addr4(audit_buf,
660 dev_name,
661 addr4->s_addr,
662 mask4->s_addr);
663 break;
664#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
665 case sizeof(struct in6_addr):
666 addr6 = (struct in6_addr *)addr;
667 mask6 = (struct in6_addr *)mask;
668 ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid);
669 if (audit_buf != NULL)
670 netlbl_unlabel_audit_addr6(audit_buf,
671 dev_name,
672 addr6, mask6);
673 break;
674#endif /* IPv6 */
675 default:
676 ret_val = -EINVAL;
677 }
678 if (ret_val == 0)
679 atomic_inc(&netlabel_mgmt_protocount);
680
681unlhsh_add_return:
682 rcu_read_unlock();
683 if (audit_buf != NULL) {
684 if (security_secid_to_secctx(secid,
685 &secctx,
686 &secctx_len) == 0) {
687 audit_log_format(audit_buf, " sec_obj=%s", secctx);
688 security_release_secctx(secctx, secctx_len);
689 }
690 audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
691 audit_log_end(audit_buf);
692 }
693 return ret_val;
694}
695
696/**
697 * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry
698 * @net: network namespace
699 * @iface: interface entry
700 * @addr: IP address
701 * @mask: IP address mask
702 * @audit_info: NetLabel audit information
703 *
704 * Description:
705 * Remove an IP address entry from the unlabeled connection hash table.
706 * Returns zero on success, negative values on failure. The caller is
707 * responsible for calling the rcu_read_[un]lock() functions.
708 *
709 */
710static int netlbl_unlhsh_remove_addr4(struct net *net,
711 struct netlbl_unlhsh_iface *iface,
712 const struct in_addr *addr,
713 const struct in_addr *mask,
714 struct netlbl_audit *audit_info)
715{
716 int ret_val = -ENOENT;
717 struct netlbl_unlhsh_addr4 *entry;
718 struct audit_buffer *audit_buf = NULL;
719 struct net_device *dev;
720 char *secctx = NULL;
721 u32 secctx_len;
722
723 spin_lock(&netlbl_unlhsh_lock);
724 entry = netlbl_unlhsh_search_addr4(addr->s_addr, iface);
725 if (entry != NULL &&
726 entry->addr == addr->s_addr && entry->mask == mask->s_addr) {
727 entry->valid = 0;
728 list_del_rcu(&entry->list);
729 ret_val = 0;
730 }
731 spin_unlock(&netlbl_unlhsh_lock);
732
733 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
734 audit_info);
735 if (audit_buf != NULL) {
736 dev = dev_get_by_index(net, iface->ifindex);
737 netlbl_unlabel_audit_addr4(audit_buf,
738 (dev != NULL ? dev->name : NULL),
739 entry->addr, entry->mask);
740 if (dev != NULL)
741 dev_put(dev);
742 if (security_secid_to_secctx(entry->secid,
743 &secctx,
744 &secctx_len) == 0) {
745 audit_log_format(audit_buf, " sec_obj=%s", secctx);
746 security_release_secctx(secctx, secctx_len);
747 }
748 audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
749 audit_log_end(audit_buf);
750 }
751
752 if (ret_val == 0)
753 call_rcu(&entry->rcu, netlbl_unlhsh_free_addr4);
754 return ret_val;
755}
756
757#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
758/**
759 * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry
760 * @net: network namespace
761 * @iface: interface entry
762 * @addr: IP address
763 * @mask: IP address mask
764 * @audit_info: NetLabel audit information
765 *
766 * Description:
767 * Remove an IP address entry from the unlabeled connection hash table.
768 * Returns zero on success, negative values on failure. The caller is
769 * responsible for calling the rcu_read_[un]lock() functions.
770 *
771 */
772static int netlbl_unlhsh_remove_addr6(struct net *net,
773 struct netlbl_unlhsh_iface *iface,
774 const struct in6_addr *addr,
775 const struct in6_addr *mask,
776 struct netlbl_audit *audit_info)
777{
778 int ret_val = -ENOENT;
779 struct netlbl_unlhsh_addr6 *entry;
780 struct audit_buffer *audit_buf = NULL;
781 struct net_device *dev;
782 char *secctx = NULL;
783 u32 secctx_len;
784
785 spin_lock(&netlbl_unlhsh_lock);
786 entry = netlbl_unlhsh_search_addr6(addr, iface);
787 if (entry != NULL &&
788 (ipv6_addr_equal(&entry->addr, addr) &&
789 ipv6_addr_equal(&entry->mask, mask))) {
790 entry->valid = 0;
791 list_del_rcu(&entry->list);
792 ret_val = 0;
793 }
794 spin_unlock(&netlbl_unlhsh_lock);
795
796 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
797 audit_info);
798 if (audit_buf != NULL) {
799 dev = dev_get_by_index(net, iface->ifindex);
800 netlbl_unlabel_audit_addr6(audit_buf,
801 (dev != NULL ? dev->name : NULL),
802 addr, mask);
803 if (dev != NULL)
804 dev_put(dev);
805 if (security_secid_to_secctx(entry->secid,
806 &secctx,
807 &secctx_len) == 0) {
808 audit_log_format(audit_buf, " sec_obj=%s", secctx);
809 security_release_secctx(secctx, secctx_len);
810 }
811 audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
812 audit_log_end(audit_buf);
813 }
814
815 if (ret_val == 0)
816 call_rcu(&entry->rcu, netlbl_unlhsh_free_addr6);
817 return ret_val;
818}
819#endif /* IPv6 */
820
821/**
822 * netlbl_unlhsh_condremove_iface - Remove an interface entry
823 * @iface: the interface entry
824 *
825 * Description:
826 * Remove an interface entry from the unlabeled connection hash table if it is
827 * empty. An interface entry is considered to be empty if there are no
828 * address entries assigned to it.
829 *
830 */
831static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface)
832{
833 struct netlbl_unlhsh_addr4 *iter4;
834 struct netlbl_unlhsh_addr6 *iter6;
835
836 spin_lock(&netlbl_unlhsh_lock);
837 list_for_each_entry_rcu(iter4, &iface->addr4_list, list)
838 if (iter4->valid)
839 goto unlhsh_condremove_failure;
840 list_for_each_entry_rcu(iter6, &iface->addr6_list, list)
841 if (iter6->valid)
842 goto unlhsh_condremove_failure;
843 iface->valid = 0;
844 if (iface->ifindex > 0)
845 list_del_rcu(&iface->list);
846 else
847 rcu_assign_pointer(netlbl_unlhsh_def, NULL);
848 spin_unlock(&netlbl_unlhsh_lock);
849
850 call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
851 return;
852
853unlhsh_condremove_failure:
854 spin_unlock(&netlbl_unlhsh_lock);
855 return;
856}
857
858/**
859 * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table
860 * @net: network namespace
861 * @dev_name: interface name
862 * @addr: IP address in network byte order
863 * @mask: address mask in network byte order
864 * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
865 * @audit_info: NetLabel audit information
866 *
867 * Description:
868 * Removes and existing entry from the unlabeled connection hash table.
869 * Returns zero on success, negative values on failure.
870 *
871 */
872static int netlbl_unlhsh_remove(struct net *net,
873 const char *dev_name,
874 const void *addr,
875 const void *mask,
876 u32 addr_len,
877 struct netlbl_audit *audit_info)
878{
879 int ret_val;
880 struct net_device *dev;
881 struct netlbl_unlhsh_iface *iface;
882
883 if (addr_len != sizeof(struct in_addr) &&
884 addr_len != sizeof(struct in6_addr))
885 return -EINVAL;
886
887 rcu_read_lock();
888 if (dev_name != NULL) {
889 dev = dev_get_by_name(net, dev_name);
890 if (dev == NULL) {
891 ret_val = -ENODEV;
892 goto unlhsh_remove_return;
893 }
894 iface = netlbl_unlhsh_search_iface(dev->ifindex);
895 dev_put(dev);
896 } else
897 iface = rcu_dereference(netlbl_unlhsh_def);
898 if (iface == NULL) {
899 ret_val = -ENOENT;
900 goto unlhsh_remove_return;
901 }
902 switch (addr_len) {
903 case sizeof(struct in_addr):
904 ret_val = netlbl_unlhsh_remove_addr4(net,
905 iface, addr, mask,
906 audit_info);
907 break;
908#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
909 case sizeof(struct in6_addr):
910 ret_val = netlbl_unlhsh_remove_addr6(net,
911 iface, addr, mask,
912 audit_info);
913 break;
914#endif /* IPv6 */
915 default:
916 ret_val = -EINVAL;
917 }
918 if (ret_val == 0) {
919 netlbl_unlhsh_condremove_iface(iface);
920 atomic_dec(&netlabel_mgmt_protocount);
921 }
922
923unlhsh_remove_return:
924 rcu_read_unlock();
925 return ret_val;
926}
927
928/*
929 * General Helper Functions
930 */
931
932/**
933 * netlbl_unlhsh_netdev_handler - Network device notification handler
934 * @this: notifier block
935 * @event: the event
936 * @ptr: the network device (cast to void)
937 *
938 * Description:
939 * Handle network device events, although at present all we care about is a
940 * network device going away. In the case of a device going away we clear any
941 * related entries from the unlabeled connection hash table.
942 *
943 */
944static int netlbl_unlhsh_netdev_handler(struct notifier_block *this,
945 unsigned long event,
946 void *ptr)
947{
948 struct net_device *dev = ptr;
949 struct netlbl_unlhsh_iface *iface = NULL;
950
951 if (dev->nd_net != &init_net)
952 return NOTIFY_DONE;
953
954 /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */
955 if (event == NETDEV_DOWN) {
956 spin_lock(&netlbl_unlhsh_lock);
957 iface = netlbl_unlhsh_search_iface(dev->ifindex);
958 if (iface != NULL && iface->valid) {
959 iface->valid = 0;
960 list_del_rcu(&iface->list);
961 } else
962 iface = NULL;
963 spin_unlock(&netlbl_unlhsh_lock);
964 }
965
966 if (iface != NULL)
967 call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
968
969 return NOTIFY_DONE;
970}
71 971
72/** 972/**
73 * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag 973 * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag
@@ -84,11 +984,8 @@ static void netlbl_unlabel_acceptflg_set(u8 value,
84 struct audit_buffer *audit_buf; 984 struct audit_buffer *audit_buf;
85 u8 old_val; 985 u8 old_val;
86 986
87 spin_lock(&netlabel_unlabel_acceptflg_lock);
88 old_val = netlabel_unlabel_acceptflg; 987 old_val = netlabel_unlabel_acceptflg;
89 netlabel_unlabel_acceptflg = value; 988 netlabel_unlabel_acceptflg = value;
90 spin_unlock(&netlabel_unlabel_acceptflg_lock);
91
92 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW, 989 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW,
93 audit_info); 990 audit_info);
94 if (audit_buf != NULL) { 991 if (audit_buf != NULL) {
@@ -98,6 +995,48 @@ static void netlbl_unlabel_acceptflg_set(u8 value,
98 } 995 }
99} 996}
100 997
998/**
999 * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information
1000 * @info: the Generic NETLINK info block
1001 * @addr: the IP address
1002 * @mask: the IP address mask
1003 * @len: the address length
1004 *
1005 * Description:
1006 * Examine the Generic NETLINK message and extract the IP address information.
1007 * Returns zero on success, negative values on failure.
1008 *
1009 */
1010static int netlbl_unlabel_addrinfo_get(struct genl_info *info,
1011 void **addr,
1012 void **mask,
1013 u32 *len)
1014{
1015 u32 addr_len;
1016
1017 if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) {
1018 addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
1019 if (addr_len != sizeof(struct in_addr) &&
1020 addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK]))
1021 return -EINVAL;
1022 *len = addr_len;
1023 *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
1024 *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]);
1025 return 0;
1026 } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) {
1027 addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
1028 if (addr_len != sizeof(struct in6_addr) &&
1029 addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK]))
1030 return -EINVAL;
1031 *len = addr_len;
1032 *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
1033 *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]);
1034 return 0;
1035 }
1036
1037 return -EINVAL;
1038}
1039
101/* 1040/*
102 * NetLabel Command Handlers 1041 * NetLabel Command Handlers
103 */ 1042 */
@@ -155,11 +1094,9 @@ static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info)
155 goto list_failure; 1094 goto list_failure;
156 } 1095 }
157 1096
158 rcu_read_lock();
159 ret_val = nla_put_u8(ans_skb, 1097 ret_val = nla_put_u8(ans_skb,
160 NLBL_UNLABEL_A_ACPTFLG, 1098 NLBL_UNLABEL_A_ACPTFLG,
161 netlabel_unlabel_acceptflg); 1099 netlabel_unlabel_acceptflg);
162 rcu_read_unlock();
163 if (ret_val != 0) 1100 if (ret_val != 0)
164 goto list_failure; 1101 goto list_failure;
165 1102
@@ -175,11 +1112,489 @@ list_failure:
175 return ret_val; 1112 return ret_val;
176} 1113}
177 1114
1115/**
1116 * netlbl_unlabel_staticadd - Handle a STATICADD message
1117 * @skb: the NETLINK buffer
1118 * @info: the Generic NETLINK info block
1119 *
1120 * Description:
1121 * Process a user generated STATICADD message and add a new unlabeled
1122 * connection entry to the hash table. Returns zero on success, negative
1123 * values on failure.
1124 *
1125 */
1126static int netlbl_unlabel_staticadd(struct sk_buff *skb,
1127 struct genl_info *info)
1128{
1129 int ret_val;
1130 char *dev_name;
1131 void *addr;
1132 void *mask;
1133 u32 addr_len;
1134 u32 secid;
1135 struct netlbl_audit audit_info;
1136
1137 /* Don't allow users to add both IPv4 and IPv6 addresses for a
1138 * single entry. However, allow users to create two entries, one each
1139 * for IPv4 and IPv4, with the same LSM security context which should
1140 * achieve the same result. */
1141 if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
1142 !info->attrs[NLBL_UNLABEL_A_IFACE] ||
1143 !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
1144 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
1145 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
1146 !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
1147 return -EINVAL;
1148
1149 netlbl_netlink_auditinfo(skb, &audit_info);
1150
1151 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
1152 if (ret_val != 0)
1153 return ret_val;
1154 dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
1155 ret_val = security_secctx_to_secid(
1156 nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
1157 nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
1158 &secid);
1159 if (ret_val != 0)
1160 return ret_val;
1161
1162 return netlbl_unlhsh_add(&init_net,
1163 dev_name, addr, mask, addr_len, secid,
1164 &audit_info);
1165}
1166
1167/**
1168 * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message
1169 * @skb: the NETLINK buffer
1170 * @info: the Generic NETLINK info block
1171 *
1172 * Description:
1173 * Process a user generated STATICADDDEF message and add a new default
1174 * unlabeled connection entry. Returns zero on success, negative values on
1175 * failure.
1176 *
1177 */
1178static int netlbl_unlabel_staticadddef(struct sk_buff *skb,
1179 struct genl_info *info)
1180{
1181 int ret_val;
1182 void *addr;
1183 void *mask;
1184 u32 addr_len;
1185 u32 secid;
1186 struct netlbl_audit audit_info;
1187
1188 /* Don't allow users to add both IPv4 and IPv6 addresses for a
1189 * single entry. However, allow users to create two entries, one each
1190 * for IPv4 and IPv6, with the same LSM security context which should
1191 * achieve the same result. */
1192 if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
1193 !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
1194 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
1195 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
1196 !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
1197 return -EINVAL;
1198
1199 netlbl_netlink_auditinfo(skb, &audit_info);
1200
1201 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
1202 if (ret_val != 0)
1203 return ret_val;
1204 ret_val = security_secctx_to_secid(
1205 nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
1206 nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
1207 &secid);
1208 if (ret_val != 0)
1209 return ret_val;
1210
1211 return netlbl_unlhsh_add(&init_net,
1212 NULL, addr, mask, addr_len, secid,
1213 &audit_info);
1214}
1215
1216/**
1217 * netlbl_unlabel_staticremove - Handle a STATICREMOVE message
1218 * @skb: the NETLINK buffer
1219 * @info: the Generic NETLINK info block
1220 *
1221 * Description:
1222 * Process a user generated STATICREMOVE message and remove the specified
1223 * unlabeled connection entry. Returns zero on success, negative values on
1224 * failure.
1225 *
1226 */
1227static int netlbl_unlabel_staticremove(struct sk_buff *skb,
1228 struct genl_info *info)
1229{
1230 int ret_val;
1231 char *dev_name;
1232 void *addr;
1233 void *mask;
1234 u32 addr_len;
1235 struct netlbl_audit audit_info;
1236
1237 /* See the note in netlbl_unlabel_staticadd() about not allowing both
1238 * IPv4 and IPv6 in the same entry. */
1239 if (!info->attrs[NLBL_UNLABEL_A_IFACE] ||
1240 !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
1241 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
1242 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
1243 !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
1244 return -EINVAL;
1245
1246 netlbl_netlink_auditinfo(skb, &audit_info);
1247
1248 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
1249 if (ret_val != 0)
1250 return ret_val;
1251 dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
1252
1253 return netlbl_unlhsh_remove(&init_net,
1254 dev_name, addr, mask, addr_len,
1255 &audit_info);
1256}
1257
1258/**
1259 * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message
1260 * @skb: the NETLINK buffer
1261 * @info: the Generic NETLINK info block
1262 *
1263 * Description:
1264 * Process a user generated STATICREMOVEDEF message and remove the default
1265 * unlabeled connection entry. Returns zero on success, negative values on
1266 * failure.
1267 *
1268 */
1269static int netlbl_unlabel_staticremovedef(struct sk_buff *skb,
1270 struct genl_info *info)
1271{
1272 int ret_val;
1273 void *addr;
1274 void *mask;
1275 u32 addr_len;
1276 struct netlbl_audit audit_info;
1277
1278 /* See the note in netlbl_unlabel_staticadd() about not allowing both
1279 * IPv4 and IPv6 in the same entry. */
1280 if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
1281 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
1282 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
1283 !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
1284 return -EINVAL;
1285
1286 netlbl_netlink_auditinfo(skb, &audit_info);
1287
1288 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
1289 if (ret_val != 0)
1290 return ret_val;
1291
1292 return netlbl_unlhsh_remove(&init_net,
1293 NULL, addr, mask, addr_len,
1294 &audit_info);
1295}
1296
1297
1298/**
1299 * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF]
1300 * @cmd: command/message
1301 * @iface: the interface entry
1302 * @addr4: the IPv4 address entry
1303 * @addr6: the IPv6 address entry
1304 * @arg: the netlbl_unlhsh_walk_arg structure
1305 *
1306 * Description:
1307 * This function is designed to be used to generate a response for a
1308 * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6
1309 * can be specified, not both, the other unspecified entry should be set to
1310 * NULL by the caller. Returns the size of the message on success, negative
1311 * values on failure.
1312 *
1313 */
1314static int netlbl_unlabel_staticlist_gen(u32 cmd,
1315 const struct netlbl_unlhsh_iface *iface,
1316 const struct netlbl_unlhsh_addr4 *addr4,
1317 const struct netlbl_unlhsh_addr6 *addr6,
1318 void *arg)
1319{
1320 int ret_val = -ENOMEM;
1321 struct netlbl_unlhsh_walk_arg *cb_arg = arg;
1322 struct net_device *dev;
1323 void *data;
1324 u32 secid;
1325 char *secctx;
1326 u32 secctx_len;
1327
1328 data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid,
1329 cb_arg->seq, &netlbl_unlabel_gnl_family,
1330 NLM_F_MULTI, cmd);
1331 if (data == NULL)
1332 goto list_cb_failure;
1333
1334 if (iface->ifindex > 0) {
1335 dev = dev_get_by_index(&init_net, iface->ifindex);
1336 ret_val = nla_put_string(cb_arg->skb,
1337 NLBL_UNLABEL_A_IFACE, dev->name);
1338 dev_put(dev);
1339 if (ret_val != 0)
1340 goto list_cb_failure;
1341 }
1342
1343 if (addr4) {
1344 struct in_addr addr_struct;
1345
1346 addr_struct.s_addr = addr4->addr;
1347 ret_val = nla_put(cb_arg->skb,
1348 NLBL_UNLABEL_A_IPV4ADDR,
1349 sizeof(struct in_addr),
1350 &addr_struct);
1351 if (ret_val != 0)
1352 goto list_cb_failure;
1353
1354 addr_struct.s_addr = addr4->mask;
1355 ret_val = nla_put(cb_arg->skb,
1356 NLBL_UNLABEL_A_IPV4MASK,
1357 sizeof(struct in_addr),
1358 &addr_struct);
1359 if (ret_val != 0)
1360 goto list_cb_failure;
1361
1362 secid = addr4->secid;
1363 } else {
1364 ret_val = nla_put(cb_arg->skb,
1365 NLBL_UNLABEL_A_IPV6ADDR,
1366 sizeof(struct in6_addr),
1367 &addr6->addr);
1368 if (ret_val != 0)
1369 goto list_cb_failure;
1370
1371 ret_val = nla_put(cb_arg->skb,
1372 NLBL_UNLABEL_A_IPV6MASK,
1373 sizeof(struct in6_addr),
1374 &addr6->mask);
1375 if (ret_val != 0)
1376 goto list_cb_failure;
1377
1378 secid = addr6->secid;
1379 }
1380
1381 ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len);
1382 if (ret_val != 0)
1383 goto list_cb_failure;
1384 ret_val = nla_put(cb_arg->skb,
1385 NLBL_UNLABEL_A_SECCTX,
1386 secctx_len,
1387 secctx);
1388 security_release_secctx(secctx, secctx_len);
1389 if (ret_val != 0)
1390 goto list_cb_failure;
1391
1392 cb_arg->seq++;
1393 return genlmsg_end(cb_arg->skb, data);
1394
1395list_cb_failure:
1396 genlmsg_cancel(cb_arg->skb, data);
1397 return ret_val;
1398}
1399
1400/**
1401 * netlbl_unlabel_staticlist - Handle a STATICLIST message
1402 * @skb: the NETLINK buffer
1403 * @cb: the NETLINK callback
1404 *
1405 * Description:
1406 * Process a user generated STATICLIST message and dump the unlabeled
1407 * connection hash table in a form suitable for use in a kernel generated
1408 * STATICLIST message. Returns the length of @skb.
1409 *
1410 */
1411static int netlbl_unlabel_staticlist(struct sk_buff *skb,
1412 struct netlink_callback *cb)
1413{
1414 struct netlbl_unlhsh_walk_arg cb_arg;
1415 u32 skip_bkt = cb->args[0];
1416 u32 skip_chain = cb->args[1];
1417 u32 skip_addr4 = cb->args[2];
1418 u32 skip_addr6 = cb->args[3];
1419 u32 iter_bkt;
1420 u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0;
1421 struct netlbl_unlhsh_iface *iface;
1422 struct netlbl_unlhsh_addr4 *addr4;
1423 struct netlbl_unlhsh_addr6 *addr6;
1424
1425 cb_arg.nl_cb = cb;
1426 cb_arg.skb = skb;
1427 cb_arg.seq = cb->nlh->nlmsg_seq;
1428
1429 rcu_read_lock();
1430 for (iter_bkt = skip_bkt;
1431 iter_bkt < rcu_dereference(netlbl_unlhsh)->size;
1432 iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) {
1433 list_for_each_entry_rcu(iface,
1434 &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt],
1435 list) {
1436 if (!iface->valid ||
1437 iter_chain++ < skip_chain)
1438 continue;
1439 list_for_each_entry_rcu(addr4,
1440 &iface->addr4_list,
1441 list) {
1442 if (!addr4->valid || iter_addr4++ < skip_addr4)
1443 continue;
1444 if (netlbl_unlabel_staticlist_gen(
1445 NLBL_UNLABEL_C_STATICLIST,
1446 iface,
1447 addr4,
1448 NULL,
1449 &cb_arg) < 0) {
1450 iter_addr4--;
1451 iter_chain--;
1452 goto unlabel_staticlist_return;
1453 }
1454 }
1455 list_for_each_entry_rcu(addr6,
1456 &iface->addr6_list,
1457 list) {
1458 if (!addr6->valid || iter_addr6++ < skip_addr6)
1459 continue;
1460 if (netlbl_unlabel_staticlist_gen(
1461 NLBL_UNLABEL_C_STATICLIST,
1462 iface,
1463 NULL,
1464 addr6,
1465 &cb_arg) < 0) {
1466 iter_addr6--;
1467 iter_chain--;
1468 goto unlabel_staticlist_return;
1469 }
1470 }
1471 }
1472 }
1473
1474unlabel_staticlist_return:
1475 rcu_read_unlock();
1476 cb->args[0] = skip_bkt;
1477 cb->args[1] = skip_chain;
1478 cb->args[2] = skip_addr4;
1479 cb->args[3] = skip_addr6;
1480 return skb->len;
1481}
1482
1483/**
1484 * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message
1485 * @skb: the NETLINK buffer
1486 * @cb: the NETLINK callback
1487 *
1488 * Description:
1489 * Process a user generated STATICLISTDEF message and dump the default
1490 * unlabeled connection entry in a form suitable for use in a kernel generated
1491 * STATICLISTDEF message. Returns the length of @skb.
1492 *
1493 */
1494static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
1495 struct netlink_callback *cb)
1496{
1497 struct netlbl_unlhsh_walk_arg cb_arg;
1498 struct netlbl_unlhsh_iface *iface;
1499 u32 skip_addr4 = cb->args[0];
1500 u32 skip_addr6 = cb->args[1];
1501 u32 iter_addr4 = 0, iter_addr6 = 0;
1502 struct netlbl_unlhsh_addr4 *addr4;
1503 struct netlbl_unlhsh_addr6 *addr6;
1504
1505 cb_arg.nl_cb = cb;
1506 cb_arg.skb = skb;
1507 cb_arg.seq = cb->nlh->nlmsg_seq;
1508
1509 rcu_read_lock();
1510 iface = rcu_dereference(netlbl_unlhsh_def);
1511 if (iface == NULL || !iface->valid)
1512 goto unlabel_staticlistdef_return;
1513
1514 list_for_each_entry_rcu(addr4, &iface->addr4_list, list) {
1515 if (!addr4->valid || iter_addr4++ < skip_addr4)
1516 continue;
1517 if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
1518 iface,
1519 addr4,
1520 NULL,
1521 &cb_arg) < 0) {
1522 iter_addr4--;
1523 goto unlabel_staticlistdef_return;
1524 }
1525 }
1526 list_for_each_entry_rcu(addr6, &iface->addr6_list, list) {
1527 if (addr6->valid || iter_addr6++ < skip_addr6)
1528 continue;
1529 if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
1530 iface,
1531 NULL,
1532 addr6,
1533 &cb_arg) < 0) {
1534 iter_addr6--;
1535 goto unlabel_staticlistdef_return;
1536 }
1537 }
1538
1539unlabel_staticlistdef_return:
1540 rcu_read_unlock();
1541 cb->args[0] = skip_addr4;
1542 cb->args[1] = skip_addr6;
1543 return skb->len;
1544}
178 1545
179/* 1546/*
180 * NetLabel Generic NETLINK Command Definitions 1547 * NetLabel Generic NETLINK Command Definitions
181 */ 1548 */
182 1549
1550static struct genl_ops netlbl_unlabel_genl_c_staticadd = {
1551 .cmd = NLBL_UNLABEL_C_STATICADD,
1552 .flags = GENL_ADMIN_PERM,
1553 .policy = netlbl_unlabel_genl_policy,
1554 .doit = netlbl_unlabel_staticadd,
1555 .dumpit = NULL,
1556};
1557
1558static struct genl_ops netlbl_unlabel_genl_c_staticremove = {
1559 .cmd = NLBL_UNLABEL_C_STATICREMOVE,
1560 .flags = GENL_ADMIN_PERM,
1561 .policy = netlbl_unlabel_genl_policy,
1562 .doit = netlbl_unlabel_staticremove,
1563 .dumpit = NULL,
1564};
1565
1566static struct genl_ops netlbl_unlabel_genl_c_staticlist = {
1567 .cmd = NLBL_UNLABEL_C_STATICLIST,
1568 .flags = 0,
1569 .policy = netlbl_unlabel_genl_policy,
1570 .doit = NULL,
1571 .dumpit = netlbl_unlabel_staticlist,
1572};
1573
1574static struct genl_ops netlbl_unlabel_genl_c_staticadddef = {
1575 .cmd = NLBL_UNLABEL_C_STATICADDDEF,
1576 .flags = GENL_ADMIN_PERM,
1577 .policy = netlbl_unlabel_genl_policy,
1578 .doit = netlbl_unlabel_staticadddef,
1579 .dumpit = NULL,
1580};
1581
1582static struct genl_ops netlbl_unlabel_genl_c_staticremovedef = {
1583 .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF,
1584 .flags = GENL_ADMIN_PERM,
1585 .policy = netlbl_unlabel_genl_policy,
1586 .doit = netlbl_unlabel_staticremovedef,
1587 .dumpit = NULL,
1588};
1589
1590static struct genl_ops netlbl_unlabel_genl_c_staticlistdef = {
1591 .cmd = NLBL_UNLABEL_C_STATICLISTDEF,
1592 .flags = 0,
1593 .policy = netlbl_unlabel_genl_policy,
1594 .doit = NULL,
1595 .dumpit = netlbl_unlabel_staticlistdef,
1596};
1597
183static struct genl_ops netlbl_unlabel_genl_c_accept = { 1598static struct genl_ops netlbl_unlabel_genl_c_accept = {
184 .cmd = NLBL_UNLABEL_C_ACCEPT, 1599 .cmd = NLBL_UNLABEL_C_ACCEPT,
185 .flags = GENL_ADMIN_PERM, 1600 .flags = GENL_ADMIN_PERM,
@@ -196,7 +1611,6 @@ static struct genl_ops netlbl_unlabel_genl_c_list = {
196 .dumpit = NULL, 1611 .dumpit = NULL,
197}; 1612};
198 1613
199
200/* 1614/*
201 * NetLabel Generic NETLINK Protocol Functions 1615 * NetLabel Generic NETLINK Protocol Functions
202 */ 1616 */
@@ -218,6 +1632,36 @@ int netlbl_unlabel_genl_init(void)
218 return ret_val; 1632 return ret_val;
219 1633
220 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, 1634 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
1635 &netlbl_unlabel_genl_c_staticadd);
1636 if (ret_val != 0)
1637 return ret_val;
1638
1639 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
1640 &netlbl_unlabel_genl_c_staticremove);
1641 if (ret_val != 0)
1642 return ret_val;
1643
1644 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
1645 &netlbl_unlabel_genl_c_staticlist);
1646 if (ret_val != 0)
1647 return ret_val;
1648
1649 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
1650 &netlbl_unlabel_genl_c_staticadddef);
1651 if (ret_val != 0)
1652 return ret_val;
1653
1654 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
1655 &netlbl_unlabel_genl_c_staticremovedef);
1656 if (ret_val != 0)
1657 return ret_val;
1658
1659 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
1660 &netlbl_unlabel_genl_c_staticlistdef);
1661 if (ret_val != 0)
1662 return ret_val;
1663
1664 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
221 &netlbl_unlabel_genl_c_accept); 1665 &netlbl_unlabel_genl_c_accept);
222 if (ret_val != 0) 1666 if (ret_val != 0)
223 return ret_val; 1667 return ret_val;
@@ -234,8 +1678,58 @@ int netlbl_unlabel_genl_init(void)
234 * NetLabel KAPI Hooks 1678 * NetLabel KAPI Hooks
235 */ 1679 */
236 1680
1681static struct notifier_block netlbl_unlhsh_netdev_notifier = {
1682 .notifier_call = netlbl_unlhsh_netdev_handler,
1683};
1684
1685/**
1686 * netlbl_unlabel_init - Initialize the unlabeled connection hash table
1687 * @size: the number of bits to use for the hash buckets
1688 *
1689 * Description:
1690 * Initializes the unlabeled connection hash table and registers a network
1691 * device notification handler. This function should only be called by the
1692 * NetLabel subsystem itself during initialization. Returns zero on success,
1693 * non-zero values on error.
1694 *
1695 */
1696int netlbl_unlabel_init(u32 size)
1697{
1698 u32 iter;
1699 struct netlbl_unlhsh_tbl *hsh_tbl;
1700
1701 if (size == 0)
1702 return -EINVAL;
1703
1704 hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
1705 if (hsh_tbl == NULL)
1706 return -ENOMEM;
1707 hsh_tbl->size = 1 << size;
1708 hsh_tbl->tbl = kcalloc(hsh_tbl->size,
1709 sizeof(struct list_head),
1710 GFP_KERNEL);
1711 if (hsh_tbl->tbl == NULL) {
1712 kfree(hsh_tbl);
1713 return -ENOMEM;
1714 }
1715 for (iter = 0; iter < hsh_tbl->size; iter++)
1716 INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);
1717
1718 rcu_read_lock();
1719 spin_lock(&netlbl_unlhsh_lock);
1720 rcu_assign_pointer(netlbl_unlhsh, hsh_tbl);
1721 spin_unlock(&netlbl_unlhsh_lock);
1722 rcu_read_unlock();
1723
1724 register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier);
1725
1726 return 0;
1727}
1728
237/** 1729/**
238 * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet 1730 * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet
1731 * @skb: the packet
1732 * @family: protocol family
239 * @secattr: the security attributes 1733 * @secattr: the security attributes
240 * 1734 *
241 * Description: 1735 * Description:
@@ -243,19 +1737,52 @@ int netlbl_unlabel_genl_init(void)
243 * them in @secattr. Returns zero on success and negative values on failure. 1737 * them in @secattr. Returns zero on success and negative values on failure.
244 * 1738 *
245 */ 1739 */
246int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr) 1740int netlbl_unlabel_getattr(const struct sk_buff *skb,
1741 u16 family,
1742 struct netlbl_lsm_secattr *secattr)
247{ 1743{
248 int ret_val; 1744 struct iphdr *hdr4;
1745 struct ipv6hdr *hdr6;
1746 struct netlbl_unlhsh_addr4 *addr4;
1747 struct netlbl_unlhsh_addr6 *addr6;
1748 struct netlbl_unlhsh_iface *iface;
249 1749
250 rcu_read_lock(); 1750 rcu_read_lock();
251 if (netlabel_unlabel_acceptflg == 1) { 1751 iface = netlbl_unlhsh_search_iface_def(skb->iif);
252 netlbl_secattr_init(secattr); 1752 if (iface == NULL)
253 ret_val = 0; 1753 goto unlabel_getattr_nolabel;
254 } else 1754 switch (family) {
255 ret_val = -ENOMSG; 1755 case PF_INET:
1756 hdr4 = ip_hdr(skb);
1757 addr4 = netlbl_unlhsh_search_addr4(hdr4->saddr, iface);
1758 if (addr4 == NULL)
1759 goto unlabel_getattr_nolabel;
1760 secattr->attr.secid = addr4->secid;
1761 break;
1762#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1763 case PF_INET6:
1764 hdr6 = ipv6_hdr(skb);
1765 addr6 = netlbl_unlhsh_search_addr6(&hdr6->saddr, iface);
1766 if (addr6 == NULL)
1767 goto unlabel_getattr_nolabel;
1768 secattr->attr.secid = addr6->secid;
1769 break;
1770#endif /* IPv6 */
1771 default:
1772 goto unlabel_getattr_nolabel;
1773 }
256 rcu_read_unlock(); 1774 rcu_read_unlock();
257 1775
258 return ret_val; 1776 secattr->flags |= NETLBL_SECATTR_SECID;
1777 secattr->type = NETLBL_NLTYPE_UNLABELED;
1778 return 0;
1779
1780unlabel_getattr_nolabel:
1781 rcu_read_unlock();
1782 if (netlabel_unlabel_acceptflg == 0)
1783 return -ENOMSG;
1784 secattr->type = NETLBL_NLTYPE_UNLABELED;
1785 return 0;
259} 1786}
260 1787
261/** 1788/**
diff --git a/net/netlabel/netlabel_unlabeled.h b/net/netlabel/netlabel_unlabeled.h
index c2917fbb42cf..06b1301ac072 100644
--- a/net/netlabel/netlabel_unlabeled.h
+++ b/net/netlabel/netlabel_unlabeled.h
@@ -36,6 +36,116 @@
36/* 36/*
37 * The following NetLabel payloads are supported by the Unlabeled subsystem. 37 * The following NetLabel payloads are supported by the Unlabeled subsystem.
38 * 38 *
39 * o STATICADD
40 * This message is sent from an application to add a new static label for
41 * incoming unlabeled connections.
42 *
43 * Required attributes:
44 *
45 * NLBL_UNLABEL_A_IFACE
46 * NLBL_UNLABEL_A_SECCTX
47 *
48 * If IPv4 is specified the following attributes are required:
49 *
50 * NLBL_UNLABEL_A_IPV4ADDR
51 * NLBL_UNLABEL_A_IPV4MASK
52 *
53 * If IPv6 is specified the following attributes are required:
54 *
55 * NLBL_UNLABEL_A_IPV6ADDR
56 * NLBL_UNLABEL_A_IPV6MASK
57 *
58 * o STATICREMOVE
59 * This message is sent from an application to remove an existing static
60 * label for incoming unlabeled connections.
61 *
62 * Required attributes:
63 *
64 * NLBL_UNLABEL_A_IFACE
65 *
66 * If IPv4 is specified the following attributes are required:
67 *
68 * NLBL_UNLABEL_A_IPV4ADDR
69 * NLBL_UNLABEL_A_IPV4MASK
70 *
71 * If IPv6 is specified the following attributes are required:
72 *
73 * NLBL_UNLABEL_A_IPV6ADDR
74 * NLBL_UNLABEL_A_IPV6MASK
75 *
76 * o STATICLIST
77 * This message can be sent either from an application or by the kernel in
78 * response to an application generated STATICLIST message. When sent by an
79 * application there is no payload and the NLM_F_DUMP flag should be set.
80 * The kernel should response with a series of the following messages.
81 *
82 * Required attributes:
83 *
84 * NLBL_UNLABEL_A_IFACE
85 * NLBL_UNLABEL_A_SECCTX
86 *
87 * If IPv4 is specified the following attributes are required:
88 *
89 * NLBL_UNLABEL_A_IPV4ADDR
90 * NLBL_UNLABEL_A_IPV4MASK
91 *
92 * If IPv6 is specified the following attributes are required:
93 *
94 * NLBL_UNLABEL_A_IPV6ADDR
95 * NLBL_UNLABEL_A_IPV6MASK
96 *
97 * o STATICADDDEF
98 * This message is sent from an application to set the default static
99 * label for incoming unlabeled connections.
100 *
101 * Required attribute:
102 *
103 * NLBL_UNLABEL_A_SECCTX
104 *
105 * If IPv4 is specified the following attributes are required:
106 *
107 * NLBL_UNLABEL_A_IPV4ADDR
108 * NLBL_UNLABEL_A_IPV4MASK
109 *
110 * If IPv6 is specified the following attributes are required:
111 *
112 * NLBL_UNLABEL_A_IPV6ADDR
113 * NLBL_UNLABEL_A_IPV6MASK
114 *
115 * o STATICREMOVEDEF
116 * This message is sent from an application to remove the existing default
117 * static label for incoming unlabeled connections.
118 *
119 * If IPv4 is specified the following attributes are required:
120 *
121 * NLBL_UNLABEL_A_IPV4ADDR
122 * NLBL_UNLABEL_A_IPV4MASK
123 *
124 * If IPv6 is specified the following attributes are required:
125 *
126 * NLBL_UNLABEL_A_IPV6ADDR
127 * NLBL_UNLABEL_A_IPV6MASK
128 *
129 * o STATICLISTDEF
130 * This message can be sent either from an application or by the kernel in
131 * response to an application generated STATICLISTDEF message. When sent by
132 * an application there is no payload and the NLM_F_DUMP flag should be set.
133 * The kernel should response with the following message.
134 *
135 * Required attribute:
136 *
137 * NLBL_UNLABEL_A_SECCTX
138 *
139 * If IPv4 is specified the following attributes are required:
140 *
141 * NLBL_UNLABEL_A_IPV4ADDR
142 * NLBL_UNLABEL_A_IPV4MASK
143 *
144 * If IPv6 is specified the following attributes are required:
145 *
146 * NLBL_UNLABEL_A_IPV6ADDR
147 * NLBL_UNLABEL_A_IPV6MASK
148 *
39 * o ACCEPT 149 * o ACCEPT
40 * This message is sent from an application to specify if the kernel should 150 * This message is sent from an application to specify if the kernel should
41 * allow unlabled packets to pass if they do not match any of the static 151 * allow unlabled packets to pass if they do not match any of the static
@@ -62,6 +172,12 @@ enum {
62 NLBL_UNLABEL_C_UNSPEC, 172 NLBL_UNLABEL_C_UNSPEC,
63 NLBL_UNLABEL_C_ACCEPT, 173 NLBL_UNLABEL_C_ACCEPT,
64 NLBL_UNLABEL_C_LIST, 174 NLBL_UNLABEL_C_LIST,
175 NLBL_UNLABEL_C_STATICADD,
176 NLBL_UNLABEL_C_STATICREMOVE,
177 NLBL_UNLABEL_C_STATICLIST,
178 NLBL_UNLABEL_C_STATICADDDEF,
179 NLBL_UNLABEL_C_STATICREMOVEDEF,
180 NLBL_UNLABEL_C_STATICLISTDEF,
65 __NLBL_UNLABEL_C_MAX, 181 __NLBL_UNLABEL_C_MAX,
66}; 182};
67#define NLBL_UNLABEL_C_MAX (__NLBL_UNLABEL_C_MAX - 1) 183#define NLBL_UNLABEL_C_MAX (__NLBL_UNLABEL_C_MAX - 1)
@@ -73,6 +189,24 @@ enum {
73 /* (NLA_U8) 189 /* (NLA_U8)
74 * if true then unlabeled packets are allowed to pass, else unlabeled 190 * if true then unlabeled packets are allowed to pass, else unlabeled
75 * packets are rejected */ 191 * packets are rejected */
192 NLBL_UNLABEL_A_IPV6ADDR,
193 /* (NLA_BINARY, struct in6_addr)
194 * an IPv6 address */
195 NLBL_UNLABEL_A_IPV6MASK,
196 /* (NLA_BINARY, struct in6_addr)
197 * an IPv6 address mask */
198 NLBL_UNLABEL_A_IPV4ADDR,
199 /* (NLA_BINARY, struct in_addr)
200 * an IPv4 address */
201 NLBL_UNLABEL_A_IPV4MASK,
202 /* (NLA_BINARY, struct in_addr)
203 * and IPv4 address mask */
204 NLBL_UNLABEL_A_IFACE,
205 /* (NLA_NULL_STRING)
206 * network interface */
207 NLBL_UNLABEL_A_SECCTX,
208 /* (NLA_BINARY)
209 * a LSM specific security context */
76 __NLBL_UNLABEL_A_MAX, 210 __NLBL_UNLABEL_A_MAX,
77}; 211};
78#define NLBL_UNLABEL_A_MAX (__NLBL_UNLABEL_A_MAX - 1) 212#define NLBL_UNLABEL_A_MAX (__NLBL_UNLABEL_A_MAX - 1)
@@ -80,8 +214,17 @@ enum {
80/* NetLabel protocol functions */ 214/* NetLabel protocol functions */
81int netlbl_unlabel_genl_init(void); 215int netlbl_unlabel_genl_init(void);
82 216
217/* Unlabeled connection hash table size */
218/* XXX - currently this number is an uneducated guess */
219#define NETLBL_UNLHSH_BITSIZE 7
220
221/* General Unlabeled init function */
222int netlbl_unlabel_init(u32 size);
223
83/* Process Unlabeled incoming network packets */ 224/* Process Unlabeled incoming network packets */
84int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr); 225int netlbl_unlabel_getattr(const struct sk_buff *skb,
226 u16 family,
227 struct netlbl_lsm_secattr *secattr);
85 228
86/* Set the default configuration to allow Unlabeled packets */ 229/* Set the default configuration to allow Unlabeled packets */
87int netlbl_unlabel_defconf(void); 230int netlbl_unlabel_defconf(void);
diff --git a/security/Kconfig b/security/Kconfig
index 8086e61058e3..389e151e3b68 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -76,6 +76,7 @@ config SECURITY_NETWORK_XFRM
76config SECURITY_CAPABILITIES 76config SECURITY_CAPABILITIES
77 bool "Default Linux Capabilities" 77 bool "Default Linux Capabilities"
78 depends on SECURITY 78 depends on SECURITY
79 default y
79 help 80 help
80 This enables the "default" Linux capabilities functionality. 81 This enables the "default" Linux capabilities functionality.
81 If you are unsure how to answer this question, answer Y. 82 If you are unsure how to answer this question, answer Y.
diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig
index b32a459c0683..2b517d618672 100644
--- a/security/selinux/Kconfig
+++ b/security/selinux/Kconfig
@@ -145,7 +145,7 @@ config SECURITY_SELINUX_POLICYDB_VERSION_MAX
145config SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE 145config SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
146 int "NSA SELinux maximum supported policy format version value" 146 int "NSA SELinux maximum supported policy format version value"
147 depends on SECURITY_SELINUX_POLICYDB_VERSION_MAX 147 depends on SECURITY_SELINUX_POLICYDB_VERSION_MAX
148 range 15 21 148 range 15 22
149 default 19 149 default 19
150 help 150 help
151 This option sets the value for the maximum policy format version 151 This option sets the value for the maximum policy format version
diff --git a/security/selinux/Makefile b/security/selinux/Makefile
index dc3502e30b19..00afd85f1edb 100644
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -4,7 +4,14 @@
4 4
5obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/ 5obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/
6 6
7selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o exports.o 7selinux-y := avc.o \
8 hooks.o \
9 selinuxfs.o \
10 netlink.o \
11 nlmsgtab.o \
12 netif.o \
13 netnode.o \
14 exports.o
8 15
9selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o 16selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
10 17
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 81b3dff3cbf0..e8529e2f51e5 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -661,9 +661,18 @@ void avc_audit(u32 ssid, u32 tsid,
661 "daddr", "dest"); 661 "daddr", "dest");
662 break; 662 break;
663 } 663 }
664 if (a->u.net.netif) 664 if (a->u.net.netif > 0) {
665 audit_log_format(ab, " netif=%s", 665 struct net_device *dev;
666 a->u.net.netif); 666
667 /* NOTE: we always use init's namespace */
668 dev = dev_get_by_index(&init_net,
669 a->u.net.netif);
670 if (dev) {
671 audit_log_format(ab, " netif=%s",
672 dev->name);
673 dev_put(dev);
674 }
675 }
667 break; 676 break;
668 } 677 }
669 } 678 }
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index b6f96943be1f..87d2bb3ea355 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -17,10 +17,14 @@
17#include <linux/selinux.h> 17#include <linux/selinux.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/ipc.h> 19#include <linux/ipc.h>
20#include <asm/atomic.h>
20 21
21#include "security.h" 22#include "security.h"
22#include "objsec.h" 23#include "objsec.h"
23 24
25/* SECMARK reference count */
26extern atomic_t selinux_secmark_refcount;
27
24int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen) 28int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen)
25{ 29{
26 if (selinux_enabled) 30 if (selinux_enabled)
@@ -74,7 +78,7 @@ int selinux_string_to_sid(char *str, u32 *sid)
74} 78}
75EXPORT_SYMBOL_GPL(selinux_string_to_sid); 79EXPORT_SYMBOL_GPL(selinux_string_to_sid);
76 80
77int selinux_relabel_packet_permission(u32 sid) 81int selinux_secmark_relabel_packet_permission(u32 sid)
78{ 82{
79 if (selinux_enabled) { 83 if (selinux_enabled) {
80 struct task_security_struct *tsec = current->security; 84 struct task_security_struct *tsec = current->security;
@@ -84,4 +88,16 @@ int selinux_relabel_packet_permission(u32 sid)
84 } 88 }
85 return 0; 89 return 0;
86} 90}
87EXPORT_SYMBOL_GPL(selinux_relabel_packet_permission); 91EXPORT_SYMBOL_GPL(selinux_secmark_relabel_packet_permission);
92
93void selinux_secmark_refcount_inc(void)
94{
95 atomic_inc(&selinux_secmark_refcount);
96}
97EXPORT_SYMBOL_GPL(selinux_secmark_refcount_inc);
98
99void selinux_secmark_refcount_dec(void)
100{
101 atomic_dec(&selinux_secmark_refcount);
102}
103EXPORT_SYMBOL_GPL(selinux_secmark_refcount_dec);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 64d414efb404..be6de0b8734f 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -12,8 +12,8 @@
12 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> 12 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
13 * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. 13 * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
14 * <dgoeddel@trustedcs.com> 14 * <dgoeddel@trustedcs.com>
15 * Copyright (C) 2006 Hewlett-Packard Development Company, L.P. 15 * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P.
16 * Paul Moore, <paul.moore@hp.com> 16 * Paul Moore <paul.moore@hp.com>
17 * Copyright (C) 2007 Hitachi Software Engineering Co., Ltd. 17 * Copyright (C) 2007 Hitachi Software Engineering Co., Ltd.
18 * Yuichi Nakamura <ynakam@hitachisoft.jp> 18 * Yuichi Nakamura <ynakam@hitachisoft.jp>
19 * 19 *
@@ -50,8 +50,11 @@
50#include <net/icmp.h> 50#include <net/icmp.h>
51#include <net/ip.h> /* for local_port_range[] */ 51#include <net/ip.h> /* for local_port_range[] */
52#include <net/tcp.h> /* struct or_callable used in sock_rcv_skb */ 52#include <net/tcp.h> /* struct or_callable used in sock_rcv_skb */
53#include <net/net_namespace.h>
54#include <net/netlabel.h>
53#include <asm/uaccess.h> 55#include <asm/uaccess.h>
54#include <asm/ioctls.h> 56#include <asm/ioctls.h>
57#include <asm/atomic.h>
55#include <linux/bitops.h> 58#include <linux/bitops.h>
56#include <linux/interrupt.h> 59#include <linux/interrupt.h>
57#include <linux/netdevice.h> /* for network interface checks */ 60#include <linux/netdevice.h> /* for network interface checks */
@@ -76,6 +79,7 @@
76#include "avc.h" 79#include "avc.h"
77#include "objsec.h" 80#include "objsec.h"
78#include "netif.h" 81#include "netif.h"
82#include "netnode.h"
79#include "xfrm.h" 83#include "xfrm.h"
80#include "netlabel.h" 84#include "netlabel.h"
81 85
@@ -89,6 +93,9 @@ extern int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm);
89extern int selinux_compat_net; 93extern int selinux_compat_net;
90extern struct security_operations *security_ops; 94extern struct security_operations *security_ops;
91 95
96/* SECMARK reference count */
97atomic_t selinux_secmark_refcount = ATOMIC_INIT(0);
98
92#ifdef CONFIG_SECURITY_SELINUX_DEVELOP 99#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
93int selinux_enforcing = 0; 100int selinux_enforcing = 0;
94 101
@@ -155,6 +162,21 @@ getsecurity_exit:
155 return len; 162 return len;
156} 163}
157 164
165/**
166 * selinux_secmark_enabled - Check to see if SECMARK is currently enabled
167 *
168 * Description:
169 * This function checks the SECMARK reference counter to see if any SECMARK
170 * targets are currently configured, if the reference counter is greater than
171 * zero SECMARK is considered to be enabled. Returns true (1) if SECMARK is
172 * enabled, false (0) if SECMARK is disabled.
173 *
174 */
175static int selinux_secmark_enabled(void)
176{
177 return (atomic_read(&selinux_secmark_refcount) > 0);
178}
179
158/* Allocate and free functions for each kind of security blob. */ 180/* Allocate and free functions for each kind of security blob. */
159 181
160static int task_alloc_security(struct task_struct *task) 182static int task_alloc_security(struct task_struct *task)
@@ -561,8 +583,8 @@ static int bad_option(struct superblock_security_struct *sbsec, char flag,
561 * Allow filesystems with binary mount data to explicitly set mount point 583 * Allow filesystems with binary mount data to explicitly set mount point
562 * labeling information. 584 * labeling information.
563 */ 585 */
564int selinux_set_mnt_opts(struct super_block *sb, char **mount_options, 586static int selinux_set_mnt_opts(struct super_block *sb, char **mount_options,
565 int *flags, int num_opts) 587 int *flags, int num_opts)
566{ 588{
567 int rc = 0, i; 589 int rc = 0, i;
568 struct task_security_struct *tsec = current->security; 590 struct task_security_struct *tsec = current->security;
@@ -3395,7 +3417,7 @@ out:
3395#endif /* IPV6 */ 3417#endif /* IPV6 */
3396 3418
3397static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, 3419static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
3398 char **addrp, int *len, int src, u8 *proto) 3420 char **addrp, int src, u8 *proto)
3399{ 3421{
3400 int ret = 0; 3422 int ret = 0;
3401 3423
@@ -3404,7 +3426,6 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
3404 ret = selinux_parse_skb_ipv4(skb, ad, proto); 3426 ret = selinux_parse_skb_ipv4(skb, ad, proto);
3405 if (ret || !addrp) 3427 if (ret || !addrp)
3406 break; 3428 break;
3407 *len = 4;
3408 *addrp = (char *)(src ? &ad->u.net.v4info.saddr : 3429 *addrp = (char *)(src ? &ad->u.net.v4info.saddr :
3409 &ad->u.net.v4info.daddr); 3430 &ad->u.net.v4info.daddr);
3410 break; 3431 break;
@@ -3414,7 +3435,6 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
3414 ret = selinux_parse_skb_ipv6(skb, ad, proto); 3435 ret = selinux_parse_skb_ipv6(skb, ad, proto);
3415 if (ret || !addrp) 3436 if (ret || !addrp)
3416 break; 3437 break;
3417 *len = 16;
3418 *addrp = (char *)(src ? &ad->u.net.v6info.saddr : 3438 *addrp = (char *)(src ? &ad->u.net.v6info.saddr :
3419 &ad->u.net.v6info.daddr); 3439 &ad->u.net.v6info.daddr);
3420 break; 3440 break;
@@ -3423,36 +3443,48 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
3423 break; 3443 break;
3424 } 3444 }
3425 3445
3446 if (unlikely(ret))
3447 printk(KERN_WARNING
3448 "SELinux: failure in selinux_parse_skb(),"
3449 " unable to parse packet\n");
3450
3426 return ret; 3451 return ret;
3427} 3452}
3428 3453
3429/** 3454/**
3430 * selinux_skb_extlbl_sid - Determine the external label of a packet 3455 * selinux_skb_peerlbl_sid - Determine the peer label of a packet
3431 * @skb: the packet 3456 * @skb: the packet
3432 * @sid: the packet's SID 3457 * @family: protocol family
3458 * @sid: the packet's peer label SID
3433 * 3459 *
3434 * Description: 3460 * Description:
3435 * Check the various different forms of external packet labeling and determine 3461 * Check the various different forms of network peer labeling and determine
3436 * the external SID for the packet. If only one form of external labeling is 3462 * the peer label/SID for the packet; most of the magic actually occurs in
3437 * present then it is used, if both labeled IPsec and NetLabel labels are 3463 * the security server function security_net_peersid_cmp(). The function
3438 * present then the SELinux type information is taken from the labeled IPsec 3464 * returns zero if the value in @sid is valid (although it may be SECSID_NULL)
3439 * SA and the MLS sensitivity label information is taken from the NetLabel 3465 * or -EACCES if @sid is invalid due to inconsistencies with the different
3440 * security attributes. This bit of "magic" is done in the call to 3466 * peer labels.
3441 * selinux_netlbl_skbuff_getsid().
3442 * 3467 *
3443 */ 3468 */
3444static void selinux_skb_extlbl_sid(struct sk_buff *skb, u32 *sid) 3469static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid)
3445{ 3470{
3471 int err;
3446 u32 xfrm_sid; 3472 u32 xfrm_sid;
3447 u32 nlbl_sid; 3473 u32 nlbl_sid;
3474 u32 nlbl_type;
3448 3475
3449 selinux_skb_xfrm_sid(skb, &xfrm_sid); 3476 selinux_skb_xfrm_sid(skb, &xfrm_sid);
3450 if (selinux_netlbl_skbuff_getsid(skb, 3477 selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid);
3451 (xfrm_sid == SECSID_NULL ? 3478
3452 SECINITSID_NETMSG : xfrm_sid), 3479 err = security_net_peersid_resolve(nlbl_sid, nlbl_type, xfrm_sid, sid);
3453 &nlbl_sid) != 0) 3480 if (unlikely(err)) {
3454 nlbl_sid = SECSID_NULL; 3481 printk(KERN_WARNING
3455 *sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid); 3482 "SELinux: failure in selinux_skb_peerlbl_sid(),"
3483 " unable to determine packet's peer label\n");
3484 return -EACCES;
3485 }
3486
3487 return 0;
3456} 3488}
3457 3489
3458/* socket security operations */ 3490/* socket security operations */
@@ -3518,6 +3550,7 @@ static int selinux_socket_post_create(struct socket *sock, int family,
3518 if (sock->sk) { 3550 if (sock->sk) {
3519 sksec = sock->sk->sk_security; 3551 sksec = sock->sk->sk_security;
3520 sksec->sid = isec->sid; 3552 sksec->sid = isec->sid;
3553 sksec->sclass = isec->sclass;
3521 err = selinux_netlbl_socket_post_create(sock); 3554 err = selinux_netlbl_socket_post_create(sock);
3522 } 3555 }
3523 3556
@@ -3610,7 +3643,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
3610 break; 3643 break;
3611 } 3644 }
3612 3645
3613 err = security_node_sid(family, addrp, addrlen, &sid); 3646 err = sel_netnode_sid(addrp, family, &sid);
3614 if (err) 3647 if (err)
3615 goto out; 3648 goto out;
3616 3649
@@ -3821,131 +3854,182 @@ static int selinux_socket_unix_may_send(struct socket *sock,
3821 return 0; 3854 return 0;
3822} 3855}
3823 3856
3824static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb, 3857static int selinux_inet_sys_rcv_skb(int ifindex, char *addrp, u16 family,
3825 struct avc_audit_data *ad, u16 family, char *addrp, int len) 3858 u32 peer_sid,
3859 struct avc_audit_data *ad)
3826{ 3860{
3827 int err = 0; 3861 int err;
3828 u32 netif_perm, node_perm, node_sid, if_sid, recv_perm = 0; 3862 u32 if_sid;
3829 struct socket *sock; 3863 u32 node_sid;
3830 u16 sock_class = 0;
3831 u32 sock_sid = 0;
3832
3833 read_lock_bh(&sk->sk_callback_lock);
3834 sock = sk->sk_socket;
3835 if (sock) {
3836 struct inode *inode;
3837 inode = SOCK_INODE(sock);
3838 if (inode) {
3839 struct inode_security_struct *isec;
3840 isec = inode->i_security;
3841 sock_sid = isec->sid;
3842 sock_class = isec->sclass;
3843 }
3844 }
3845 read_unlock_bh(&sk->sk_callback_lock);
3846 if (!sock_sid)
3847 goto out;
3848 3864
3849 if (!skb->dev) 3865 err = sel_netif_sid(ifindex, &if_sid);
3850 goto out; 3866 if (err)
3867 return err;
3868 err = avc_has_perm(peer_sid, if_sid,
3869 SECCLASS_NETIF, NETIF__INGRESS, ad);
3870 if (err)
3871 return err;
3851 3872
3852 err = sel_netif_sids(skb->dev, &if_sid, NULL); 3873 err = sel_netnode_sid(addrp, family, &node_sid);
3853 if (err) 3874 if (err)
3854 goto out; 3875 return err;
3876 return avc_has_perm(peer_sid, node_sid,
3877 SECCLASS_NODE, NODE__RECVFROM, ad);
3878}
3879
3880static int selinux_sock_rcv_skb_iptables_compat(struct sock *sk,
3881 struct sk_buff *skb,
3882 struct avc_audit_data *ad,
3883 u16 family,
3884 char *addrp)
3885{
3886 int err;
3887 struct sk_security_struct *sksec = sk->sk_security;
3888 u16 sk_class;
3889 u32 netif_perm, node_perm, recv_perm;
3890 u32 port_sid, node_sid, if_sid, sk_sid;
3855 3891
3856 switch (sock_class) { 3892 sk_sid = sksec->sid;
3893 sk_class = sksec->sclass;
3894
3895 switch (sk_class) {
3857 case SECCLASS_UDP_SOCKET: 3896 case SECCLASS_UDP_SOCKET:
3858 netif_perm = NETIF__UDP_RECV; 3897 netif_perm = NETIF__UDP_RECV;
3859 node_perm = NODE__UDP_RECV; 3898 node_perm = NODE__UDP_RECV;
3860 recv_perm = UDP_SOCKET__RECV_MSG; 3899 recv_perm = UDP_SOCKET__RECV_MSG;
3861 break; 3900 break;
3862
3863 case SECCLASS_TCP_SOCKET: 3901 case SECCLASS_TCP_SOCKET:
3864 netif_perm = NETIF__TCP_RECV; 3902 netif_perm = NETIF__TCP_RECV;
3865 node_perm = NODE__TCP_RECV; 3903 node_perm = NODE__TCP_RECV;
3866 recv_perm = TCP_SOCKET__RECV_MSG; 3904 recv_perm = TCP_SOCKET__RECV_MSG;
3867 break; 3905 break;
3868
3869 case SECCLASS_DCCP_SOCKET: 3906 case SECCLASS_DCCP_SOCKET:
3870 netif_perm = NETIF__DCCP_RECV; 3907 netif_perm = NETIF__DCCP_RECV;
3871 node_perm = NODE__DCCP_RECV; 3908 node_perm = NODE__DCCP_RECV;
3872 recv_perm = DCCP_SOCKET__RECV_MSG; 3909 recv_perm = DCCP_SOCKET__RECV_MSG;
3873 break; 3910 break;
3874
3875 default: 3911 default:
3876 netif_perm = NETIF__RAWIP_RECV; 3912 netif_perm = NETIF__RAWIP_RECV;
3877 node_perm = NODE__RAWIP_RECV; 3913 node_perm = NODE__RAWIP_RECV;
3914 recv_perm = 0;
3878 break; 3915 break;
3879 } 3916 }
3880 3917
3881 err = avc_has_perm(sock_sid, if_sid, SECCLASS_NETIF, netif_perm, ad); 3918 err = sel_netif_sid(skb->iif, &if_sid);
3882 if (err) 3919 if (err)
3883 goto out; 3920 return err;
3884 3921 err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
3885 err = security_node_sid(family, addrp, len, &node_sid);
3886 if (err) 3922 if (err)
3887 goto out; 3923 return err;
3888 3924
3889 err = avc_has_perm(sock_sid, node_sid, SECCLASS_NODE, node_perm, ad); 3925 err = sel_netnode_sid(addrp, family, &node_sid);
3890 if (err) 3926 if (err)
3891 goto out; 3927 return err;
3928 err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad);
3929 if (err)
3930 return err;
3892 3931
3893 if (recv_perm) { 3932 if (!recv_perm)
3894 u32 port_sid; 3933 return 0;
3934 err = security_port_sid(sk->sk_family, sk->sk_type,
3935 sk->sk_protocol, ntohs(ad->u.net.sport),
3936 &port_sid);
3937 if (unlikely(err)) {
3938 printk(KERN_WARNING
3939 "SELinux: failure in"
3940 " selinux_sock_rcv_skb_iptables_compat(),"
3941 " network port label not found\n");
3942 return err;
3943 }
3944 return avc_has_perm(sk_sid, port_sid, sk_class, recv_perm, ad);
3945}
3895 3946
3896 err = security_port_sid(sk->sk_family, sk->sk_type, 3947static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
3897 sk->sk_protocol, ntohs(ad->u.net.sport), 3948 struct avc_audit_data *ad,
3898 &port_sid); 3949 u16 family, char *addrp)
3899 if (err) 3950{
3900 goto out; 3951 int err;
3952 struct sk_security_struct *sksec = sk->sk_security;
3953 u32 peer_sid;
3954 u32 sk_sid = sksec->sid;
3901 3955
3902 err = avc_has_perm(sock_sid, port_sid, 3956 if (selinux_compat_net)
3903 sock_class, recv_perm, ad); 3957 err = selinux_sock_rcv_skb_iptables_compat(sk, skb, ad,
3958 family, addrp);
3959 else
3960 err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
3961 PACKET__RECV, ad);
3962 if (err)
3963 return err;
3964
3965 if (selinux_policycap_netpeer) {
3966 err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
3967 if (err)
3968 return err;
3969 err = avc_has_perm(sk_sid, peer_sid,
3970 SECCLASS_PEER, PEER__RECV, ad);
3971 } else {
3972 err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, ad);
3973 if (err)
3974 return err;
3975 err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, ad);
3904 } 3976 }
3905 3977
3906out:
3907 return err; 3978 return err;
3908} 3979}
3909 3980
3910static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) 3981static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
3911{ 3982{
3912 u16 family; 3983 int err;
3913 char *addrp;
3914 int len, err = 0;
3915 struct avc_audit_data ad;
3916 struct sk_security_struct *sksec = sk->sk_security; 3984 struct sk_security_struct *sksec = sk->sk_security;
3985 u16 family = sk->sk_family;
3986 u32 sk_sid = sksec->sid;
3987 struct avc_audit_data ad;
3988 char *addrp;
3917 3989
3918 family = sk->sk_family;
3919 if (family != PF_INET && family != PF_INET6) 3990 if (family != PF_INET && family != PF_INET6)
3920 goto out; 3991 return 0;
3921 3992
3922 /* Handle mapped IPv4 packets arriving via IPv6 sockets */ 3993 /* Handle mapped IPv4 packets arriving via IPv6 sockets */
3923 if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) 3994 if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
3924 family = PF_INET; 3995 family = PF_INET;
3925 3996
3926 AVC_AUDIT_DATA_INIT(&ad, NET); 3997 AVC_AUDIT_DATA_INIT(&ad, NET);
3927 ad.u.net.netif = skb->dev ? skb->dev->name : "[unknown]"; 3998 ad.u.net.netif = skb->iif;
3928 ad.u.net.family = family; 3999 ad.u.net.family = family;
3929 4000 err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
3930 err = selinux_parse_skb(skb, &ad, &addrp, &len, 1, NULL);
3931 if (err) 4001 if (err)
3932 goto out; 4002 return err;
3933 4003
3934 if (selinux_compat_net) 4004 /* If any sort of compatibility mode is enabled then handoff processing
3935 err = selinux_sock_rcv_skb_compat(sk, skb, &ad, family, 4005 * to the selinux_sock_rcv_skb_compat() function to deal with the
3936 addrp, len); 4006 * special handling. We do this in an attempt to keep this function
3937 else 4007 * as fast and as clean as possible. */
3938 err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET, 4008 if (selinux_compat_net || !selinux_policycap_netpeer)
3939 PACKET__RECV, &ad); 4009 return selinux_sock_rcv_skb_compat(sk, skb, &ad,
3940 if (err) 4010 family, addrp);
3941 goto out;
3942 4011
3943 err = selinux_netlbl_sock_rcv_skb(sksec, skb, &ad); 4012 if (netlbl_enabled() || selinux_xfrm_enabled()) {
3944 if (err) 4013 u32 peer_sid;
3945 goto out; 4014
4015 err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
4016 if (err)
4017 return err;
4018 err = selinux_inet_sys_rcv_skb(skb->iif, addrp, family,
4019 peer_sid, &ad);
4020 if (err)
4021 return err;
4022 err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER,
4023 PEER__RECV, &ad);
4024 }
4025
4026 if (selinux_secmark_enabled()) {
4027 err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
4028 PACKET__RECV, &ad);
4029 if (err)
4030 return err;
4031 }
3946 4032
3947 err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad);
3948out:
3949 return err; 4033 return err;
3950} 4034}
3951 4035
@@ -3996,18 +4080,25 @@ out:
3996static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) 4080static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid)
3997{ 4081{
3998 u32 peer_secid = SECSID_NULL; 4082 u32 peer_secid = SECSID_NULL;
3999 int err = 0; 4083 u16 family;
4084
4085 if (sock)
4086 family = sock->sk->sk_family;
4087 else if (skb && skb->sk)
4088 family = skb->sk->sk_family;
4089 else
4090 goto out;
4000 4091
4001 if (sock && sock->sk->sk_family == PF_UNIX) 4092 if (sock && family == PF_UNIX)
4002 selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid); 4093 selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid);
4003 else if (skb) 4094 else if (skb)
4004 selinux_skb_extlbl_sid(skb, &peer_secid); 4095 selinux_skb_peerlbl_sid(skb, family, &peer_secid);
4005 4096
4006 if (peer_secid == SECSID_NULL) 4097out:
4007 err = -EINVAL;
4008 *secid = peer_secid; 4098 *secid = peer_secid;
4009 4099 if (peer_secid == SECSID_NULL)
4010 return err; 4100 return -EINVAL;
4101 return 0;
4011} 4102}
4012 4103
4013static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority) 4104static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority)
@@ -4027,6 +4118,7 @@ static void selinux_sk_clone_security(const struct sock *sk, struct sock *newsk)
4027 4118
4028 newssec->sid = ssec->sid; 4119 newssec->sid = ssec->sid;
4029 newssec->peer_sid = ssec->peer_sid; 4120 newssec->peer_sid = ssec->peer_sid;
4121 newssec->sclass = ssec->sclass;
4030 4122
4031 selinux_netlbl_sk_security_clone(ssec, newssec); 4123 selinux_netlbl_sk_security_clone(ssec, newssec);
4032} 4124}
@@ -4050,6 +4142,7 @@ static void selinux_sock_graft(struct sock* sk, struct socket *parent)
4050 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 || 4142 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 ||
4051 sk->sk_family == PF_UNIX) 4143 sk->sk_family == PF_UNIX)
4052 isec->sid = sksec->sid; 4144 isec->sid = sksec->sid;
4145 sksec->sclass = isec->sclass;
4053 4146
4054 selinux_netlbl_sock_graft(sk, parent); 4147 selinux_netlbl_sock_graft(sk, parent);
4055} 4148}
@@ -4062,7 +4155,9 @@ static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb,
4062 u32 newsid; 4155 u32 newsid;
4063 u32 peersid; 4156 u32 peersid;
4064 4157
4065 selinux_skb_extlbl_sid(skb, &peersid); 4158 err = selinux_skb_peerlbl_sid(skb, sk->sk_family, &peersid);
4159 if (err)
4160 return err;
4066 if (peersid == SECSID_NULL) { 4161 if (peersid == SECSID_NULL) {
4067 req->secid = sksec->sid; 4162 req->secid = sksec->sid;
4068 req->peer_secid = SECSID_NULL; 4163 req->peer_secid = SECSID_NULL;
@@ -4100,7 +4195,7 @@ static void selinux_inet_conn_established(struct sock *sk,
4100{ 4195{
4101 struct sk_security_struct *sksec = sk->sk_security; 4196 struct sk_security_struct *sksec = sk->sk_security;
4102 4197
4103 selinux_skb_extlbl_sid(skb, &sksec->peer_sid); 4198 selinux_skb_peerlbl_sid(skb, sk->sk_family, &sksec->peer_sid);
4104} 4199}
4105 4200
4106static void selinux_req_classify_flow(const struct request_sock *req, 4201static void selinux_req_classify_flow(const struct request_sock *req,
@@ -4147,149 +4242,260 @@ out:
4147 4242
4148#ifdef CONFIG_NETFILTER 4243#ifdef CONFIG_NETFILTER
4149 4244
4150static int selinux_ip_postroute_last_compat(struct sock *sk, struct net_device *dev, 4245static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex,
4151 struct avc_audit_data *ad, 4246 u16 family)
4152 u16 family, char *addrp, int len)
4153{ 4247{
4154 int err = 0; 4248 char *addrp;
4155 u32 netif_perm, node_perm, node_sid, if_sid, send_perm = 0; 4249 u32 peer_sid;
4156 struct socket *sock; 4250 struct avc_audit_data ad;
4157 struct inode *inode; 4251 u8 secmark_active;
4158 struct inode_security_struct *isec; 4252 u8 peerlbl_active;
4159 4253
4160 sock = sk->sk_socket; 4254 if (!selinux_policycap_netpeer)
4161 if (!sock) 4255 return NF_ACCEPT;
4162 goto out;
4163 4256
4164 inode = SOCK_INODE(sock); 4257 secmark_active = selinux_secmark_enabled();
4165 if (!inode) 4258 peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
4166 goto out; 4259 if (!secmark_active && !peerlbl_active)
4260 return NF_ACCEPT;
4167 4261
4168 isec = inode->i_security; 4262 AVC_AUDIT_DATA_INIT(&ad, NET);
4169 4263 ad.u.net.netif = ifindex;
4170 err = sel_netif_sids(dev, &if_sid, NULL); 4264 ad.u.net.family = family;
4171 if (err) 4265 if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0)
4172 goto out; 4266 return NF_DROP;
4267
4268 if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0)
4269 return NF_DROP;
4270
4271 if (peerlbl_active)
4272 if (selinux_inet_sys_rcv_skb(ifindex, addrp, family,
4273 peer_sid, &ad) != 0)
4274 return NF_DROP;
4275
4276 if (secmark_active)
4277 if (avc_has_perm(peer_sid, skb->secmark,
4278 SECCLASS_PACKET, PACKET__FORWARD_IN, &ad))
4279 return NF_DROP;
4280
4281 return NF_ACCEPT;
4282}
4283
4284static unsigned int selinux_ipv4_forward(unsigned int hooknum,
4285 struct sk_buff *skb,
4286 const struct net_device *in,
4287 const struct net_device *out,
4288 int (*okfn)(struct sk_buff *))
4289{
4290 return selinux_ip_forward(skb, in->ifindex, PF_INET);
4291}
4173 4292
4174 switch (isec->sclass) { 4293#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
4294static unsigned int selinux_ipv6_forward(unsigned int hooknum,
4295 struct sk_buff *skb,
4296 const struct net_device *in,
4297 const struct net_device *out,
4298 int (*okfn)(struct sk_buff *))
4299{
4300 return selinux_ip_forward(skb, in->ifindex, PF_INET6);
4301}
4302#endif /* IPV6 */
4303
4304static int selinux_ip_postroute_iptables_compat(struct sock *sk,
4305 int ifindex,
4306 struct avc_audit_data *ad,
4307 u16 family, char *addrp)
4308{
4309 int err;
4310 struct sk_security_struct *sksec = sk->sk_security;
4311 u16 sk_class;
4312 u32 netif_perm, node_perm, send_perm;
4313 u32 port_sid, node_sid, if_sid, sk_sid;
4314
4315 sk_sid = sksec->sid;
4316 sk_class = sksec->sclass;
4317
4318 switch (sk_class) {
4175 case SECCLASS_UDP_SOCKET: 4319 case SECCLASS_UDP_SOCKET:
4176 netif_perm = NETIF__UDP_SEND; 4320 netif_perm = NETIF__UDP_SEND;
4177 node_perm = NODE__UDP_SEND; 4321 node_perm = NODE__UDP_SEND;
4178 send_perm = UDP_SOCKET__SEND_MSG; 4322 send_perm = UDP_SOCKET__SEND_MSG;
4179 break; 4323 break;
4180
4181 case SECCLASS_TCP_SOCKET: 4324 case SECCLASS_TCP_SOCKET:
4182 netif_perm = NETIF__TCP_SEND; 4325 netif_perm = NETIF__TCP_SEND;
4183 node_perm = NODE__TCP_SEND; 4326 node_perm = NODE__TCP_SEND;
4184 send_perm = TCP_SOCKET__SEND_MSG; 4327 send_perm = TCP_SOCKET__SEND_MSG;
4185 break; 4328 break;
4186
4187 case SECCLASS_DCCP_SOCKET: 4329 case SECCLASS_DCCP_SOCKET:
4188 netif_perm = NETIF__DCCP_SEND; 4330 netif_perm = NETIF__DCCP_SEND;
4189 node_perm = NODE__DCCP_SEND; 4331 node_perm = NODE__DCCP_SEND;
4190 send_perm = DCCP_SOCKET__SEND_MSG; 4332 send_perm = DCCP_SOCKET__SEND_MSG;
4191 break; 4333 break;
4192
4193 default: 4334 default:
4194 netif_perm = NETIF__RAWIP_SEND; 4335 netif_perm = NETIF__RAWIP_SEND;
4195 node_perm = NODE__RAWIP_SEND; 4336 node_perm = NODE__RAWIP_SEND;
4337 send_perm = 0;
4196 break; 4338 break;
4197 } 4339 }
4198 4340
4199 err = avc_has_perm(isec->sid, if_sid, SECCLASS_NETIF, netif_perm, ad); 4341 err = sel_netif_sid(ifindex, &if_sid);
4200 if (err) 4342 if (err)
4201 goto out; 4343 return err;
4344 err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
4345 return err;
4202 4346
4203 err = security_node_sid(family, addrp, len, &node_sid); 4347 err = sel_netnode_sid(addrp, family, &node_sid);
4204 if (err) 4348 if (err)
4205 goto out; 4349 return err;
4206 4350 err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad);
4207 err = avc_has_perm(isec->sid, node_sid, SECCLASS_NODE, node_perm, ad);
4208 if (err) 4351 if (err)
4209 goto out; 4352 return err;
4210 4353
4211 if (send_perm) { 4354 if (send_perm != 0)
4212 u32 port_sid; 4355 return 0;
4213
4214 err = security_port_sid(sk->sk_family,
4215 sk->sk_type,
4216 sk->sk_protocol,
4217 ntohs(ad->u.net.dport),
4218 &port_sid);
4219 if (err)
4220 goto out;
4221 4356
4222 err = avc_has_perm(isec->sid, port_sid, isec->sclass, 4357 err = security_port_sid(sk->sk_family, sk->sk_type,
4223 send_perm, ad); 4358 sk->sk_protocol, ntohs(ad->u.net.dport),
4359 &port_sid);
4360 if (unlikely(err)) {
4361 printk(KERN_WARNING
4362 "SELinux: failure in"
4363 " selinux_ip_postroute_iptables_compat(),"
4364 " network port label not found\n");
4365 return err;
4224 } 4366 }
4225out: 4367 return avc_has_perm(sk_sid, port_sid, sk_class, send_perm, ad);
4226 return err;
4227} 4368}
4228 4369
4229static unsigned int selinux_ip_postroute_last(unsigned int hooknum, 4370static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb,
4230 struct sk_buff *skb, 4371 int ifindex,
4231 const struct net_device *in, 4372 struct avc_audit_data *ad,
4232 const struct net_device *out, 4373 u16 family,
4233 int (*okfn)(struct sk_buff *), 4374 char *addrp,
4234 u16 family) 4375 u8 proto)
4235{ 4376{
4236 char *addrp; 4377 struct sock *sk = skb->sk;
4237 int len, err = 0;
4238 struct sock *sk;
4239 struct avc_audit_data ad;
4240 struct net_device *dev = (struct net_device *)out;
4241 struct sk_security_struct *sksec; 4378 struct sk_security_struct *sksec;
4242 u8 proto;
4243
4244 sk = skb->sk;
4245 if (!sk)
4246 goto out;
4247 4379
4380 if (sk == NULL)
4381 return NF_ACCEPT;
4248 sksec = sk->sk_security; 4382 sksec = sk->sk_security;
4249 4383
4250 AVC_AUDIT_DATA_INIT(&ad, NET); 4384 if (selinux_compat_net) {
4251 ad.u.net.netif = dev->name; 4385 if (selinux_ip_postroute_iptables_compat(skb->sk, ifindex,
4252 ad.u.net.family = family; 4386 ad, family, addrp))
4387 return NF_DROP;
4388 } else {
4389 if (avc_has_perm(sksec->sid, skb->secmark,
4390 SECCLASS_PACKET, PACKET__SEND, ad))
4391 return NF_DROP;
4392 }
4253 4393
4254 err = selinux_parse_skb(skb, &ad, &addrp, &len, 0, &proto); 4394 if (selinux_policycap_netpeer)
4255 if (err) 4395 if (selinux_xfrm_postroute_last(sksec->sid, skb, ad, proto))
4256 goto out; 4396 return NF_DROP;
4257 4397
4258 if (selinux_compat_net) 4398 return NF_ACCEPT;
4259 err = selinux_ip_postroute_last_compat(sk, dev, &ad, 4399}
4260 family, addrp, len);
4261 else
4262 err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET,
4263 PACKET__SEND, &ad);
4264 4400
4265 if (err) 4401static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex,
4266 goto out; 4402 u16 family)
4403{
4404 u32 secmark_perm;
4405 u32 peer_sid;
4406 struct sock *sk;
4407 struct avc_audit_data ad;
4408 char *addrp;
4409 u8 proto;
4410 u8 secmark_active;
4411 u8 peerlbl_active;
4267 4412
4268 err = selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto); 4413 AVC_AUDIT_DATA_INIT(&ad, NET);
4269out: 4414 ad.u.net.netif = ifindex;
4270 return err ? NF_DROP : NF_ACCEPT; 4415 ad.u.net.family = family;
4416 if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto))
4417 return NF_DROP;
4418
4419 /* If any sort of compatibility mode is enabled then handoff processing
4420 * to the selinux_ip_postroute_compat() function to deal with the
4421 * special handling. We do this in an attempt to keep this function
4422 * as fast and as clean as possible. */
4423 if (selinux_compat_net || !selinux_policycap_netpeer)
4424 return selinux_ip_postroute_compat(skb, ifindex, &ad,
4425 family, addrp, proto);
4426
4427 /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec
4428 * packet transformation so allow the packet to pass without any checks
4429 * since we'll have another chance to perform access control checks
4430 * when the packet is on it's final way out.
4431 * NOTE: there appear to be some IPv6 multicast cases where skb->dst
4432 * is NULL, in this case go ahead and apply access control. */
4433 if (skb->dst != NULL && skb->dst->xfrm != NULL)
4434 return NF_ACCEPT;
4435
4436 secmark_active = selinux_secmark_enabled();
4437 peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
4438 if (!secmark_active && !peerlbl_active)
4439 return NF_ACCEPT;
4440
4441 /* if the packet is locally generated (skb->sk != NULL) then use the
4442 * socket's label as the peer label, otherwise the packet is being
4443 * forwarded through this system and we need to fetch the peer label
4444 * directly from the packet */
4445 sk = skb->sk;
4446 if (sk) {
4447 struct sk_security_struct *sksec = sk->sk_security;
4448 peer_sid = sksec->sid;
4449 secmark_perm = PACKET__SEND;
4450 } else {
4451 if (selinux_skb_peerlbl_sid(skb, family, &peer_sid))
4452 return NF_DROP;
4453 secmark_perm = PACKET__FORWARD_OUT;
4454 }
4455
4456 if (secmark_active)
4457 if (avc_has_perm(peer_sid, skb->secmark,
4458 SECCLASS_PACKET, secmark_perm, &ad))
4459 return NF_DROP;
4460
4461 if (peerlbl_active) {
4462 u32 if_sid;
4463 u32 node_sid;
4464
4465 if (sel_netif_sid(ifindex, &if_sid))
4466 return NF_DROP;
4467 if (avc_has_perm(peer_sid, if_sid,
4468 SECCLASS_NETIF, NETIF__EGRESS, &ad))
4469 return NF_DROP;
4470
4471 if (sel_netnode_sid(addrp, family, &node_sid))
4472 return NF_DROP;
4473 if (avc_has_perm(peer_sid, node_sid,
4474 SECCLASS_NODE, NODE__SENDTO, &ad))
4475 return NF_DROP;
4476 }
4477
4478 return NF_ACCEPT;
4271} 4479}
4272 4480
4273static unsigned int selinux_ipv4_postroute_last(unsigned int hooknum, 4481static unsigned int selinux_ipv4_postroute(unsigned int hooknum,
4274 struct sk_buff *skb, 4482 struct sk_buff *skb,
4275 const struct net_device *in, 4483 const struct net_device *in,
4276 const struct net_device *out, 4484 const struct net_device *out,
4277 int (*okfn)(struct sk_buff *)) 4485 int (*okfn)(struct sk_buff *))
4278{ 4486{
4279 return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET); 4487 return selinux_ip_postroute(skb, out->ifindex, PF_INET);
4280} 4488}
4281 4489
4282#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 4490#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
4283 4491static unsigned int selinux_ipv6_postroute(unsigned int hooknum,
4284static unsigned int selinux_ipv6_postroute_last(unsigned int hooknum, 4492 struct sk_buff *skb,
4285 struct sk_buff *skb, 4493 const struct net_device *in,
4286 const struct net_device *in, 4494 const struct net_device *out,
4287 const struct net_device *out, 4495 int (*okfn)(struct sk_buff *))
4288 int (*okfn)(struct sk_buff *))
4289{ 4496{
4290 return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET6); 4497 return selinux_ip_postroute(skb, out->ifindex, PF_INET6);
4291} 4498}
4292
4293#endif /* IPV6 */ 4499#endif /* IPV6 */
4294 4500
4295#endif /* CONFIG_NETFILTER */ 4501#endif /* CONFIG_NETFILTER */
@@ -5277,22 +5483,40 @@ security_initcall(selinux_init);
5277 5483
5278#if defined(CONFIG_NETFILTER) 5484#if defined(CONFIG_NETFILTER)
5279 5485
5280static struct nf_hook_ops selinux_ipv4_op = { 5486static struct nf_hook_ops selinux_ipv4_ops[] = {
5281 .hook = selinux_ipv4_postroute_last, 5487 {
5282 .owner = THIS_MODULE, 5488 .hook = selinux_ipv4_postroute,
5283 .pf = PF_INET, 5489 .owner = THIS_MODULE,
5284 .hooknum = NF_INET_POST_ROUTING, 5490 .pf = PF_INET,
5285 .priority = NF_IP_PRI_SELINUX_LAST, 5491 .hooknum = NF_INET_POST_ROUTING,
5492 .priority = NF_IP_PRI_SELINUX_LAST,
5493 },
5494 {
5495 .hook = selinux_ipv4_forward,
5496 .owner = THIS_MODULE,
5497 .pf = PF_INET,
5498 .hooknum = NF_INET_FORWARD,
5499 .priority = NF_IP_PRI_SELINUX_FIRST,
5500 }
5286}; 5501};
5287 5502
5288#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 5503#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
5289 5504
5290static struct nf_hook_ops selinux_ipv6_op = { 5505static struct nf_hook_ops selinux_ipv6_ops[] = {
5291 .hook = selinux_ipv6_postroute_last, 5506 {
5292 .owner = THIS_MODULE, 5507 .hook = selinux_ipv6_postroute,
5293 .pf = PF_INET6, 5508 .owner = THIS_MODULE,
5294 .hooknum = NF_INET_POST_ROUTING, 5509 .pf = PF_INET6,
5295 .priority = NF_IP6_PRI_SELINUX_LAST, 5510 .hooknum = NF_INET_POST_ROUTING,
5511 .priority = NF_IP6_PRI_SELINUX_LAST,
5512 },
5513 {
5514 .hook = selinux_ipv6_forward,
5515 .owner = THIS_MODULE,
5516 .pf = PF_INET6,
5517 .hooknum = NF_INET_FORWARD,
5518 .priority = NF_IP6_PRI_SELINUX_FIRST,
5519 }
5296}; 5520};
5297 5521
5298#endif /* IPV6 */ 5522#endif /* IPV6 */
@@ -5300,22 +5524,27 @@ static struct nf_hook_ops selinux_ipv6_op = {
5300static int __init selinux_nf_ip_init(void) 5524static int __init selinux_nf_ip_init(void)
5301{ 5525{
5302 int err = 0; 5526 int err = 0;
5527 u32 iter;
5303 5528
5304 if (!selinux_enabled) 5529 if (!selinux_enabled)
5305 goto out; 5530 goto out;
5306 5531
5307 printk(KERN_DEBUG "SELinux: Registering netfilter hooks\n"); 5532 printk(KERN_DEBUG "SELinux: Registering netfilter hooks\n");
5308 5533
5309 err = nf_register_hook(&selinux_ipv4_op); 5534 for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++) {
5310 if (err) 5535 err = nf_register_hook(&selinux_ipv4_ops[iter]);
5311 panic("SELinux: nf_register_hook for IPv4: error %d\n", err); 5536 if (err)
5537 panic("SELinux: nf_register_hook for IPv4: error %d\n",
5538 err);
5539 }
5312 5540
5313#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 5541#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
5314 5542 for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++) {
5315 err = nf_register_hook(&selinux_ipv6_op); 5543 err = nf_register_hook(&selinux_ipv6_ops[iter]);
5316 if (err) 5544 if (err)
5317 panic("SELinux: nf_register_hook for IPv6: error %d\n", err); 5545 panic("SELinux: nf_register_hook for IPv6: error %d\n",
5318 5546 err);
5547 }
5319#endif /* IPV6 */ 5548#endif /* IPV6 */
5320 5549
5321out: 5550out:
@@ -5327,11 +5556,15 @@ __initcall(selinux_nf_ip_init);
5327#ifdef CONFIG_SECURITY_SELINUX_DISABLE 5556#ifdef CONFIG_SECURITY_SELINUX_DISABLE
5328static void selinux_nf_ip_exit(void) 5557static void selinux_nf_ip_exit(void)
5329{ 5558{
5559 u32 iter;
5560
5330 printk(KERN_DEBUG "SELinux: Unregistering netfilter hooks\n"); 5561 printk(KERN_DEBUG "SELinux: Unregistering netfilter hooks\n");
5331 5562
5332 nf_unregister_hook(&selinux_ipv4_op); 5563 for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++)
5564 nf_unregister_hook(&selinux_ipv4_ops[iter]);
5333#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 5565#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
5334 nf_unregister_hook(&selinux_ipv6_op); 5566 for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++)
5567 nf_unregister_hook(&selinux_ipv6_ops[iter]);
5335#endif /* IPV6 */ 5568#endif /* IPV6 */
5336} 5569}
5337#endif 5570#endif
diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h
index 049bf69429b6..399f868c5c8f 100644
--- a/security/selinux/include/av_perm_to_string.h
+++ b/security/selinux/include/av_perm_to_string.h
@@ -37,6 +37,8 @@
37 S_(SECCLASS_NODE, NODE__ENFORCE_DEST, "enforce_dest") 37 S_(SECCLASS_NODE, NODE__ENFORCE_DEST, "enforce_dest")
38 S_(SECCLASS_NODE, NODE__DCCP_RECV, "dccp_recv") 38 S_(SECCLASS_NODE, NODE__DCCP_RECV, "dccp_recv")
39 S_(SECCLASS_NODE, NODE__DCCP_SEND, "dccp_send") 39 S_(SECCLASS_NODE, NODE__DCCP_SEND, "dccp_send")
40 S_(SECCLASS_NODE, NODE__RECVFROM, "recvfrom")
41 S_(SECCLASS_NODE, NODE__SENDTO, "sendto")
40 S_(SECCLASS_NETIF, NETIF__TCP_RECV, "tcp_recv") 42 S_(SECCLASS_NETIF, NETIF__TCP_RECV, "tcp_recv")
41 S_(SECCLASS_NETIF, NETIF__TCP_SEND, "tcp_send") 43 S_(SECCLASS_NETIF, NETIF__TCP_SEND, "tcp_send")
42 S_(SECCLASS_NETIF, NETIF__UDP_RECV, "udp_recv") 44 S_(SECCLASS_NETIF, NETIF__UDP_RECV, "udp_recv")
@@ -45,6 +47,8 @@
45 S_(SECCLASS_NETIF, NETIF__RAWIP_SEND, "rawip_send") 47 S_(SECCLASS_NETIF, NETIF__RAWIP_SEND, "rawip_send")
46 S_(SECCLASS_NETIF, NETIF__DCCP_RECV, "dccp_recv") 48 S_(SECCLASS_NETIF, NETIF__DCCP_RECV, "dccp_recv")
47 S_(SECCLASS_NETIF, NETIF__DCCP_SEND, "dccp_send") 49 S_(SECCLASS_NETIF, NETIF__DCCP_SEND, "dccp_send")
50 S_(SECCLASS_NETIF, NETIF__INGRESS, "ingress")
51 S_(SECCLASS_NETIF, NETIF__EGRESS, "egress")
48 S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__CONNECTTO, "connectto") 52 S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__CONNECTTO, "connectto")
49 S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__NEWCONN, "newconn") 53 S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__NEWCONN, "newconn")
50 S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__ACCEPTFROM, "acceptfrom") 54 S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__ACCEPTFROM, "acceptfrom")
@@ -149,6 +153,10 @@
149 S_(SECCLASS_PACKET, PACKET__SEND, "send") 153 S_(SECCLASS_PACKET, PACKET__SEND, "send")
150 S_(SECCLASS_PACKET, PACKET__RECV, "recv") 154 S_(SECCLASS_PACKET, PACKET__RECV, "recv")
151 S_(SECCLASS_PACKET, PACKET__RELABELTO, "relabelto") 155 S_(SECCLASS_PACKET, PACKET__RELABELTO, "relabelto")
156 S_(SECCLASS_PACKET, PACKET__FLOW_IN, "flow_in")
157 S_(SECCLASS_PACKET, PACKET__FLOW_OUT, "flow_out")
158 S_(SECCLASS_PACKET, PACKET__FORWARD_IN, "forward_in")
159 S_(SECCLASS_PACKET, PACKET__FORWARD_OUT, "forward_out")
152 S_(SECCLASS_KEY, KEY__VIEW, "view") 160 S_(SECCLASS_KEY, KEY__VIEW, "view")
153 S_(SECCLASS_KEY, KEY__READ, "read") 161 S_(SECCLASS_KEY, KEY__READ, "read")
154 S_(SECCLASS_KEY, KEY__WRITE, "write") 162 S_(SECCLASS_KEY, KEY__WRITE, "write")
@@ -159,3 +167,4 @@
159 S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind") 167 S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind")
160 S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect") 168 S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect")
161 S_(SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, "mmap_zero") 169 S_(SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, "mmap_zero")
170 S_(SECCLASS_PEER, PEER__RECV, "recv")
diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h
index eda89a2ec635..84c9abc80978 100644
--- a/security/selinux/include/av_permissions.h
+++ b/security/selinux/include/av_permissions.h
@@ -292,6 +292,8 @@
292#define NODE__ENFORCE_DEST 0x00000040UL 292#define NODE__ENFORCE_DEST 0x00000040UL
293#define NODE__DCCP_RECV 0x00000080UL 293#define NODE__DCCP_RECV 0x00000080UL
294#define NODE__DCCP_SEND 0x00000100UL 294#define NODE__DCCP_SEND 0x00000100UL
295#define NODE__RECVFROM 0x00000200UL
296#define NODE__SENDTO 0x00000400UL
295#define NETIF__TCP_RECV 0x00000001UL 297#define NETIF__TCP_RECV 0x00000001UL
296#define NETIF__TCP_SEND 0x00000002UL 298#define NETIF__TCP_SEND 0x00000002UL
297#define NETIF__UDP_RECV 0x00000004UL 299#define NETIF__UDP_RECV 0x00000004UL
@@ -300,6 +302,8 @@
300#define NETIF__RAWIP_SEND 0x00000020UL 302#define NETIF__RAWIP_SEND 0x00000020UL
301#define NETIF__DCCP_RECV 0x00000040UL 303#define NETIF__DCCP_RECV 0x00000040UL
302#define NETIF__DCCP_SEND 0x00000080UL 304#define NETIF__DCCP_SEND 0x00000080UL
305#define NETIF__INGRESS 0x00000100UL
306#define NETIF__EGRESS 0x00000200UL
303#define NETLINK_SOCKET__IOCTL 0x00000001UL 307#define NETLINK_SOCKET__IOCTL 0x00000001UL
304#define NETLINK_SOCKET__READ 0x00000002UL 308#define NETLINK_SOCKET__READ 0x00000002UL
305#define NETLINK_SOCKET__WRITE 0x00000004UL 309#define NETLINK_SOCKET__WRITE 0x00000004UL
@@ -792,6 +796,10 @@
792#define PACKET__SEND 0x00000001UL 796#define PACKET__SEND 0x00000001UL
793#define PACKET__RECV 0x00000002UL 797#define PACKET__RECV 0x00000002UL
794#define PACKET__RELABELTO 0x00000004UL 798#define PACKET__RELABELTO 0x00000004UL
799#define PACKET__FLOW_IN 0x00000008UL
800#define PACKET__FLOW_OUT 0x00000010UL
801#define PACKET__FORWARD_IN 0x00000020UL
802#define PACKET__FORWARD_OUT 0x00000040UL
795#define KEY__VIEW 0x00000001UL 803#define KEY__VIEW 0x00000001UL
796#define KEY__READ 0x00000002UL 804#define KEY__READ 0x00000002UL
797#define KEY__WRITE 0x00000004UL 805#define KEY__WRITE 0x00000004UL
@@ -824,3 +832,4 @@
824#define DCCP_SOCKET__NODE_BIND 0x00400000UL 832#define DCCP_SOCKET__NODE_BIND 0x00400000UL
825#define DCCP_SOCKET__NAME_CONNECT 0x00800000UL 833#define DCCP_SOCKET__NAME_CONNECT 0x00800000UL
826#define MEMPROTECT__MMAP_ZERO 0x00000001UL 834#define MEMPROTECT__MMAP_ZERO 0x00000001UL
835#define PEER__RECV 0x00000001UL
diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h
index 553607a19e92..80c28fa6621c 100644
--- a/security/selinux/include/avc.h
+++ b/security/selinux/include/avc.h
@@ -51,7 +51,7 @@ struct avc_audit_data {
51 struct inode *inode; 51 struct inode *inode;
52 } fs; 52 } fs;
53 struct { 53 struct {
54 char *netif; 54 int netif;
55 struct sock *sk; 55 struct sock *sk;
56 u16 family; 56 u16 family;
57 __be16 dport; 57 __be16 dport;
diff --git a/security/selinux/include/class_to_string.h b/security/selinux/include/class_to_string.h
index e77de0e62ea0..b1b0d1d8f950 100644
--- a/security/selinux/include/class_to_string.h
+++ b/security/selinux/include/class_to_string.h
@@ -64,3 +64,10 @@
64 S_(NULL) 64 S_(NULL)
65 S_("dccp_socket") 65 S_("dccp_socket")
66 S_("memprotect") 66 S_("memprotect")
67 S_(NULL)
68 S_(NULL)
69 S_(NULL)
70 S_(NULL)
71 S_(NULL)
72 S_(NULL)
73 S_("peer")
diff --git a/security/selinux/include/flask.h b/security/selinux/include/flask.h
index a9c2b20f14b5..09e9dd23ee1a 100644
--- a/security/selinux/include/flask.h
+++ b/security/selinux/include/flask.h
@@ -50,6 +50,7 @@
50#define SECCLASS_KEY 58 50#define SECCLASS_KEY 58
51#define SECCLASS_DCCP_SOCKET 60 51#define SECCLASS_DCCP_SOCKET 60
52#define SECCLASS_MEMPROTECT 61 52#define SECCLASS_MEMPROTECT 61
53#define SECCLASS_PEER 68
53 54
54/* 55/*
55 * Security identifier indices for initial entities 56 * Security identifier indices for initial entities
diff --git a/security/selinux/include/netif.h b/security/selinux/include/netif.h
index 8bd6f9992d2b..ce23edd128b3 100644
--- a/security/selinux/include/netif.h
+++ b/security/selinux/include/netif.h
@@ -7,6 +7,8 @@
7 * Author: James Morris <jmorris@redhat.com> 7 * Author: James Morris <jmorris@redhat.com>
8 * 8 *
9 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> 9 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
10 * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
11 * Paul Moore, <paul.moore@hp.com>
10 * 12 *
11 * This program is free software; you can redistribute it and/or modify 13 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2, 14 * it under the terms of the GNU General Public License version 2,
@@ -15,7 +17,7 @@
15#ifndef _SELINUX_NETIF_H_ 17#ifndef _SELINUX_NETIF_H_
16#define _SELINUX_NETIF_H_ 18#define _SELINUX_NETIF_H_
17 19
18int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid); 20int sel_netif_sid(int ifindex, u32 *sid);
19 21
20#endif /* _SELINUX_NETIF_H_ */ 22#endif /* _SELINUX_NETIF_H_ */
21 23
diff --git a/security/selinux/include/netlabel.h b/security/selinux/include/netlabel.h
index 218e3f77c350..00a2809c8506 100644
--- a/security/selinux/include/netlabel.h
+++ b/security/selinux/include/netlabel.h
@@ -46,13 +46,17 @@ void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec,
46void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, 46void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
47 struct sk_security_struct *newssec); 47 struct sk_security_struct *newssec);
48 48
49int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid); 49int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
50 u16 family,
51 u32 *type,
52 u32 *sid);
50 53
51void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock); 54void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock);
52int selinux_netlbl_socket_post_create(struct socket *sock); 55int selinux_netlbl_socket_post_create(struct socket *sock);
53int selinux_netlbl_inode_permission(struct inode *inode, int mask); 56int selinux_netlbl_inode_permission(struct inode *inode, int mask);
54int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, 57int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
55 struct sk_buff *skb, 58 struct sk_buff *skb,
59 u16 family,
56 struct avc_audit_data *ad); 60 struct avc_audit_data *ad);
57int selinux_netlbl_socket_setsockopt(struct socket *sock, 61int selinux_netlbl_socket_setsockopt(struct socket *sock,
58 int level, 62 int level,
@@ -83,9 +87,11 @@ static inline void selinux_netlbl_sk_security_clone(
83} 87}
84 88
85static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, 89static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
86 u32 base_sid, 90 u16 family,
91 u32 *type,
87 u32 *sid) 92 u32 *sid)
88{ 93{
94 *type = NETLBL_NLTYPE_NONE;
89 *sid = SECSID_NULL; 95 *sid = SECSID_NULL;
90 return 0; 96 return 0;
91} 97}
@@ -106,6 +112,7 @@ static inline int selinux_netlbl_inode_permission(struct inode *inode,
106} 112}
107static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, 113static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
108 struct sk_buff *skb, 114 struct sk_buff *skb,
115 u16 family,
109 struct avc_audit_data *ad) 116 struct avc_audit_data *ad)
110{ 117{
111 return 0; 118 return 0;
diff --git a/security/selinux/include/netnode.h b/security/selinux/include/netnode.h
new file mode 100644
index 000000000000..1b94450d11d2
--- /dev/null
+++ b/security/selinux/include/netnode.h
@@ -0,0 +1,32 @@
1/*
2 * Network node table
3 *
4 * SELinux must keep a mapping of network nodes to labels/SIDs. This
5 * mapping is maintained as part of the normal policy but a fast cache is
6 * needed to reduce the lookup overhead since most of these queries happen on
7 * a per-packet basis.
8 *
9 * Author: Paul Moore <paul.moore@hp.com>
10 *
11 */
12
13/*
14 * (c) Copyright Hewlett-Packard Development Company, L.P., 2007
15 *
16 * This program is free software: you can redistribute it and/or modify
17 * it under the terms of version 2 of the GNU General Public License as
18 * published by the Free Software Foundation.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 */
26
27#ifndef _SELINUX_NETNODE_H
28#define _SELINUX_NETNODE_H
29
30int sel_netnode_sid(void *addr, u16 family, u32 *sid);
31
32#endif
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 4138a80f8e27..c6c2bb4ebacc 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -96,17 +96,25 @@ struct bprm_security_struct {
96}; 96};
97 97
98struct netif_security_struct { 98struct netif_security_struct {
99 struct net_device *dev; /* back pointer */ 99 int ifindex; /* device index */
100 u32 if_sid; /* SID for this interface */ 100 u32 sid; /* SID for this interface */
101 u32 msg_sid; /* default SID for messages received on this interface */ 101};
102
103struct netnode_security_struct {
104 union {
105 __be32 ipv4; /* IPv4 node address */
106 struct in6_addr ipv6; /* IPv6 node address */
107 } addr;
108 u32 sid; /* SID for this node */
109 u16 family; /* address family */
102}; 110};
103 111
104struct sk_security_struct { 112struct sk_security_struct {
105 struct sock *sk; /* back pointer to sk object */ 113 struct sock *sk; /* back pointer to sk object */
106 u32 sid; /* SID of this object */ 114 u32 sid; /* SID of this object */
107 u32 peer_sid; /* SID of peer */ 115 u32 peer_sid; /* SID of peer */
108#ifdef CONFIG_NETLABEL
109 u16 sclass; /* sock security class */ 116 u16 sclass; /* sock security class */
117#ifdef CONFIG_NETLABEL
110 enum { /* NetLabel state */ 118 enum { /* NetLabel state */
111 NLBL_UNSET = 0, 119 NLBL_UNSET = 0,
112 NLBL_REQUIRE, 120 NLBL_REQUIRE,
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 39337afffec2..23137c17f917 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -25,13 +25,14 @@
25#define POLICYDB_VERSION_MLS 19 25#define POLICYDB_VERSION_MLS 19
26#define POLICYDB_VERSION_AVTAB 20 26#define POLICYDB_VERSION_AVTAB 20
27#define POLICYDB_VERSION_RANGETRANS 21 27#define POLICYDB_VERSION_RANGETRANS 21
28#define POLICYDB_VERSION_POLCAP 22
28 29
29/* Range of policy versions we understand*/ 30/* Range of policy versions we understand*/
30#define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE 31#define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE
31#ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX 32#ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX
32#define POLICYDB_VERSION_MAX CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE 33#define POLICYDB_VERSION_MAX CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
33#else 34#else
34#define POLICYDB_VERSION_MAX POLICYDB_VERSION_RANGETRANS 35#define POLICYDB_VERSION_MAX POLICYDB_VERSION_POLCAP
35#endif 36#endif
36 37
37struct netlbl_lsm_secattr; 38struct netlbl_lsm_secattr;
@@ -39,8 +40,19 @@ struct netlbl_lsm_secattr;
39extern int selinux_enabled; 40extern int selinux_enabled;
40extern int selinux_mls_enabled; 41extern int selinux_mls_enabled;
41 42
43/* Policy capabilities */
44enum {
45 POLICYDB_CAPABILITY_NETPEER,
46 __POLICYDB_CAPABILITY_MAX
47};
48#define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1)
49
50extern int selinux_policycap_netpeer;
51
42int security_load_policy(void * data, size_t len); 52int security_load_policy(void * data, size_t len);
43 53
54int security_policycap_supported(unsigned int req_cap);
55
44#define SEL_VEC_MAX 32 56#define SEL_VEC_MAX 32
45struct av_decision { 57struct av_decision {
46 u32 allowed; 58 u32 allowed;
@@ -77,8 +89,7 @@ int security_get_user_sids(u32 callsid, char *username,
77int security_port_sid(u16 domain, u16 type, u8 protocol, u16 port, 89int security_port_sid(u16 domain, u16 type, u8 protocol, u16 port,
78 u32 *out_sid); 90 u32 *out_sid);
79 91
80int security_netif_sid(char *name, u32 *if_sid, 92int security_netif_sid(char *name, u32 *if_sid);
81 u32 *msg_sid);
82 93
83int security_node_sid(u16 domain, void *addr, u32 addrlen, 94int security_node_sid(u16 domain, void *addr, u32 addrlen,
84 u32 *out_sid); 95 u32 *out_sid);
@@ -88,10 +99,15 @@ int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid,
88 99
89int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid); 100int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid);
90 101
102int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type,
103 u32 xfrm_sid,
104 u32 *peer_sid);
105
91int security_get_classes(char ***classes, int *nclasses); 106int security_get_classes(char ***classes, int *nclasses);
92int security_get_permissions(char *class, char ***perms, int *nperms); 107int security_get_permissions(char *class, char ***perms, int *nperms);
93int security_get_reject_unknown(void); 108int security_get_reject_unknown(void);
94int security_get_allow_unknown(void); 109int security_get_allow_unknown(void);
110int security_get_policycaps(int *len, int **values);
95 111
96#define SECURITY_FS_USE_XATTR 1 /* use xattr */ 112#define SECURITY_FS_USE_XATTR 1 /* use xattr */
97#define SECURITY_FS_USE_TRANS 2 /* use transition SIDs, e.g. devpts/tmpfs */ 113#define SECURITY_FS_USE_TRANS 2 /* use transition SIDs, e.g. devpts/tmpfs */
@@ -108,7 +124,6 @@ int security_genfs_sid(const char *fstype, char *name, u16 sclass,
108 124
109#ifdef CONFIG_NETLABEL 125#ifdef CONFIG_NETLABEL
110int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, 126int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
111 u32 base_sid,
112 u32 *sid); 127 u32 *sid);
113 128
114int security_netlbl_sid_to_secattr(u32 sid, 129int security_netlbl_sid_to_secattr(u32 sid,
@@ -116,7 +131,6 @@ int security_netlbl_sid_to_secattr(u32 sid,
116#else 131#else
117static inline int security_netlbl_secattr_to_sid( 132static inline int security_netlbl_secattr_to_sid(
118 struct netlbl_lsm_secattr *secattr, 133 struct netlbl_lsm_secattr *secattr,
119 u32 base_sid,
120 u32 *sid) 134 u32 *sid)
121{ 135{
122 return -EIDRM; 136 return -EIDRM;
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
index 31929e39f5ca..36b0510efa7b 100644
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -32,6 +32,13 @@ static inline struct inode_security_struct *get_sock_isec(struct sock *sk)
32} 32}
33 33
34#ifdef CONFIG_SECURITY_NETWORK_XFRM 34#ifdef CONFIG_SECURITY_NETWORK_XFRM
35extern atomic_t selinux_xfrm_refcount;
36
37static inline int selinux_xfrm_enabled(void)
38{
39 return (atomic_read(&selinux_xfrm_refcount) > 0);
40}
41
35int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb, 42int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb,
36 struct avc_audit_data *ad); 43 struct avc_audit_data *ad);
37int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb, 44int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb,
@@ -43,6 +50,11 @@ static inline void selinux_xfrm_notify_policyload(void)
43 atomic_inc(&flow_cache_genid); 50 atomic_inc(&flow_cache_genid);
44} 51}
45#else 52#else
53static inline int selinux_xfrm_enabled(void)
54{
55 return 0;
56}
57
46static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb, 58static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb,
47 struct avc_audit_data *ad) 59 struct avc_audit_data *ad)
48{ 60{
diff --git a/security/selinux/netif.c b/security/selinux/netif.c
index e87ab948104c..013d3117a86b 100644
--- a/security/selinux/netif.c
+++ b/security/selinux/netif.c
@@ -7,6 +7,8 @@
7 * Author: James Morris <jmorris@redhat.com> 7 * Author: James Morris <jmorris@redhat.com>
8 * 8 *
9 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> 9 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
10 * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
11 * Paul Moore <paul.moore@hp.com>
10 * 12 *
11 * This program is free software; you can redistribute it and/or modify 13 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2, 14 * it under the terms of the GNU General Public License version 2,
@@ -29,14 +31,6 @@
29#define SEL_NETIF_HASH_SIZE 64 31#define SEL_NETIF_HASH_SIZE 64
30#define SEL_NETIF_HASH_MAX 1024 32#define SEL_NETIF_HASH_MAX 1024
31 33
32#undef DEBUG
33
34#ifdef DEBUG
35#define DEBUGP printk
36#else
37#define DEBUGP(format, args...)
38#endif
39
40struct sel_netif 34struct sel_netif
41{ 35{
42 struct list_head list; 36 struct list_head list;
@@ -49,174 +43,226 @@ static LIST_HEAD(sel_netif_list);
49static DEFINE_SPINLOCK(sel_netif_lock); 43static DEFINE_SPINLOCK(sel_netif_lock);
50static struct list_head sel_netif_hash[SEL_NETIF_HASH_SIZE]; 44static struct list_head sel_netif_hash[SEL_NETIF_HASH_SIZE];
51 45
52static inline u32 sel_netif_hasfn(struct net_device *dev) 46/**
47 * sel_netif_hashfn - Hashing function for the interface table
48 * @ifindex: the network interface
49 *
50 * Description:
51 * This is the hashing function for the network interface table, it returns the
52 * bucket number for the given interface.
53 *
54 */
55static inline u32 sel_netif_hashfn(int ifindex)
53{ 56{
54 return (dev->ifindex & (SEL_NETIF_HASH_SIZE - 1)); 57 return (ifindex & (SEL_NETIF_HASH_SIZE - 1));
55} 58}
56 59
57/* 60/**
58 * All of the devices should normally fit in the hash, so we optimize 61 * sel_netif_find - Search for an interface record
59 * for that case. 62 * @ifindex: the network interface
63 *
64 * Description:
65 * Search the network interface table and return the record matching @ifindex.
66 * If an entry can not be found in the table return NULL.
67 *
60 */ 68 */
61static inline struct sel_netif *sel_netif_find(struct net_device *dev) 69static inline struct sel_netif *sel_netif_find(int ifindex)
62{ 70{
63 struct list_head *pos; 71 int idx = sel_netif_hashfn(ifindex);
64 int idx = sel_netif_hasfn(dev); 72 struct sel_netif *netif;
65 73
66 __list_for_each_rcu(pos, &sel_netif_hash[idx]) { 74 list_for_each_entry_rcu(netif, &sel_netif_hash[idx], list)
67 struct sel_netif *netif = list_entry(pos, 75 /* all of the devices should normally fit in the hash, so we
68 struct sel_netif, list); 76 * optimize for that case */
69 if (likely(netif->nsec.dev == dev)) 77 if (likely(netif->nsec.ifindex == ifindex))
70 return netif; 78 return netif;
71 } 79
72 return NULL; 80 return NULL;
73} 81}
74 82
83/**
84 * sel_netif_insert - Insert a new interface into the table
85 * @netif: the new interface record
86 *
87 * Description:
88 * Add a new interface record to the network interface hash table. Returns
89 * zero on success, negative values on failure.
90 *
91 */
75static int sel_netif_insert(struct sel_netif *netif) 92static int sel_netif_insert(struct sel_netif *netif)
76{ 93{
77 int idx, ret = 0; 94 int idx;
78 95
79 if (sel_netif_total >= SEL_NETIF_HASH_MAX) { 96 if (sel_netif_total >= SEL_NETIF_HASH_MAX)
80 ret = -ENOSPC; 97 return -ENOSPC;
81 goto out;
82 }
83 98
84 idx = sel_netif_hasfn(netif->nsec.dev); 99 idx = sel_netif_hashfn(netif->nsec.ifindex);
85 list_add_rcu(&netif->list, &sel_netif_hash[idx]); 100 list_add_rcu(&netif->list, &sel_netif_hash[idx]);
86 sel_netif_total++; 101 sel_netif_total++;
87out: 102
88 return ret; 103 return 0;
89} 104}
90 105
106/**
107 * sel_netif_free - Frees an interface entry
108 * @p: the entry's RCU field
109 *
110 * Description:
111 * This function is designed to be used as a callback to the call_rcu()
112 * function so that memory allocated to a hash table interface entry can be
113 * released safely.
114 *
115 */
91static void sel_netif_free(struct rcu_head *p) 116static void sel_netif_free(struct rcu_head *p)
92{ 117{
93 struct sel_netif *netif = container_of(p, struct sel_netif, rcu_head); 118 struct sel_netif *netif = container_of(p, struct sel_netif, rcu_head);
94
95 DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name);
96 kfree(netif); 119 kfree(netif);
97} 120}
98 121
122/**
123 * sel_netif_destroy - Remove an interface record from the table
124 * @netif: the existing interface record
125 *
126 * Description:
127 * Remove an existing interface record from the network interface table.
128 *
129 */
99static void sel_netif_destroy(struct sel_netif *netif) 130static void sel_netif_destroy(struct sel_netif *netif)
100{ 131{
101 DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name);
102
103 list_del_rcu(&netif->list); 132 list_del_rcu(&netif->list);
104 sel_netif_total--; 133 sel_netif_total--;
105 call_rcu(&netif->rcu_head, sel_netif_free); 134 call_rcu(&netif->rcu_head, sel_netif_free);
106} 135}
107 136
108static struct sel_netif *sel_netif_lookup(struct net_device *dev) 137/**
138 * sel_netif_sid_slow - Lookup the SID of a network interface using the policy
139 * @ifindex: the network interface
140 * @sid: interface SID
141 *
142 * Description:
143 * This function determines the SID of a network interface by quering the
144 * security policy. The result is added to the network interface table to
145 * speedup future queries. Returns zero on success, negative values on
146 * failure.
147 *
148 */
149static int sel_netif_sid_slow(int ifindex, u32 *sid)
109{ 150{
110 int ret; 151 int ret;
111 struct sel_netif *netif, *new; 152 struct sel_netif *netif;
112 struct netif_security_struct *nsec; 153 struct sel_netif *new = NULL;
113 154 struct net_device *dev;
114 netif = sel_netif_find(dev); 155
115 if (likely(netif != NULL)) 156 /* NOTE: we always use init's network namespace since we don't
116 goto out; 157 * currently support containers */
117 158
118 new = kzalloc(sizeof(*new), GFP_ATOMIC); 159 dev = dev_get_by_index(&init_net, ifindex);
119 if (!new) { 160 if (unlikely(dev == NULL)) {
120 netif = ERR_PTR(-ENOMEM); 161 printk(KERN_WARNING
121 goto out; 162 "SELinux: failure in sel_netif_sid_slow(),"
163 " invalid network interface (%d)\n", ifindex);
164 return -ENOENT;
122 } 165 }
123
124 nsec = &new->nsec;
125 166
126 ret = security_netif_sid(dev->name, &nsec->if_sid, &nsec->msg_sid); 167 spin_lock_bh(&sel_netif_lock);
127 if (ret < 0) { 168 netif = sel_netif_find(ifindex);
128 kfree(new); 169 if (netif != NULL) {
129 netif = ERR_PTR(ret); 170 *sid = netif->nsec.sid;
171 ret = 0;
130 goto out; 172 goto out;
131 } 173 }
132 174 new = kzalloc(sizeof(*new), GFP_ATOMIC);
133 nsec->dev = dev; 175 if (new == NULL) {
134 176 ret = -ENOMEM;
135 spin_lock_bh(&sel_netif_lock);
136
137 netif = sel_netif_find(dev);
138 if (netif) {
139 spin_unlock_bh(&sel_netif_lock);
140 kfree(new);
141 goto out; 177 goto out;
142 } 178 }
143 179 ret = security_netif_sid(dev->name, &new->nsec.sid);
180 if (ret != 0)
181 goto out;
182 new->nsec.ifindex = ifindex;
144 ret = sel_netif_insert(new); 183 ret = sel_netif_insert(new);
145 spin_unlock_bh(&sel_netif_lock); 184 if (ret != 0)
146
147 if (ret) {
148 kfree(new);
149 netif = ERR_PTR(ret);
150 goto out; 185 goto out;
151 } 186 *sid = new->nsec.sid;
152 187
153 netif = new;
154
155 DEBUGP("new: ifindex=%u name=%s if_sid=%u msg_sid=%u\n", dev->ifindex, dev->name,
156 nsec->if_sid, nsec->msg_sid);
157out: 188out:
158 return netif; 189 spin_unlock_bh(&sel_netif_lock);
159} 190 dev_put(dev);
160 191 if (unlikely(ret)) {
161static void sel_netif_assign_sids(u32 if_sid_in, u32 msg_sid_in, u32 *if_sid_out, u32 *msg_sid_out) 192 printk(KERN_WARNING
162{ 193 "SELinux: failure in sel_netif_sid_slow(),"
163 if (if_sid_out) 194 " unable to determine network interface label (%d)\n",
164 *if_sid_out = if_sid_in; 195 ifindex);
165 if (msg_sid_out) 196 kfree(new);
166 *msg_sid_out = msg_sid_in; 197 }
167}
168
169static int sel_netif_sids_slow(struct net_device *dev, u32 *if_sid, u32 *msg_sid)
170{
171 int ret = 0;
172 u32 tmp_if_sid, tmp_msg_sid;
173
174 ret = security_netif_sid(dev->name, &tmp_if_sid, &tmp_msg_sid);
175 if (!ret)
176 sel_netif_assign_sids(tmp_if_sid, tmp_msg_sid, if_sid, msg_sid);
177 return ret; 198 return ret;
178} 199}
179 200
180int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid) 201/**
202 * sel_netif_sid - Lookup the SID of a network interface
203 * @ifindex: the network interface
204 * @sid: interface SID
205 *
206 * Description:
207 * This function determines the SID of a network interface using the fastest
208 * method possible. First the interface table is queried, but if an entry
209 * can't be found then the policy is queried and the result is added to the
210 * table to speedup future queries. Returns zero on success, negative values
211 * on failure.
212 *
213 */
214int sel_netif_sid(int ifindex, u32 *sid)
181{ 215{
182 int ret = 0;
183 struct sel_netif *netif; 216 struct sel_netif *netif;
184 217
185 rcu_read_lock(); 218 rcu_read_lock();
186 netif = sel_netif_lookup(dev); 219 netif = sel_netif_find(ifindex);
187 if (IS_ERR(netif)) { 220 if (likely(netif != NULL)) {
221 *sid = netif->nsec.sid;
188 rcu_read_unlock(); 222 rcu_read_unlock();
189 ret = sel_netif_sids_slow(dev, if_sid, msg_sid); 223 return 0;
190 goto out;
191 } 224 }
192 sel_netif_assign_sids(netif->nsec.if_sid, netif->nsec.msg_sid, if_sid, msg_sid);
193 rcu_read_unlock(); 225 rcu_read_unlock();
194out: 226
195 return ret; 227 return sel_netif_sid_slow(ifindex, sid);
196} 228}
197 229
198static void sel_netif_kill(struct net_device *dev) 230/**
231 * sel_netif_kill - Remove an entry from the network interface table
232 * @ifindex: the network interface
233 *
234 * Description:
235 * This function removes the entry matching @ifindex from the network interface
236 * table if it exists.
237 *
238 */
239static void sel_netif_kill(int ifindex)
199{ 240{
200 struct sel_netif *netif; 241 struct sel_netif *netif;
201 242
202 spin_lock_bh(&sel_netif_lock); 243 spin_lock_bh(&sel_netif_lock);
203 netif = sel_netif_find(dev); 244 netif = sel_netif_find(ifindex);
204 if (netif) 245 if (netif)
205 sel_netif_destroy(netif); 246 sel_netif_destroy(netif);
206 spin_unlock_bh(&sel_netif_lock); 247 spin_unlock_bh(&sel_netif_lock);
207} 248}
208 249
250/**
251 * sel_netif_flush - Flush the entire network interface table
252 *
253 * Description:
254 * Remove all entries from the network interface table.
255 *
256 */
209static void sel_netif_flush(void) 257static void sel_netif_flush(void)
210{ 258{
211 int idx; 259 int idx;
260 struct sel_netif *netif;
212 261
213 spin_lock_bh(&sel_netif_lock); 262 spin_lock_bh(&sel_netif_lock);
214 for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++) { 263 for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++)
215 struct sel_netif *netif;
216
217 list_for_each_entry(netif, &sel_netif_hash[idx], list) 264 list_for_each_entry(netif, &sel_netif_hash[idx], list)
218 sel_netif_destroy(netif); 265 sel_netif_destroy(netif);
219 }
220 spin_unlock_bh(&sel_netif_lock); 266 spin_unlock_bh(&sel_netif_lock);
221} 267}
222 268
@@ -239,7 +285,7 @@ static int sel_netif_netdev_notifier_handler(struct notifier_block *this,
239 return NOTIFY_DONE; 285 return NOTIFY_DONE;
240 286
241 if (event == NETDEV_DOWN) 287 if (event == NETDEV_DOWN)
242 sel_netif_kill(dev); 288 sel_netif_kill(dev->ifindex);
243 289
244 return NOTIFY_DONE; 290 return NOTIFY_DONE;
245} 291}
@@ -250,10 +296,10 @@ static struct notifier_block sel_netif_netdev_notifier = {
250 296
251static __init int sel_netif_init(void) 297static __init int sel_netif_init(void)
252{ 298{
253 int i, err = 0; 299 int i, err;
254 300
255 if (!selinux_enabled) 301 if (!selinux_enabled)
256 goto out; 302 return 0;
257 303
258 for (i = 0; i < SEL_NETIF_HASH_SIZE; i++) 304 for (i = 0; i < SEL_NETIF_HASH_SIZE; i++)
259 INIT_LIST_HEAD(&sel_netif_hash[i]); 305 INIT_LIST_HEAD(&sel_netif_hash[i]);
@@ -265,7 +311,6 @@ static __init int sel_netif_init(void)
265 if (err) 311 if (err)
266 panic("avc_add_callback() failed, error %d\n", err); 312 panic("avc_add_callback() failed, error %d\n", err);
267 313
268out:
269 return err; 314 return err;
270} 315}
271 316
diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c
index 66e013d6f6f6..0fa2be4149e8 100644
--- a/security/selinux/netlabel.c
+++ b/security/selinux/netlabel.c
@@ -36,6 +36,33 @@
36#include "security.h" 36#include "security.h"
37 37
38/** 38/**
39 * selinux_netlbl_sidlookup_cached - Cache a SID lookup
40 * @skb: the packet
41 * @secattr: the NetLabel security attributes
42 * @sid: the SID
43 *
44 * Description:
45 * Query the SELinux security server to lookup the correct SID for the given
46 * security attributes. If the query is successful, cache the result to speed
47 * up future lookups. Returns zero on success, negative values on failure.
48 *
49 */
50static int selinux_netlbl_sidlookup_cached(struct sk_buff *skb,
51 struct netlbl_lsm_secattr *secattr,
52 u32 *sid)
53{
54 int rc;
55
56 rc = security_netlbl_secattr_to_sid(secattr, sid);
57 if (rc == 0 &&
58 (secattr->flags & NETLBL_SECATTR_CACHEABLE) &&
59 (secattr->flags & NETLBL_SECATTR_CACHE))
60 netlbl_cache_add(skb, secattr);
61
62 return rc;
63}
64
65/**
39 * selinux_netlbl_sock_setsid - Label a socket using the NetLabel mechanism 66 * selinux_netlbl_sock_setsid - Label a socket using the NetLabel mechanism
40 * @sk: the socket to label 67 * @sk: the socket to label
41 * @sid: the SID to use 68 * @sid: the SID to use
@@ -137,14 +164,14 @@ void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
137 * lock as other threads could have access to ssec */ 164 * lock as other threads could have access to ssec */
138 rcu_read_lock(); 165 rcu_read_lock();
139 selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family); 166 selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family);
140 newssec->sclass = ssec->sclass;
141 rcu_read_unlock(); 167 rcu_read_unlock();
142} 168}
143 169
144/** 170/**
145 * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel 171 * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel
146 * @skb: the packet 172 * @skb: the packet
147 * @base_sid: the SELinux SID to use as a context for MLS only attributes 173 * @family: protocol family
174 * @type: NetLabel labeling protocol type
148 * @sid: the SID 175 * @sid: the SID
149 * 176 *
150 * Description: 177 * Description:
@@ -153,7 +180,10 @@ void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
153 * assign to the packet. Returns zero on success, negative values on failure. 180 * assign to the packet. Returns zero on success, negative values on failure.
154 * 181 *
155 */ 182 */
156int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid) 183int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
184 u16 family,
185 u32 *type,
186 u32 *sid)
157{ 187{
158 int rc; 188 int rc;
159 struct netlbl_lsm_secattr secattr; 189 struct netlbl_lsm_secattr secattr;
@@ -164,15 +194,12 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid)
164 } 194 }
165 195
166 netlbl_secattr_init(&secattr); 196 netlbl_secattr_init(&secattr);
167 rc = netlbl_skbuff_getattr(skb, &secattr); 197 rc = netlbl_skbuff_getattr(skb, family, &secattr);
168 if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) { 198 if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
169 rc = security_netlbl_secattr_to_sid(&secattr, base_sid, sid); 199 rc = selinux_netlbl_sidlookup_cached(skb, &secattr, sid);
170 if (rc == 0 && 200 else
171 (secattr.flags & NETLBL_SECATTR_CACHEABLE) &&
172 (secattr.flags & NETLBL_SECATTR_CACHE))
173 netlbl_cache_add(skb, &secattr);
174 } else
175 *sid = SECSID_NULL; 201 *sid = SECSID_NULL;
202 *type = secattr.type;
176 netlbl_secattr_destroy(&secattr); 203 netlbl_secattr_destroy(&secattr);
177 204
178 return rc; 205 return rc;
@@ -190,13 +217,10 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid)
190 */ 217 */
191void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock) 218void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock)
192{ 219{
193 struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
194 struct sk_security_struct *sksec = sk->sk_security; 220 struct sk_security_struct *sksec = sk->sk_security;
195 struct netlbl_lsm_secattr secattr; 221 struct netlbl_lsm_secattr secattr;
196 u32 nlbl_peer_sid; 222 u32 nlbl_peer_sid;
197 223
198 sksec->sclass = isec->sclass;
199
200 rcu_read_lock(); 224 rcu_read_lock();
201 225
202 if (sksec->nlbl_state != NLBL_REQUIRE) { 226 if (sksec->nlbl_state != NLBL_REQUIRE) {
@@ -207,9 +231,7 @@ void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock)
207 netlbl_secattr_init(&secattr); 231 netlbl_secattr_init(&secattr);
208 if (netlbl_sock_getattr(sk, &secattr) == 0 && 232 if (netlbl_sock_getattr(sk, &secattr) == 0 &&
209 secattr.flags != NETLBL_SECATTR_NONE && 233 secattr.flags != NETLBL_SECATTR_NONE &&
210 security_netlbl_secattr_to_sid(&secattr, 234 security_netlbl_secattr_to_sid(&secattr, &nlbl_peer_sid) == 0)
211 SECINITSID_NETMSG,
212 &nlbl_peer_sid) == 0)
213 sksec->peer_sid = nlbl_peer_sid; 235 sksec->peer_sid = nlbl_peer_sid;
214 netlbl_secattr_destroy(&secattr); 236 netlbl_secattr_destroy(&secattr);
215 237
@@ -234,11 +256,8 @@ int selinux_netlbl_socket_post_create(struct socket *sock)
234{ 256{
235 int rc = 0; 257 int rc = 0;
236 struct sock *sk = sock->sk; 258 struct sock *sk = sock->sk;
237 struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
238 struct sk_security_struct *sksec = sk->sk_security; 259 struct sk_security_struct *sksec = sk->sk_security;
239 260
240 sksec->sclass = isec->sclass;
241
242 rcu_read_lock(); 261 rcu_read_lock();
243 if (sksec->nlbl_state == NLBL_REQUIRE) 262 if (sksec->nlbl_state == NLBL_REQUIRE)
244 rc = selinux_netlbl_sock_setsid(sk, sksec->sid); 263 rc = selinux_netlbl_sock_setsid(sk, sksec->sid);
@@ -292,6 +311,7 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask)
292 * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel 311 * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel
293 * @sksec: the sock's sk_security_struct 312 * @sksec: the sock's sk_security_struct
294 * @skb: the packet 313 * @skb: the packet
314 * @family: protocol family
295 * @ad: the audit data 315 * @ad: the audit data
296 * 316 *
297 * Description: 317 * Description:
@@ -302,6 +322,7 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask)
302 */ 322 */
303int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, 323int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
304 struct sk_buff *skb, 324 struct sk_buff *skb,
325 u16 family,
305 struct avc_audit_data *ad) 326 struct avc_audit_data *ad)
306{ 327{
307 int rc; 328 int rc;
@@ -313,16 +334,10 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
313 return 0; 334 return 0;
314 335
315 netlbl_secattr_init(&secattr); 336 netlbl_secattr_init(&secattr);
316 rc = netlbl_skbuff_getattr(skb, &secattr); 337 rc = netlbl_skbuff_getattr(skb, family, &secattr);
317 if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) { 338 if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
318 rc = security_netlbl_secattr_to_sid(&secattr, 339 rc = selinux_netlbl_sidlookup_cached(skb, &secattr, &nlbl_sid);
319 SECINITSID_NETMSG, 340 else
320 &nlbl_sid);
321 if (rc == 0 &&
322 (secattr.flags & NETLBL_SECATTR_CACHEABLE) &&
323 (secattr.flags & NETLBL_SECATTR_CACHE))
324 netlbl_cache_add(skb, &secattr);
325 } else
326 nlbl_sid = SECINITSID_UNLABELED; 341 nlbl_sid = SECINITSID_UNLABELED;
327 netlbl_secattr_destroy(&secattr); 342 netlbl_secattr_destroy(&secattr);
328 if (rc != 0) 343 if (rc != 0)
diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c
new file mode 100644
index 000000000000..f3c526f2cacb
--- /dev/null
+++ b/security/selinux/netnode.c
@@ -0,0 +1,354 @@
1/*
2 * Network node table
3 *
4 * SELinux must keep a mapping of network nodes to labels/SIDs. This
5 * mapping is maintained as part of the normal policy but a fast cache is
6 * needed to reduce the lookup overhead since most of these queries happen on
7 * a per-packet basis.
8 *
9 * Author: Paul Moore <paul.moore@hp.com>
10 *
11 * This code is heavily based on the "netif" concept originally developed by
12 * James Morris <jmorris@redhat.com>
13 * (see security/selinux/netif.c for more information)
14 *
15 */
16
17/*
18 * (c) Copyright Hewlett-Packard Development Company, L.P., 2007
19 *
20 * This program is free software: you can redistribute it and/or modify
21 * it under the terms of version 2 of the GNU General Public License as
22 * published by the Free Software Foundation.
23 *
24 * This program is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * GNU General Public License for more details.
28 *
29 */
30
31#include <linux/types.h>
32#include <linux/rcupdate.h>
33#include <linux/list.h>
34#include <linux/spinlock.h>
35#include <linux/in.h>
36#include <linux/in6.h>
37#include <linux/ip.h>
38#include <linux/ipv6.h>
39#include <net/ip.h>
40#include <net/ipv6.h>
41#include <asm/bug.h>
42
43#include "objsec.h"
44
45#define SEL_NETNODE_HASH_SIZE 256
46#define SEL_NETNODE_HASH_BKT_LIMIT 16
47
48struct sel_netnode {
49 struct netnode_security_struct nsec;
50
51 struct list_head list;
52 struct rcu_head rcu;
53};
54
55/* NOTE: we are using a combined hash table for both IPv4 and IPv6, the reason
56 * for this is that I suspect most users will not make heavy use of both
57 * address families at the same time so one table will usually end up wasted,
58 * if this becomes a problem we can always add a hash table for each address
59 * family later */
60
61static LIST_HEAD(sel_netnode_list);
62static DEFINE_SPINLOCK(sel_netnode_lock);
63static struct list_head sel_netnode_hash[SEL_NETNODE_HASH_SIZE];
64
65/**
66 * sel_netnode_free - Frees a node entry
67 * @p: the entry's RCU field
68 *
69 * Description:
70 * This function is designed to be used as a callback to the call_rcu()
71 * function so that memory allocated to a hash table node entry can be
72 * released safely.
73 *
74 */
75static void sel_netnode_free(struct rcu_head *p)
76{
77 struct sel_netnode *node = container_of(p, struct sel_netnode, rcu);
78 kfree(node);
79}
80
81/**
82 * sel_netnode_hashfn_ipv4 - IPv4 hashing function for the node table
83 * @addr: IPv4 address
84 *
85 * Description:
86 * This is the IPv4 hashing function for the node interface table, it returns
87 * the bucket number for the given IP address.
88 *
89 */
90static u32 sel_netnode_hashfn_ipv4(__be32 addr)
91{
92 /* at some point we should determine if the mismatch in byte order
93 * affects the hash function dramatically */
94 return (addr & (SEL_NETNODE_HASH_SIZE - 1));
95}
96
97/**
98 * sel_netnode_hashfn_ipv6 - IPv6 hashing function for the node table
99 * @addr: IPv6 address
100 *
101 * Description:
102 * This is the IPv6 hashing function for the node interface table, it returns
103 * the bucket number for the given IP address.
104 *
105 */
106static u32 sel_netnode_hashfn_ipv6(const struct in6_addr *addr)
107{
108 /* just hash the least significant 32 bits to keep things fast (they
109 * are the most likely to be different anyway), we can revisit this
110 * later if needed */
111 return (addr->s6_addr32[3] & (SEL_NETNODE_HASH_SIZE - 1));
112}
113
114/**
115 * sel_netnode_find - Search for a node record
116 * @addr: IP address
117 * @family: address family
118 *
119 * Description:
120 * Search the network node table and return the record matching @addr. If an
121 * entry can not be found in the table return NULL.
122 *
123 */
124static struct sel_netnode *sel_netnode_find(const void *addr, u16 family)
125{
126 u32 idx;
127 struct sel_netnode *node;
128
129 switch (family) {
130 case PF_INET:
131 idx = sel_netnode_hashfn_ipv4(*(__be32 *)addr);
132 break;
133 case PF_INET6:
134 idx = sel_netnode_hashfn_ipv6(addr);
135 break;
136 default:
137 BUG();
138 }
139
140 list_for_each_entry_rcu(node, &sel_netnode_hash[idx], list)
141 if (node->nsec.family == family)
142 switch (family) {
143 case PF_INET:
144 if (node->nsec.addr.ipv4 == *(__be32 *)addr)
145 return node;
146 break;
147 case PF_INET6:
148 if (ipv6_addr_equal(&node->nsec.addr.ipv6,
149 addr))
150 return node;
151 break;
152 }
153
154 return NULL;
155}
156
157/**
158 * sel_netnode_insert - Insert a new node into the table
159 * @node: the new node record
160 *
161 * Description:
162 * Add a new node record to the network address hash table. Returns zero on
163 * success, negative values on failure.
164 *
165 */
166static int sel_netnode_insert(struct sel_netnode *node)
167{
168 u32 idx;
169 u32 count = 0;
170 struct sel_netnode *iter;
171
172 switch (node->nsec.family) {
173 case PF_INET:
174 idx = sel_netnode_hashfn_ipv4(node->nsec.addr.ipv4);
175 break;
176 case PF_INET6:
177 idx = sel_netnode_hashfn_ipv6(&node->nsec.addr.ipv6);
178 break;
179 default:
180 BUG();
181 }
182 list_add_rcu(&node->list, &sel_netnode_hash[idx]);
183
184 /* we need to impose a limit on the growth of the hash table so check
185 * this bucket to make sure it is within the specified bounds */
186 list_for_each_entry(iter, &sel_netnode_hash[idx], list)
187 if (++count > SEL_NETNODE_HASH_BKT_LIMIT) {
188 list_del_rcu(&iter->list);
189 call_rcu(&iter->rcu, sel_netnode_free);
190 break;
191 }
192
193 return 0;
194}
195
196/**
197 * sel_netnode_destroy - Remove a node record from the table
198 * @node: the existing node record
199 *
200 * Description:
201 * Remove an existing node record from the network address table.
202 *
203 */
204static void sel_netnode_destroy(struct sel_netnode *node)
205{
206 list_del_rcu(&node->list);
207 call_rcu(&node->rcu, sel_netnode_free);
208}
209
210/**
211 * sel_netnode_sid_slow - Lookup the SID of a network address using the policy
212 * @addr: the IP address
213 * @family: the address family
214 * @sid: node SID
215 *
216 * Description:
217 * This function determines the SID of a network address by quering the
218 * security policy. The result is added to the network address table to
219 * speedup future queries. Returns zero on success, negative values on
220 * failure.
221 *
222 */
223static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid)
224{
225 int ret;
226 struct sel_netnode *node;
227 struct sel_netnode *new = NULL;
228
229 spin_lock_bh(&sel_netnode_lock);
230 node = sel_netnode_find(addr, family);
231 if (node != NULL) {
232 *sid = node->nsec.sid;
233 ret = 0;
234 goto out;
235 }
236 new = kzalloc(sizeof(*new), GFP_ATOMIC);
237 if (new == NULL) {
238 ret = -ENOMEM;
239 goto out;
240 }
241 switch (family) {
242 case PF_INET:
243 ret = security_node_sid(PF_INET,
244 addr, sizeof(struct in_addr),
245 &new->nsec.sid);
246 new->nsec.addr.ipv4 = *(__be32 *)addr;
247 break;
248 case PF_INET6:
249 ret = security_node_sid(PF_INET6,
250 addr, sizeof(struct in6_addr),
251 &new->nsec.sid);
252 ipv6_addr_copy(&new->nsec.addr.ipv6, addr);
253 break;
254 default:
255 BUG();
256 }
257 if (ret != 0)
258 goto out;
259 new->nsec.family = family;
260 ret = sel_netnode_insert(new);
261 if (ret != 0)
262 goto out;
263 *sid = new->nsec.sid;
264
265out:
266 spin_unlock_bh(&sel_netnode_lock);
267 if (unlikely(ret)) {
268 printk(KERN_WARNING
269 "SELinux: failure in sel_netnode_sid_slow(),"
270 " unable to determine network node label\n");
271 kfree(new);
272 }
273 return ret;
274}
275
276/**
277 * sel_netnode_sid - Lookup the SID of a network address
278 * @addr: the IP address
279 * @family: the address family
280 * @sid: node SID
281 *
282 * Description:
283 * This function determines the SID of a network address using the fastest
284 * method possible. First the address table is queried, but if an entry
285 * can't be found then the policy is queried and the result is added to the
286 * table to speedup future queries. Returns zero on success, negative values
287 * on failure.
288 *
289 */
290int sel_netnode_sid(void *addr, u16 family, u32 *sid)
291{
292 struct sel_netnode *node;
293
294 rcu_read_lock();
295 node = sel_netnode_find(addr, family);
296 if (node != NULL) {
297 *sid = node->nsec.sid;
298 rcu_read_unlock();
299 return 0;
300 }
301 rcu_read_unlock();
302
303 return sel_netnode_sid_slow(addr, family, sid);
304}
305
306/**
307 * sel_netnode_flush - Flush the entire network address table
308 *
309 * Description:
310 * Remove all entries from the network address table.
311 *
312 */
313static void sel_netnode_flush(void)
314{
315 u32 idx;
316 struct sel_netnode *node;
317
318 spin_lock_bh(&sel_netnode_lock);
319 for (idx = 0; idx < SEL_NETNODE_HASH_SIZE; idx++)
320 list_for_each_entry(node, &sel_netnode_hash[idx], list)
321 sel_netnode_destroy(node);
322 spin_unlock_bh(&sel_netnode_lock);
323}
324
325static int sel_netnode_avc_callback(u32 event, u32 ssid, u32 tsid,
326 u16 class, u32 perms, u32 *retained)
327{
328 if (event == AVC_CALLBACK_RESET) {
329 sel_netnode_flush();
330 synchronize_net();
331 }
332 return 0;
333}
334
335static __init int sel_netnode_init(void)
336{
337 int iter;
338 int ret;
339
340 if (!selinux_enabled)
341 return 0;
342
343 for (iter = 0; iter < SEL_NETNODE_HASH_SIZE; iter++)
344 INIT_LIST_HEAD(&sel_netnode_hash[iter]);
345
346 ret = avc_add_callback(sel_netnode_avc_callback, AVC_CALLBACK_RESET,
347 SECSID_NULL, SECSID_NULL, SECCLASS_NULL, 0);
348 if (ret != 0)
349 panic("avc_add_callback() failed, error %d\n", ret);
350
351 return ret;
352}
353
354__initcall(sel_netnode_init);
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 397fd4955fe1..a85740530afc 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -2,6 +2,11 @@
2 * 2 *
3 * Added conditional policy language extensions 3 * Added conditional policy language extensions
4 * 4 *
5 * Updated: Hewlett-Packard <paul.moore@hp.com>
6 *
7 * Added support for the policy capability bitmap
8 *
9 * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
5 * Copyright (C) 2003 - 2004 Tresys Technology, LLC 10 * Copyright (C) 2003 - 2004 Tresys Technology, LLC
6 * Copyright (C) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 11 * Copyright (C) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
7 * This program is free software; you can redistribute it and/or modify 12 * This program is free software; you can redistribute it and/or modify
@@ -35,6 +40,11 @@
35#include "objsec.h" 40#include "objsec.h"
36#include "conditional.h" 41#include "conditional.h"
37 42
43/* Policy capability filenames */
44static char *policycap_names[] = {
45 "network_peer_controls"
46};
47
38unsigned int selinux_checkreqprot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; 48unsigned int selinux_checkreqprot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE;
39 49
40#ifdef CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT 50#ifdef CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT
@@ -72,6 +82,9 @@ static int *bool_pending_values = NULL;
72static struct dentry *class_dir = NULL; 82static struct dentry *class_dir = NULL;
73static unsigned long last_class_ino; 83static unsigned long last_class_ino;
74 84
85/* global data for policy capabilities */
86static struct dentry *policycap_dir = NULL;
87
75extern void selnl_notify_setenforce(int val); 88extern void selnl_notify_setenforce(int val);
76 89
77/* Check whether a task is allowed to use a security operation. */ 90/* Check whether a task is allowed to use a security operation. */
@@ -111,10 +124,11 @@ enum sel_inos {
111 124
112static unsigned long sel_last_ino = SEL_INO_NEXT - 1; 125static unsigned long sel_last_ino = SEL_INO_NEXT - 1;
113 126
114#define SEL_INITCON_INO_OFFSET 0x01000000 127#define SEL_INITCON_INO_OFFSET 0x01000000
115#define SEL_BOOL_INO_OFFSET 0x02000000 128#define SEL_BOOL_INO_OFFSET 0x02000000
116#define SEL_CLASS_INO_OFFSET 0x04000000 129#define SEL_CLASS_INO_OFFSET 0x04000000
117#define SEL_INO_MASK 0x00ffffff 130#define SEL_POLICYCAP_INO_OFFSET 0x08000000
131#define SEL_INO_MASK 0x00ffffff
118 132
119#define TMPBUFLEN 12 133#define TMPBUFLEN 12
120static ssize_t sel_read_enforce(struct file *filp, char __user *buf, 134static ssize_t sel_read_enforce(struct file *filp, char __user *buf,
@@ -263,6 +277,7 @@ static const struct file_operations sel_policyvers_ops = {
263/* declaration for sel_write_load */ 277/* declaration for sel_write_load */
264static int sel_make_bools(void); 278static int sel_make_bools(void);
265static int sel_make_classes(void); 279static int sel_make_classes(void);
280static int sel_make_policycap(void);
266 281
267/* declaration for sel_make_class_dirs */ 282/* declaration for sel_make_class_dirs */
268static int sel_make_dir(struct inode *dir, struct dentry *dentry, 283static int sel_make_dir(struct inode *dir, struct dentry *dentry,
@@ -323,6 +338,12 @@ static ssize_t sel_write_load(struct file * file, const char __user * buf,
323 } 338 }
324 339
325 ret = sel_make_classes(); 340 ret = sel_make_classes();
341 if (ret) {
342 length = ret;
343 goto out1;
344 }
345
346 ret = sel_make_policycap();
326 if (ret) 347 if (ret)
327 length = ret; 348 length = ret;
328 else 349 else
@@ -1399,6 +1420,24 @@ static const struct file_operations sel_perm_ops = {
1399 .read = sel_read_perm, 1420 .read = sel_read_perm,
1400}; 1421};
1401 1422
1423static ssize_t sel_read_policycap(struct file *file, char __user *buf,
1424 size_t count, loff_t *ppos)
1425{
1426 int value;
1427 char tmpbuf[TMPBUFLEN];
1428 ssize_t length;
1429 unsigned long i_ino = file->f_path.dentry->d_inode->i_ino;
1430
1431 value = security_policycap_supported(i_ino & SEL_INO_MASK);
1432 length = scnprintf(tmpbuf, TMPBUFLEN, "%d", value);
1433
1434 return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1435}
1436
1437static const struct file_operations sel_policycap_ops = {
1438 .read = sel_read_policycap,
1439};
1440
1402static int sel_make_perm_files(char *objclass, int classvalue, 1441static int sel_make_perm_files(char *objclass, int classvalue,
1403 struct dentry *dir) 1442 struct dentry *dir)
1404{ 1443{
@@ -1545,6 +1584,36 @@ out:
1545 return rc; 1584 return rc;
1546} 1585}
1547 1586
1587static int sel_make_policycap(void)
1588{
1589 unsigned int iter;
1590 struct dentry *dentry = NULL;
1591 struct inode *inode = NULL;
1592
1593 sel_remove_entries(policycap_dir);
1594
1595 for (iter = 0; iter <= POLICYDB_CAPABILITY_MAX; iter++) {
1596 if (iter < ARRAY_SIZE(policycap_names))
1597 dentry = d_alloc_name(policycap_dir,
1598 policycap_names[iter]);
1599 else
1600 dentry = d_alloc_name(policycap_dir, "unknown");
1601
1602 if (dentry == NULL)
1603 return -ENOMEM;
1604
1605 inode = sel_make_inode(policycap_dir->d_sb, S_IFREG | S_IRUGO);
1606 if (inode == NULL)
1607 return -ENOMEM;
1608
1609 inode->i_fop = &sel_policycap_ops;
1610 inode->i_ino = iter | SEL_POLICYCAP_INO_OFFSET;
1611 d_add(dentry, inode);
1612 }
1613
1614 return 0;
1615}
1616
1548static int sel_make_dir(struct inode *dir, struct dentry *dentry, 1617static int sel_make_dir(struct inode *dir, struct dentry *dentry,
1549 unsigned long *ino) 1618 unsigned long *ino)
1550{ 1619{
@@ -1673,6 +1742,18 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent)
1673 1742
1674 class_dir = dentry; 1743 class_dir = dentry;
1675 1744
1745 dentry = d_alloc_name(sb->s_root, "policy_capabilities");
1746 if (!dentry) {
1747 ret = -ENOMEM;
1748 goto err;
1749 }
1750
1751 ret = sel_make_dir(root_inode, dentry, &sel_last_ino);
1752 if (ret)
1753 goto err;
1754
1755 policycap_dir = dentry;
1756
1676out: 1757out:
1677 return ret; 1758 return ret;
1678err: 1759err:
diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c
index 3bbcb5369af9..feaf0a5b828f 100644
--- a/security/selinux/ss/mls.c
+++ b/security/selinux/ss/mls.c
@@ -562,7 +562,7 @@ void mls_export_netlbl_lvl(struct context *context,
562 if (!selinux_mls_enabled) 562 if (!selinux_mls_enabled)
563 return; 563 return;
564 564
565 secattr->mls_lvl = context->range.level[0].sens - 1; 565 secattr->attr.mls.lvl = context->range.level[0].sens - 1;
566 secattr->flags |= NETLBL_SECATTR_MLS_LVL; 566 secattr->flags |= NETLBL_SECATTR_MLS_LVL;
567} 567}
568 568
@@ -582,7 +582,7 @@ void mls_import_netlbl_lvl(struct context *context,
582 if (!selinux_mls_enabled) 582 if (!selinux_mls_enabled)
583 return; 583 return;
584 584
585 context->range.level[0].sens = secattr->mls_lvl + 1; 585 context->range.level[0].sens = secattr->attr.mls.lvl + 1;
586 context->range.level[1].sens = context->range.level[0].sens; 586 context->range.level[1].sens = context->range.level[0].sens;
587} 587}
588 588
@@ -605,8 +605,8 @@ int mls_export_netlbl_cat(struct context *context,
605 return 0; 605 return 0;
606 606
607 rc = ebitmap_netlbl_export(&context->range.level[0].cat, 607 rc = ebitmap_netlbl_export(&context->range.level[0].cat,
608 &secattr->mls_cat); 608 &secattr->attr.mls.cat);
609 if (rc == 0 && secattr->mls_cat != NULL) 609 if (rc == 0 && secattr->attr.mls.cat != NULL)
610 secattr->flags |= NETLBL_SECATTR_MLS_CAT; 610 secattr->flags |= NETLBL_SECATTR_MLS_CAT;
611 611
612 return rc; 612 return rc;
@@ -633,7 +633,7 @@ int mls_import_netlbl_cat(struct context *context,
633 return 0; 633 return 0;
634 634
635 rc = ebitmap_netlbl_import(&context->range.level[0].cat, 635 rc = ebitmap_netlbl_import(&context->range.level[0].cat,
636 secattr->mls_cat); 636 secattr->attr.mls.cat);
637 if (rc != 0) 637 if (rc != 0)
638 goto import_netlbl_cat_failure; 638 goto import_netlbl_cat_failure;
639 639
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index b582aae3c62c..bd7d6a00342d 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -13,6 +13,11 @@
13 * 13 *
14 * Added conditional policy language extensions 14 * Added conditional policy language extensions
15 * 15 *
16 * Updated: Hewlett-Packard <paul.moore@hp.com>
17 *
18 * Added support for the policy capability bitmap
19 *
20 * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
16 * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. 21 * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
17 * Copyright (C) 2003 - 2004 Tresys Technology, LLC 22 * Copyright (C) 2003 - 2004 Tresys Technology, LLC
18 * This program is free software; you can redistribute it and/or modify 23 * This program is free software; you can redistribute it and/or modify
@@ -102,6 +107,11 @@ static struct policydb_compat_info policydb_compat[] = {
102 .sym_num = SYM_NUM, 107 .sym_num = SYM_NUM,
103 .ocon_num = OCON_NUM, 108 .ocon_num = OCON_NUM,
104 }, 109 },
110 {
111 .version = POLICYDB_VERSION_POLCAP,
112 .sym_num = SYM_NUM,
113 .ocon_num = OCON_NUM,
114 }
105}; 115};
106 116
107static struct policydb_compat_info *policydb_lookup_compat(int version) 117static struct policydb_compat_info *policydb_lookup_compat(int version)
@@ -183,6 +193,8 @@ static int policydb_init(struct policydb *p)
183 if (rc) 193 if (rc)
184 goto out_free_symtab; 194 goto out_free_symtab;
185 195
196 ebitmap_init(&p->policycaps);
197
186out: 198out:
187 return rc; 199 return rc;
188 200
@@ -673,8 +685,8 @@ void policydb_destroy(struct policydb *p)
673 ebitmap_destroy(&p->type_attr_map[i]); 685 ebitmap_destroy(&p->type_attr_map[i]);
674 } 686 }
675 kfree(p->type_attr_map); 687 kfree(p->type_attr_map);
676
677 kfree(p->undefined_perms); 688 kfree(p->undefined_perms);
689 ebitmap_destroy(&p->policycaps);
678 690
679 return; 691 return;
680} 692}
@@ -1554,6 +1566,10 @@ int policydb_read(struct policydb *p, void *fp)
1554 p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN); 1566 p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN);
1555 p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN); 1567 p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN);
1556 1568
1569 if (p->policyvers >= POLICYDB_VERSION_POLCAP &&
1570 ebitmap_read(&p->policycaps, fp) != 0)
1571 goto bad;
1572
1557 info = policydb_lookup_compat(p->policyvers); 1573 info = policydb_lookup_compat(p->policyvers);
1558 if (!info) { 1574 if (!info) {
1559 printk(KERN_ERR "security: unable to find policy compat info " 1575 printk(KERN_ERR "security: unable to find policy compat info "
diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h
index ed6fc687c66f..c4ce996e202c 100644
--- a/security/selinux/ss/policydb.h
+++ b/security/selinux/ss/policydb.h
@@ -241,6 +241,8 @@ struct policydb {
241 /* type -> attribute reverse mapping */ 241 /* type -> attribute reverse mapping */
242 struct ebitmap *type_attr_map; 242 struct ebitmap *type_attr_map;
243 243
244 struct ebitmap policycaps;
245
244 unsigned int policyvers; 246 unsigned int policyvers;
245 247
246 unsigned int reject_unknown : 1; 248 unsigned int reject_unknown : 1;
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 4bf715d4cf29..f96dec1f9258 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -16,12 +16,13 @@
16 * Updated: Hewlett-Packard <paul.moore@hp.com> 16 * Updated: Hewlett-Packard <paul.moore@hp.com>
17 * 17 *
18 * Added support for NetLabel 18 * Added support for NetLabel
19 * Added support for the policy capability bitmap
19 * 20 *
20 * Updated: Chad Sellers <csellers@tresys.com> 21 * Updated: Chad Sellers <csellers@tresys.com>
21 * 22 *
22 * Added validation of kernel classes and permissions 23 * Added validation of kernel classes and permissions
23 * 24 *
24 * Copyright (C) 2006 Hewlett-Packard Development Company, L.P. 25 * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P.
25 * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. 26 * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
26 * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC 27 * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC
27 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> 28 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
@@ -59,6 +60,8 @@
59extern void selnl_notify_policyload(u32 seqno); 60extern void selnl_notify_policyload(u32 seqno);
60unsigned int policydb_loaded_version; 61unsigned int policydb_loaded_version;
61 62
63int selinux_policycap_netpeer;
64
62/* 65/*
63 * This is declared in avc.c 66 * This is declared in avc.c
64 */ 67 */
@@ -1299,6 +1302,12 @@ bad:
1299 goto out; 1302 goto out;
1300} 1303}
1301 1304
1305static void security_load_policycaps(void)
1306{
1307 selinux_policycap_netpeer = ebitmap_get_bit(&policydb.policycaps,
1308 POLICYDB_CAPABILITY_NETPEER);
1309}
1310
1302extern void selinux_complete_init(void); 1311extern void selinux_complete_init(void);
1303static int security_preserve_bools(struct policydb *p); 1312static int security_preserve_bools(struct policydb *p);
1304 1313
@@ -1346,6 +1355,7 @@ int security_load_policy(void *data, size_t len)
1346 avtab_cache_destroy(); 1355 avtab_cache_destroy();
1347 return -EINVAL; 1356 return -EINVAL;
1348 } 1357 }
1358 security_load_policycaps();
1349 policydb_loaded_version = policydb.policyvers; 1359 policydb_loaded_version = policydb.policyvers;
1350 ss_initialized = 1; 1360 ss_initialized = 1;
1351 seqno = ++latest_granting; 1361 seqno = ++latest_granting;
@@ -1404,6 +1414,7 @@ int security_load_policy(void *data, size_t len)
1404 POLICY_WRLOCK; 1414 POLICY_WRLOCK;
1405 memcpy(&policydb, &newpolicydb, sizeof policydb); 1415 memcpy(&policydb, &newpolicydb, sizeof policydb);
1406 sidtab_set(&sidtab, &newsidtab); 1416 sidtab_set(&sidtab, &newsidtab);
1417 security_load_policycaps();
1407 seqno = ++latest_granting; 1418 seqno = ++latest_granting;
1408 policydb_loaded_version = policydb.policyvers; 1419 policydb_loaded_version = policydb.policyvers;
1409 POLICY_WRUNLOCK; 1420 POLICY_WRUNLOCK;
@@ -1478,11 +1489,8 @@ out:
1478 * security_netif_sid - Obtain the SID for a network interface. 1489 * security_netif_sid - Obtain the SID for a network interface.
1479 * @name: interface name 1490 * @name: interface name
1480 * @if_sid: interface SID 1491 * @if_sid: interface SID
1481 * @msg_sid: default SID for received packets
1482 */ 1492 */
1483int security_netif_sid(char *name, 1493int security_netif_sid(char *name, u32 *if_sid)
1484 u32 *if_sid,
1485 u32 *msg_sid)
1486{ 1494{
1487 int rc = 0; 1495 int rc = 0;
1488 struct ocontext *c; 1496 struct ocontext *c;
@@ -1510,11 +1518,8 @@ int security_netif_sid(char *name,
1510 goto out; 1518 goto out;
1511 } 1519 }
1512 *if_sid = c->sid[0]; 1520 *if_sid = c->sid[0];
1513 *msg_sid = c->sid[1]; 1521 } else
1514 } else {
1515 *if_sid = SECINITSID_NETIF; 1522 *if_sid = SECINITSID_NETIF;
1516 *msg_sid = SECINITSID_NETMSG;
1517 }
1518 1523
1519out: 1524out:
1520 POLICY_RDUNLOCK; 1525 POLICY_RDUNLOCK;
@@ -2049,6 +2054,91 @@ out:
2049 return rc; 2054 return rc;
2050} 2055}
2051 2056
2057/**
2058 * security_net_peersid_resolve - Compare and resolve two network peer SIDs
2059 * @nlbl_sid: NetLabel SID
2060 * @nlbl_type: NetLabel labeling protocol type
2061 * @xfrm_sid: XFRM SID
2062 *
2063 * Description:
2064 * Compare the @nlbl_sid and @xfrm_sid values and if the two SIDs can be
2065 * resolved into a single SID it is returned via @peer_sid and the function
2066 * returns zero. Otherwise @peer_sid is set to SECSID_NULL and the function
2067 * returns a negative value. A table summarizing the behavior is below:
2068 *
2069 * | function return | @sid
2070 * ------------------------------+-----------------+-----------------
2071 * no peer labels | 0 | SECSID_NULL
2072 * single peer label | 0 | <peer_label>
2073 * multiple, consistent labels | 0 | <peer_label>
2074 * multiple, inconsistent labels | -<errno> | SECSID_NULL
2075 *
2076 */
2077int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type,
2078 u32 xfrm_sid,
2079 u32 *peer_sid)
2080{
2081 int rc;
2082 struct context *nlbl_ctx;
2083 struct context *xfrm_ctx;
2084
2085 /* handle the common (which also happens to be the set of easy) cases
2086 * right away, these two if statements catch everything involving a
2087 * single or absent peer SID/label */
2088 if (xfrm_sid == SECSID_NULL) {
2089 *peer_sid = nlbl_sid;
2090 return 0;
2091 }
2092 /* NOTE: an nlbl_type == NETLBL_NLTYPE_UNLABELED is a "fallback" label
2093 * and is treated as if nlbl_sid == SECSID_NULL when a XFRM SID/label
2094 * is present */
2095 if (nlbl_sid == SECSID_NULL || nlbl_type == NETLBL_NLTYPE_UNLABELED) {
2096 *peer_sid = xfrm_sid;
2097 return 0;
2098 }
2099
2100 /* we don't need to check ss_initialized here since the only way both
2101 * nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the
2102 * security server was initialized and ss_initialized was true */
2103 if (!selinux_mls_enabled) {
2104 *peer_sid = SECSID_NULL;
2105 return 0;
2106 }
2107
2108 POLICY_RDLOCK;
2109
2110 nlbl_ctx = sidtab_search(&sidtab, nlbl_sid);
2111 if (!nlbl_ctx) {
2112 printk(KERN_ERR
2113 "security_sid_mls_cmp: unrecognized SID %d\n",
2114 nlbl_sid);
2115 rc = -EINVAL;
2116 goto out_slowpath;
2117 }
2118 xfrm_ctx = sidtab_search(&sidtab, xfrm_sid);
2119 if (!xfrm_ctx) {
2120 printk(KERN_ERR
2121 "security_sid_mls_cmp: unrecognized SID %d\n",
2122 xfrm_sid);
2123 rc = -EINVAL;
2124 goto out_slowpath;
2125 }
2126 rc = (mls_context_cmp(nlbl_ctx, xfrm_ctx) ? 0 : -EACCES);
2127
2128out_slowpath:
2129 POLICY_RDUNLOCK;
2130 if (rc == 0)
2131 /* at present NetLabel SIDs/labels really only carry MLS
2132 * information so if the MLS portion of the NetLabel SID
2133 * matches the MLS portion of the labeled XFRM SID/label
2134 * then pass along the XFRM SID as it is the most
2135 * expressive */
2136 *peer_sid = xfrm_sid;
2137 else
2138 *peer_sid = SECSID_NULL;
2139 return rc;
2140}
2141
2052static int get_classes_callback(void *k, void *d, void *args) 2142static int get_classes_callback(void *k, void *d, void *args)
2053{ 2143{
2054 struct class_datum *datum = d; 2144 struct class_datum *datum = d;
@@ -2154,6 +2244,60 @@ int security_get_allow_unknown(void)
2154 return policydb.allow_unknown; 2244 return policydb.allow_unknown;
2155} 2245}
2156 2246
2247/**
2248 * security_get_policycaps - Query the loaded policy for its capabilities
2249 * @len: the number of capability bits
2250 * @values: the capability bit array
2251 *
2252 * Description:
2253 * Get an array of the policy capabilities in @values where each entry in
2254 * @values is either true (1) or false (0) depending the policy's support of
2255 * that feature. The policy capabilities are defined by the
2256 * POLICYDB_CAPABILITY_* enums. The size of the array is stored in @len and it
2257 * is up to the caller to free the array in @values. Returns zero on success,
2258 * negative values on failure.
2259 *
2260 */
2261int security_get_policycaps(int *len, int **values)
2262{
2263 int rc = -ENOMEM;
2264 unsigned int iter;
2265
2266 POLICY_RDLOCK;
2267
2268 *values = kcalloc(POLICYDB_CAPABILITY_MAX, sizeof(int), GFP_ATOMIC);
2269 if (*values == NULL)
2270 goto out;
2271 for (iter = 0; iter < POLICYDB_CAPABILITY_MAX; iter++)
2272 (*values)[iter] = ebitmap_get_bit(&policydb.policycaps, iter);
2273 *len = POLICYDB_CAPABILITY_MAX;
2274
2275out:
2276 POLICY_RDUNLOCK;
2277 return rc;
2278}
2279
2280/**
2281 * security_policycap_supported - Check for a specific policy capability
2282 * @req_cap: capability
2283 *
2284 * Description:
2285 * This function queries the currently loaded policy to see if it supports the
2286 * capability specified by @req_cap. Returns true (1) if the capability is
2287 * supported, false (0) if it isn't supported.
2288 *
2289 */
2290int security_policycap_supported(unsigned int req_cap)
2291{
2292 int rc;
2293
2294 POLICY_RDLOCK;
2295 rc = ebitmap_get_bit(&policydb.policycaps, req_cap);
2296 POLICY_RDUNLOCK;
2297
2298 return rc;
2299}
2300
2157struct selinux_audit_rule { 2301struct selinux_audit_rule {
2158 u32 au_seqno; 2302 u32 au_seqno;
2159 struct context au_ctxt; 2303 struct context au_ctxt;
@@ -2403,50 +2547,10 @@ void selinux_audit_set_callback(int (*callback)(void))
2403} 2547}
2404 2548
2405#ifdef CONFIG_NETLABEL 2549#ifdef CONFIG_NETLABEL
2406/*
2407 * NetLabel cache structure
2408 */
2409#define NETLBL_CACHE(x) ((struct selinux_netlbl_cache *)(x))
2410#define NETLBL_CACHE_T_NONE 0
2411#define NETLBL_CACHE_T_SID 1
2412#define NETLBL_CACHE_T_MLS 2
2413struct selinux_netlbl_cache {
2414 u32 type;
2415 union {
2416 u32 sid;
2417 struct mls_range mls_label;
2418 } data;
2419};
2420
2421/**
2422 * security_netlbl_cache_free - Free the NetLabel cached data
2423 * @data: the data to free
2424 *
2425 * Description:
2426 * This function is intended to be used as the free() callback inside the
2427 * netlbl_lsm_cache structure.
2428 *
2429 */
2430static void security_netlbl_cache_free(const void *data)
2431{
2432 struct selinux_netlbl_cache *cache;
2433
2434 if (data == NULL)
2435 return;
2436
2437 cache = NETLBL_CACHE(data);
2438 switch (cache->type) {
2439 case NETLBL_CACHE_T_MLS:
2440 ebitmap_destroy(&cache->data.mls_label.level[0].cat);
2441 break;
2442 }
2443 kfree(data);
2444}
2445
2446/** 2550/**
2447 * security_netlbl_cache_add - Add an entry to the NetLabel cache 2551 * security_netlbl_cache_add - Add an entry to the NetLabel cache
2448 * @secattr: the NetLabel packet security attributes 2552 * @secattr: the NetLabel packet security attributes
2449 * @ctx: the SELinux context 2553 * @sid: the SELinux SID
2450 * 2554 *
2451 * Description: 2555 * Description:
2452 * Attempt to cache the context in @ctx, which was derived from the packet in 2556 * Attempt to cache the context in @ctx, which was derived from the packet in
@@ -2455,60 +2559,46 @@ static void security_netlbl_cache_free(const void *data)
2455 * 2559 *
2456 */ 2560 */
2457static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr, 2561static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr,
2458 struct context *ctx) 2562 u32 sid)
2459{ 2563{
2460 struct selinux_netlbl_cache *cache = NULL; 2564 u32 *sid_cache;
2461 2565
2462 secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); 2566 sid_cache = kmalloc(sizeof(*sid_cache), GFP_ATOMIC);
2463 if (secattr->cache == NULL) 2567 if (sid_cache == NULL)
2464 return;
2465
2466 cache = kzalloc(sizeof(*cache), GFP_ATOMIC);
2467 if (cache == NULL)
2468 return; 2568 return;
2469 2569 secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC);
2470 cache->type = NETLBL_CACHE_T_MLS; 2570 if (secattr->cache == NULL) {
2471 if (ebitmap_cpy(&cache->data.mls_label.level[0].cat, 2571 kfree(sid_cache);
2472 &ctx->range.level[0].cat) != 0) {
2473 kfree(cache);
2474 return; 2572 return;
2475 } 2573 }
2476 cache->data.mls_label.level[1].cat.highbit =
2477 cache->data.mls_label.level[0].cat.highbit;
2478 cache->data.mls_label.level[1].cat.node =
2479 cache->data.mls_label.level[0].cat.node;
2480 cache->data.mls_label.level[0].sens = ctx->range.level[0].sens;
2481 cache->data.mls_label.level[1].sens = ctx->range.level[0].sens;
2482 2574
2483 secattr->cache->free = security_netlbl_cache_free; 2575 *sid_cache = sid;
2484 secattr->cache->data = (void *)cache; 2576 secattr->cache->free = kfree;
2577 secattr->cache->data = sid_cache;
2485 secattr->flags |= NETLBL_SECATTR_CACHE; 2578 secattr->flags |= NETLBL_SECATTR_CACHE;
2486} 2579}
2487 2580
2488/** 2581/**
2489 * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID 2582 * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID
2490 * @secattr: the NetLabel packet security attributes 2583 * @secattr: the NetLabel packet security attributes
2491 * @base_sid: the SELinux SID to use as a context for MLS only attributes
2492 * @sid: the SELinux SID 2584 * @sid: the SELinux SID
2493 * 2585 *
2494 * Description: 2586 * Description:
2495 * Convert the given NetLabel security attributes in @secattr into a 2587 * Convert the given NetLabel security attributes in @secattr into a
2496 * SELinux SID. If the @secattr field does not contain a full SELinux 2588 * SELinux SID. If the @secattr field does not contain a full SELinux
2497 * SID/context then use the context in @base_sid as the foundation. If 2589 * SID/context then use SECINITSID_NETMSG as the foundation. If possibile the
2498 * possibile the 'cache' field of @secattr is set and the CACHE flag is set; 2590 * 'cache' field of @secattr is set and the CACHE flag is set; this is to
2499 * this is to allow the @secattr to be used by NetLabel to cache the secattr to 2591 * allow the @secattr to be used by NetLabel to cache the secattr to SID
2500 * SID conversion for future lookups. Returns zero on success, negative 2592 * conversion for future lookups. Returns zero on success, negative values on
2501 * values on failure. 2593 * failure.
2502 * 2594 *
2503 */ 2595 */
2504int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, 2596int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
2505 u32 base_sid,
2506 u32 *sid) 2597 u32 *sid)
2507{ 2598{
2508 int rc = -EIDRM; 2599 int rc = -EIDRM;
2509 struct context *ctx; 2600 struct context *ctx;
2510 struct context ctx_new; 2601 struct context ctx_new;
2511 struct selinux_netlbl_cache *cache;
2512 2602
2513 if (!ss_initialized) { 2603 if (!ss_initialized) {
2514 *sid = SECSID_NULL; 2604 *sid = SECSID_NULL;
@@ -2518,40 +2608,13 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
2518 POLICY_RDLOCK; 2608 POLICY_RDLOCK;
2519 2609
2520 if (secattr->flags & NETLBL_SECATTR_CACHE) { 2610 if (secattr->flags & NETLBL_SECATTR_CACHE) {
2521 cache = NETLBL_CACHE(secattr->cache->data); 2611 *sid = *(u32 *)secattr->cache->data;
2522 switch (cache->type) { 2612 rc = 0;
2523 case NETLBL_CACHE_T_SID: 2613 } else if (secattr->flags & NETLBL_SECATTR_SECID) {
2524 *sid = cache->data.sid; 2614 *sid = secattr->attr.secid;
2525 rc = 0; 2615 rc = 0;
2526 break;
2527 case NETLBL_CACHE_T_MLS:
2528 ctx = sidtab_search(&sidtab, base_sid);
2529 if (ctx == NULL)
2530 goto netlbl_secattr_to_sid_return;
2531
2532 ctx_new.user = ctx->user;
2533 ctx_new.role = ctx->role;
2534 ctx_new.type = ctx->type;
2535 ctx_new.range.level[0].sens =
2536 cache->data.mls_label.level[0].sens;
2537 ctx_new.range.level[0].cat.highbit =
2538 cache->data.mls_label.level[0].cat.highbit;
2539 ctx_new.range.level[0].cat.node =
2540 cache->data.mls_label.level[0].cat.node;
2541 ctx_new.range.level[1].sens =
2542 cache->data.mls_label.level[1].sens;
2543 ctx_new.range.level[1].cat.highbit =
2544 cache->data.mls_label.level[1].cat.highbit;
2545 ctx_new.range.level[1].cat.node =
2546 cache->data.mls_label.level[1].cat.node;
2547
2548 rc = sidtab_context_to_sid(&sidtab, &ctx_new, sid);
2549 break;
2550 default:
2551 goto netlbl_secattr_to_sid_return;
2552 }
2553 } else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) { 2616 } else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) {
2554 ctx = sidtab_search(&sidtab, base_sid); 2617 ctx = sidtab_search(&sidtab, SECINITSID_NETMSG);
2555 if (ctx == NULL) 2618 if (ctx == NULL)
2556 goto netlbl_secattr_to_sid_return; 2619 goto netlbl_secattr_to_sid_return;
2557 2620
@@ -2561,7 +2624,7 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
2561 mls_import_netlbl_lvl(&ctx_new, secattr); 2624 mls_import_netlbl_lvl(&ctx_new, secattr);
2562 if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { 2625 if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
2563 if (ebitmap_netlbl_import(&ctx_new.range.level[0].cat, 2626 if (ebitmap_netlbl_import(&ctx_new.range.level[0].cat,
2564 secattr->mls_cat) != 0) 2627 secattr->attr.mls.cat) != 0)
2565 goto netlbl_secattr_to_sid_return; 2628 goto netlbl_secattr_to_sid_return;
2566 ctx_new.range.level[1].cat.highbit = 2629 ctx_new.range.level[1].cat.highbit =
2567 ctx_new.range.level[0].cat.highbit; 2630 ctx_new.range.level[0].cat.highbit;
@@ -2578,7 +2641,7 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
2578 if (rc != 0) 2641 if (rc != 0)
2579 goto netlbl_secattr_to_sid_return_cleanup; 2642 goto netlbl_secattr_to_sid_return_cleanup;
2580 2643
2581 security_netlbl_cache_add(secattr, &ctx_new); 2644 security_netlbl_cache_add(secattr, *sid);
2582 2645
2583 ebitmap_destroy(&ctx_new.range.level[0].cat); 2646 ebitmap_destroy(&ctx_new.range.level[0].cat);
2584 } else { 2647 } else {
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index e07603969033..7e158205d081 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -46,11 +46,14 @@
46#include <net/checksum.h> 46#include <net/checksum.h>
47#include <net/udp.h> 47#include <net/udp.h>
48#include <asm/semaphore.h> 48#include <asm/semaphore.h>
49#include <asm/atomic.h>
49 50
50#include "avc.h" 51#include "avc.h"
51#include "objsec.h" 52#include "objsec.h"
52#include "xfrm.h" 53#include "xfrm.h"
53 54
55/* Labeled XFRM instance counter */
56atomic_t selinux_xfrm_refcount = ATOMIC_INIT(0);
54 57
55/* 58/*
56 * Returns true if an LSM/SELinux context 59 * Returns true if an LSM/SELinux context
@@ -293,6 +296,9 @@ int selinux_xfrm_policy_alloc(struct xfrm_policy *xp,
293 BUG_ON(!uctx); 296 BUG_ON(!uctx);
294 297
295 err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx, 0); 298 err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx, 0);
299 if (err == 0)
300 atomic_inc(&selinux_xfrm_refcount);
301
296 return err; 302 return err;
297} 303}
298 304
@@ -340,10 +346,13 @@ int selinux_xfrm_policy_delete(struct xfrm_policy *xp)
340 struct xfrm_sec_ctx *ctx = xp->security; 346 struct xfrm_sec_ctx *ctx = xp->security;
341 int rc = 0; 347 int rc = 0;
342 348
343 if (ctx) 349 if (ctx) {
344 rc = avc_has_perm(tsec->sid, ctx->ctx_sid, 350 rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
345 SECCLASS_ASSOCIATION, 351 SECCLASS_ASSOCIATION,
346 ASSOCIATION__SETCONTEXT, NULL); 352 ASSOCIATION__SETCONTEXT, NULL);
353 if (rc == 0)
354 atomic_dec(&selinux_xfrm_refcount);
355 }
347 356
348 return rc; 357 return rc;
349} 358}
@@ -360,6 +369,8 @@ int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uct
360 BUG_ON(!x); 369 BUG_ON(!x);
361 370
362 err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx, secid); 371 err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx, secid);
372 if (err == 0)
373 atomic_inc(&selinux_xfrm_refcount);
363 return err; 374 return err;
364} 375}
365 376
@@ -382,10 +393,13 @@ int selinux_xfrm_state_delete(struct xfrm_state *x)
382 struct xfrm_sec_ctx *ctx = x->security; 393 struct xfrm_sec_ctx *ctx = x->security;
383 int rc = 0; 394 int rc = 0;
384 395
385 if (ctx) 396 if (ctx) {
386 rc = avc_has_perm(tsec->sid, ctx->ctx_sid, 397 rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
387 SECCLASS_ASSOCIATION, 398 SECCLASS_ASSOCIATION,
388 ASSOCIATION__SETCONTEXT, NULL); 399 ASSOCIATION__SETCONTEXT, NULL);
400 if (rc == 0)
401 atomic_dec(&selinux_xfrm_refcount);
402 }
389 403
390 return rc; 404 return rc;
391} 405}
diff --git a/drivers/kvm/ioapic.c b/virt/kvm/ioapic.c
index c7992e667fdb..317f8e211cd2 100644
--- a/drivers/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -26,7 +26,7 @@
26 * Based on Xen 3.1 code. 26 * Based on Xen 3.1 code.
27 */ 27 */
28 28
29#include "kvm.h" 29#include <linux/kvm_host.h>
30#include <linux/kvm.h> 30#include <linux/kvm.h>
31#include <linux/mm.h> 31#include <linux/mm.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
@@ -34,14 +34,17 @@
34#include <linux/hrtimer.h> 34#include <linux/hrtimer.h>
35#include <linux/io.h> 35#include <linux/io.h>
36#include <asm/processor.h> 36#include <asm/processor.h>
37#include <asm/msr.h>
38#include <asm/page.h> 37#include <asm/page.h>
39#include <asm/current.h> 38#include <asm/current.h>
40#include <asm/apicdef.h> 39
41#include <asm/io_apic.h> 40#include "ioapic.h"
42#include "irq.h" 41#include "lapic.h"
43/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ 42
43#if 0
44#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
45#else
44#define ioapic_debug(fmt, arg...) 46#define ioapic_debug(fmt, arg...)
47#endif
45static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq); 48static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
46 49
47static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, 50static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
@@ -113,7 +116,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
113 default: 116 default:
114 index = (ioapic->ioregsel - 0x10) >> 1; 117 index = (ioapic->ioregsel - 0x10) >> 1;
115 118
116 ioapic_debug("change redir index %x val %x", index, val); 119 ioapic_debug("change redir index %x val %x\n", index, val);
117 if (index >= IOAPIC_NUM_PINS) 120 if (index >= IOAPIC_NUM_PINS)
118 return; 121 return;
119 if (ioapic->ioregsel & 1) { 122 if (ioapic->ioregsel & 1) {
@@ -131,16 +134,16 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
131} 134}
132 135
133static void ioapic_inj_irq(struct kvm_ioapic *ioapic, 136static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
134 struct kvm_lapic *target, 137 struct kvm_vcpu *vcpu,
135 u8 vector, u8 trig_mode, u8 delivery_mode) 138 u8 vector, u8 trig_mode, u8 delivery_mode)
136{ 139{
137 ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode, 140 ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode,
138 delivery_mode); 141 delivery_mode);
139 142
140 ASSERT((delivery_mode == dest_Fixed) || 143 ASSERT((delivery_mode == IOAPIC_FIXED) ||
141 (delivery_mode == dest_LowestPrio)); 144 (delivery_mode == IOAPIC_LOWEST_PRIORITY));
142 145
143 kvm_apic_set_irq(target, vector, trig_mode); 146 kvm_apic_set_irq(vcpu, vector, trig_mode);
144} 147}
145 148
146static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, 149static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
@@ -151,12 +154,12 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
151 struct kvm *kvm = ioapic->kvm; 154 struct kvm *kvm = ioapic->kvm;
152 struct kvm_vcpu *vcpu; 155 struct kvm_vcpu *vcpu;
153 156
154 ioapic_debug("dest %d dest_mode %d", dest, dest_mode); 157 ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
155 158
156 if (dest_mode == 0) { /* Physical mode. */ 159 if (dest_mode == 0) { /* Physical mode. */
157 if (dest == 0xFF) { /* Broadcast. */ 160 if (dest == 0xFF) { /* Broadcast. */
158 for (i = 0; i < KVM_MAX_VCPUS; ++i) 161 for (i = 0; i < KVM_MAX_VCPUS; ++i)
159 if (kvm->vcpus[i] && kvm->vcpus[i]->apic) 162 if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
160 mask |= 1 << i; 163 mask |= 1 << i;
161 return mask; 164 return mask;
162 } 165 }
@@ -164,8 +167,8 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
164 vcpu = kvm->vcpus[i]; 167 vcpu = kvm->vcpus[i];
165 if (!vcpu) 168 if (!vcpu)
166 continue; 169 continue;
167 if (kvm_apic_match_physical_addr(vcpu->apic, dest)) { 170 if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
168 if (vcpu->apic) 171 if (vcpu->arch.apic)
169 mask = 1 << i; 172 mask = 1 << i;
170 break; 173 break;
171 } 174 }
@@ -175,11 +178,11 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
175 vcpu = kvm->vcpus[i]; 178 vcpu = kvm->vcpus[i];
176 if (!vcpu) 179 if (!vcpu)
177 continue; 180 continue;
178 if (vcpu->apic && 181 if (vcpu->arch.apic &&
179 kvm_apic_match_logical_addr(vcpu->apic, dest)) 182 kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
180 mask |= 1 << vcpu->vcpu_id; 183 mask |= 1 << vcpu->vcpu_id;
181 } 184 }
182 ioapic_debug("mask %x", mask); 185 ioapic_debug("mask %x\n", mask);
183 return mask; 186 return mask;
184} 187}
185 188
@@ -191,41 +194,39 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
191 u8 vector = ioapic->redirtbl[irq].fields.vector; 194 u8 vector = ioapic->redirtbl[irq].fields.vector;
192 u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; 195 u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
193 u32 deliver_bitmask; 196 u32 deliver_bitmask;
194 struct kvm_lapic *target;
195 struct kvm_vcpu *vcpu; 197 struct kvm_vcpu *vcpu;
196 int vcpu_id; 198 int vcpu_id;
197 199
198 ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " 200 ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
199 "vector=%x trig_mode=%x", 201 "vector=%x trig_mode=%x\n",
200 dest, dest_mode, delivery_mode, vector, trig_mode); 202 dest, dest_mode, delivery_mode, vector, trig_mode);
201 203
202 deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); 204 deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
203 if (!deliver_bitmask) { 205 if (!deliver_bitmask) {
204 ioapic_debug("no target on destination"); 206 ioapic_debug("no target on destination\n");
205 return; 207 return;
206 } 208 }
207 209
208 switch (delivery_mode) { 210 switch (delivery_mode) {
209 case dest_LowestPrio: 211 case IOAPIC_LOWEST_PRIORITY:
210 target = 212 vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
211 kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask); 213 deliver_bitmask);
212 if (target != NULL) 214 if (vcpu != NULL)
213 ioapic_inj_irq(ioapic, target, vector, 215 ioapic_inj_irq(ioapic, vcpu, vector,
214 trig_mode, delivery_mode); 216 trig_mode, delivery_mode);
215 else 217 else
216 ioapic_debug("null round robin: " 218 ioapic_debug("null lowest prio vcpu: "
217 "mask=%x vector=%x delivery_mode=%x", 219 "mask=%x vector=%x delivery_mode=%x\n",
218 deliver_bitmask, vector, dest_LowestPrio); 220 deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
219 break; 221 break;
220 case dest_Fixed: 222 case IOAPIC_FIXED:
221 for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { 223 for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
222 if (!(deliver_bitmask & (1 << vcpu_id))) 224 if (!(deliver_bitmask & (1 << vcpu_id)))
223 continue; 225 continue;
224 deliver_bitmask &= ~(1 << vcpu_id); 226 deliver_bitmask &= ~(1 << vcpu_id);
225 vcpu = ioapic->kvm->vcpus[vcpu_id]; 227 vcpu = ioapic->kvm->vcpus[vcpu_id];
226 if (vcpu) { 228 if (vcpu) {
227 target = vcpu->apic; 229 ioapic_inj_irq(ioapic, vcpu, vector,
228 ioapic_inj_irq(ioapic, target, vector,
229 trig_mode, delivery_mode); 230 trig_mode, delivery_mode);
230 } 231 }
231 } 232 }
@@ -271,7 +272,7 @@ static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
271 272
272void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) 273void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
273{ 274{
274 struct kvm_ioapic *ioapic = kvm->vioapic; 275 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
275 union ioapic_redir_entry *ent; 276 union ioapic_redir_entry *ent;
276 int gsi; 277 int gsi;
277 278
@@ -304,7 +305,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
304 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; 305 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
305 u32 result; 306 u32 result;
306 307
307 ioapic_debug("addr %lx", (unsigned long)addr); 308 ioapic_debug("addr %lx\n", (unsigned long)addr);
308 ASSERT(!(addr & 0xf)); /* check alignment */ 309 ASSERT(!(addr & 0xf)); /* check alignment */
309 310
310 addr &= 0xff; 311 addr &= 0xff;
@@ -341,8 +342,8 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
341 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; 342 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
342 u32 data; 343 u32 data;
343 344
344 ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n", 345 ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
345 addr, len, val); 346 (void*)addr, len, val);
346 ASSERT(!(addr & 0xf)); /* check alignment */ 347 ASSERT(!(addr & 0xf)); /* check alignment */
347 if (len == 4 || len == 8) 348 if (len == 4 || len == 8)
348 data = *(u32 *) val; 349 data = *(u32 *) val;
@@ -360,24 +361,38 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
360 case IOAPIC_REG_WINDOW: 361 case IOAPIC_REG_WINDOW:
361 ioapic_write_indirect(ioapic, data); 362 ioapic_write_indirect(ioapic, data);
362 break; 363 break;
364#ifdef CONFIG_IA64
365 case IOAPIC_REG_EOI:
366 kvm_ioapic_update_eoi(ioapic->kvm, data);
367 break;
368#endif
363 369
364 default: 370 default:
365 break; 371 break;
366 } 372 }
367} 373}
368 374
375void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
376{
377 int i;
378
379 for (i = 0; i < IOAPIC_NUM_PINS; i++)
380 ioapic->redirtbl[i].fields.mask = 1;
381 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
382 ioapic->ioregsel = 0;
383 ioapic->irr = 0;
384 ioapic->id = 0;
385}
386
369int kvm_ioapic_init(struct kvm *kvm) 387int kvm_ioapic_init(struct kvm *kvm)
370{ 388{
371 struct kvm_ioapic *ioapic; 389 struct kvm_ioapic *ioapic;
372 int i;
373 390
374 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); 391 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
375 if (!ioapic) 392 if (!ioapic)
376 return -ENOMEM; 393 return -ENOMEM;
377 kvm->vioapic = ioapic; 394 kvm->arch.vioapic = ioapic;
378 for (i = 0; i < IOAPIC_NUM_PINS; i++) 395 kvm_ioapic_reset(ioapic);
379 ioapic->redirtbl[i].fields.mask = 1;
380 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
381 ioapic->dev.read = ioapic_mmio_read; 396 ioapic->dev.read = ioapic_mmio_read;
382 ioapic->dev.write = ioapic_mmio_write; 397 ioapic->dev.write = ioapic_mmio_write;
383 ioapic->dev.in_range = ioapic_in_range; 398 ioapic->dev.in_range = ioapic_in_range;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
new file mode 100644
index 000000000000..7f16675fe783
--- /dev/null
+++ b/virt/kvm/ioapic.h
@@ -0,0 +1,95 @@
1#ifndef __KVM_IO_APIC_H
2#define __KVM_IO_APIC_H
3
4#include <linux/kvm_host.h>
5
6#include "iodev.h"
7
8struct kvm;
9struct kvm_vcpu;
10
11#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
12#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
13#define IOAPIC_EDGE_TRIG 0
14#define IOAPIC_LEVEL_TRIG 1
15
16#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
17#define IOAPIC_MEM_LENGTH 0x100
18
19/* Direct registers. */
20#define IOAPIC_REG_SELECT 0x00
21#define IOAPIC_REG_WINDOW 0x10
22#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
23
24/* Indirect registers. */
25#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
26#define IOAPIC_REG_VERSION 0x01
27#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
28
29/*ioapic delivery mode*/
30#define IOAPIC_FIXED 0x0
31#define IOAPIC_LOWEST_PRIORITY 0x1
32#define IOAPIC_PMI 0x2
33#define IOAPIC_NMI 0x4
34#define IOAPIC_INIT 0x5
35#define IOAPIC_EXTINT 0x7
36
37struct kvm_ioapic {
38 u64 base_address;
39 u32 ioregsel;
40 u32 id;
41 u32 irr;
42 u32 pad;
43 union ioapic_redir_entry {
44 u64 bits;
45 struct {
46 u8 vector;
47 u8 delivery_mode:3;
48 u8 dest_mode:1;
49 u8 delivery_status:1;
50 u8 polarity:1;
51 u8 remote_irr:1;
52 u8 trig_mode:1;
53 u8 mask:1;
54 u8 reserve:7;
55 u8 reserved[4];
56 u8 dest_id;
57 } fields;
58 } redirtbl[IOAPIC_NUM_PINS];
59 struct kvm_io_device dev;
60 struct kvm *kvm;
61};
62
63#ifdef DEBUG
64#define ASSERT(x) \
65do { \
66 if (!(x)) { \
67 printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
68 __FILE__, __LINE__, #x); \
69 BUG(); \
70 } \
71} while (0)
72#else
73#define ASSERT(x) do { } while (0)
74#endif
75
76static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
77{
78 return kvm->arch.vioapic;
79}
80
81#ifdef CONFIG_IA64
82static inline int irqchip_in_kernel(struct kvm *kvm)
83{
84 return 1;
85}
86#endif
87
88struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
89 unsigned long bitmap);
90void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
91int kvm_ioapic_init(struct kvm *kvm);
92void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
93void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
94
95#endif
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
new file mode 100644
index 000000000000..c14e642027b2
--- /dev/null
+++ b/virt/kvm/iodev.h
@@ -0,0 +1,63 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License.
5 *
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with this program; if not, write to the Free Software
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 */
15
16#ifndef __KVM_IODEV_H__
17#define __KVM_IODEV_H__
18
19#include <linux/kvm_types.h>
20
21struct kvm_io_device {
22 void (*read)(struct kvm_io_device *this,
23 gpa_t addr,
24 int len,
25 void *val);
26 void (*write)(struct kvm_io_device *this,
27 gpa_t addr,
28 int len,
29 const void *val);
30 int (*in_range)(struct kvm_io_device *this, gpa_t addr);
31 void (*destructor)(struct kvm_io_device *this);
32
33 void *private;
34};
35
36static inline void kvm_iodevice_read(struct kvm_io_device *dev,
37 gpa_t addr,
38 int len,
39 void *val)
40{
41 dev->read(dev, addr, len, val);
42}
43
44static inline void kvm_iodevice_write(struct kvm_io_device *dev,
45 gpa_t addr,
46 int len,
47 const void *val)
48{
49 dev->write(dev, addr, len, val);
50}
51
52static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
53{
54 return dev->in_range(dev, addr);
55}
56
57static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
58{
59 if (dev->destructor)
60 dev->destructor(dev);
61}
62
63#endif /* __KVM_IODEV_H__ */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
new file mode 100644
index 000000000000..3c4fe26096fc
--- /dev/null
+++ b/virt/kvm/kvm_main.c
@@ -0,0 +1,1400 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "iodev.h"
19
20#include <linux/kvm_host.h>
21#include <linux/kvm.h>
22#include <linux/module.h>
23#include <linux/errno.h>
24#include <linux/percpu.h>
25#include <linux/gfp.h>
26#include <linux/mm.h>
27#include <linux/miscdevice.h>
28#include <linux/vmalloc.h>
29#include <linux/reboot.h>
30#include <linux/debugfs.h>
31#include <linux/highmem.h>
32#include <linux/file.h>
33#include <linux/sysdev.h>
34#include <linux/cpu.h>
35#include <linux/sched.h>
36#include <linux/cpumask.h>
37#include <linux/smp.h>
38#include <linux/anon_inodes.h>
39#include <linux/profile.h>
40#include <linux/kvm_para.h>
41#include <linux/pagemap.h>
42#include <linux/mman.h>
43
44#include <asm/processor.h>
45#include <asm/io.h>
46#include <asm/uaccess.h>
47#include <asm/pgtable.h>
48
49MODULE_AUTHOR("Qumranet");
50MODULE_LICENSE("GPL");
51
52DEFINE_SPINLOCK(kvm_lock);
53LIST_HEAD(vm_list);
54
55static cpumask_t cpus_hardware_enabled;
56
57struct kmem_cache *kvm_vcpu_cache;
58EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
59
60static __read_mostly struct preempt_ops kvm_preempt_ops;
61
62static struct dentry *debugfs_dir;
63
64static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
65 unsigned long arg);
66
67static inline int valid_vcpu(int n)
68{
69 return likely(n >= 0 && n < KVM_MAX_VCPUS);
70}
71
72/*
73 * Switches to specified vcpu, until a matching vcpu_put()
74 */
75void vcpu_load(struct kvm_vcpu *vcpu)
76{
77 int cpu;
78
79 mutex_lock(&vcpu->mutex);
80 cpu = get_cpu();
81 preempt_notifier_register(&vcpu->preempt_notifier);
82 kvm_arch_vcpu_load(vcpu, cpu);
83 put_cpu();
84}
85
86void vcpu_put(struct kvm_vcpu *vcpu)
87{
88 preempt_disable();
89 kvm_arch_vcpu_put(vcpu);
90 preempt_notifier_unregister(&vcpu->preempt_notifier);
91 preempt_enable();
92 mutex_unlock(&vcpu->mutex);
93}
94
95static void ack_flush(void *_completed)
96{
97}
98
99void kvm_flush_remote_tlbs(struct kvm *kvm)
100{
101 int i, cpu;
102 cpumask_t cpus;
103 struct kvm_vcpu *vcpu;
104
105 cpus_clear(cpus);
106 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
107 vcpu = kvm->vcpus[i];
108 if (!vcpu)
109 continue;
110 if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
111 continue;
112 cpu = vcpu->cpu;
113 if (cpu != -1 && cpu != raw_smp_processor_id())
114 cpu_set(cpu, cpus);
115 }
116 if (cpus_empty(cpus))
117 return;
118 ++kvm->stat.remote_tlb_flush;
119 smp_call_function_mask(cpus, ack_flush, NULL, 1);
120}
121
122int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
123{
124 struct page *page;
125 int r;
126
127 mutex_init(&vcpu->mutex);
128 vcpu->cpu = -1;
129 vcpu->kvm = kvm;
130 vcpu->vcpu_id = id;
131 init_waitqueue_head(&vcpu->wq);
132
133 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
134 if (!page) {
135 r = -ENOMEM;
136 goto fail;
137 }
138 vcpu->run = page_address(page);
139
140 r = kvm_arch_vcpu_init(vcpu);
141 if (r < 0)
142 goto fail_free_run;
143 return 0;
144
145fail_free_run:
146 free_page((unsigned long)vcpu->run);
147fail:
148 return r;
149}
150EXPORT_SYMBOL_GPL(kvm_vcpu_init);
151
152void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
153{
154 kvm_arch_vcpu_uninit(vcpu);
155 free_page((unsigned long)vcpu->run);
156}
157EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
158
159static struct kvm *kvm_create_vm(void)
160{
161 struct kvm *kvm = kvm_arch_create_vm();
162
163 if (IS_ERR(kvm))
164 goto out;
165
166 kvm->mm = current->mm;
167 atomic_inc(&kvm->mm->mm_count);
168 spin_lock_init(&kvm->mmu_lock);
169 kvm_io_bus_init(&kvm->pio_bus);
170 mutex_init(&kvm->lock);
171 kvm_io_bus_init(&kvm->mmio_bus);
172 spin_lock(&kvm_lock);
173 list_add(&kvm->vm_list, &vm_list);
174 spin_unlock(&kvm_lock);
175out:
176 return kvm;
177}
178
179/*
180 * Free any memory in @free but not in @dont.
181 */
182static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
183 struct kvm_memory_slot *dont)
184{
185 if (!dont || free->rmap != dont->rmap)
186 vfree(free->rmap);
187
188 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
189 vfree(free->dirty_bitmap);
190
191 free->npages = 0;
192 free->dirty_bitmap = NULL;
193 free->rmap = NULL;
194}
195
196void kvm_free_physmem(struct kvm *kvm)
197{
198 int i;
199
200 for (i = 0; i < kvm->nmemslots; ++i)
201 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
202}
203
204static void kvm_destroy_vm(struct kvm *kvm)
205{
206 struct mm_struct *mm = kvm->mm;
207
208 spin_lock(&kvm_lock);
209 list_del(&kvm->vm_list);
210 spin_unlock(&kvm_lock);
211 kvm_io_bus_destroy(&kvm->pio_bus);
212 kvm_io_bus_destroy(&kvm->mmio_bus);
213 kvm_arch_destroy_vm(kvm);
214 mmdrop(mm);
215}
216
217static int kvm_vm_release(struct inode *inode, struct file *filp)
218{
219 struct kvm *kvm = filp->private_data;
220
221 kvm_destroy_vm(kvm);
222 return 0;
223}
224
225/*
226 * Allocate some memory and give it an address in the guest physical address
227 * space.
228 *
229 * Discontiguous memory is allowed, mostly for framebuffers.
230 *
231 * Must be called holding mmap_sem for write.
232 */
233int __kvm_set_memory_region(struct kvm *kvm,
234 struct kvm_userspace_memory_region *mem,
235 int user_alloc)
236{
237 int r;
238 gfn_t base_gfn;
239 unsigned long npages;
240 unsigned long i;
241 struct kvm_memory_slot *memslot;
242 struct kvm_memory_slot old, new;
243
244 r = -EINVAL;
245 /* General sanity checks */
246 if (mem->memory_size & (PAGE_SIZE - 1))
247 goto out;
248 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
249 goto out;
250 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
251 goto out;
252 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
253 goto out;
254
255 memslot = &kvm->memslots[mem->slot];
256 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
257 npages = mem->memory_size >> PAGE_SHIFT;
258
259 if (!npages)
260 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
261
262 new = old = *memslot;
263
264 new.base_gfn = base_gfn;
265 new.npages = npages;
266 new.flags = mem->flags;
267
268 /* Disallow changing a memory slot's size. */
269 r = -EINVAL;
270 if (npages && old.npages && npages != old.npages)
271 goto out_free;
272
273 /* Check for overlaps */
274 r = -EEXIST;
275 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
276 struct kvm_memory_slot *s = &kvm->memslots[i];
277
278 if (s == memslot)
279 continue;
280 if (!((base_gfn + npages <= s->base_gfn) ||
281 (base_gfn >= s->base_gfn + s->npages)))
282 goto out_free;
283 }
284
285 /* Free page dirty bitmap if unneeded */
286 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
287 new.dirty_bitmap = NULL;
288
289 r = -ENOMEM;
290
291 /* Allocate if a slot is being created */
292 if (npages && !new.rmap) {
293 new.rmap = vmalloc(npages * sizeof(struct page *));
294
295 if (!new.rmap)
296 goto out_free;
297
298 memset(new.rmap, 0, npages * sizeof(*new.rmap));
299
300 new.user_alloc = user_alloc;
301 new.userspace_addr = mem->userspace_addr;
302 }
303
304 /* Allocate page dirty bitmap if needed */
305 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
306 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
307
308 new.dirty_bitmap = vmalloc(dirty_bytes);
309 if (!new.dirty_bitmap)
310 goto out_free;
311 memset(new.dirty_bitmap, 0, dirty_bytes);
312 }
313
314 if (mem->slot >= kvm->nmemslots)
315 kvm->nmemslots = mem->slot + 1;
316
317 *memslot = new;
318
319 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
320 if (r) {
321 *memslot = old;
322 goto out_free;
323 }
324
325 kvm_free_physmem_slot(&old, &new);
326 return 0;
327
328out_free:
329 kvm_free_physmem_slot(&new, &old);
330out:
331 return r;
332
333}
334EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
335
336int kvm_set_memory_region(struct kvm *kvm,
337 struct kvm_userspace_memory_region *mem,
338 int user_alloc)
339{
340 int r;
341
342 down_write(&current->mm->mmap_sem);
343 r = __kvm_set_memory_region(kvm, mem, user_alloc);
344 up_write(&current->mm->mmap_sem);
345 return r;
346}
347EXPORT_SYMBOL_GPL(kvm_set_memory_region);
348
349int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
350 struct
351 kvm_userspace_memory_region *mem,
352 int user_alloc)
353{
354 if (mem->slot >= KVM_MEMORY_SLOTS)
355 return -EINVAL;
356 return kvm_set_memory_region(kvm, mem, user_alloc);
357}
358
359int kvm_get_dirty_log(struct kvm *kvm,
360 struct kvm_dirty_log *log, int *is_dirty)
361{
362 struct kvm_memory_slot *memslot;
363 int r, i;
364 int n;
365 unsigned long any = 0;
366
367 r = -EINVAL;
368 if (log->slot >= KVM_MEMORY_SLOTS)
369 goto out;
370
371 memslot = &kvm->memslots[log->slot];
372 r = -ENOENT;
373 if (!memslot->dirty_bitmap)
374 goto out;
375
376 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
377
378 for (i = 0; !any && i < n/sizeof(long); ++i)
379 any = memslot->dirty_bitmap[i];
380
381 r = -EFAULT;
382 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
383 goto out;
384
385 if (any)
386 *is_dirty = 1;
387
388 r = 0;
389out:
390 return r;
391}
392
393int is_error_page(struct page *page)
394{
395 return page == bad_page;
396}
397EXPORT_SYMBOL_GPL(is_error_page);
398
399static inline unsigned long bad_hva(void)
400{
401 return PAGE_OFFSET;
402}
403
404int kvm_is_error_hva(unsigned long addr)
405{
406 return addr == bad_hva();
407}
408EXPORT_SYMBOL_GPL(kvm_is_error_hva);
409
410static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
411{
412 int i;
413
414 for (i = 0; i < kvm->nmemslots; ++i) {
415 struct kvm_memory_slot *memslot = &kvm->memslots[i];
416
417 if (gfn >= memslot->base_gfn
418 && gfn < memslot->base_gfn + memslot->npages)
419 return memslot;
420 }
421 return NULL;
422}
423
424struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
425{
426 gfn = unalias_gfn(kvm, gfn);
427 return __gfn_to_memslot(kvm, gfn);
428}
429
430int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
431{
432 int i;
433
434 gfn = unalias_gfn(kvm, gfn);
435 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
436 struct kvm_memory_slot *memslot = &kvm->memslots[i];
437
438 if (gfn >= memslot->base_gfn
439 && gfn < memslot->base_gfn + memslot->npages)
440 return 1;
441 }
442 return 0;
443}
444EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
445
446static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
447{
448 struct kvm_memory_slot *slot;
449
450 gfn = unalias_gfn(kvm, gfn);
451 slot = __gfn_to_memslot(kvm, gfn);
452 if (!slot)
453 return bad_hva();
454 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
455}
456
457/*
458 * Requires current->mm->mmap_sem to be held
459 */
460struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
461{
462 struct page *page[1];
463 unsigned long addr;
464 int npages;
465
466 might_sleep();
467
468 addr = gfn_to_hva(kvm, gfn);
469 if (kvm_is_error_hva(addr)) {
470 get_page(bad_page);
471 return bad_page;
472 }
473
474 npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
475 NULL);
476
477 if (npages != 1) {
478 get_page(bad_page);
479 return bad_page;
480 }
481
482 return page[0];
483}
484
485EXPORT_SYMBOL_GPL(gfn_to_page);
486
487void kvm_release_page_clean(struct page *page)
488{
489 put_page(page);
490}
491EXPORT_SYMBOL_GPL(kvm_release_page_clean);
492
493void kvm_release_page_dirty(struct page *page)
494{
495 if (!PageReserved(page))
496 SetPageDirty(page);
497 put_page(page);
498}
499EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
500
501static int next_segment(unsigned long len, int offset)
502{
503 if (len > PAGE_SIZE - offset)
504 return PAGE_SIZE - offset;
505 else
506 return len;
507}
508
509int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
510 int len)
511{
512 int r;
513 unsigned long addr;
514
515 addr = gfn_to_hva(kvm, gfn);
516 if (kvm_is_error_hva(addr))
517 return -EFAULT;
518 r = copy_from_user(data, (void __user *)addr + offset, len);
519 if (r)
520 return -EFAULT;
521 return 0;
522}
523EXPORT_SYMBOL_GPL(kvm_read_guest_page);
524
525int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
526{
527 gfn_t gfn = gpa >> PAGE_SHIFT;
528 int seg;
529 int offset = offset_in_page(gpa);
530 int ret;
531
532 while ((seg = next_segment(len, offset)) != 0) {
533 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
534 if (ret < 0)
535 return ret;
536 offset = 0;
537 len -= seg;
538 data += seg;
539 ++gfn;
540 }
541 return 0;
542}
543EXPORT_SYMBOL_GPL(kvm_read_guest);
544
545int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
546 unsigned long len)
547{
548 int r;
549 unsigned long addr;
550 gfn_t gfn = gpa >> PAGE_SHIFT;
551 int offset = offset_in_page(gpa);
552
553 addr = gfn_to_hva(kvm, gfn);
554 if (kvm_is_error_hva(addr))
555 return -EFAULT;
556 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
557 if (r)
558 return -EFAULT;
559 return 0;
560}
561EXPORT_SYMBOL(kvm_read_guest_atomic);
562
563int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
564 int offset, int len)
565{
566 int r;
567 unsigned long addr;
568
569 addr = gfn_to_hva(kvm, gfn);
570 if (kvm_is_error_hva(addr))
571 return -EFAULT;
572 r = copy_to_user((void __user *)addr + offset, data, len);
573 if (r)
574 return -EFAULT;
575 mark_page_dirty(kvm, gfn);
576 return 0;
577}
578EXPORT_SYMBOL_GPL(kvm_write_guest_page);
579
580int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
581 unsigned long len)
582{
583 gfn_t gfn = gpa >> PAGE_SHIFT;
584 int seg;
585 int offset = offset_in_page(gpa);
586 int ret;
587
588 while ((seg = next_segment(len, offset)) != 0) {
589 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
590 if (ret < 0)
591 return ret;
592 offset = 0;
593 len -= seg;
594 data += seg;
595 ++gfn;
596 }
597 return 0;
598}
599
600int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
601{
602 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
603}
604EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
605
606int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
607{
608 gfn_t gfn = gpa >> PAGE_SHIFT;
609 int seg;
610 int offset = offset_in_page(gpa);
611 int ret;
612
613 while ((seg = next_segment(len, offset)) != 0) {
614 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
615 if (ret < 0)
616 return ret;
617 offset = 0;
618 len -= seg;
619 ++gfn;
620 }
621 return 0;
622}
623EXPORT_SYMBOL_GPL(kvm_clear_guest);
624
625void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
626{
627 struct kvm_memory_slot *memslot;
628
629 gfn = unalias_gfn(kvm, gfn);
630 memslot = __gfn_to_memslot(kvm, gfn);
631 if (memslot && memslot->dirty_bitmap) {
632 unsigned long rel_gfn = gfn - memslot->base_gfn;
633
634 /* avoid RMW */
635 if (!test_bit(rel_gfn, memslot->dirty_bitmap))
636 set_bit(rel_gfn, memslot->dirty_bitmap);
637 }
638}
639
640/*
641 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
642 */
643void kvm_vcpu_block(struct kvm_vcpu *vcpu)
644{
645 DECLARE_WAITQUEUE(wait, current);
646
647 add_wait_queue(&vcpu->wq, &wait);
648
649 /*
650 * We will block until either an interrupt or a signal wakes us up
651 */
652 while (!kvm_cpu_has_interrupt(vcpu)
653 && !signal_pending(current)
654 && !kvm_arch_vcpu_runnable(vcpu)) {
655 set_current_state(TASK_INTERRUPTIBLE);
656 vcpu_put(vcpu);
657 schedule();
658 vcpu_load(vcpu);
659 }
660
661 __set_current_state(TASK_RUNNING);
662 remove_wait_queue(&vcpu->wq, &wait);
663}
664
665void kvm_resched(struct kvm_vcpu *vcpu)
666{
667 if (!need_resched())
668 return;
669 cond_resched();
670}
671EXPORT_SYMBOL_GPL(kvm_resched);
672
673static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
674{
675 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
676 struct page *page;
677
678 if (vmf->pgoff == 0)
679 page = virt_to_page(vcpu->run);
680 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
681 page = virt_to_page(vcpu->arch.pio_data);
682 else
683 return VM_FAULT_SIGBUS;
684 get_page(page);
685 vmf->page = page;
686 return 0;
687}
688
689static struct vm_operations_struct kvm_vcpu_vm_ops = {
690 .fault = kvm_vcpu_fault,
691};
692
693static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
694{
695 vma->vm_ops = &kvm_vcpu_vm_ops;
696 return 0;
697}
698
699static int kvm_vcpu_release(struct inode *inode, struct file *filp)
700{
701 struct kvm_vcpu *vcpu = filp->private_data;
702
703 fput(vcpu->kvm->filp);
704 return 0;
705}
706
707static struct file_operations kvm_vcpu_fops = {
708 .release = kvm_vcpu_release,
709 .unlocked_ioctl = kvm_vcpu_ioctl,
710 .compat_ioctl = kvm_vcpu_ioctl,
711 .mmap = kvm_vcpu_mmap,
712};
713
714/*
715 * Allocates an inode for the vcpu.
716 */
717static int create_vcpu_fd(struct kvm_vcpu *vcpu)
718{
719 int fd, r;
720 struct inode *inode;
721 struct file *file;
722
723 r = anon_inode_getfd(&fd, &inode, &file,
724 "kvm-vcpu", &kvm_vcpu_fops, vcpu);
725 if (r)
726 return r;
727 atomic_inc(&vcpu->kvm->filp->f_count);
728 return fd;
729}
730
731/*
732 * Creates some virtual cpus. Good luck creating more than one.
733 */
734static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
735{
736 int r;
737 struct kvm_vcpu *vcpu;
738
739 if (!valid_vcpu(n))
740 return -EINVAL;
741
742 vcpu = kvm_arch_vcpu_create(kvm, n);
743 if (IS_ERR(vcpu))
744 return PTR_ERR(vcpu);
745
746 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
747
748 r = kvm_arch_vcpu_setup(vcpu);
749 if (r)
750 goto vcpu_destroy;
751
752 mutex_lock(&kvm->lock);
753 if (kvm->vcpus[n]) {
754 r = -EEXIST;
755 mutex_unlock(&kvm->lock);
756 goto vcpu_destroy;
757 }
758 kvm->vcpus[n] = vcpu;
759 mutex_unlock(&kvm->lock);
760
761 /* Now it's all set up, let userspace reach it */
762 r = create_vcpu_fd(vcpu);
763 if (r < 0)
764 goto unlink;
765 return r;
766
767unlink:
768 mutex_lock(&kvm->lock);
769 kvm->vcpus[n] = NULL;
770 mutex_unlock(&kvm->lock);
771vcpu_destroy:
772 kvm_arch_vcpu_destroy(vcpu);
773 return r;
774}
775
776static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
777{
778 if (sigset) {
779 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
780 vcpu->sigset_active = 1;
781 vcpu->sigset = *sigset;
782 } else
783 vcpu->sigset_active = 0;
784 return 0;
785}
786
787static long kvm_vcpu_ioctl(struct file *filp,
788 unsigned int ioctl, unsigned long arg)
789{
790 struct kvm_vcpu *vcpu = filp->private_data;
791 void __user *argp = (void __user *)arg;
792 int r;
793
794 if (vcpu->kvm->mm != current->mm)
795 return -EIO;
796 switch (ioctl) {
797 case KVM_RUN:
798 r = -EINVAL;
799 if (arg)
800 goto out;
801 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
802 break;
803 case KVM_GET_REGS: {
804 struct kvm_regs kvm_regs;
805
806 memset(&kvm_regs, 0, sizeof kvm_regs);
807 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
808 if (r)
809 goto out;
810 r = -EFAULT;
811 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
812 goto out;
813 r = 0;
814 break;
815 }
816 case KVM_SET_REGS: {
817 struct kvm_regs kvm_regs;
818
819 r = -EFAULT;
820 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
821 goto out;
822 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
823 if (r)
824 goto out;
825 r = 0;
826 break;
827 }
828 case KVM_GET_SREGS: {
829 struct kvm_sregs kvm_sregs;
830
831 memset(&kvm_sregs, 0, sizeof kvm_sregs);
832 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
833 if (r)
834 goto out;
835 r = -EFAULT;
836 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
837 goto out;
838 r = 0;
839 break;
840 }
841 case KVM_SET_SREGS: {
842 struct kvm_sregs kvm_sregs;
843
844 r = -EFAULT;
845 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
846 goto out;
847 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
848 if (r)
849 goto out;
850 r = 0;
851 break;
852 }
853 case KVM_TRANSLATE: {
854 struct kvm_translation tr;
855
856 r = -EFAULT;
857 if (copy_from_user(&tr, argp, sizeof tr))
858 goto out;
859 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
860 if (r)
861 goto out;
862 r = -EFAULT;
863 if (copy_to_user(argp, &tr, sizeof tr))
864 goto out;
865 r = 0;
866 break;
867 }
868 case KVM_DEBUG_GUEST: {
869 struct kvm_debug_guest dbg;
870
871 r = -EFAULT;
872 if (copy_from_user(&dbg, argp, sizeof dbg))
873 goto out;
874 r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
875 if (r)
876 goto out;
877 r = 0;
878 break;
879 }
880 case KVM_SET_SIGNAL_MASK: {
881 struct kvm_signal_mask __user *sigmask_arg = argp;
882 struct kvm_signal_mask kvm_sigmask;
883 sigset_t sigset, *p;
884
885 p = NULL;
886 if (argp) {
887 r = -EFAULT;
888 if (copy_from_user(&kvm_sigmask, argp,
889 sizeof kvm_sigmask))
890 goto out;
891 r = -EINVAL;
892 if (kvm_sigmask.len != sizeof sigset)
893 goto out;
894 r = -EFAULT;
895 if (copy_from_user(&sigset, sigmask_arg->sigset,
896 sizeof sigset))
897 goto out;
898 p = &sigset;
899 }
900 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
901 break;
902 }
903 case KVM_GET_FPU: {
904 struct kvm_fpu fpu;
905
906 memset(&fpu, 0, sizeof fpu);
907 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu);
908 if (r)
909 goto out;
910 r = -EFAULT;
911 if (copy_to_user(argp, &fpu, sizeof fpu))
912 goto out;
913 r = 0;
914 break;
915 }
916 case KVM_SET_FPU: {
917 struct kvm_fpu fpu;
918
919 r = -EFAULT;
920 if (copy_from_user(&fpu, argp, sizeof fpu))
921 goto out;
922 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu);
923 if (r)
924 goto out;
925 r = 0;
926 break;
927 }
928 default:
929 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
930 }
931out:
932 return r;
933}
934
935static long kvm_vm_ioctl(struct file *filp,
936 unsigned int ioctl, unsigned long arg)
937{
938 struct kvm *kvm = filp->private_data;
939 void __user *argp = (void __user *)arg;
940 int r;
941
942 if (kvm->mm != current->mm)
943 return -EIO;
944 switch (ioctl) {
945 case KVM_CREATE_VCPU:
946 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
947 if (r < 0)
948 goto out;
949 break;
950 case KVM_SET_USER_MEMORY_REGION: {
951 struct kvm_userspace_memory_region kvm_userspace_mem;
952
953 r = -EFAULT;
954 if (copy_from_user(&kvm_userspace_mem, argp,
955 sizeof kvm_userspace_mem))
956 goto out;
957
958 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
959 if (r)
960 goto out;
961 break;
962 }
963 case KVM_GET_DIRTY_LOG: {
964 struct kvm_dirty_log log;
965
966 r = -EFAULT;
967 if (copy_from_user(&log, argp, sizeof log))
968 goto out;
969 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
970 if (r)
971 goto out;
972 break;
973 }
974 default:
975 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
976 }
977out:
978 return r;
979}
980
981static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
982{
983 struct kvm *kvm = vma->vm_file->private_data;
984 struct page *page;
985
986 if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
987 return VM_FAULT_SIGBUS;
988 page = gfn_to_page(kvm, vmf->pgoff);
989 if (is_error_page(page)) {
990 kvm_release_page_clean(page);
991 return VM_FAULT_SIGBUS;
992 }
993 vmf->page = page;
994 return 0;
995}
996
997static struct vm_operations_struct kvm_vm_vm_ops = {
998 .fault = kvm_vm_fault,
999};
1000
1001static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
1002{
1003 vma->vm_ops = &kvm_vm_vm_ops;
1004 return 0;
1005}
1006
1007static struct file_operations kvm_vm_fops = {
1008 .release = kvm_vm_release,
1009 .unlocked_ioctl = kvm_vm_ioctl,
1010 .compat_ioctl = kvm_vm_ioctl,
1011 .mmap = kvm_vm_mmap,
1012};
1013
1014static int kvm_dev_ioctl_create_vm(void)
1015{
1016 int fd, r;
1017 struct inode *inode;
1018 struct file *file;
1019 struct kvm *kvm;
1020
1021 kvm = kvm_create_vm();
1022 if (IS_ERR(kvm))
1023 return PTR_ERR(kvm);
1024 r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
1025 if (r) {
1026 kvm_destroy_vm(kvm);
1027 return r;
1028 }
1029
1030 kvm->filp = file;
1031
1032 return fd;
1033}
1034
1035static long kvm_dev_ioctl(struct file *filp,
1036 unsigned int ioctl, unsigned long arg)
1037{
1038 void __user *argp = (void __user *)arg;
1039 long r = -EINVAL;
1040
1041 switch (ioctl) {
1042 case KVM_GET_API_VERSION:
1043 r = -EINVAL;
1044 if (arg)
1045 goto out;
1046 r = KVM_API_VERSION;
1047 break;
1048 case KVM_CREATE_VM:
1049 r = -EINVAL;
1050 if (arg)
1051 goto out;
1052 r = kvm_dev_ioctl_create_vm();
1053 break;
1054 case KVM_CHECK_EXTENSION:
1055 r = kvm_dev_ioctl_check_extension((long)argp);
1056 break;
1057 case KVM_GET_VCPU_MMAP_SIZE:
1058 r = -EINVAL;
1059 if (arg)
1060 goto out;
1061 r = 2 * PAGE_SIZE;
1062 break;
1063 default:
1064 return kvm_arch_dev_ioctl(filp, ioctl, arg);
1065 }
1066out:
1067 return r;
1068}
1069
1070static struct file_operations kvm_chardev_ops = {
1071 .unlocked_ioctl = kvm_dev_ioctl,
1072 .compat_ioctl = kvm_dev_ioctl,
1073};
1074
1075static struct miscdevice kvm_dev = {
1076 KVM_MINOR,
1077 "kvm",
1078 &kvm_chardev_ops,
1079};
1080
1081static void hardware_enable(void *junk)
1082{
1083 int cpu = raw_smp_processor_id();
1084
1085 if (cpu_isset(cpu, cpus_hardware_enabled))
1086 return;
1087 cpu_set(cpu, cpus_hardware_enabled);
1088 kvm_arch_hardware_enable(NULL);
1089}
1090
1091static void hardware_disable(void *junk)
1092{
1093 int cpu = raw_smp_processor_id();
1094
1095 if (!cpu_isset(cpu, cpus_hardware_enabled))
1096 return;
1097 cpu_clear(cpu, cpus_hardware_enabled);
1098 decache_vcpus_on_cpu(cpu);
1099 kvm_arch_hardware_disable(NULL);
1100}
1101
1102static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
1103 void *v)
1104{
1105 int cpu = (long)v;
1106
1107 val &= ~CPU_TASKS_FROZEN;
1108 switch (val) {
1109 case CPU_DYING:
1110 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1111 cpu);
1112 hardware_disable(NULL);
1113 break;
1114 case CPU_UP_CANCELED:
1115 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1116 cpu);
1117 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
1118 break;
1119 case CPU_ONLINE:
1120 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
1121 cpu);
1122 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
1123 break;
1124 }
1125 return NOTIFY_OK;
1126}
1127
1128static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
1129 void *v)
1130{
1131 if (val == SYS_RESTART) {
1132 /*
1133 * Some (well, at least mine) BIOSes hang on reboot if
1134 * in vmx root mode.
1135 */
1136 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
1137 on_each_cpu(hardware_disable, NULL, 0, 1);
1138 }
1139 return NOTIFY_OK;
1140}
1141
1142static struct notifier_block kvm_reboot_notifier = {
1143 .notifier_call = kvm_reboot,
1144 .priority = 0,
1145};
1146
1147void kvm_io_bus_init(struct kvm_io_bus *bus)
1148{
1149 memset(bus, 0, sizeof(*bus));
1150}
1151
1152void kvm_io_bus_destroy(struct kvm_io_bus *bus)
1153{
1154 int i;
1155
1156 for (i = 0; i < bus->dev_count; i++) {
1157 struct kvm_io_device *pos = bus->devs[i];
1158
1159 kvm_iodevice_destructor(pos);
1160 }
1161}
1162
1163struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
1164{
1165 int i;
1166
1167 for (i = 0; i < bus->dev_count; i++) {
1168 struct kvm_io_device *pos = bus->devs[i];
1169
1170 if (pos->in_range(pos, addr))
1171 return pos;
1172 }
1173
1174 return NULL;
1175}
1176
1177void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
1178{
1179 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
1180
1181 bus->devs[bus->dev_count++] = dev;
1182}
1183
1184static struct notifier_block kvm_cpu_notifier = {
1185 .notifier_call = kvm_cpu_hotplug,
1186 .priority = 20, /* must be > scheduler priority */
1187};
1188
1189static u64 vm_stat_get(void *_offset)
1190{
1191 unsigned offset = (long)_offset;
1192 u64 total = 0;
1193 struct kvm *kvm;
1194
1195 spin_lock(&kvm_lock);
1196 list_for_each_entry(kvm, &vm_list, vm_list)
1197 total += *(u32 *)((void *)kvm + offset);
1198 spin_unlock(&kvm_lock);
1199 return total;
1200}
1201
1202DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
1203
1204static u64 vcpu_stat_get(void *_offset)
1205{
1206 unsigned offset = (long)_offset;
1207 u64 total = 0;
1208 struct kvm *kvm;
1209 struct kvm_vcpu *vcpu;
1210 int i;
1211
1212 spin_lock(&kvm_lock);
1213 list_for_each_entry(kvm, &vm_list, vm_list)
1214 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
1215 vcpu = kvm->vcpus[i];
1216 if (vcpu)
1217 total += *(u32 *)((void *)vcpu + offset);
1218 }
1219 spin_unlock(&kvm_lock);
1220 return total;
1221}
1222
1223DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
1224
1225static struct file_operations *stat_fops[] = {
1226 [KVM_STAT_VCPU] = &vcpu_stat_fops,
1227 [KVM_STAT_VM] = &vm_stat_fops,
1228};
1229
1230static void kvm_init_debug(void)
1231{
1232 struct kvm_stats_debugfs_item *p;
1233
1234 debugfs_dir = debugfs_create_dir("kvm", NULL);
1235 for (p = debugfs_entries; p->name; ++p)
1236 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
1237 (void *)(long)p->offset,
1238 stat_fops[p->kind]);
1239}
1240
1241static void kvm_exit_debug(void)
1242{
1243 struct kvm_stats_debugfs_item *p;
1244
1245 for (p = debugfs_entries; p->name; ++p)
1246 debugfs_remove(p->dentry);
1247 debugfs_remove(debugfs_dir);
1248}
1249
1250static int kvm_suspend(struct sys_device *dev, pm_message_t state)
1251{
1252 hardware_disable(NULL);
1253 return 0;
1254}
1255
1256static int kvm_resume(struct sys_device *dev)
1257{
1258 hardware_enable(NULL);
1259 return 0;
1260}
1261
1262static struct sysdev_class kvm_sysdev_class = {
1263 .name = "kvm",
1264 .suspend = kvm_suspend,
1265 .resume = kvm_resume,
1266};
1267
1268static struct sys_device kvm_sysdev = {
1269 .id = 0,
1270 .cls = &kvm_sysdev_class,
1271};
1272
1273struct page *bad_page;
1274
1275static inline
1276struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
1277{
1278 return container_of(pn, struct kvm_vcpu, preempt_notifier);
1279}
1280
1281static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
1282{
1283 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
1284
1285 kvm_arch_vcpu_load(vcpu, cpu);
1286}
1287
1288static void kvm_sched_out(struct preempt_notifier *pn,
1289 struct task_struct *next)
1290{
1291 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
1292
1293 kvm_arch_vcpu_put(vcpu);
1294}
1295
1296int kvm_init(void *opaque, unsigned int vcpu_size,
1297 struct module *module)
1298{
1299 int r;
1300 int cpu;
1301
1302 kvm_init_debug();
1303
1304 r = kvm_arch_init(opaque);
1305 if (r)
1306 goto out_fail;
1307
1308 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1309
1310 if (bad_page == NULL) {
1311 r = -ENOMEM;
1312 goto out;
1313 }
1314
1315 r = kvm_arch_hardware_setup();
1316 if (r < 0)
1317 goto out_free_0;
1318
1319 for_each_online_cpu(cpu) {
1320 smp_call_function_single(cpu,
1321 kvm_arch_check_processor_compat,
1322 &r, 0, 1);
1323 if (r < 0)
1324 goto out_free_1;
1325 }
1326
1327 on_each_cpu(hardware_enable, NULL, 0, 1);
1328 r = register_cpu_notifier(&kvm_cpu_notifier);
1329 if (r)
1330 goto out_free_2;
1331 register_reboot_notifier(&kvm_reboot_notifier);
1332
1333 r = sysdev_class_register(&kvm_sysdev_class);
1334 if (r)
1335 goto out_free_3;
1336
1337 r = sysdev_register(&kvm_sysdev);
1338 if (r)
1339 goto out_free_4;
1340
1341 /* A kmem cache lets us meet the alignment requirements of fx_save. */
1342 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
1343 __alignof__(struct kvm_vcpu),
1344 0, NULL);
1345 if (!kvm_vcpu_cache) {
1346 r = -ENOMEM;
1347 goto out_free_5;
1348 }
1349
1350 kvm_chardev_ops.owner = module;
1351
1352 r = misc_register(&kvm_dev);
1353 if (r) {
1354 printk(KERN_ERR "kvm: misc device register failed\n");
1355 goto out_free;
1356 }
1357
1358 kvm_preempt_ops.sched_in = kvm_sched_in;
1359 kvm_preempt_ops.sched_out = kvm_sched_out;
1360
1361 return 0;
1362
1363out_free:
1364 kmem_cache_destroy(kvm_vcpu_cache);
1365out_free_5:
1366 sysdev_unregister(&kvm_sysdev);
1367out_free_4:
1368 sysdev_class_unregister(&kvm_sysdev_class);
1369out_free_3:
1370 unregister_reboot_notifier(&kvm_reboot_notifier);
1371 unregister_cpu_notifier(&kvm_cpu_notifier);
1372out_free_2:
1373 on_each_cpu(hardware_disable, NULL, 0, 1);
1374out_free_1:
1375 kvm_arch_hardware_unsetup();
1376out_free_0:
1377 __free_page(bad_page);
1378out:
1379 kvm_arch_exit();
1380 kvm_exit_debug();
1381out_fail:
1382 return r;
1383}
1384EXPORT_SYMBOL_GPL(kvm_init);
1385
1386void kvm_exit(void)
1387{
1388 misc_deregister(&kvm_dev);
1389 kmem_cache_destroy(kvm_vcpu_cache);
1390 sysdev_unregister(&kvm_sysdev);
1391 sysdev_class_unregister(&kvm_sysdev_class);
1392 unregister_reboot_notifier(&kvm_reboot_notifier);
1393 unregister_cpu_notifier(&kvm_cpu_notifier);
1394 on_each_cpu(hardware_disable, NULL, 0, 1);
1395 kvm_arch_hardware_unsetup();
1396 kvm_arch_exit();
1397 kvm_exit_debug();
1398 __free_page(bad_page);
1399}
1400EXPORT_SYMBOL_GPL(kvm_exit);