diff options
182 files changed, 14842 insertions, 9360 deletions
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index aa38cc5692a0..77436d735013 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl | |||
| @@ -419,7 +419,13 @@ X!Edrivers/pnp/system.c | |||
| 419 | 419 | ||
| 420 | <chapter id="blkdev"> | 420 | <chapter id="blkdev"> |
| 421 | <title>Block Devices</title> | 421 | <title>Block Devices</title> |
| 422 | !Eblock/ll_rw_blk.c | 422 | !Eblock/blk-core.c |
| 423 | !Eblock/blk-map.c | ||
| 424 | !Iblock/blk-sysfs.c | ||
| 425 | !Eblock/blk-settings.c | ||
| 426 | !Eblock/blk-exec.c | ||
| 427 | !Eblock/blk-barrier.c | ||
| 428 | !Eblock/blk-tag.c | ||
| 423 | </chapter> | 429 | </chapter> |
| 424 | 430 | ||
| 425 | <chapter id="chrdev"> | 431 | <chapter id="chrdev"> |
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 9b0e322118b5..6c8a2386cd50 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c | |||
| @@ -79,6 +79,9 @@ static void *guest_base; | |||
| 79 | /* The maximum guest physical address allowed, and maximum possible. */ | 79 | /* The maximum guest physical address allowed, and maximum possible. */ |
| 80 | static unsigned long guest_limit, guest_max; | 80 | static unsigned long guest_limit, guest_max; |
| 81 | 81 | ||
| 82 | /* a per-cpu variable indicating whose vcpu is currently running */ | ||
| 83 | static unsigned int __thread cpu_id; | ||
| 84 | |||
| 82 | /* This is our list of devices. */ | 85 | /* This is our list of devices. */ |
| 83 | struct device_list | 86 | struct device_list |
| 84 | { | 87 | { |
| @@ -153,6 +156,9 @@ struct virtqueue | |||
| 153 | void (*handle_output)(int fd, struct virtqueue *me); | 156 | void (*handle_output)(int fd, struct virtqueue *me); |
| 154 | }; | 157 | }; |
| 155 | 158 | ||
| 159 | /* Remember the arguments to the program so we can "reboot" */ | ||
| 160 | static char **main_args; | ||
| 161 | |||
| 156 | /* Since guest is UP and we don't run at the same time, we don't need barriers. | 162 | /* Since guest is UP and we don't run at the same time, we don't need barriers. |
| 157 | * But I include them in the code in case others copy it. */ | 163 | * But I include them in the code in case others copy it. */ |
| 158 | #define wmb() | 164 | #define wmb() |
| @@ -554,7 +560,7 @@ static void wake_parent(int pipefd, int lguest_fd) | |||
| 554 | else | 560 | else |
| 555 | FD_CLR(-fd - 1, &devices.infds); | 561 | FD_CLR(-fd - 1, &devices.infds); |
| 556 | } else /* Send LHREQ_BREAK command. */ | 562 | } else /* Send LHREQ_BREAK command. */ |
| 557 | write(lguest_fd, args, sizeof(args)); | 563 | pwrite(lguest_fd, args, sizeof(args), cpu_id); |
| 558 | } | 564 | } |
| 559 | } | 565 | } |
| 560 | 566 | ||
| @@ -1489,7 +1495,9 @@ static void setup_block_file(const char *filename) | |||
| 1489 | 1495 | ||
| 1490 | /* Create stack for thread and run it */ | 1496 | /* Create stack for thread and run it */ |
| 1491 | stack = malloc(32768); | 1497 | stack = malloc(32768); |
| 1492 | if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1) | 1498 | /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from |
| 1499 | * becoming a zombie. */ | ||
| 1500 | if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) | ||
| 1493 | err(1, "Creating clone"); | 1501 | err(1, "Creating clone"); |
| 1494 | 1502 | ||
| 1495 | /* We don't need to keep the I/O thread's end of the pipes open. */ | 1503 | /* We don't need to keep the I/O thread's end of the pipes open. */ |
| @@ -1499,7 +1507,21 @@ static void setup_block_file(const char *filename) | |||
| 1499 | verbose("device %u: virtblock %llu sectors\n", | 1507 | verbose("device %u: virtblock %llu sectors\n", |
| 1500 | devices.device_num, cap); | 1508 | devices.device_num, cap); |
| 1501 | } | 1509 | } |
| 1502 | /* That's the end of device setup. */ | 1510 | /* That's the end of device setup. :*/ |
| 1511 | |||
| 1512 | /* Reboot */ | ||
| 1513 | static void __attribute__((noreturn)) restart_guest(void) | ||
| 1514 | { | ||
| 1515 | unsigned int i; | ||
| 1516 | |||
| 1517 | /* Closing pipes causes the waker thread and io_threads to die, and | ||
| 1518 | * closing /dev/lguest cleans up the Guest. Since we don't track all | ||
| 1519 | * open fds, we simply close everything beyond stderr. */ | ||
| 1520 | for (i = 3; i < FD_SETSIZE; i++) | ||
| 1521 | close(i); | ||
| 1522 | execv(main_args[0], main_args); | ||
| 1523 | err(1, "Could not exec %s", main_args[0]); | ||
| 1524 | } | ||
| 1503 | 1525 | ||
| 1504 | /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves | 1526 | /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves |
| 1505 | * its input and output, and finally, lays it to rest. */ | 1527 | * its input and output, and finally, lays it to rest. */ |
| @@ -1511,7 +1533,8 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) | |||
| 1511 | int readval; | 1533 | int readval; |
| 1512 | 1534 | ||
| 1513 | /* We read from the /dev/lguest device to run the Guest. */ | 1535 | /* We read from the /dev/lguest device to run the Guest. */ |
| 1514 | readval = read(lguest_fd, ¬ify_addr, sizeof(notify_addr)); | 1536 | readval = pread(lguest_fd, ¬ify_addr, |
| 1537 | sizeof(notify_addr), cpu_id); | ||
| 1515 | 1538 | ||
| 1516 | /* One unsigned long means the Guest did HCALL_NOTIFY */ | 1539 | /* One unsigned long means the Guest did HCALL_NOTIFY */ |
| 1517 | if (readval == sizeof(notify_addr)) { | 1540 | if (readval == sizeof(notify_addr)) { |
| @@ -1521,16 +1544,23 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) | |||
| 1521 | /* ENOENT means the Guest died. Reading tells us why. */ | 1544 | /* ENOENT means the Guest died. Reading tells us why. */ |
| 1522 | } else if (errno == ENOENT) { | 1545 | } else if (errno == ENOENT) { |
| 1523 | char reason[1024] = { 0 }; | 1546 | char reason[1024] = { 0 }; |
| 1524 | read(lguest_fd, reason, sizeof(reason)-1); | 1547 | pread(lguest_fd, reason, sizeof(reason)-1, cpu_id); |
| 1525 | errx(1, "%s", reason); | 1548 | errx(1, "%s", reason); |
| 1549 | /* ERESTART means that we need to reboot the guest */ | ||
| 1550 | } else if (errno == ERESTART) { | ||
| 1551 | restart_guest(); | ||
| 1526 | /* EAGAIN means the Waker wanted us to look at some input. | 1552 | /* EAGAIN means the Waker wanted us to look at some input. |
| 1527 | * Anything else means a bug or incompatible change. */ | 1553 | * Anything else means a bug or incompatible change. */ |
| 1528 | } else if (errno != EAGAIN) | 1554 | } else if (errno != EAGAIN) |
| 1529 | err(1, "Running guest failed"); | 1555 | err(1, "Running guest failed"); |
| 1530 | 1556 | ||
| 1557 | /* Only service input on thread for CPU 0. */ | ||
| 1558 | if (cpu_id != 0) | ||
| 1559 | continue; | ||
| 1560 | |||
| 1531 | /* Service input, then unset the BREAK to release the Waker. */ | 1561 | /* Service input, then unset the BREAK to release the Waker. */ |
| 1532 | handle_input(lguest_fd); | 1562 | handle_input(lguest_fd); |
| 1533 | if (write(lguest_fd, args, sizeof(args)) < 0) | 1563 | if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) |
| 1534 | err(1, "Resetting break"); | 1564 | err(1, "Resetting break"); |
| 1535 | } | 1565 | } |
| 1536 | } | 1566 | } |
| @@ -1571,6 +1601,12 @@ int main(int argc, char *argv[]) | |||
| 1571 | /* If they specify an initrd file to load. */ | 1601 | /* If they specify an initrd file to load. */ |
| 1572 | const char *initrd_name = NULL; | 1602 | const char *initrd_name = NULL; |
| 1573 | 1603 | ||
| 1604 | /* Save the args: we "reboot" by execing ourselves again. */ | ||
| 1605 | main_args = argv; | ||
| 1606 | /* We don't "wait" for the children, so prevent them from becoming | ||
| 1607 | * zombies. */ | ||
| 1608 | signal(SIGCHLD, SIG_IGN); | ||
| 1609 | |||
| 1574 | /* First we initialize the device list. Since console and network | 1610 | /* First we initialize the device list. Since console and network |
| 1575 | * device receive input from a file descriptor, we keep an fdset | 1611 | * device receive input from a file descriptor, we keep an fdset |
| 1576 | * (infds) and the maximum fd number (max_infd) with the head of the | 1612 | * (infds) and the maximum fd number (max_infd) with the head of the |
| @@ -1582,6 +1618,7 @@ int main(int argc, char *argv[]) | |||
| 1582 | devices.lastdev = &devices.dev; | 1618 | devices.lastdev = &devices.dev; |
| 1583 | devices.next_irq = 1; | 1619 | devices.next_irq = 1; |
| 1584 | 1620 | ||
| 1621 | cpu_id = 0; | ||
| 1585 | /* We need to know how much memory so we can set up the device | 1622 | /* We need to know how much memory so we can set up the device |
| 1586 | * descriptor and memory pages for the devices as we parse the command | 1623 | * descriptor and memory pages for the devices as we parse the command |
| 1587 | * line. So we quickly look through the arguments to find the amount | 1624 | * line. So we quickly look through the arguments to find the amount |
diff --git a/arch/ia64/hp/sim/simscsi.c b/arch/ia64/hp/sim/simscsi.c index 6ef9b5219930..7661bb065fa5 100644 --- a/arch/ia64/hp/sim/simscsi.c +++ b/arch/ia64/hp/sim/simscsi.c | |||
| @@ -360,7 +360,6 @@ static struct scsi_host_template driver_template = { | |||
| 360 | .max_sectors = 1024, | 360 | .max_sectors = 1024, |
| 361 | .cmd_per_lun = SIMSCSI_REQ_QUEUE_LEN, | 361 | .cmd_per_lun = SIMSCSI_REQ_QUEUE_LEN, |
| 362 | .use_clustering = DISABLE_CLUSTERING, | 362 | .use_clustering = DISABLE_CLUSTERING, |
| 363 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 364 | }; | 363 | }; |
| 365 | 364 | ||
| 366 | static int __init | 365 | static int __init |
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 19a5656001c0..f0bad7070fb5 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c | |||
| @@ -37,8 +37,6 @@ | |||
| 37 | #include <asm/iseries/hv_call_xm.h> | 37 | #include <asm/iseries/hv_call_xm.h> |
| 38 | #include <asm/iseries/iommu.h> | 38 | #include <asm/iseries/iommu.h> |
| 39 | 39 | ||
| 40 | extern struct kset devices_subsys; /* needed for vio_find_name() */ | ||
| 41 | |||
| 42 | static struct bus_type vio_bus_type; | 40 | static struct bus_type vio_bus_type; |
| 43 | 41 | ||
| 44 | static struct vio_dev vio_bus_device = { /* fake "parent" device */ | 42 | static struct vio_dev vio_bus_device = { /* fake "parent" device */ |
| @@ -361,19 +359,16 @@ EXPORT_SYMBOL(vio_get_attribute); | |||
| 361 | #ifdef CONFIG_PPC_PSERIES | 359 | #ifdef CONFIG_PPC_PSERIES |
| 362 | /* vio_find_name() - internal because only vio.c knows how we formatted the | 360 | /* vio_find_name() - internal because only vio.c knows how we formatted the |
| 363 | * kobject name | 361 | * kobject name |
| 364 | * XXX once vio_bus_type.devices is actually used as a kset in | ||
| 365 | * drivers/base/bus.c, this function should be removed in favor of | ||
| 366 | * "device_find(kobj_name, &vio_bus_type)" | ||
| 367 | */ | 362 | */ |
| 368 | static struct vio_dev *vio_find_name(const char *kobj_name) | 363 | static struct vio_dev *vio_find_name(const char *name) |
| 369 | { | 364 | { |
| 370 | struct kobject *found; | 365 | struct device *found; |
| 371 | 366 | ||
| 372 | found = kset_find_obj(&devices_subsys, kobj_name); | 367 | found = bus_find_device_by_name(&vio_bus_type, NULL, name); |
| 373 | if (!found) | 368 | if (!found) |
| 374 | return NULL; | 369 | return NULL; |
| 375 | 370 | ||
| 376 | return to_vio_dev(container_of(found, struct device, kobj)); | 371 | return to_vio_dev(found); |
| 377 | } | 372 | } |
| 378 | 373 | ||
| 379 | /** | 374 | /** |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fb3eea3e38ee..65b449134cf7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
| @@ -107,6 +107,7 @@ config ARCH_SUPPORTS_OPROFILE | |||
| 107 | bool | 107 | bool |
| 108 | default y | 108 | default y |
| 109 | 109 | ||
| 110 | select HAVE_KVM | ||
| 110 | 111 | ||
| 111 | config ZONE_DMA32 | 112 | config ZONE_DMA32 |
| 112 | bool | 113 | bool |
| @@ -1598,4 +1599,6 @@ source "security/Kconfig" | |||
| 1598 | 1599 | ||
| 1599 | source "crypto/Kconfig" | 1600 | source "crypto/Kconfig" |
| 1600 | 1601 | ||
| 1602 | source "arch/x86/kvm/Kconfig" | ||
| 1603 | |||
| 1601 | source "lib/Kconfig" | 1604 | source "lib/Kconfig" |
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b08f18261df6..da8f4129780b 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile | |||
| @@ -7,6 +7,8 @@ else | |||
| 7 | KBUILD_DEFCONFIG := $(ARCH)_defconfig | 7 | KBUILD_DEFCONFIG := $(ARCH)_defconfig |
| 8 | endif | 8 | endif |
| 9 | 9 | ||
| 10 | core-$(CONFIG_KVM) += arch/x86/kvm/ | ||
| 11 | |||
| 10 | # BITS is used as extension for files which are available in a 32 bit | 12 | # BITS is used as extension for files which are available in a 32 bit |
| 11 | # and a 64 bit version to simplify shared Makefiles. | 13 | # and a 64 bit version to simplify shared Makefiles. |
| 12 | # e.g.: obj-y += foo_$(BITS).o | 14 | # e.g.: obj-y += foo_$(BITS).o |
diff --git a/drivers/kvm/Kconfig b/arch/x86/kvm/Kconfig index 656920636cb2..c83e1c9b5129 100644 --- a/drivers/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
| @@ -1,9 +1,12 @@ | |||
| 1 | # | 1 | # |
| 2 | # KVM configuration | 2 | # KVM configuration |
| 3 | # | 3 | # |
| 4 | config HAVE_KVM | ||
| 5 | bool | ||
| 6 | |||
| 4 | menuconfig VIRTUALIZATION | 7 | menuconfig VIRTUALIZATION |
| 5 | bool "Virtualization" | 8 | bool "Virtualization" |
| 6 | depends on X86 | 9 | depends on HAVE_KVM || X86 |
| 7 | default y | 10 | default y |
| 8 | ---help--- | 11 | ---help--- |
| 9 | Say Y here to get to see options for using your Linux host to run other | 12 | Say Y here to get to see options for using your Linux host to run other |
| @@ -16,7 +19,7 @@ if VIRTUALIZATION | |||
| 16 | 19 | ||
| 17 | config KVM | 20 | config KVM |
| 18 | tristate "Kernel-based Virtual Machine (KVM) support" | 21 | tristate "Kernel-based Virtual Machine (KVM) support" |
| 19 | depends on X86 && EXPERIMENTAL | 22 | depends on HAVE_KVM && EXPERIMENTAL |
| 20 | select PREEMPT_NOTIFIERS | 23 | select PREEMPT_NOTIFIERS |
| 21 | select ANON_INODES | 24 | select ANON_INODES |
| 22 | ---help--- | 25 | ---help--- |
diff --git a/drivers/kvm/Makefile b/arch/x86/kvm/Makefile index e5a8f4d3e973..ffdd0b310784 100644 --- a/drivers/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
| @@ -2,7 +2,11 @@ | |||
| 2 | # Makefile for Kernel-based Virtual Machine module | 2 | # Makefile for Kernel-based Virtual Machine module |
| 3 | # | 3 | # |
| 4 | 4 | ||
| 5 | kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o | 5 | common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) |
| 6 | |||
| 7 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm | ||
| 8 | |||
| 9 | kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o | ||
| 6 | obj-$(CONFIG_KVM) += kvm.o | 10 | obj-$(CONFIG_KVM) += kvm.o |
| 7 | kvm-intel-objs = vmx.o | 11 | kvm-intel-objs = vmx.o |
| 8 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o | 12 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o |
diff --git a/drivers/kvm/i8259.c b/arch/x86/kvm/i8259.c index a679157bc599..ab29cf2def47 100644 --- a/drivers/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
| @@ -28,6 +28,8 @@ | |||
| 28 | #include <linux/mm.h> | 28 | #include <linux/mm.h> |
| 29 | #include "irq.h" | 29 | #include "irq.h" |
| 30 | 30 | ||
| 31 | #include <linux/kvm_host.h> | ||
| 32 | |||
| 31 | /* | 33 | /* |
| 32 | * set irq level. If an edge is detected, then the IRR is set to 1 | 34 | * set irq level. If an edge is detected, then the IRR is set to 1 |
| 33 | */ | 35 | */ |
| @@ -181,10 +183,8 @@ int kvm_pic_read_irq(struct kvm_pic *s) | |||
| 181 | return intno; | 183 | return intno; |
| 182 | } | 184 | } |
| 183 | 185 | ||
| 184 | static void pic_reset(void *opaque) | 186 | void kvm_pic_reset(struct kvm_kpic_state *s) |
| 185 | { | 187 | { |
| 186 | struct kvm_kpic_state *s = opaque; | ||
| 187 | |||
| 188 | s->last_irr = 0; | 188 | s->last_irr = 0; |
| 189 | s->irr = 0; | 189 | s->irr = 0; |
| 190 | s->imr = 0; | 190 | s->imr = 0; |
| @@ -209,7 +209,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
| 209 | addr &= 1; | 209 | addr &= 1; |
| 210 | if (addr == 0) { | 210 | if (addr == 0) { |
| 211 | if (val & 0x10) { | 211 | if (val & 0x10) { |
| 212 | pic_reset(s); /* init */ | 212 | kvm_pic_reset(s); /* init */ |
| 213 | /* | 213 | /* |
| 214 | * deassert a pending interrupt | 214 | * deassert a pending interrupt |
| 215 | */ | 215 | */ |
diff --git a/drivers/kvm/irq.c b/arch/x86/kvm/irq.c index 7628c7ff628f..e5714759e97f 100644 --- a/drivers/kvm/irq.c +++ b/arch/x86/kvm/irq.c | |||
| @@ -20,8 +20,8 @@ | |||
| 20 | */ | 20 | */ |
| 21 | 21 | ||
| 22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
| 23 | #include <linux/kvm_host.h> | ||
| 23 | 24 | ||
| 24 | #include "kvm.h" | ||
| 25 | #include "irq.h" | 25 | #include "irq.h" |
| 26 | 26 | ||
| 27 | /* | 27 | /* |
| @@ -63,26 +63,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) | |||
| 63 | } | 63 | } |
| 64 | EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); | 64 | EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); |
| 65 | 65 | ||
| 66 | static void vcpu_kick_intr(void *info) | ||
| 67 | { | ||
| 68 | #ifdef DEBUG | ||
| 69 | struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; | ||
| 70 | printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); | ||
| 71 | #endif | ||
| 72 | } | ||
| 73 | |||
| 74 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) | ||
| 75 | { | ||
| 76 | int ipi_pcpu = vcpu->cpu; | ||
| 77 | |||
| 78 | if (waitqueue_active(&vcpu->wq)) { | ||
| 79 | wake_up_interruptible(&vcpu->wq); | ||
| 80 | ++vcpu->stat.halt_wakeup; | ||
| 81 | } | ||
| 82 | if (vcpu->guest_mode) | ||
| 83 | smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); | ||
| 84 | } | ||
| 85 | |||
| 86 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) | 66 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) |
| 87 | { | 67 | { |
| 88 | kvm_inject_apic_timer_irqs(vcpu); | 68 | kvm_inject_apic_timer_irqs(vcpu); |
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h new file mode 100644 index 000000000000..fa5ed5d59b5d --- /dev/null +++ b/arch/x86/kvm/irq.h | |||
| @@ -0,0 +1,88 @@ | |||
| 1 | /* | ||
| 2 | * irq.h: in kernel interrupt controller related definitions | ||
| 3 | * Copyright (c) 2007, Intel Corporation. | ||
| 4 | * | ||
| 5 | * This program is free software; you can redistribute it and/or modify it | ||
| 6 | * under the terms and conditions of the GNU General Public License, | ||
| 7 | * version 2, as published by the Free Software Foundation. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
| 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
| 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
| 12 | * more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License along with | ||
| 15 | * this program; if not, write to the Free Software Foundation, Inc., 59 Temple | ||
| 16 | * Place - Suite 330, Boston, MA 02111-1307 USA. | ||
| 17 | * Authors: | ||
| 18 | * Yaozu (Eddie) Dong <Eddie.dong@intel.com> | ||
| 19 | * | ||
| 20 | */ | ||
| 21 | |||
| 22 | #ifndef __IRQ_H | ||
| 23 | #define __IRQ_H | ||
| 24 | |||
| 25 | #include <linux/mm_types.h> | ||
| 26 | #include <linux/hrtimer.h> | ||
| 27 | #include <linux/kvm_host.h> | ||
| 28 | |||
| 29 | #include "iodev.h" | ||
| 30 | #include "ioapic.h" | ||
| 31 | #include "lapic.h" | ||
| 32 | |||
| 33 | struct kvm; | ||
| 34 | struct kvm_vcpu; | ||
| 35 | |||
| 36 | typedef void irq_request_func(void *opaque, int level); | ||
| 37 | |||
| 38 | struct kvm_kpic_state { | ||
| 39 | u8 last_irr; /* edge detection */ | ||
| 40 | u8 irr; /* interrupt request register */ | ||
| 41 | u8 imr; /* interrupt mask register */ | ||
| 42 | u8 isr; /* interrupt service register */ | ||
| 43 | u8 priority_add; /* highest irq priority */ | ||
| 44 | u8 irq_base; | ||
| 45 | u8 read_reg_select; | ||
| 46 | u8 poll; | ||
| 47 | u8 special_mask; | ||
| 48 | u8 init_state; | ||
| 49 | u8 auto_eoi; | ||
| 50 | u8 rotate_on_auto_eoi; | ||
| 51 | u8 special_fully_nested_mode; | ||
| 52 | u8 init4; /* true if 4 byte init */ | ||
| 53 | u8 elcr; /* PIIX edge/trigger selection */ | ||
| 54 | u8 elcr_mask; | ||
| 55 | struct kvm_pic *pics_state; | ||
| 56 | }; | ||
| 57 | |||
| 58 | struct kvm_pic { | ||
| 59 | struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ | ||
| 60 | irq_request_func *irq_request; | ||
| 61 | void *irq_request_opaque; | ||
| 62 | int output; /* intr from master PIC */ | ||
| 63 | struct kvm_io_device dev; | ||
| 64 | }; | ||
| 65 | |||
| 66 | struct kvm_pic *kvm_create_pic(struct kvm *kvm); | ||
| 67 | void kvm_pic_set_irq(void *opaque, int irq, int level); | ||
| 68 | int kvm_pic_read_irq(struct kvm_pic *s); | ||
| 69 | void kvm_pic_update_irq(struct kvm_pic *s); | ||
| 70 | |||
| 71 | static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) | ||
| 72 | { | ||
| 73 | return kvm->arch.vpic; | ||
| 74 | } | ||
| 75 | |||
| 76 | static inline int irqchip_in_kernel(struct kvm *kvm) | ||
| 77 | { | ||
| 78 | return pic_irqchip(kvm) != NULL; | ||
| 79 | } | ||
| 80 | |||
| 81 | void kvm_pic_reset(struct kvm_kpic_state *s); | ||
| 82 | |||
| 83 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
| 84 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); | ||
| 85 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); | ||
| 86 | void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); | ||
| 87 | |||
| 88 | #endif | ||
diff --git a/drivers/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index a0e415daef5b..ecdfe97e4635 100644 --- a/drivers/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h | |||
| @@ -4,10 +4,10 @@ | |||
| 4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
| 5 | #include <linux/types.h> | 5 | #include <linux/types.h> |
| 6 | #include <linux/list.h> | 6 | #include <linux/list.h> |
| 7 | #include <linux/kvm_host.h> | ||
| 7 | #include <asm/msr.h> | 8 | #include <asm/msr.h> |
| 8 | 9 | ||
| 9 | #include "svm.h" | 10 | #include "svm.h" |
| 10 | #include "kvm.h" | ||
| 11 | 11 | ||
| 12 | static const u32 host_save_user_msrs[] = { | 12 | static const u32 host_save_user_msrs[] = { |
| 13 | #ifdef CONFIG_X86_64 | 13 | #ifdef CONFIG_X86_64 |
diff --git a/drivers/kvm/lapic.c b/arch/x86/kvm/lapic.c index 238fcad3cece..2cbee9479ce4 100644 --- a/drivers/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
| @@ -17,7 +17,7 @@ | |||
| 17 | * the COPYING file in the top-level directory. | 17 | * the COPYING file in the top-level directory. |
| 18 | */ | 18 | */ |
| 19 | 19 | ||
| 20 | #include "kvm.h" | 20 | #include <linux/kvm_host.h> |
| 21 | #include <linux/kvm.h> | 21 | #include <linux/kvm.h> |
| 22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
| 23 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
| @@ -56,6 +56,7 @@ | |||
| 56 | 56 | ||
| 57 | #define VEC_POS(v) ((v) & (32 - 1)) | 57 | #define VEC_POS(v) ((v) & (32 - 1)) |
| 58 | #define REG_POS(v) (((v) >> 5) << 4) | 58 | #define REG_POS(v) (((v) >> 5) << 4) |
| 59 | |||
| 59 | static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) | 60 | static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) |
| 60 | { | 61 | { |
| 61 | return *((u32 *) (apic->regs + reg_off)); | 62 | return *((u32 *) (apic->regs + reg_off)); |
| @@ -88,7 +89,7 @@ static inline void apic_clear_vector(int vec, void *bitmap) | |||
| 88 | 89 | ||
| 89 | static inline int apic_hw_enabled(struct kvm_lapic *apic) | 90 | static inline int apic_hw_enabled(struct kvm_lapic *apic) |
| 90 | { | 91 | { |
| 91 | return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE; | 92 | return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; |
| 92 | } | 93 | } |
| 93 | 94 | ||
| 94 | static inline int apic_sw_enabled(struct kvm_lapic *apic) | 95 | static inline int apic_sw_enabled(struct kvm_lapic *apic) |
| @@ -172,7 +173,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) | |||
| 172 | 173 | ||
| 173 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) | 174 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) |
| 174 | { | 175 | { |
| 175 | struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; | 176 | struct kvm_lapic *apic = vcpu->arch.apic; |
| 176 | int highest_irr; | 177 | int highest_irr; |
| 177 | 178 | ||
| 178 | if (!apic) | 179 | if (!apic) |
| @@ -183,8 +184,10 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) | |||
| 183 | } | 184 | } |
| 184 | EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); | 185 | EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); |
| 185 | 186 | ||
| 186 | int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig) | 187 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) |
| 187 | { | 188 | { |
| 189 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
| 190 | |||
| 188 | if (!apic_test_and_set_irr(vec, apic)) { | 191 | if (!apic_test_and_set_irr(vec, apic)) { |
| 189 | /* a new pending irq is set in IRR */ | 192 | /* a new pending irq is set in IRR */ |
| 190 | if (trig) | 193 | if (trig) |
| @@ -268,7 +271,7 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, | |||
| 268 | int short_hand, int dest, int dest_mode) | 271 | int short_hand, int dest, int dest_mode) |
| 269 | { | 272 | { |
| 270 | int result = 0; | 273 | int result = 0; |
| 271 | struct kvm_lapic *target = vcpu->apic; | 274 | struct kvm_lapic *target = vcpu->arch.apic; |
| 272 | 275 | ||
| 273 | apic_debug("target %p, source %p, dest 0x%x, " | 276 | apic_debug("target %p, source %p, dest 0x%x, " |
| 274 | "dest_mode 0x%x, short_hand 0x%x", | 277 | "dest_mode 0x%x, short_hand 0x%x", |
| @@ -335,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
| 335 | } else | 338 | } else |
| 336 | apic_clear_vector(vector, apic->regs + APIC_TMR); | 339 | apic_clear_vector(vector, apic->regs + APIC_TMR); |
| 337 | 340 | ||
| 338 | if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) | 341 | if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) |
| 339 | kvm_vcpu_kick(vcpu); | 342 | kvm_vcpu_kick(vcpu); |
| 340 | else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) { | 343 | else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) { |
| 341 | vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; | 344 | vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; |
| 342 | if (waitqueue_active(&vcpu->wq)) | 345 | if (waitqueue_active(&vcpu->wq)) |
| 343 | wake_up_interruptible(&vcpu->wq); | 346 | wake_up_interruptible(&vcpu->wq); |
| 344 | } | 347 | } |
| @@ -359,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
| 359 | 362 | ||
| 360 | case APIC_DM_INIT: | 363 | case APIC_DM_INIT: |
| 361 | if (level) { | 364 | if (level) { |
| 362 | if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) | 365 | if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) |
| 363 | printk(KERN_DEBUG | 366 | printk(KERN_DEBUG |
| 364 | "INIT on a runnable vcpu %d\n", | 367 | "INIT on a runnable vcpu %d\n", |
| 365 | vcpu->vcpu_id); | 368 | vcpu->vcpu_id); |
| 366 | vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED; | 369 | vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED; |
| 367 | kvm_vcpu_kick(vcpu); | 370 | kvm_vcpu_kick(vcpu); |
| 368 | } else { | 371 | } else { |
| 369 | printk(KERN_DEBUG | 372 | printk(KERN_DEBUG |
| @@ -376,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
| 376 | case APIC_DM_STARTUP: | 379 | case APIC_DM_STARTUP: |
| 377 | printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", | 380 | printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", |
| 378 | vcpu->vcpu_id, vector); | 381 | vcpu->vcpu_id, vector); |
| 379 | if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) { | 382 | if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) { |
| 380 | vcpu->sipi_vector = vector; | 383 | vcpu->arch.sipi_vector = vector; |
| 381 | vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED; | 384 | vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED; |
| 382 | if (waitqueue_active(&vcpu->wq)) | 385 | if (waitqueue_active(&vcpu->wq)) |
| 383 | wake_up_interruptible(&vcpu->wq); | 386 | wake_up_interruptible(&vcpu->wq); |
| 384 | } | 387 | } |
| @@ -392,15 +395,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
| 392 | return result; | 395 | return result; |
| 393 | } | 396 | } |
| 394 | 397 | ||
| 395 | struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, | 398 | static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, |
| 396 | unsigned long bitmap) | 399 | unsigned long bitmap) |
| 397 | { | 400 | { |
| 398 | int vcpu_id; | ||
| 399 | int last; | 401 | int last; |
| 400 | int next; | 402 | int next; |
| 401 | struct kvm_lapic *apic; | 403 | struct kvm_lapic *apic = NULL; |
| 402 | 404 | ||
| 403 | last = kvm->round_robin_prev_vcpu; | 405 | last = kvm->arch.round_robin_prev_vcpu; |
| 404 | next = last; | 406 | next = last; |
| 405 | 407 | ||
| 406 | do { | 408 | do { |
| @@ -408,25 +410,30 @@ struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, | |||
| 408 | next = 0; | 410 | next = 0; |
| 409 | if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) | 411 | if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) |
| 410 | continue; | 412 | continue; |
| 411 | apic = kvm->vcpus[next]->apic; | 413 | apic = kvm->vcpus[next]->arch.apic; |
| 412 | if (apic && apic_enabled(apic)) | 414 | if (apic && apic_enabled(apic)) |
| 413 | break; | 415 | break; |
| 414 | apic = NULL; | 416 | apic = NULL; |
| 415 | } while (next != last); | 417 | } while (next != last); |
| 416 | kvm->round_robin_prev_vcpu = next; | 418 | kvm->arch.round_robin_prev_vcpu = next; |
| 417 | 419 | ||
| 418 | if (!apic) { | 420 | if (!apic) |
| 419 | vcpu_id = ffs(bitmap) - 1; | 421 | printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); |
| 420 | if (vcpu_id < 0) { | ||
| 421 | vcpu_id = 0; | ||
| 422 | printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); | ||
| 423 | } | ||
| 424 | apic = kvm->vcpus[vcpu_id]->apic; | ||
| 425 | } | ||
| 426 | 422 | ||
| 427 | return apic; | 423 | return apic; |
| 428 | } | 424 | } |
| 429 | 425 | ||
| 426 | struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, | ||
| 427 | unsigned long bitmap) | ||
| 428 | { | ||
| 429 | struct kvm_lapic *apic; | ||
| 430 | |||
| 431 | apic = kvm_apic_round_robin(kvm, vector, bitmap); | ||
| 432 | if (apic) | ||
| 433 | return apic->vcpu; | ||
| 434 | return NULL; | ||
| 435 | } | ||
| 436 | |||
| 430 | static void apic_set_eoi(struct kvm_lapic *apic) | 437 | static void apic_set_eoi(struct kvm_lapic *apic) |
| 431 | { | 438 | { |
| 432 | int vector = apic_find_highest_isr(apic); | 439 | int vector = apic_find_highest_isr(apic); |
| @@ -458,7 +465,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) | |||
| 458 | unsigned int delivery_mode = icr_low & APIC_MODE_MASK; | 465 | unsigned int delivery_mode = icr_low & APIC_MODE_MASK; |
| 459 | unsigned int vector = icr_low & APIC_VECTOR_MASK; | 466 | unsigned int vector = icr_low & APIC_VECTOR_MASK; |
| 460 | 467 | ||
| 461 | struct kvm_lapic *target; | 468 | struct kvm_vcpu *target; |
| 462 | struct kvm_vcpu *vcpu; | 469 | struct kvm_vcpu *vcpu; |
| 463 | unsigned long lpr_map = 0; | 470 | unsigned long lpr_map = 0; |
| 464 | int i; | 471 | int i; |
| @@ -474,20 +481,20 @@ static void apic_send_ipi(struct kvm_lapic *apic) | |||
| 474 | if (!vcpu) | 481 | if (!vcpu) |
| 475 | continue; | 482 | continue; |
| 476 | 483 | ||
| 477 | if (vcpu->apic && | 484 | if (vcpu->arch.apic && |
| 478 | apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { | 485 | apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { |
| 479 | if (delivery_mode == APIC_DM_LOWEST) | 486 | if (delivery_mode == APIC_DM_LOWEST) |
| 480 | set_bit(vcpu->vcpu_id, &lpr_map); | 487 | set_bit(vcpu->vcpu_id, &lpr_map); |
| 481 | else | 488 | else |
| 482 | __apic_accept_irq(vcpu->apic, delivery_mode, | 489 | __apic_accept_irq(vcpu->arch.apic, delivery_mode, |
| 483 | vector, level, trig_mode); | 490 | vector, level, trig_mode); |
| 484 | } | 491 | } |
| 485 | } | 492 | } |
| 486 | 493 | ||
| 487 | if (delivery_mode == APIC_DM_LOWEST) { | 494 | if (delivery_mode == APIC_DM_LOWEST) { |
| 488 | target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map); | 495 | target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map); |
| 489 | if (target != NULL) | 496 | if (target != NULL) |
| 490 | __apic_accept_irq(target, delivery_mode, | 497 | __apic_accept_irq(target->arch.apic, delivery_mode, |
| 491 | vector, level, trig_mode); | 498 | vector, level, trig_mode); |
| 492 | } | 499 | } |
| 493 | } | 500 | } |
| @@ -544,6 +551,23 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) | |||
| 544 | return tmcct; | 551 | return tmcct; |
| 545 | } | 552 | } |
| 546 | 553 | ||
| 554 | static void __report_tpr_access(struct kvm_lapic *apic, bool write) | ||
| 555 | { | ||
| 556 | struct kvm_vcpu *vcpu = apic->vcpu; | ||
| 557 | struct kvm_run *run = vcpu->run; | ||
| 558 | |||
| 559 | set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); | ||
| 560 | kvm_x86_ops->cache_regs(vcpu); | ||
| 561 | run->tpr_access.rip = vcpu->arch.rip; | ||
| 562 | run->tpr_access.is_write = write; | ||
| 563 | } | ||
| 564 | |||
| 565 | static inline void report_tpr_access(struct kvm_lapic *apic, bool write) | ||
| 566 | { | ||
| 567 | if (apic->vcpu->arch.tpr_access_reporting) | ||
| 568 | __report_tpr_access(apic, write); | ||
| 569 | } | ||
| 570 | |||
| 547 | static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) | 571 | static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) |
| 548 | { | 572 | { |
| 549 | u32 val = 0; | 573 | u32 val = 0; |
| @@ -561,6 +585,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) | |||
| 561 | val = apic_get_tmcct(apic); | 585 | val = apic_get_tmcct(apic); |
| 562 | break; | 586 | break; |
| 563 | 587 | ||
| 588 | case APIC_TASKPRI: | ||
| 589 | report_tpr_access(apic, false); | ||
| 590 | /* fall thru */ | ||
| 564 | default: | 591 | default: |
| 565 | apic_update_ppr(apic); | 592 | apic_update_ppr(apic); |
| 566 | val = apic_get_reg(apic, offset); | 593 | val = apic_get_reg(apic, offset); |
| @@ -670,6 +697,7 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
| 670 | break; | 697 | break; |
| 671 | 698 | ||
| 672 | case APIC_TASKPRI: | 699 | case APIC_TASKPRI: |
| 700 | report_tpr_access(apic, true); | ||
| 673 | apic_set_tpr(apic, val & 0xff); | 701 | apic_set_tpr(apic, val & 0xff); |
| 674 | break; | 702 | break; |
| 675 | 703 | ||
| @@ -762,19 +790,17 @@ static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) | |||
| 762 | return ret; | 790 | return ret; |
| 763 | } | 791 | } |
| 764 | 792 | ||
| 765 | void kvm_free_apic(struct kvm_lapic *apic) | 793 | void kvm_free_lapic(struct kvm_vcpu *vcpu) |
| 766 | { | 794 | { |
| 767 | if (!apic) | 795 | if (!vcpu->arch.apic) |
| 768 | return; | 796 | return; |
| 769 | 797 | ||
| 770 | hrtimer_cancel(&apic->timer.dev); | 798 | hrtimer_cancel(&vcpu->arch.apic->timer.dev); |
| 771 | 799 | ||
| 772 | if (apic->regs_page) { | 800 | if (vcpu->arch.apic->regs_page) |
| 773 | __free_page(apic->regs_page); | 801 | __free_page(vcpu->arch.apic->regs_page); |
| 774 | apic->regs_page = 0; | ||
| 775 | } | ||
| 776 | 802 | ||
| 777 | kfree(apic); | 803 | kfree(vcpu->arch.apic); |
| 778 | } | 804 | } |
| 779 | 805 | ||
| 780 | /* | 806 | /* |
| @@ -785,16 +811,17 @@ void kvm_free_apic(struct kvm_lapic *apic) | |||
| 785 | 811 | ||
| 786 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) | 812 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) |
| 787 | { | 813 | { |
| 788 | struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; | 814 | struct kvm_lapic *apic = vcpu->arch.apic; |
| 789 | 815 | ||
| 790 | if (!apic) | 816 | if (!apic) |
| 791 | return; | 817 | return; |
| 792 | apic_set_tpr(apic, ((cr8 & 0x0f) << 4)); | 818 | apic_set_tpr(apic, ((cr8 & 0x0f) << 4) |
| 819 | | (apic_get_reg(apic, APIC_TASKPRI) & 4)); | ||
| 793 | } | 820 | } |
| 794 | 821 | ||
| 795 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) | 822 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) |
| 796 | { | 823 | { |
| 797 | struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; | 824 | struct kvm_lapic *apic = vcpu->arch.apic; |
| 798 | u64 tpr; | 825 | u64 tpr; |
| 799 | 826 | ||
| 800 | if (!apic) | 827 | if (!apic) |
| @@ -807,29 +834,29 @@ EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8); | |||
| 807 | 834 | ||
| 808 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | 835 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) |
| 809 | { | 836 | { |
| 810 | struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; | 837 | struct kvm_lapic *apic = vcpu->arch.apic; |
| 811 | 838 | ||
| 812 | if (!apic) { | 839 | if (!apic) { |
| 813 | value |= MSR_IA32_APICBASE_BSP; | 840 | value |= MSR_IA32_APICBASE_BSP; |
| 814 | vcpu->apic_base = value; | 841 | vcpu->arch.apic_base = value; |
| 815 | return; | 842 | return; |
| 816 | } | 843 | } |
| 817 | if (apic->vcpu->vcpu_id) | 844 | if (apic->vcpu->vcpu_id) |
| 818 | value &= ~MSR_IA32_APICBASE_BSP; | 845 | value &= ~MSR_IA32_APICBASE_BSP; |
| 819 | 846 | ||
| 820 | vcpu->apic_base = value; | 847 | vcpu->arch.apic_base = value; |
| 821 | apic->base_address = apic->vcpu->apic_base & | 848 | apic->base_address = apic->vcpu->arch.apic_base & |
| 822 | MSR_IA32_APICBASE_BASE; | 849 | MSR_IA32_APICBASE_BASE; |
| 823 | 850 | ||
| 824 | /* with FSB delivery interrupt, we can restart APIC functionality */ | 851 | /* with FSB delivery interrupt, we can restart APIC functionality */ |
| 825 | apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " | 852 | apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " |
| 826 | "0x%lx.\n", apic->apic_base, apic->base_address); | 853 | "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address); |
| 827 | 854 | ||
| 828 | } | 855 | } |
| 829 | 856 | ||
| 830 | u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) | 857 | u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) |
| 831 | { | 858 | { |
| 832 | return vcpu->apic_base; | 859 | return vcpu->arch.apic_base; |
| 833 | } | 860 | } |
| 834 | EXPORT_SYMBOL_GPL(kvm_lapic_get_base); | 861 | EXPORT_SYMBOL_GPL(kvm_lapic_get_base); |
| 835 | 862 | ||
| @@ -841,7 +868,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
| 841 | apic_debug("%s\n", __FUNCTION__); | 868 | apic_debug("%s\n", __FUNCTION__); |
| 842 | 869 | ||
| 843 | ASSERT(vcpu); | 870 | ASSERT(vcpu); |
| 844 | apic = vcpu->apic; | 871 | apic = vcpu->arch.apic; |
| 845 | ASSERT(apic != NULL); | 872 | ASSERT(apic != NULL); |
| 846 | 873 | ||
| 847 | /* Stop the timer in case it's a reset to an active apic */ | 874 | /* Stop the timer in case it's a reset to an active apic */ |
| @@ -872,19 +899,19 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
| 872 | update_divide_count(apic); | 899 | update_divide_count(apic); |
| 873 | atomic_set(&apic->timer.pending, 0); | 900 | atomic_set(&apic->timer.pending, 0); |
| 874 | if (vcpu->vcpu_id == 0) | 901 | if (vcpu->vcpu_id == 0) |
| 875 | vcpu->apic_base |= MSR_IA32_APICBASE_BSP; | 902 | vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; |
| 876 | apic_update_ppr(apic); | 903 | apic_update_ppr(apic); |
| 877 | 904 | ||
| 878 | apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" | 905 | apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" |
| 879 | "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, | 906 | "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, |
| 880 | vcpu, kvm_apic_id(apic), | 907 | vcpu, kvm_apic_id(apic), |
| 881 | vcpu->apic_base, apic->base_address); | 908 | vcpu->arch.apic_base, apic->base_address); |
| 882 | } | 909 | } |
| 883 | EXPORT_SYMBOL_GPL(kvm_lapic_reset); | 910 | EXPORT_SYMBOL_GPL(kvm_lapic_reset); |
| 884 | 911 | ||
| 885 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu) | 912 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu) |
| 886 | { | 913 | { |
| 887 | struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; | 914 | struct kvm_lapic *apic = vcpu->arch.apic; |
| 888 | int ret = 0; | 915 | int ret = 0; |
| 889 | 916 | ||
| 890 | if (!apic) | 917 | if (!apic) |
| @@ -908,9 +935,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic) | |||
| 908 | wait_queue_head_t *q = &apic->vcpu->wq; | 935 | wait_queue_head_t *q = &apic->vcpu->wq; |
| 909 | 936 | ||
| 910 | atomic_inc(&apic->timer.pending); | 937 | atomic_inc(&apic->timer.pending); |
| 911 | if (waitqueue_active(q)) | 938 | if (waitqueue_active(q)) { |
| 912 | { | 939 | apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; |
| 913 | apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; | ||
| 914 | wake_up_interruptible(q); | 940 | wake_up_interruptible(q); |
| 915 | } | 941 | } |
| 916 | if (apic_lvtt_period(apic)) { | 942 | if (apic_lvtt_period(apic)) { |
| @@ -956,13 +982,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) | |||
| 956 | if (!apic) | 982 | if (!apic) |
| 957 | goto nomem; | 983 | goto nomem; |
| 958 | 984 | ||
| 959 | vcpu->apic = apic; | 985 | vcpu->arch.apic = apic; |
| 960 | 986 | ||
| 961 | apic->regs_page = alloc_page(GFP_KERNEL); | 987 | apic->regs_page = alloc_page(GFP_KERNEL); |
| 962 | if (apic->regs_page == NULL) { | 988 | if (apic->regs_page == NULL) { |
| 963 | printk(KERN_ERR "malloc apic regs error for vcpu %x\n", | 989 | printk(KERN_ERR "malloc apic regs error for vcpu %x\n", |
| 964 | vcpu->vcpu_id); | 990 | vcpu->vcpu_id); |
| 965 | goto nomem; | 991 | goto nomem_free_apic; |
| 966 | } | 992 | } |
| 967 | apic->regs = page_address(apic->regs_page); | 993 | apic->regs = page_address(apic->regs_page); |
| 968 | memset(apic->regs, 0, PAGE_SIZE); | 994 | memset(apic->regs, 0, PAGE_SIZE); |
| @@ -971,7 +997,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) | |||
| 971 | hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 997 | hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); |
| 972 | apic->timer.dev.function = apic_timer_fn; | 998 | apic->timer.dev.function = apic_timer_fn; |
| 973 | apic->base_address = APIC_DEFAULT_PHYS_BASE; | 999 | apic->base_address = APIC_DEFAULT_PHYS_BASE; |
| 974 | vcpu->apic_base = APIC_DEFAULT_PHYS_BASE; | 1000 | vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; |
| 975 | 1001 | ||
| 976 | kvm_lapic_reset(vcpu); | 1002 | kvm_lapic_reset(vcpu); |
| 977 | apic->dev.read = apic_mmio_read; | 1003 | apic->dev.read = apic_mmio_read; |
| @@ -980,15 +1006,16 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) | |||
| 980 | apic->dev.private = apic; | 1006 | apic->dev.private = apic; |
| 981 | 1007 | ||
| 982 | return 0; | 1008 | return 0; |
| 1009 | nomem_free_apic: | ||
| 1010 | kfree(apic); | ||
| 983 | nomem: | 1011 | nomem: |
| 984 | kvm_free_apic(apic); | ||
| 985 | return -ENOMEM; | 1012 | return -ENOMEM; |
| 986 | } | 1013 | } |
| 987 | EXPORT_SYMBOL_GPL(kvm_create_lapic); | 1014 | EXPORT_SYMBOL_GPL(kvm_create_lapic); |
| 988 | 1015 | ||
| 989 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) | 1016 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) |
| 990 | { | 1017 | { |
| 991 | struct kvm_lapic *apic = vcpu->apic; | 1018 | struct kvm_lapic *apic = vcpu->arch.apic; |
| 992 | int highest_irr; | 1019 | int highest_irr; |
| 993 | 1020 | ||
| 994 | if (!apic || !apic_enabled(apic)) | 1021 | if (!apic || !apic_enabled(apic)) |
| @@ -1004,11 +1031,11 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) | |||
| 1004 | 1031 | ||
| 1005 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) | 1032 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) |
| 1006 | { | 1033 | { |
| 1007 | u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0); | 1034 | u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); |
| 1008 | int r = 0; | 1035 | int r = 0; |
| 1009 | 1036 | ||
| 1010 | if (vcpu->vcpu_id == 0) { | 1037 | if (vcpu->vcpu_id == 0) { |
| 1011 | if (!apic_hw_enabled(vcpu->apic)) | 1038 | if (!apic_hw_enabled(vcpu->arch.apic)) |
| 1012 | r = 1; | 1039 | r = 1; |
| 1013 | if ((lvt0 & APIC_LVT_MASKED) == 0 && | 1040 | if ((lvt0 & APIC_LVT_MASKED) == 0 && |
| 1014 | GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) | 1041 | GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) |
| @@ -1019,7 +1046,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) | |||
| 1019 | 1046 | ||
| 1020 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) | 1047 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) |
| 1021 | { | 1048 | { |
| 1022 | struct kvm_lapic *apic = vcpu->apic; | 1049 | struct kvm_lapic *apic = vcpu->arch.apic; |
| 1023 | 1050 | ||
| 1024 | if (apic && apic_lvt_enabled(apic, APIC_LVTT) && | 1051 | if (apic && apic_lvt_enabled(apic, APIC_LVTT) && |
| 1025 | atomic_read(&apic->timer.pending) > 0) { | 1052 | atomic_read(&apic->timer.pending) > 0) { |
| @@ -1030,7 +1057,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) | |||
| 1030 | 1057 | ||
| 1031 | void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | 1058 | void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) |
| 1032 | { | 1059 | { |
| 1033 | struct kvm_lapic *apic = vcpu->apic; | 1060 | struct kvm_lapic *apic = vcpu->arch.apic; |
| 1034 | 1061 | ||
| 1035 | if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) | 1062 | if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) |
| 1036 | apic->timer.last_update = ktime_add_ns( | 1063 | apic->timer.last_update = ktime_add_ns( |
| @@ -1041,7 +1068,7 @@ void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | |||
| 1041 | int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) | 1068 | int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) |
| 1042 | { | 1069 | { |
| 1043 | int vector = kvm_apic_has_interrupt(vcpu); | 1070 | int vector = kvm_apic_has_interrupt(vcpu); |
| 1044 | struct kvm_lapic *apic = vcpu->apic; | 1071 | struct kvm_lapic *apic = vcpu->arch.apic; |
| 1045 | 1072 | ||
| 1046 | if (vector == -1) | 1073 | if (vector == -1) |
| 1047 | return -1; | 1074 | return -1; |
| @@ -1054,9 +1081,9 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) | |||
| 1054 | 1081 | ||
| 1055 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | 1082 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) |
| 1056 | { | 1083 | { |
| 1057 | struct kvm_lapic *apic = vcpu->apic; | 1084 | struct kvm_lapic *apic = vcpu->arch.apic; |
| 1058 | 1085 | ||
| 1059 | apic->base_address = vcpu->apic_base & | 1086 | apic->base_address = vcpu->arch.apic_base & |
| 1060 | MSR_IA32_APICBASE_BASE; | 1087 | MSR_IA32_APICBASE_BASE; |
| 1061 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); | 1088 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); |
| 1062 | apic_update_ppr(apic); | 1089 | apic_update_ppr(apic); |
| @@ -1065,9 +1092,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | |||
| 1065 | start_apic_timer(apic); | 1092 | start_apic_timer(apic); |
| 1066 | } | 1093 | } |
| 1067 | 1094 | ||
| 1068 | void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) | 1095 | void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) |
| 1069 | { | 1096 | { |
| 1070 | struct kvm_lapic *apic = vcpu->apic; | 1097 | struct kvm_lapic *apic = vcpu->arch.apic; |
| 1071 | struct hrtimer *timer; | 1098 | struct hrtimer *timer; |
| 1072 | 1099 | ||
| 1073 | if (!apic) | 1100 | if (!apic) |
| @@ -1077,4 +1104,51 @@ void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) | |||
| 1077 | if (hrtimer_cancel(timer)) | 1104 | if (hrtimer_cancel(timer)) |
| 1078 | hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); | 1105 | hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); |
| 1079 | } | 1106 | } |
| 1080 | EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer); | 1107 | |
| 1108 | void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) | ||
| 1109 | { | ||
| 1110 | u32 data; | ||
| 1111 | void *vapic; | ||
| 1112 | |||
| 1113 | if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) | ||
| 1114 | return; | ||
| 1115 | |||
| 1116 | vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); | ||
| 1117 | data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)); | ||
| 1118 | kunmap_atomic(vapic, KM_USER0); | ||
| 1119 | |||
| 1120 | apic_set_tpr(vcpu->arch.apic, data & 0xff); | ||
| 1121 | } | ||
| 1122 | |||
| 1123 | void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) | ||
| 1124 | { | ||
| 1125 | u32 data, tpr; | ||
| 1126 | int max_irr, max_isr; | ||
| 1127 | struct kvm_lapic *apic; | ||
| 1128 | void *vapic; | ||
| 1129 | |||
| 1130 | if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) | ||
| 1131 | return; | ||
| 1132 | |||
| 1133 | apic = vcpu->arch.apic; | ||
| 1134 | tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; | ||
| 1135 | max_irr = apic_find_highest_irr(apic); | ||
| 1136 | if (max_irr < 0) | ||
| 1137 | max_irr = 0; | ||
| 1138 | max_isr = apic_find_highest_isr(apic); | ||
| 1139 | if (max_isr < 0) | ||
| 1140 | max_isr = 0; | ||
| 1141 | data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); | ||
| 1142 | |||
| 1143 | vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); | ||
| 1144 | *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data; | ||
| 1145 | kunmap_atomic(vapic, KM_USER0); | ||
| 1146 | } | ||
| 1147 | |||
| 1148 | void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) | ||
| 1149 | { | ||
| 1150 | if (!irqchip_in_kernel(vcpu->kvm)) | ||
| 1151 | return; | ||
| 1152 | |||
| 1153 | vcpu->arch.apic->vapic_addr = vapic_addr; | ||
| 1154 | } | ||
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h new file mode 100644 index 000000000000..676c396c9cee --- /dev/null +++ b/arch/x86/kvm/lapic.h | |||
| @@ -0,0 +1,50 @@ | |||
| 1 | #ifndef __KVM_X86_LAPIC_H | ||
| 2 | #define __KVM_X86_LAPIC_H | ||
| 3 | |||
| 4 | #include "iodev.h" | ||
| 5 | |||
| 6 | #include <linux/kvm_host.h> | ||
| 7 | |||
| 8 | struct kvm_lapic { | ||
| 9 | unsigned long base_address; | ||
| 10 | struct kvm_io_device dev; | ||
| 11 | struct { | ||
| 12 | atomic_t pending; | ||
| 13 | s64 period; /* unit: ns */ | ||
| 14 | u32 divide_count; | ||
| 15 | ktime_t last_update; | ||
| 16 | struct hrtimer dev; | ||
| 17 | } timer; | ||
| 18 | struct kvm_vcpu *vcpu; | ||
| 19 | struct page *regs_page; | ||
| 20 | void *regs; | ||
| 21 | gpa_t vapic_addr; | ||
| 22 | struct page *vapic_page; | ||
| 23 | }; | ||
| 24 | int kvm_create_lapic(struct kvm_vcpu *vcpu); | ||
| 25 | void kvm_free_lapic(struct kvm_vcpu *vcpu); | ||
| 26 | |||
| 27 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); | ||
| 28 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); | ||
| 29 | int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); | ||
| 30 | void kvm_lapic_reset(struct kvm_vcpu *vcpu); | ||
| 31 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); | ||
| 32 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); | ||
| 33 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); | ||
| 34 | |||
| 35 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); | ||
| 36 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); | ||
| 37 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); | ||
| 38 | |||
| 39 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); | ||
| 40 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); | ||
| 41 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); | ||
| 42 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu); | ||
| 43 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); | ||
| 44 | void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
| 45 | |||
| 46 | void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); | ||
| 47 | void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); | ||
| 48 | void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); | ||
| 49 | |||
| 50 | #endif | ||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c new file mode 100644 index 000000000000..8efdcdbebb03 --- /dev/null +++ b/arch/x86/kvm/mmu.c | |||
| @@ -0,0 +1,1885 @@ | |||
| 1 | /* | ||
| 2 | * Kernel-based Virtual Machine driver for Linux | ||
| 3 | * | ||
| 4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
| 5 | * machines without emulation or binary translation. | ||
| 6 | * | ||
| 7 | * MMU support | ||
| 8 | * | ||
| 9 | * Copyright (C) 2006 Qumranet, Inc. | ||
| 10 | * | ||
| 11 | * Authors: | ||
| 12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
| 13 | * Avi Kivity <avi@qumranet.com> | ||
| 14 | * | ||
| 15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
| 16 | * the COPYING file in the top-level directory. | ||
| 17 | * | ||
| 18 | */ | ||
| 19 | |||
| 20 | #include "vmx.h" | ||
| 21 | #include "mmu.h" | ||
| 22 | |||
| 23 | #include <linux/kvm_host.h> | ||
| 24 | #include <linux/types.h> | ||
| 25 | #include <linux/string.h> | ||
| 26 | #include <linux/mm.h> | ||
| 27 | #include <linux/highmem.h> | ||
| 28 | #include <linux/module.h> | ||
| 29 | #include <linux/swap.h> | ||
| 30 | |||
| 31 | #include <asm/page.h> | ||
| 32 | #include <asm/cmpxchg.h> | ||
| 33 | #include <asm/io.h> | ||
| 34 | |||
| 35 | #undef MMU_DEBUG | ||
| 36 | |||
| 37 | #undef AUDIT | ||
| 38 | |||
| 39 | #ifdef AUDIT | ||
| 40 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); | ||
| 41 | #else | ||
| 42 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} | ||
| 43 | #endif | ||
| 44 | |||
| 45 | #ifdef MMU_DEBUG | ||
| 46 | |||
| 47 | #define pgprintk(x...) do { if (dbg) printk(x); } while (0) | ||
| 48 | #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) | ||
| 49 | |||
| 50 | #else | ||
| 51 | |||
| 52 | #define pgprintk(x...) do { } while (0) | ||
| 53 | #define rmap_printk(x...) do { } while (0) | ||
| 54 | |||
| 55 | #endif | ||
| 56 | |||
| 57 | #if defined(MMU_DEBUG) || defined(AUDIT) | ||
| 58 | static int dbg = 1; | ||
| 59 | #endif | ||
| 60 | |||
| 61 | #ifndef MMU_DEBUG | ||
| 62 | #define ASSERT(x) do { } while (0) | ||
| 63 | #else | ||
| 64 | #define ASSERT(x) \ | ||
| 65 | if (!(x)) { \ | ||
| 66 | printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ | ||
| 67 | __FILE__, __LINE__, #x); \ | ||
| 68 | } | ||
| 69 | #endif | ||
| 70 | |||
| 71 | #define PT64_PT_BITS 9 | ||
| 72 | #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) | ||
| 73 | #define PT32_PT_BITS 10 | ||
| 74 | #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) | ||
| 75 | |||
| 76 | #define PT_WRITABLE_SHIFT 1 | ||
| 77 | |||
| 78 | #define PT_PRESENT_MASK (1ULL << 0) | ||
| 79 | #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) | ||
| 80 | #define PT_USER_MASK (1ULL << 2) | ||
| 81 | #define PT_PWT_MASK (1ULL << 3) | ||
| 82 | #define PT_PCD_MASK (1ULL << 4) | ||
| 83 | #define PT_ACCESSED_MASK (1ULL << 5) | ||
| 84 | #define PT_DIRTY_MASK (1ULL << 6) | ||
| 85 | #define PT_PAGE_SIZE_MASK (1ULL << 7) | ||
| 86 | #define PT_PAT_MASK (1ULL << 7) | ||
| 87 | #define PT_GLOBAL_MASK (1ULL << 8) | ||
| 88 | #define PT64_NX_SHIFT 63 | ||
| 89 | #define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) | ||
| 90 | |||
| 91 | #define PT_PAT_SHIFT 7 | ||
| 92 | #define PT_DIR_PAT_SHIFT 12 | ||
| 93 | #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) | ||
| 94 | |||
| 95 | #define PT32_DIR_PSE36_SIZE 4 | ||
| 96 | #define PT32_DIR_PSE36_SHIFT 13 | ||
| 97 | #define PT32_DIR_PSE36_MASK \ | ||
| 98 | (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) | ||
| 99 | |||
| 100 | |||
| 101 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | ||
| 102 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | ||
| 103 | |||
| 104 | #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) | ||
| 105 | |||
| 106 | #define VALID_PAGE(x) ((x) != INVALID_PAGE) | ||
| 107 | |||
| 108 | #define PT64_LEVEL_BITS 9 | ||
| 109 | |||
| 110 | #define PT64_LEVEL_SHIFT(level) \ | ||
| 111 | (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) | ||
| 112 | |||
| 113 | #define PT64_LEVEL_MASK(level) \ | ||
| 114 | (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) | ||
| 115 | |||
| 116 | #define PT64_INDEX(address, level)\ | ||
| 117 | (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) | ||
| 118 | |||
| 119 | |||
| 120 | #define PT32_LEVEL_BITS 10 | ||
| 121 | |||
| 122 | #define PT32_LEVEL_SHIFT(level) \ | ||
| 123 | (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) | ||
| 124 | |||
| 125 | #define PT32_LEVEL_MASK(level) \ | ||
| 126 | (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) | ||
| 127 | |||
| 128 | #define PT32_INDEX(address, level)\ | ||
| 129 | (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) | ||
| 130 | |||
| 131 | |||
| 132 | #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) | ||
| 133 | #define PT64_DIR_BASE_ADDR_MASK \ | ||
| 134 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) | ||
| 135 | |||
| 136 | #define PT32_BASE_ADDR_MASK PAGE_MASK | ||
| 137 | #define PT32_DIR_BASE_ADDR_MASK \ | ||
| 138 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) | ||
| 139 | |||
| 140 | #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | ||
| 141 | | PT64_NX_MASK) | ||
| 142 | |||
| 143 | #define PFERR_PRESENT_MASK (1U << 0) | ||
| 144 | #define PFERR_WRITE_MASK (1U << 1) | ||
| 145 | #define PFERR_USER_MASK (1U << 2) | ||
| 146 | #define PFERR_FETCH_MASK (1U << 4) | ||
| 147 | |||
| 148 | #define PT64_ROOT_LEVEL 4 | ||
| 149 | #define PT32_ROOT_LEVEL 2 | ||
| 150 | #define PT32E_ROOT_LEVEL 3 | ||
| 151 | |||
| 152 | #define PT_DIRECTORY_LEVEL 2 | ||
| 153 | #define PT_PAGE_TABLE_LEVEL 1 | ||
| 154 | |||
| 155 | #define RMAP_EXT 4 | ||
| 156 | |||
| 157 | #define ACC_EXEC_MASK 1 | ||
| 158 | #define ACC_WRITE_MASK PT_WRITABLE_MASK | ||
| 159 | #define ACC_USER_MASK PT_USER_MASK | ||
| 160 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) | ||
| 161 | |||
| 162 | struct kvm_rmap_desc { | ||
| 163 | u64 *shadow_ptes[RMAP_EXT]; | ||
| 164 | struct kvm_rmap_desc *more; | ||
| 165 | }; | ||
| 166 | |||
| 167 | static struct kmem_cache *pte_chain_cache; | ||
| 168 | static struct kmem_cache *rmap_desc_cache; | ||
| 169 | static struct kmem_cache *mmu_page_header_cache; | ||
| 170 | |||
| 171 | static u64 __read_mostly shadow_trap_nonpresent_pte; | ||
| 172 | static u64 __read_mostly shadow_notrap_nonpresent_pte; | ||
| 173 | |||
| 174 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) | ||
| 175 | { | ||
| 176 | shadow_trap_nonpresent_pte = trap_pte; | ||
| 177 | shadow_notrap_nonpresent_pte = notrap_pte; | ||
| 178 | } | ||
| 179 | EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); | ||
| 180 | |||
| 181 | static int is_write_protection(struct kvm_vcpu *vcpu) | ||
| 182 | { | ||
| 183 | return vcpu->arch.cr0 & X86_CR0_WP; | ||
| 184 | } | ||
| 185 | |||
| 186 | static int is_cpuid_PSE36(void) | ||
| 187 | { | ||
| 188 | return 1; | ||
| 189 | } | ||
| 190 | |||
| 191 | static int is_nx(struct kvm_vcpu *vcpu) | ||
| 192 | { | ||
| 193 | return vcpu->arch.shadow_efer & EFER_NX; | ||
| 194 | } | ||
| 195 | |||
| 196 | static int is_present_pte(unsigned long pte) | ||
| 197 | { | ||
| 198 | return pte & PT_PRESENT_MASK; | ||
| 199 | } | ||
| 200 | |||
| 201 | static int is_shadow_present_pte(u64 pte) | ||
| 202 | { | ||
| 203 | pte &= ~PT_SHADOW_IO_MARK; | ||
| 204 | return pte != shadow_trap_nonpresent_pte | ||
| 205 | && pte != shadow_notrap_nonpresent_pte; | ||
| 206 | } | ||
| 207 | |||
| 208 | static int is_writeble_pte(unsigned long pte) | ||
| 209 | { | ||
| 210 | return pte & PT_WRITABLE_MASK; | ||
| 211 | } | ||
| 212 | |||
| 213 | static int is_dirty_pte(unsigned long pte) | ||
| 214 | { | ||
| 215 | return pte & PT_DIRTY_MASK; | ||
| 216 | } | ||
| 217 | |||
| 218 | static int is_io_pte(unsigned long pte) | ||
| 219 | { | ||
| 220 | return pte & PT_SHADOW_IO_MARK; | ||
| 221 | } | ||
| 222 | |||
| 223 | static int is_rmap_pte(u64 pte) | ||
| 224 | { | ||
| 225 | return pte != shadow_trap_nonpresent_pte | ||
| 226 | && pte != shadow_notrap_nonpresent_pte; | ||
| 227 | } | ||
| 228 | |||
| 229 | static gfn_t pse36_gfn_delta(u32 gpte) | ||
| 230 | { | ||
| 231 | int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; | ||
| 232 | |||
| 233 | return (gpte & PT32_DIR_PSE36_MASK) << shift; | ||
| 234 | } | ||
| 235 | |||
| 236 | static void set_shadow_pte(u64 *sptep, u64 spte) | ||
| 237 | { | ||
| 238 | #ifdef CONFIG_X86_64 | ||
| 239 | set_64bit((unsigned long *)sptep, spte); | ||
| 240 | #else | ||
| 241 | set_64bit((unsigned long long *)sptep, spte); | ||
| 242 | #endif | ||
| 243 | } | ||
| 244 | |||
| 245 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | ||
| 246 | struct kmem_cache *base_cache, int min) | ||
| 247 | { | ||
| 248 | void *obj; | ||
| 249 | |||
| 250 | if (cache->nobjs >= min) | ||
| 251 | return 0; | ||
| 252 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | ||
| 253 | obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); | ||
| 254 | if (!obj) | ||
| 255 | return -ENOMEM; | ||
| 256 | cache->objects[cache->nobjs++] = obj; | ||
| 257 | } | ||
| 258 | return 0; | ||
| 259 | } | ||
| 260 | |||
| 261 | static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) | ||
| 262 | { | ||
| 263 | while (mc->nobjs) | ||
| 264 | kfree(mc->objects[--mc->nobjs]); | ||
| 265 | } | ||
| 266 | |||
| 267 | static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, | ||
| 268 | int min) | ||
| 269 | { | ||
| 270 | struct page *page; | ||
| 271 | |||
| 272 | if (cache->nobjs >= min) | ||
| 273 | return 0; | ||
| 274 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | ||
| 275 | page = alloc_page(GFP_KERNEL); | ||
| 276 | if (!page) | ||
| 277 | return -ENOMEM; | ||
| 278 | set_page_private(page, 0); | ||
| 279 | cache->objects[cache->nobjs++] = page_address(page); | ||
| 280 | } | ||
| 281 | return 0; | ||
| 282 | } | ||
| 283 | |||
| 284 | static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) | ||
| 285 | { | ||
| 286 | while (mc->nobjs) | ||
| 287 | free_page((unsigned long)mc->objects[--mc->nobjs]); | ||
| 288 | } | ||
| 289 | |||
| 290 | static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) | ||
| 291 | { | ||
| 292 | int r; | ||
| 293 | |||
| 294 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, | ||
| 295 | pte_chain_cache, 4); | ||
| 296 | if (r) | ||
| 297 | goto out; | ||
| 298 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, | ||
| 299 | rmap_desc_cache, 1); | ||
| 300 | if (r) | ||
| 301 | goto out; | ||
| 302 | r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); | ||
| 303 | if (r) | ||
| 304 | goto out; | ||
| 305 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, | ||
| 306 | mmu_page_header_cache, 4); | ||
| 307 | out: | ||
| 308 | return r; | ||
| 309 | } | ||
| 310 | |||
| 311 | static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) | ||
| 312 | { | ||
| 313 | mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); | ||
| 314 | mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); | ||
| 315 | mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); | ||
| 316 | mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); | ||
| 317 | } | ||
| 318 | |||
| 319 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | ||
| 320 | size_t size) | ||
| 321 | { | ||
| 322 | void *p; | ||
| 323 | |||
| 324 | BUG_ON(!mc->nobjs); | ||
| 325 | p = mc->objects[--mc->nobjs]; | ||
| 326 | memset(p, 0, size); | ||
| 327 | return p; | ||
| 328 | } | ||
| 329 | |||
| 330 | static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) | ||
| 331 | { | ||
| 332 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, | ||
| 333 | sizeof(struct kvm_pte_chain)); | ||
| 334 | } | ||
| 335 | |||
| 336 | static void mmu_free_pte_chain(struct kvm_pte_chain *pc) | ||
| 337 | { | ||
| 338 | kfree(pc); | ||
| 339 | } | ||
| 340 | |||
| 341 | static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) | ||
| 342 | { | ||
| 343 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, | ||
| 344 | sizeof(struct kvm_rmap_desc)); | ||
| 345 | } | ||
| 346 | |||
| 347 | static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) | ||
| 348 | { | ||
| 349 | kfree(rd); | ||
| 350 | } | ||
| 351 | |||
| 352 | /* | ||
| 353 | * Take gfn and return the reverse mapping to it. | ||
| 354 | * Note: gfn must be unaliased before this function get called | ||
| 355 | */ | ||
| 356 | |||
| 357 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) | ||
| 358 | { | ||
| 359 | struct kvm_memory_slot *slot; | ||
| 360 | |||
| 361 | slot = gfn_to_memslot(kvm, gfn); | ||
| 362 | return &slot->rmap[gfn - slot->base_gfn]; | ||
| 363 | } | ||
| 364 | |||
| 365 | /* | ||
| 366 | * Reverse mapping data structures: | ||
| 367 | * | ||
| 368 | * If rmapp bit zero is zero, then rmapp point to the shadw page table entry | ||
| 369 | * that points to page_address(page). | ||
| 370 | * | ||
| 371 | * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc | ||
| 372 | * containing more mappings. | ||
| 373 | */ | ||
| 374 | static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | ||
| 375 | { | ||
| 376 | struct kvm_mmu_page *sp; | ||
| 377 | struct kvm_rmap_desc *desc; | ||
| 378 | unsigned long *rmapp; | ||
| 379 | int i; | ||
| 380 | |||
| 381 | if (!is_rmap_pte(*spte)) | ||
| 382 | return; | ||
| 383 | gfn = unalias_gfn(vcpu->kvm, gfn); | ||
| 384 | sp = page_header(__pa(spte)); | ||
| 385 | sp->gfns[spte - sp->spt] = gfn; | ||
| 386 | rmapp = gfn_to_rmap(vcpu->kvm, gfn); | ||
| 387 | if (!*rmapp) { | ||
| 388 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); | ||
| 389 | *rmapp = (unsigned long)spte; | ||
| 390 | } else if (!(*rmapp & 1)) { | ||
| 391 | rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); | ||
| 392 | desc = mmu_alloc_rmap_desc(vcpu); | ||
| 393 | desc->shadow_ptes[0] = (u64 *)*rmapp; | ||
| 394 | desc->shadow_ptes[1] = spte; | ||
| 395 | *rmapp = (unsigned long)desc | 1; | ||
| 396 | } else { | ||
| 397 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | ||
| 398 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
| 399 | while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) | ||
| 400 | desc = desc->more; | ||
| 401 | if (desc->shadow_ptes[RMAP_EXT-1]) { | ||
| 402 | desc->more = mmu_alloc_rmap_desc(vcpu); | ||
| 403 | desc = desc->more; | ||
| 404 | } | ||
| 405 | for (i = 0; desc->shadow_ptes[i]; ++i) | ||
| 406 | ; | ||
| 407 | desc->shadow_ptes[i] = spte; | ||
| 408 | } | ||
| 409 | } | ||
| 410 | |||
| 411 | static void rmap_desc_remove_entry(unsigned long *rmapp, | ||
| 412 | struct kvm_rmap_desc *desc, | ||
| 413 | int i, | ||
| 414 | struct kvm_rmap_desc *prev_desc) | ||
| 415 | { | ||
| 416 | int j; | ||
| 417 | |||
| 418 | for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) | ||
| 419 | ; | ||
| 420 | desc->shadow_ptes[i] = desc->shadow_ptes[j]; | ||
| 421 | desc->shadow_ptes[j] = NULL; | ||
| 422 | if (j != 0) | ||
| 423 | return; | ||
| 424 | if (!prev_desc && !desc->more) | ||
| 425 | *rmapp = (unsigned long)desc->shadow_ptes[0]; | ||
| 426 | else | ||
| 427 | if (prev_desc) | ||
| 428 | prev_desc->more = desc->more; | ||
| 429 | else | ||
| 430 | *rmapp = (unsigned long)desc->more | 1; | ||
| 431 | mmu_free_rmap_desc(desc); | ||
| 432 | } | ||
| 433 | |||
| 434 | static void rmap_remove(struct kvm *kvm, u64 *spte) | ||
| 435 | { | ||
| 436 | struct kvm_rmap_desc *desc; | ||
| 437 | struct kvm_rmap_desc *prev_desc; | ||
| 438 | struct kvm_mmu_page *sp; | ||
| 439 | struct page *page; | ||
| 440 | unsigned long *rmapp; | ||
| 441 | int i; | ||
| 442 | |||
| 443 | if (!is_rmap_pte(*spte)) | ||
| 444 | return; | ||
| 445 | sp = page_header(__pa(spte)); | ||
| 446 | page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
| 447 | mark_page_accessed(page); | ||
| 448 | if (is_writeble_pte(*spte)) | ||
| 449 | kvm_release_page_dirty(page); | ||
| 450 | else | ||
| 451 | kvm_release_page_clean(page); | ||
| 452 | rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); | ||
| 453 | if (!*rmapp) { | ||
| 454 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | ||
| 455 | BUG(); | ||
| 456 | } else if (!(*rmapp & 1)) { | ||
| 457 | rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); | ||
| 458 | if ((u64 *)*rmapp != spte) { | ||
| 459 | printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", | ||
| 460 | spte, *spte); | ||
| 461 | BUG(); | ||
| 462 | } | ||
| 463 | *rmapp = 0; | ||
| 464 | } else { | ||
| 465 | rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); | ||
| 466 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
| 467 | prev_desc = NULL; | ||
| 468 | while (desc) { | ||
| 469 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) | ||
| 470 | if (desc->shadow_ptes[i] == spte) { | ||
| 471 | rmap_desc_remove_entry(rmapp, | ||
| 472 | desc, i, | ||
| 473 | prev_desc); | ||
| 474 | return; | ||
| 475 | } | ||
| 476 | prev_desc = desc; | ||
| 477 | desc = desc->more; | ||
| 478 | } | ||
| 479 | BUG(); | ||
| 480 | } | ||
| 481 | } | ||
| 482 | |||
| 483 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) | ||
| 484 | { | ||
| 485 | struct kvm_rmap_desc *desc; | ||
| 486 | struct kvm_rmap_desc *prev_desc; | ||
| 487 | u64 *prev_spte; | ||
| 488 | int i; | ||
| 489 | |||
| 490 | if (!*rmapp) | ||
| 491 | return NULL; | ||
| 492 | else if (!(*rmapp & 1)) { | ||
| 493 | if (!spte) | ||
| 494 | return (u64 *)*rmapp; | ||
| 495 | return NULL; | ||
| 496 | } | ||
| 497 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
| 498 | prev_desc = NULL; | ||
| 499 | prev_spte = NULL; | ||
| 500 | while (desc) { | ||
| 501 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { | ||
| 502 | if (prev_spte == spte) | ||
| 503 | return desc->shadow_ptes[i]; | ||
| 504 | prev_spte = desc->shadow_ptes[i]; | ||
| 505 | } | ||
| 506 | desc = desc->more; | ||
| 507 | } | ||
| 508 | return NULL; | ||
| 509 | } | ||
| 510 | |||
| 511 | static void rmap_write_protect(struct kvm *kvm, u64 gfn) | ||
| 512 | { | ||
| 513 | unsigned long *rmapp; | ||
| 514 | u64 *spte; | ||
| 515 | int write_protected = 0; | ||
| 516 | |||
| 517 | gfn = unalias_gfn(kvm, gfn); | ||
| 518 | rmapp = gfn_to_rmap(kvm, gfn); | ||
| 519 | |||
| 520 | spte = rmap_next(kvm, rmapp, NULL); | ||
| 521 | while (spte) { | ||
| 522 | BUG_ON(!spte); | ||
| 523 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | ||
| 524 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | ||
| 525 | if (is_writeble_pte(*spte)) { | ||
| 526 | set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); | ||
| 527 | write_protected = 1; | ||
| 528 | } | ||
| 529 | spte = rmap_next(kvm, rmapp, spte); | ||
| 530 | } | ||
| 531 | if (write_protected) | ||
| 532 | kvm_flush_remote_tlbs(kvm); | ||
| 533 | } | ||
| 534 | |||
| 535 | #ifdef MMU_DEBUG | ||
| 536 | static int is_empty_shadow_page(u64 *spt) | ||
| 537 | { | ||
| 538 | u64 *pos; | ||
| 539 | u64 *end; | ||
| 540 | |||
| 541 | for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) | ||
| 542 | if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) { | ||
| 543 | printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, | ||
| 544 | pos, *pos); | ||
| 545 | return 0; | ||
| 546 | } | ||
| 547 | return 1; | ||
| 548 | } | ||
| 549 | #endif | ||
| 550 | |||
| 551 | static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
| 552 | { | ||
| 553 | ASSERT(is_empty_shadow_page(sp->spt)); | ||
| 554 | list_del(&sp->link); | ||
| 555 | __free_page(virt_to_page(sp->spt)); | ||
| 556 | __free_page(virt_to_page(sp->gfns)); | ||
| 557 | kfree(sp); | ||
| 558 | ++kvm->arch.n_free_mmu_pages; | ||
| 559 | } | ||
| 560 | |||
| 561 | static unsigned kvm_page_table_hashfn(gfn_t gfn) | ||
| 562 | { | ||
| 563 | return gfn; | ||
| 564 | } | ||
| 565 | |||
| 566 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | ||
| 567 | u64 *parent_pte) | ||
| 568 | { | ||
| 569 | struct kvm_mmu_page *sp; | ||
| 570 | |||
| 571 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); | ||
| 572 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | ||
| 573 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | ||
| 574 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | ||
| 575 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | ||
| 576 | ASSERT(is_empty_shadow_page(sp->spt)); | ||
| 577 | sp->slot_bitmap = 0; | ||
| 578 | sp->multimapped = 0; | ||
| 579 | sp->parent_pte = parent_pte; | ||
| 580 | --vcpu->kvm->arch.n_free_mmu_pages; | ||
| 581 | return sp; | ||
| 582 | } | ||
| 583 | |||
| 584 | static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, | ||
| 585 | struct kvm_mmu_page *sp, u64 *parent_pte) | ||
| 586 | { | ||
| 587 | struct kvm_pte_chain *pte_chain; | ||
| 588 | struct hlist_node *node; | ||
| 589 | int i; | ||
| 590 | |||
| 591 | if (!parent_pte) | ||
| 592 | return; | ||
| 593 | if (!sp->multimapped) { | ||
| 594 | u64 *old = sp->parent_pte; | ||
| 595 | |||
| 596 | if (!old) { | ||
| 597 | sp->parent_pte = parent_pte; | ||
| 598 | return; | ||
| 599 | } | ||
| 600 | sp->multimapped = 1; | ||
| 601 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
| 602 | INIT_HLIST_HEAD(&sp->parent_ptes); | ||
| 603 | hlist_add_head(&pte_chain->link, &sp->parent_ptes); | ||
| 604 | pte_chain->parent_ptes[0] = old; | ||
| 605 | } | ||
| 606 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { | ||
| 607 | if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) | ||
| 608 | continue; | ||
| 609 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) | ||
| 610 | if (!pte_chain->parent_ptes[i]) { | ||
| 611 | pte_chain->parent_ptes[i] = parent_pte; | ||
| 612 | return; | ||
| 613 | } | ||
| 614 | } | ||
| 615 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
| 616 | BUG_ON(!pte_chain); | ||
| 617 | hlist_add_head(&pte_chain->link, &sp->parent_ptes); | ||
| 618 | pte_chain->parent_ptes[0] = parent_pte; | ||
| 619 | } | ||
| 620 | |||
| 621 | static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, | ||
| 622 | u64 *parent_pte) | ||
| 623 | { | ||
| 624 | struct kvm_pte_chain *pte_chain; | ||
| 625 | struct hlist_node *node; | ||
| 626 | int i; | ||
| 627 | |||
| 628 | if (!sp->multimapped) { | ||
| 629 | BUG_ON(sp->parent_pte != parent_pte); | ||
| 630 | sp->parent_pte = NULL; | ||
| 631 | return; | ||
| 632 | } | ||
| 633 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | ||
| 634 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
| 635 | if (!pte_chain->parent_ptes[i]) | ||
| 636 | break; | ||
| 637 | if (pte_chain->parent_ptes[i] != parent_pte) | ||
| 638 | continue; | ||
| 639 | while (i + 1 < NR_PTE_CHAIN_ENTRIES | ||
| 640 | && pte_chain->parent_ptes[i + 1]) { | ||
| 641 | pte_chain->parent_ptes[i] | ||
| 642 | = pte_chain->parent_ptes[i + 1]; | ||
| 643 | ++i; | ||
| 644 | } | ||
| 645 | pte_chain->parent_ptes[i] = NULL; | ||
| 646 | if (i == 0) { | ||
| 647 | hlist_del(&pte_chain->link); | ||
| 648 | mmu_free_pte_chain(pte_chain); | ||
| 649 | if (hlist_empty(&sp->parent_ptes)) { | ||
| 650 | sp->multimapped = 0; | ||
| 651 | sp->parent_pte = NULL; | ||
| 652 | } | ||
| 653 | } | ||
| 654 | return; | ||
| 655 | } | ||
| 656 | BUG(); | ||
| 657 | } | ||
| 658 | |||
| 659 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) | ||
| 660 | { | ||
| 661 | unsigned index; | ||
| 662 | struct hlist_head *bucket; | ||
| 663 | struct kvm_mmu_page *sp; | ||
| 664 | struct hlist_node *node; | ||
| 665 | |||
| 666 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
| 667 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
| 668 | bucket = &kvm->arch.mmu_page_hash[index]; | ||
| 669 | hlist_for_each_entry(sp, node, bucket, hash_link) | ||
| 670 | if (sp->gfn == gfn && !sp->role.metaphysical) { | ||
| 671 | pgprintk("%s: found role %x\n", | ||
| 672 | __FUNCTION__, sp->role.word); | ||
| 673 | return sp; | ||
| 674 | } | ||
| 675 | return NULL; | ||
| 676 | } | ||
| 677 | |||
| 678 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | ||
| 679 | gfn_t gfn, | ||
| 680 | gva_t gaddr, | ||
| 681 | unsigned level, | ||
| 682 | int metaphysical, | ||
| 683 | unsigned access, | ||
| 684 | u64 *parent_pte, | ||
| 685 | bool *new_page) | ||
| 686 | { | ||
| 687 | union kvm_mmu_page_role role; | ||
| 688 | unsigned index; | ||
| 689 | unsigned quadrant; | ||
| 690 | struct hlist_head *bucket; | ||
| 691 | struct kvm_mmu_page *sp; | ||
| 692 | struct hlist_node *node; | ||
| 693 | |||
| 694 | role.word = 0; | ||
| 695 | role.glevels = vcpu->arch.mmu.root_level; | ||
| 696 | role.level = level; | ||
| 697 | role.metaphysical = metaphysical; | ||
| 698 | role.access = access; | ||
| 699 | if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { | ||
| 700 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); | ||
| 701 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | ||
| 702 | role.quadrant = quadrant; | ||
| 703 | } | ||
| 704 | pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, | ||
| 705 | gfn, role.word); | ||
| 706 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
| 707 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | ||
| 708 | hlist_for_each_entry(sp, node, bucket, hash_link) | ||
| 709 | if (sp->gfn == gfn && sp->role.word == role.word) { | ||
| 710 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | ||
| 711 | pgprintk("%s: found\n", __FUNCTION__); | ||
| 712 | return sp; | ||
| 713 | } | ||
| 714 | ++vcpu->kvm->stat.mmu_cache_miss; | ||
| 715 | sp = kvm_mmu_alloc_page(vcpu, parent_pte); | ||
| 716 | if (!sp) | ||
| 717 | return sp; | ||
| 718 | pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); | ||
| 719 | sp->gfn = gfn; | ||
| 720 | sp->role = role; | ||
| 721 | hlist_add_head(&sp->hash_link, bucket); | ||
| 722 | vcpu->arch.mmu.prefetch_page(vcpu, sp); | ||
| 723 | if (!metaphysical) | ||
| 724 | rmap_write_protect(vcpu->kvm, gfn); | ||
| 725 | if (new_page) | ||
| 726 | *new_page = 1; | ||
| 727 | return sp; | ||
| 728 | } | ||
| 729 | |||
| 730 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | ||
| 731 | struct kvm_mmu_page *sp) | ||
| 732 | { | ||
| 733 | unsigned i; | ||
| 734 | u64 *pt; | ||
| 735 | u64 ent; | ||
| 736 | |||
| 737 | pt = sp->spt; | ||
| 738 | |||
| 739 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) { | ||
| 740 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
| 741 | if (is_shadow_present_pte(pt[i])) | ||
| 742 | rmap_remove(kvm, &pt[i]); | ||
| 743 | pt[i] = shadow_trap_nonpresent_pte; | ||
| 744 | } | ||
| 745 | kvm_flush_remote_tlbs(kvm); | ||
| 746 | return; | ||
| 747 | } | ||
| 748 | |||
| 749 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
| 750 | ent = pt[i]; | ||
| 751 | |||
| 752 | pt[i] = shadow_trap_nonpresent_pte; | ||
| 753 | if (!is_shadow_present_pte(ent)) | ||
| 754 | continue; | ||
| 755 | ent &= PT64_BASE_ADDR_MASK; | ||
| 756 | mmu_page_remove_parent_pte(page_header(ent), &pt[i]); | ||
| 757 | } | ||
| 758 | kvm_flush_remote_tlbs(kvm); | ||
| 759 | } | ||
| 760 | |||
| 761 | static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | ||
| 762 | { | ||
| 763 | mmu_page_remove_parent_pte(sp, parent_pte); | ||
| 764 | } | ||
| 765 | |||
| 766 | static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) | ||
| 767 | { | ||
| 768 | int i; | ||
| 769 | |||
| 770 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | ||
| 771 | if (kvm->vcpus[i]) | ||
| 772 | kvm->vcpus[i]->arch.last_pte_updated = NULL; | ||
| 773 | } | ||
| 774 | |||
| 775 | static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
| 776 | { | ||
| 777 | u64 *parent_pte; | ||
| 778 | |||
| 779 | ++kvm->stat.mmu_shadow_zapped; | ||
| 780 | while (sp->multimapped || sp->parent_pte) { | ||
| 781 | if (!sp->multimapped) | ||
| 782 | parent_pte = sp->parent_pte; | ||
| 783 | else { | ||
| 784 | struct kvm_pte_chain *chain; | ||
| 785 | |||
| 786 | chain = container_of(sp->parent_ptes.first, | ||
| 787 | struct kvm_pte_chain, link); | ||
| 788 | parent_pte = chain->parent_ptes[0]; | ||
| 789 | } | ||
| 790 | BUG_ON(!parent_pte); | ||
| 791 | kvm_mmu_put_page(sp, parent_pte); | ||
| 792 | set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); | ||
| 793 | } | ||
| 794 | kvm_mmu_page_unlink_children(kvm, sp); | ||
| 795 | if (!sp->root_count) { | ||
| 796 | hlist_del(&sp->hash_link); | ||
| 797 | kvm_mmu_free_page(kvm, sp); | ||
| 798 | } else | ||
| 799 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | ||
| 800 | kvm_mmu_reset_last_pte_updated(kvm); | ||
| 801 | } | ||
| 802 | |||
| 803 | /* | ||
| 804 | * Changing the number of mmu pages allocated to the vm | ||
| 805 | * Note: if kvm_nr_mmu_pages is too small, you will get dead lock | ||
| 806 | */ | ||
| 807 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) | ||
| 808 | { | ||
| 809 | /* | ||
| 810 | * If we set the number of mmu pages to be smaller be than the | ||
| 811 | * number of actived pages , we must to free some mmu pages before we | ||
| 812 | * change the value | ||
| 813 | */ | ||
| 814 | |||
| 815 | if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > | ||
| 816 | kvm_nr_mmu_pages) { | ||
| 817 | int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages | ||
| 818 | - kvm->arch.n_free_mmu_pages; | ||
| 819 | |||
| 820 | while (n_used_mmu_pages > kvm_nr_mmu_pages) { | ||
| 821 | struct kvm_mmu_page *page; | ||
| 822 | |||
| 823 | page = container_of(kvm->arch.active_mmu_pages.prev, | ||
| 824 | struct kvm_mmu_page, link); | ||
| 825 | kvm_mmu_zap_page(kvm, page); | ||
| 826 | n_used_mmu_pages--; | ||
| 827 | } | ||
| 828 | kvm->arch.n_free_mmu_pages = 0; | ||
| 829 | } | ||
| 830 | else | ||
| 831 | kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages | ||
| 832 | - kvm->arch.n_alloc_mmu_pages; | ||
| 833 | |||
| 834 | kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; | ||
| 835 | } | ||
| 836 | |||
| 837 | static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | ||
| 838 | { | ||
| 839 | unsigned index; | ||
| 840 | struct hlist_head *bucket; | ||
| 841 | struct kvm_mmu_page *sp; | ||
| 842 | struct hlist_node *node, *n; | ||
| 843 | int r; | ||
| 844 | |||
| 845 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
| 846 | r = 0; | ||
| 847 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
| 848 | bucket = &kvm->arch.mmu_page_hash[index]; | ||
| 849 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) | ||
| 850 | if (sp->gfn == gfn && !sp->role.metaphysical) { | ||
| 851 | pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, | ||
| 852 | sp->role.word); | ||
| 853 | kvm_mmu_zap_page(kvm, sp); | ||
| 854 | r = 1; | ||
| 855 | } | ||
| 856 | return r; | ||
| 857 | } | ||
| 858 | |||
| 859 | static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) | ||
| 860 | { | ||
| 861 | struct kvm_mmu_page *sp; | ||
| 862 | |||
| 863 | while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { | ||
| 864 | pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word); | ||
| 865 | kvm_mmu_zap_page(kvm, sp); | ||
| 866 | } | ||
| 867 | } | ||
| 868 | |||
| 869 | static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) | ||
| 870 | { | ||
| 871 | int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); | ||
| 872 | struct kvm_mmu_page *sp = page_header(__pa(pte)); | ||
| 873 | |||
| 874 | __set_bit(slot, &sp->slot_bitmap); | ||
| 875 | } | ||
| 876 | |||
| 877 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | ||
| 878 | { | ||
| 879 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); | ||
| 880 | |||
| 881 | if (gpa == UNMAPPED_GVA) | ||
| 882 | return NULL; | ||
| 883 | return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
| 884 | } | ||
| 885 | |||
| 886 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | ||
| 887 | unsigned pt_access, unsigned pte_access, | ||
| 888 | int user_fault, int write_fault, int dirty, | ||
| 889 | int *ptwrite, gfn_t gfn, struct page *page) | ||
| 890 | { | ||
| 891 | u64 spte; | ||
| 892 | int was_rmapped = is_rmap_pte(*shadow_pte); | ||
| 893 | int was_writeble = is_writeble_pte(*shadow_pte); | ||
| 894 | |||
| 895 | pgprintk("%s: spte %llx access %x write_fault %d" | ||
| 896 | " user_fault %d gfn %lx\n", | ||
| 897 | __FUNCTION__, *shadow_pte, pt_access, | ||
| 898 | write_fault, user_fault, gfn); | ||
| 899 | |||
| 900 | /* | ||
| 901 | * We don't set the accessed bit, since we sometimes want to see | ||
| 902 | * whether the guest actually used the pte (in order to detect | ||
| 903 | * demand paging). | ||
| 904 | */ | ||
| 905 | spte = PT_PRESENT_MASK | PT_DIRTY_MASK; | ||
| 906 | if (!dirty) | ||
| 907 | pte_access &= ~ACC_WRITE_MASK; | ||
| 908 | if (!(pte_access & ACC_EXEC_MASK)) | ||
| 909 | spte |= PT64_NX_MASK; | ||
| 910 | |||
| 911 | spte |= PT_PRESENT_MASK; | ||
| 912 | if (pte_access & ACC_USER_MASK) | ||
| 913 | spte |= PT_USER_MASK; | ||
| 914 | |||
| 915 | if (is_error_page(page)) { | ||
| 916 | set_shadow_pte(shadow_pte, | ||
| 917 | shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK); | ||
| 918 | kvm_release_page_clean(page); | ||
| 919 | return; | ||
| 920 | } | ||
| 921 | |||
| 922 | spte |= page_to_phys(page); | ||
| 923 | |||
| 924 | if ((pte_access & ACC_WRITE_MASK) | ||
| 925 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { | ||
| 926 | struct kvm_mmu_page *shadow; | ||
| 927 | |||
| 928 | spte |= PT_WRITABLE_MASK; | ||
| 929 | if (user_fault) { | ||
| 930 | mmu_unshadow(vcpu->kvm, gfn); | ||
| 931 | goto unshadowed; | ||
| 932 | } | ||
| 933 | |||
| 934 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); | ||
| 935 | if (shadow) { | ||
| 936 | pgprintk("%s: found shadow page for %lx, marking ro\n", | ||
| 937 | __FUNCTION__, gfn); | ||
| 938 | pte_access &= ~ACC_WRITE_MASK; | ||
| 939 | if (is_writeble_pte(spte)) { | ||
| 940 | spte &= ~PT_WRITABLE_MASK; | ||
| 941 | kvm_x86_ops->tlb_flush(vcpu); | ||
| 942 | } | ||
| 943 | if (write_fault) | ||
| 944 | *ptwrite = 1; | ||
| 945 | } | ||
| 946 | } | ||
| 947 | |||
| 948 | unshadowed: | ||
| 949 | |||
| 950 | if (pte_access & ACC_WRITE_MASK) | ||
| 951 | mark_page_dirty(vcpu->kvm, gfn); | ||
| 952 | |||
| 953 | pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); | ||
| 954 | set_shadow_pte(shadow_pte, spte); | ||
| 955 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); | ||
| 956 | if (!was_rmapped) { | ||
| 957 | rmap_add(vcpu, shadow_pte, gfn); | ||
| 958 | if (!is_rmap_pte(*shadow_pte)) | ||
| 959 | kvm_release_page_clean(page); | ||
| 960 | } else { | ||
| 961 | if (was_writeble) | ||
| 962 | kvm_release_page_dirty(page); | ||
| 963 | else | ||
| 964 | kvm_release_page_clean(page); | ||
| 965 | } | ||
| 966 | if (!ptwrite || !*ptwrite) | ||
| 967 | vcpu->arch.last_pte_updated = shadow_pte; | ||
| 968 | } | ||
| 969 | |||
| 970 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | ||
| 971 | { | ||
| 972 | } | ||
| 973 | |||
| 974 | static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, | ||
| 975 | gfn_t gfn, struct page *page) | ||
| 976 | { | ||
| 977 | int level = PT32E_ROOT_LEVEL; | ||
| 978 | hpa_t table_addr = vcpu->arch.mmu.root_hpa; | ||
| 979 | int pt_write = 0; | ||
| 980 | |||
| 981 | for (; ; level--) { | ||
| 982 | u32 index = PT64_INDEX(v, level); | ||
| 983 | u64 *table; | ||
| 984 | |||
| 985 | ASSERT(VALID_PAGE(table_addr)); | ||
| 986 | table = __va(table_addr); | ||
| 987 | |||
| 988 | if (level == 1) { | ||
| 989 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, | ||
| 990 | 0, write, 1, &pt_write, gfn, page); | ||
| 991 | return pt_write || is_io_pte(table[index]); | ||
| 992 | } | ||
| 993 | |||
| 994 | if (table[index] == shadow_trap_nonpresent_pte) { | ||
| 995 | struct kvm_mmu_page *new_table; | ||
| 996 | gfn_t pseudo_gfn; | ||
| 997 | |||
| 998 | pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) | ||
| 999 | >> PAGE_SHIFT; | ||
| 1000 | new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, | ||
| 1001 | v, level - 1, | ||
| 1002 | 1, ACC_ALL, &table[index], | ||
| 1003 | NULL); | ||
| 1004 | if (!new_table) { | ||
| 1005 | pgprintk("nonpaging_map: ENOMEM\n"); | ||
| 1006 | kvm_release_page_clean(page); | ||
| 1007 | return -ENOMEM; | ||
| 1008 | } | ||
| 1009 | |||
| 1010 | table[index] = __pa(new_table->spt) | PT_PRESENT_MASK | ||
| 1011 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
| 1012 | } | ||
| 1013 | table_addr = table[index] & PT64_BASE_ADDR_MASK; | ||
| 1014 | } | ||
| 1015 | } | ||
| 1016 | |||
| 1017 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | ||
| 1018 | { | ||
| 1019 | int r; | ||
| 1020 | |||
| 1021 | struct page *page; | ||
| 1022 | |||
| 1023 | down_read(¤t->mm->mmap_sem); | ||
| 1024 | page = gfn_to_page(vcpu->kvm, gfn); | ||
| 1025 | |||
| 1026 | spin_lock(&vcpu->kvm->mmu_lock); | ||
| 1027 | kvm_mmu_free_some_pages(vcpu); | ||
| 1028 | r = __nonpaging_map(vcpu, v, write, gfn, page); | ||
| 1029 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
| 1030 | |||
| 1031 | up_read(¤t->mm->mmap_sem); | ||
| 1032 | |||
| 1033 | return r; | ||
| 1034 | } | ||
| 1035 | |||
| 1036 | |||
| 1037 | static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | ||
| 1038 | struct kvm_mmu_page *sp) | ||
| 1039 | { | ||
| 1040 | int i; | ||
| 1041 | |||
| 1042 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
| 1043 | sp->spt[i] = shadow_trap_nonpresent_pte; | ||
| 1044 | } | ||
| 1045 | |||
| 1046 | static void mmu_free_roots(struct kvm_vcpu *vcpu) | ||
| 1047 | { | ||
| 1048 | int i; | ||
| 1049 | struct kvm_mmu_page *sp; | ||
| 1050 | |||
| 1051 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
| 1052 | return; | ||
| 1053 | spin_lock(&vcpu->kvm->mmu_lock); | ||
| 1054 | #ifdef CONFIG_X86_64 | ||
| 1055 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
| 1056 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
| 1057 | |||
| 1058 | sp = page_header(root); | ||
| 1059 | --sp->root_count; | ||
| 1060 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
| 1061 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
| 1062 | return; | ||
| 1063 | } | ||
| 1064 | #endif | ||
| 1065 | for (i = 0; i < 4; ++i) { | ||
| 1066 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
| 1067 | |||
| 1068 | if (root) { | ||
| 1069 | root &= PT64_BASE_ADDR_MASK; | ||
| 1070 | sp = page_header(root); | ||
| 1071 | --sp->root_count; | ||
| 1072 | } | ||
| 1073 | vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; | ||
| 1074 | } | ||
| 1075 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
| 1076 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
| 1077 | } | ||
| 1078 | |||
| 1079 | static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | ||
| 1080 | { | ||
| 1081 | int i; | ||
| 1082 | gfn_t root_gfn; | ||
| 1083 | struct kvm_mmu_page *sp; | ||
| 1084 | |||
| 1085 | root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; | ||
| 1086 | |||
| 1087 | #ifdef CONFIG_X86_64 | ||
| 1088 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
| 1089 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
| 1090 | |||
| 1091 | ASSERT(!VALID_PAGE(root)); | ||
| 1092 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, | ||
| 1093 | PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL); | ||
| 1094 | root = __pa(sp->spt); | ||
| 1095 | ++sp->root_count; | ||
| 1096 | vcpu->arch.mmu.root_hpa = root; | ||
| 1097 | return; | ||
| 1098 | } | ||
| 1099 | #endif | ||
| 1100 | for (i = 0; i < 4; ++i) { | ||
| 1101 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
| 1102 | |||
| 1103 | ASSERT(!VALID_PAGE(root)); | ||
| 1104 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { | ||
| 1105 | if (!is_present_pte(vcpu->arch.pdptrs[i])) { | ||
| 1106 | vcpu->arch.mmu.pae_root[i] = 0; | ||
| 1107 | continue; | ||
| 1108 | } | ||
| 1109 | root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; | ||
| 1110 | } else if (vcpu->arch.mmu.root_level == 0) | ||
| 1111 | root_gfn = 0; | ||
| 1112 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | ||
| 1113 | PT32_ROOT_LEVEL, !is_paging(vcpu), | ||
| 1114 | ACC_ALL, NULL, NULL); | ||
| 1115 | root = __pa(sp->spt); | ||
| 1116 | ++sp->root_count; | ||
| 1117 | vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; | ||
| 1118 | } | ||
| 1119 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | ||
| 1120 | } | ||
| 1121 | |||
| 1122 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
| 1123 | { | ||
| 1124 | return vaddr; | ||
| 1125 | } | ||
| 1126 | |||
| 1127 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | ||
| 1128 | u32 error_code) | ||
| 1129 | { | ||
| 1130 | gfn_t gfn; | ||
| 1131 | int r; | ||
| 1132 | |||
| 1133 | pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code); | ||
| 1134 | r = mmu_topup_memory_caches(vcpu); | ||
| 1135 | if (r) | ||
| 1136 | return r; | ||
| 1137 | |||
| 1138 | ASSERT(vcpu); | ||
| 1139 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
| 1140 | |||
| 1141 | gfn = gva >> PAGE_SHIFT; | ||
| 1142 | |||
| 1143 | return nonpaging_map(vcpu, gva & PAGE_MASK, | ||
| 1144 | error_code & PFERR_WRITE_MASK, gfn); | ||
| 1145 | } | ||
| 1146 | |||
| 1147 | static void nonpaging_free(struct kvm_vcpu *vcpu) | ||
| 1148 | { | ||
| 1149 | mmu_free_roots(vcpu); | ||
| 1150 | } | ||
| 1151 | |||
| 1152 | static int nonpaging_init_context(struct kvm_vcpu *vcpu) | ||
| 1153 | { | ||
| 1154 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
| 1155 | |||
| 1156 | context->new_cr3 = nonpaging_new_cr3; | ||
| 1157 | context->page_fault = nonpaging_page_fault; | ||
| 1158 | context->gva_to_gpa = nonpaging_gva_to_gpa; | ||
| 1159 | context->free = nonpaging_free; | ||
| 1160 | context->prefetch_page = nonpaging_prefetch_page; | ||
| 1161 | context->root_level = 0; | ||
| 1162 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
| 1163 | context->root_hpa = INVALID_PAGE; | ||
| 1164 | return 0; | ||
| 1165 | } | ||
| 1166 | |||
| 1167 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | ||
| 1168 | { | ||
| 1169 | ++vcpu->stat.tlb_flush; | ||
| 1170 | kvm_x86_ops->tlb_flush(vcpu); | ||
| 1171 | } | ||
| 1172 | |||
| 1173 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | ||
| 1174 | { | ||
| 1175 | pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); | ||
| 1176 | mmu_free_roots(vcpu); | ||
| 1177 | } | ||
| 1178 | |||
| 1179 | static void inject_page_fault(struct kvm_vcpu *vcpu, | ||
| 1180 | u64 addr, | ||
| 1181 | u32 err_code) | ||
| 1182 | { | ||
| 1183 | kvm_inject_page_fault(vcpu, addr, err_code); | ||
| 1184 | } | ||
| 1185 | |||
| 1186 | static void paging_free(struct kvm_vcpu *vcpu) | ||
| 1187 | { | ||
| 1188 | nonpaging_free(vcpu); | ||
| 1189 | } | ||
| 1190 | |||
| 1191 | #define PTTYPE 64 | ||
| 1192 | #include "paging_tmpl.h" | ||
| 1193 | #undef PTTYPE | ||
| 1194 | |||
| 1195 | #define PTTYPE 32 | ||
| 1196 | #include "paging_tmpl.h" | ||
| 1197 | #undef PTTYPE | ||
| 1198 | |||
| 1199 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | ||
| 1200 | { | ||
| 1201 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
| 1202 | |||
| 1203 | ASSERT(is_pae(vcpu)); | ||
| 1204 | context->new_cr3 = paging_new_cr3; | ||
| 1205 | context->page_fault = paging64_page_fault; | ||
| 1206 | context->gva_to_gpa = paging64_gva_to_gpa; | ||
| 1207 | context->prefetch_page = paging64_prefetch_page; | ||
| 1208 | context->free = paging_free; | ||
| 1209 | context->root_level = level; | ||
| 1210 | context->shadow_root_level = level; | ||
| 1211 | context->root_hpa = INVALID_PAGE; | ||
| 1212 | return 0; | ||
| 1213 | } | ||
| 1214 | |||
| 1215 | static int paging64_init_context(struct kvm_vcpu *vcpu) | ||
| 1216 | { | ||
| 1217 | return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); | ||
| 1218 | } | ||
| 1219 | |||
| 1220 | static int paging32_init_context(struct kvm_vcpu *vcpu) | ||
| 1221 | { | ||
| 1222 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
| 1223 | |||
| 1224 | context->new_cr3 = paging_new_cr3; | ||
| 1225 | context->page_fault = paging32_page_fault; | ||
| 1226 | context->gva_to_gpa = paging32_gva_to_gpa; | ||
| 1227 | context->free = paging_free; | ||
| 1228 | context->prefetch_page = paging32_prefetch_page; | ||
| 1229 | context->root_level = PT32_ROOT_LEVEL; | ||
| 1230 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
| 1231 | context->root_hpa = INVALID_PAGE; | ||
| 1232 | return 0; | ||
| 1233 | } | ||
| 1234 | |||
| 1235 | static int paging32E_init_context(struct kvm_vcpu *vcpu) | ||
| 1236 | { | ||
| 1237 | return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); | ||
| 1238 | } | ||
| 1239 | |||
| 1240 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | ||
| 1241 | { | ||
| 1242 | ASSERT(vcpu); | ||
| 1243 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
| 1244 | |||
| 1245 | if (!is_paging(vcpu)) | ||
| 1246 | return nonpaging_init_context(vcpu); | ||
| 1247 | else if (is_long_mode(vcpu)) | ||
| 1248 | return paging64_init_context(vcpu); | ||
| 1249 | else if (is_pae(vcpu)) | ||
| 1250 | return paging32E_init_context(vcpu); | ||
| 1251 | else | ||
| 1252 | return paging32_init_context(vcpu); | ||
| 1253 | } | ||
| 1254 | |||
| 1255 | static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) | ||
| 1256 | { | ||
| 1257 | ASSERT(vcpu); | ||
| 1258 | if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { | ||
| 1259 | vcpu->arch.mmu.free(vcpu); | ||
| 1260 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
| 1261 | } | ||
| 1262 | } | ||
| 1263 | |||
| 1264 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) | ||
| 1265 | { | ||
| 1266 | destroy_kvm_mmu(vcpu); | ||
| 1267 | return init_kvm_mmu(vcpu); | ||
| 1268 | } | ||
| 1269 | EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); | ||
| 1270 | |||
| 1271 | int kvm_mmu_load(struct kvm_vcpu *vcpu) | ||
| 1272 | { | ||
| 1273 | int r; | ||
| 1274 | |||
| 1275 | r = mmu_topup_memory_caches(vcpu); | ||
| 1276 | if (r) | ||
| 1277 | goto out; | ||
| 1278 | spin_lock(&vcpu->kvm->mmu_lock); | ||
| 1279 | kvm_mmu_free_some_pages(vcpu); | ||
| 1280 | mmu_alloc_roots(vcpu); | ||
| 1281 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
| 1282 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); | ||
| 1283 | kvm_mmu_flush_tlb(vcpu); | ||
| 1284 | out: | ||
| 1285 | return r; | ||
| 1286 | } | ||
| 1287 | EXPORT_SYMBOL_GPL(kvm_mmu_load); | ||
| 1288 | |||
| 1289 | void kvm_mmu_unload(struct kvm_vcpu *vcpu) | ||
| 1290 | { | ||
| 1291 | mmu_free_roots(vcpu); | ||
| 1292 | } | ||
| 1293 | |||
| 1294 | static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | ||
| 1295 | struct kvm_mmu_page *sp, | ||
| 1296 | u64 *spte) | ||
| 1297 | { | ||
| 1298 | u64 pte; | ||
| 1299 | struct kvm_mmu_page *child; | ||
| 1300 | |||
| 1301 | pte = *spte; | ||
| 1302 | if (is_shadow_present_pte(pte)) { | ||
| 1303 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) | ||
| 1304 | rmap_remove(vcpu->kvm, spte); | ||
| 1305 | else { | ||
| 1306 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
| 1307 | mmu_page_remove_parent_pte(child, spte); | ||
| 1308 | } | ||
| 1309 | } | ||
| 1310 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | ||
| 1311 | } | ||
| 1312 | |||
| 1313 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | ||
| 1314 | struct kvm_mmu_page *sp, | ||
| 1315 | u64 *spte, | ||
| 1316 | const void *new, int bytes, | ||
| 1317 | int offset_in_pte) | ||
| 1318 | { | ||
| 1319 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { | ||
| 1320 | ++vcpu->kvm->stat.mmu_pde_zapped; | ||
| 1321 | return; | ||
| 1322 | } | ||
| 1323 | |||
| 1324 | ++vcpu->kvm->stat.mmu_pte_updated; | ||
| 1325 | if (sp->role.glevels == PT32_ROOT_LEVEL) | ||
| 1326 | paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); | ||
| 1327 | else | ||
| 1328 | paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); | ||
| 1329 | } | ||
| 1330 | |||
| 1331 | static bool need_remote_flush(u64 old, u64 new) | ||
| 1332 | { | ||
| 1333 | if (!is_shadow_present_pte(old)) | ||
| 1334 | return false; | ||
| 1335 | if (!is_shadow_present_pte(new)) | ||
| 1336 | return true; | ||
| 1337 | if ((old ^ new) & PT64_BASE_ADDR_MASK) | ||
| 1338 | return true; | ||
| 1339 | old ^= PT64_NX_MASK; | ||
| 1340 | new ^= PT64_NX_MASK; | ||
| 1341 | return (old & ~new & PT64_PERM_MASK) != 0; | ||
| 1342 | } | ||
| 1343 | |||
| 1344 | static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) | ||
| 1345 | { | ||
| 1346 | if (need_remote_flush(old, new)) | ||
| 1347 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
| 1348 | else | ||
| 1349 | kvm_mmu_flush_tlb(vcpu); | ||
| 1350 | } | ||
| 1351 | |||
| 1352 | static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) | ||
| 1353 | { | ||
| 1354 | u64 *spte = vcpu->arch.last_pte_updated; | ||
| 1355 | |||
| 1356 | return !!(spte && (*spte & PT_ACCESSED_MASK)); | ||
| 1357 | } | ||
| 1358 | |||
| 1359 | static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
| 1360 | const u8 *new, int bytes) | ||
| 1361 | { | ||
| 1362 | gfn_t gfn; | ||
| 1363 | int r; | ||
| 1364 | u64 gpte = 0; | ||
| 1365 | |||
| 1366 | if (bytes != 4 && bytes != 8) | ||
| 1367 | return; | ||
| 1368 | |||
| 1369 | /* | ||
| 1370 | * Assume that the pte write on a page table of the same type | ||
| 1371 | * as the current vcpu paging mode. This is nearly always true | ||
| 1372 | * (might be false while changing modes). Note it is verified later | ||
| 1373 | * by update_pte(). | ||
| 1374 | */ | ||
| 1375 | if (is_pae(vcpu)) { | ||
| 1376 | /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ | ||
| 1377 | if ((bytes == 4) && (gpa % 4 == 0)) { | ||
| 1378 | r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8); | ||
| 1379 | if (r) | ||
| 1380 | return; | ||
| 1381 | memcpy((void *)&gpte + (gpa % 8), new, 4); | ||
| 1382 | } else if ((bytes == 8) && (gpa % 8 == 0)) { | ||
| 1383 | memcpy((void *)&gpte, new, 8); | ||
| 1384 | } | ||
| 1385 | } else { | ||
| 1386 | if ((bytes == 4) && (gpa % 4 == 0)) | ||
| 1387 | memcpy((void *)&gpte, new, 4); | ||
| 1388 | } | ||
| 1389 | if (!is_present_pte(gpte)) | ||
| 1390 | return; | ||
| 1391 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
| 1392 | vcpu->arch.update_pte.gfn = gfn; | ||
| 1393 | vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn); | ||
| 1394 | } | ||
| 1395 | |||
| 1396 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
| 1397 | const u8 *new, int bytes) | ||
| 1398 | { | ||
| 1399 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
| 1400 | struct kvm_mmu_page *sp; | ||
| 1401 | struct hlist_node *node, *n; | ||
| 1402 | struct hlist_head *bucket; | ||
| 1403 | unsigned index; | ||
| 1404 | u64 entry; | ||
| 1405 | u64 *spte; | ||
| 1406 | unsigned offset = offset_in_page(gpa); | ||
| 1407 | unsigned pte_size; | ||
| 1408 | unsigned page_offset; | ||
| 1409 | unsigned misaligned; | ||
| 1410 | unsigned quadrant; | ||
| 1411 | int level; | ||
| 1412 | int flooded = 0; | ||
| 1413 | int npte; | ||
| 1414 | |||
| 1415 | pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); | ||
| 1416 | mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); | ||
| 1417 | spin_lock(&vcpu->kvm->mmu_lock); | ||
| 1418 | kvm_mmu_free_some_pages(vcpu); | ||
| 1419 | ++vcpu->kvm->stat.mmu_pte_write; | ||
| 1420 | kvm_mmu_audit(vcpu, "pre pte write"); | ||
| 1421 | if (gfn == vcpu->arch.last_pt_write_gfn | ||
| 1422 | && !last_updated_pte_accessed(vcpu)) { | ||
| 1423 | ++vcpu->arch.last_pt_write_count; | ||
| 1424 | if (vcpu->arch.last_pt_write_count >= 3) | ||
| 1425 | flooded = 1; | ||
| 1426 | } else { | ||
| 1427 | vcpu->arch.last_pt_write_gfn = gfn; | ||
| 1428 | vcpu->arch.last_pt_write_count = 1; | ||
| 1429 | vcpu->arch.last_pte_updated = NULL; | ||
| 1430 | } | ||
| 1431 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
| 1432 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | ||
| 1433 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { | ||
| 1434 | if (sp->gfn != gfn || sp->role.metaphysical) | ||
| 1435 | continue; | ||
| 1436 | pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; | ||
| 1437 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | ||
| 1438 | misaligned |= bytes < 4; | ||
| 1439 | if (misaligned || flooded) { | ||
| 1440 | /* | ||
| 1441 | * Misaligned accesses are too much trouble to fix | ||
| 1442 | * up; also, they usually indicate a page is not used | ||
| 1443 | * as a page table. | ||
| 1444 | * | ||
| 1445 | * If we're seeing too many writes to a page, | ||
| 1446 | * it may no longer be a page table, or we may be | ||
| 1447 | * forking, in which case it is better to unmap the | ||
| 1448 | * page. | ||
| 1449 | */ | ||
| 1450 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | ||
| 1451 | gpa, bytes, sp->role.word); | ||
| 1452 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
| 1453 | ++vcpu->kvm->stat.mmu_flooded; | ||
| 1454 | continue; | ||
| 1455 | } | ||
| 1456 | page_offset = offset; | ||
| 1457 | level = sp->role.level; | ||
| 1458 | npte = 1; | ||
| 1459 | if (sp->role.glevels == PT32_ROOT_LEVEL) { | ||
| 1460 | page_offset <<= 1; /* 32->64 */ | ||
| 1461 | /* | ||
| 1462 | * A 32-bit pde maps 4MB while the shadow pdes map | ||
| 1463 | * only 2MB. So we need to double the offset again | ||
| 1464 | * and zap two pdes instead of one. | ||
| 1465 | */ | ||
| 1466 | if (level == PT32_ROOT_LEVEL) { | ||
| 1467 | page_offset &= ~7; /* kill rounding error */ | ||
| 1468 | page_offset <<= 1; | ||
| 1469 | npte = 2; | ||
| 1470 | } | ||
| 1471 | quadrant = page_offset >> PAGE_SHIFT; | ||
| 1472 | page_offset &= ~PAGE_MASK; | ||
| 1473 | if (quadrant != sp->role.quadrant) | ||
| 1474 | continue; | ||
| 1475 | } | ||
| 1476 | spte = &sp->spt[page_offset / sizeof(*spte)]; | ||
| 1477 | while (npte--) { | ||
| 1478 | entry = *spte; | ||
| 1479 | mmu_pte_write_zap_pte(vcpu, sp, spte); | ||
| 1480 | mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes, | ||
| 1481 | page_offset & (pte_size - 1)); | ||
| 1482 | mmu_pte_write_flush_tlb(vcpu, entry, *spte); | ||
| 1483 | ++spte; | ||
| 1484 | } | ||
| 1485 | } | ||
| 1486 | kvm_mmu_audit(vcpu, "post pte write"); | ||
| 1487 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
| 1488 | if (vcpu->arch.update_pte.page) { | ||
| 1489 | kvm_release_page_clean(vcpu->arch.update_pte.page); | ||
| 1490 | vcpu->arch.update_pte.page = NULL; | ||
| 1491 | } | ||
| 1492 | } | ||
| 1493 | |||
| 1494 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | ||
| 1495 | { | ||
| 1496 | gpa_t gpa; | ||
| 1497 | int r; | ||
| 1498 | |||
| 1499 | down_read(¤t->mm->mmap_sem); | ||
| 1500 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); | ||
| 1501 | up_read(¤t->mm->mmap_sem); | ||
| 1502 | |||
| 1503 | spin_lock(&vcpu->kvm->mmu_lock); | ||
| 1504 | r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
| 1505 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
| 1506 | return r; | ||
| 1507 | } | ||
| 1508 | |||
| 1509 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | ||
| 1510 | { | ||
| 1511 | while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { | ||
| 1512 | struct kvm_mmu_page *sp; | ||
| 1513 | |||
| 1514 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, | ||
| 1515 | struct kvm_mmu_page, link); | ||
| 1516 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
| 1517 | ++vcpu->kvm->stat.mmu_recycled; | ||
| 1518 | } | ||
| 1519 | } | ||
| 1520 | |||
| 1521 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | ||
| 1522 | { | ||
| 1523 | int r; | ||
| 1524 | enum emulation_result er; | ||
| 1525 | |||
| 1526 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); | ||
| 1527 | if (r < 0) | ||
| 1528 | goto out; | ||
| 1529 | |||
| 1530 | if (!r) { | ||
| 1531 | r = 1; | ||
| 1532 | goto out; | ||
| 1533 | } | ||
| 1534 | |||
| 1535 | r = mmu_topup_memory_caches(vcpu); | ||
| 1536 | if (r) | ||
| 1537 | goto out; | ||
| 1538 | |||
| 1539 | er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); | ||
| 1540 | |||
| 1541 | switch (er) { | ||
| 1542 | case EMULATE_DONE: | ||
| 1543 | return 1; | ||
| 1544 | case EMULATE_DO_MMIO: | ||
| 1545 | ++vcpu->stat.mmio_exits; | ||
| 1546 | return 0; | ||
| 1547 | case EMULATE_FAIL: | ||
| 1548 | kvm_report_emulation_failure(vcpu, "pagetable"); | ||
| 1549 | return 1; | ||
| 1550 | default: | ||
| 1551 | BUG(); | ||
| 1552 | } | ||
| 1553 | out: | ||
| 1554 | return r; | ||
| 1555 | } | ||
| 1556 | EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); | ||
| 1557 | |||
| 1558 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | ||
| 1559 | { | ||
| 1560 | struct kvm_mmu_page *sp; | ||
| 1561 | |||
| 1562 | while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { | ||
| 1563 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, | ||
| 1564 | struct kvm_mmu_page, link); | ||
| 1565 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
| 1566 | } | ||
| 1567 | free_page((unsigned long)vcpu->arch.mmu.pae_root); | ||
| 1568 | } | ||
| 1569 | |||
| 1570 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | ||
| 1571 | { | ||
| 1572 | struct page *page; | ||
| 1573 | int i; | ||
| 1574 | |||
| 1575 | ASSERT(vcpu); | ||
| 1576 | |||
| 1577 | if (vcpu->kvm->arch.n_requested_mmu_pages) | ||
| 1578 | vcpu->kvm->arch.n_free_mmu_pages = | ||
| 1579 | vcpu->kvm->arch.n_requested_mmu_pages; | ||
| 1580 | else | ||
| 1581 | vcpu->kvm->arch.n_free_mmu_pages = | ||
| 1582 | vcpu->kvm->arch.n_alloc_mmu_pages; | ||
| 1583 | /* | ||
| 1584 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. | ||
| 1585 | * Therefore we need to allocate shadow page tables in the first | ||
| 1586 | * 4GB of memory, which happens to fit the DMA32 zone. | ||
| 1587 | */ | ||
| 1588 | page = alloc_page(GFP_KERNEL | __GFP_DMA32); | ||
| 1589 | if (!page) | ||
| 1590 | goto error_1; | ||
| 1591 | vcpu->arch.mmu.pae_root = page_address(page); | ||
| 1592 | for (i = 0; i < 4; ++i) | ||
| 1593 | vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; | ||
| 1594 | |||
| 1595 | return 0; | ||
| 1596 | |||
| 1597 | error_1: | ||
| 1598 | free_mmu_pages(vcpu); | ||
| 1599 | return -ENOMEM; | ||
| 1600 | } | ||
| 1601 | |||
| 1602 | int kvm_mmu_create(struct kvm_vcpu *vcpu) | ||
| 1603 | { | ||
| 1604 | ASSERT(vcpu); | ||
| 1605 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
| 1606 | |||
| 1607 | return alloc_mmu_pages(vcpu); | ||
| 1608 | } | ||
| 1609 | |||
| 1610 | int kvm_mmu_setup(struct kvm_vcpu *vcpu) | ||
| 1611 | { | ||
| 1612 | ASSERT(vcpu); | ||
| 1613 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
| 1614 | |||
| 1615 | return init_kvm_mmu(vcpu); | ||
| 1616 | } | ||
| 1617 | |||
| 1618 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | ||
| 1619 | { | ||
| 1620 | ASSERT(vcpu); | ||
| 1621 | |||
| 1622 | destroy_kvm_mmu(vcpu); | ||
| 1623 | free_mmu_pages(vcpu); | ||
| 1624 | mmu_free_memory_caches(vcpu); | ||
| 1625 | } | ||
| 1626 | |||
| 1627 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | ||
| 1628 | { | ||
| 1629 | struct kvm_mmu_page *sp; | ||
| 1630 | |||
| 1631 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { | ||
| 1632 | int i; | ||
| 1633 | u64 *pt; | ||
| 1634 | |||
| 1635 | if (!test_bit(slot, &sp->slot_bitmap)) | ||
| 1636 | continue; | ||
| 1637 | |||
| 1638 | pt = sp->spt; | ||
| 1639 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
| 1640 | /* avoid RMW */ | ||
| 1641 | if (pt[i] & PT_WRITABLE_MASK) | ||
| 1642 | pt[i] &= ~PT_WRITABLE_MASK; | ||
| 1643 | } | ||
| 1644 | } | ||
| 1645 | |||
| 1646 | void kvm_mmu_zap_all(struct kvm *kvm) | ||
| 1647 | { | ||
| 1648 | struct kvm_mmu_page *sp, *node; | ||
| 1649 | |||
| 1650 | spin_lock(&kvm->mmu_lock); | ||
| 1651 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) | ||
| 1652 | kvm_mmu_zap_page(kvm, sp); | ||
| 1653 | spin_unlock(&kvm->mmu_lock); | ||
| 1654 | |||
| 1655 | kvm_flush_remote_tlbs(kvm); | ||
| 1656 | } | ||
| 1657 | |||
| 1658 | void kvm_mmu_module_exit(void) | ||
| 1659 | { | ||
| 1660 | if (pte_chain_cache) | ||
| 1661 | kmem_cache_destroy(pte_chain_cache); | ||
| 1662 | if (rmap_desc_cache) | ||
| 1663 | kmem_cache_destroy(rmap_desc_cache); | ||
| 1664 | if (mmu_page_header_cache) | ||
| 1665 | kmem_cache_destroy(mmu_page_header_cache); | ||
| 1666 | } | ||
| 1667 | |||
| 1668 | int kvm_mmu_module_init(void) | ||
| 1669 | { | ||
| 1670 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", | ||
| 1671 | sizeof(struct kvm_pte_chain), | ||
| 1672 | 0, 0, NULL); | ||
| 1673 | if (!pte_chain_cache) | ||
| 1674 | goto nomem; | ||
| 1675 | rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", | ||
| 1676 | sizeof(struct kvm_rmap_desc), | ||
| 1677 | 0, 0, NULL); | ||
| 1678 | if (!rmap_desc_cache) | ||
| 1679 | goto nomem; | ||
| 1680 | |||
| 1681 | mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", | ||
| 1682 | sizeof(struct kvm_mmu_page), | ||
| 1683 | 0, 0, NULL); | ||
| 1684 | if (!mmu_page_header_cache) | ||
| 1685 | goto nomem; | ||
| 1686 | |||
| 1687 | return 0; | ||
| 1688 | |||
| 1689 | nomem: | ||
| 1690 | kvm_mmu_module_exit(); | ||
| 1691 | return -ENOMEM; | ||
| 1692 | } | ||
| 1693 | |||
| 1694 | /* | ||
| 1695 | * Caculate mmu pages needed for kvm. | ||
| 1696 | */ | ||
| 1697 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) | ||
| 1698 | { | ||
| 1699 | int i; | ||
| 1700 | unsigned int nr_mmu_pages; | ||
| 1701 | unsigned int nr_pages = 0; | ||
| 1702 | |||
| 1703 | for (i = 0; i < kvm->nmemslots; i++) | ||
| 1704 | nr_pages += kvm->memslots[i].npages; | ||
| 1705 | |||
| 1706 | nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; | ||
| 1707 | nr_mmu_pages = max(nr_mmu_pages, | ||
| 1708 | (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); | ||
| 1709 | |||
| 1710 | return nr_mmu_pages; | ||
| 1711 | } | ||
| 1712 | |||
| 1713 | #ifdef AUDIT | ||
| 1714 | |||
| 1715 | static const char *audit_msg; | ||
| 1716 | |||
| 1717 | static gva_t canonicalize(gva_t gva) | ||
| 1718 | { | ||
| 1719 | #ifdef CONFIG_X86_64 | ||
| 1720 | gva = (long long)(gva << 16) >> 16; | ||
| 1721 | #endif | ||
| 1722 | return gva; | ||
| 1723 | } | ||
| 1724 | |||
| 1725 | static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | ||
| 1726 | gva_t va, int level) | ||
| 1727 | { | ||
| 1728 | u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); | ||
| 1729 | int i; | ||
| 1730 | gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); | ||
| 1731 | |||
| 1732 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { | ||
| 1733 | u64 ent = pt[i]; | ||
| 1734 | |||
| 1735 | if (ent == shadow_trap_nonpresent_pte) | ||
| 1736 | continue; | ||
| 1737 | |||
| 1738 | va = canonicalize(va); | ||
| 1739 | if (level > 1) { | ||
| 1740 | if (ent == shadow_notrap_nonpresent_pte) | ||
| 1741 | printk(KERN_ERR "audit: (%s) nontrapping pte" | ||
| 1742 | " in nonleaf level: levels %d gva %lx" | ||
| 1743 | " level %d pte %llx\n", audit_msg, | ||
| 1744 | vcpu->arch.mmu.root_level, va, level, ent); | ||
| 1745 | |||
| 1746 | audit_mappings_page(vcpu, ent, va, level - 1); | ||
| 1747 | } else { | ||
| 1748 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); | ||
| 1749 | struct page *page = gpa_to_page(vcpu, gpa); | ||
| 1750 | hpa_t hpa = page_to_phys(page); | ||
| 1751 | |||
| 1752 | if (is_shadow_present_pte(ent) | ||
| 1753 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | ||
| 1754 | printk(KERN_ERR "xx audit error: (%s) levels %d" | ||
| 1755 | " gva %lx gpa %llx hpa %llx ent %llx %d\n", | ||
| 1756 | audit_msg, vcpu->arch.mmu.root_level, | ||
| 1757 | va, gpa, hpa, ent, | ||
| 1758 | is_shadow_present_pte(ent)); | ||
| 1759 | else if (ent == shadow_notrap_nonpresent_pte | ||
| 1760 | && !is_error_hpa(hpa)) | ||
| 1761 | printk(KERN_ERR "audit: (%s) notrap shadow," | ||
| 1762 | " valid guest gva %lx\n", audit_msg, va); | ||
| 1763 | kvm_release_page_clean(page); | ||
| 1764 | |||
| 1765 | } | ||
| 1766 | } | ||
| 1767 | } | ||
| 1768 | |||
| 1769 | static void audit_mappings(struct kvm_vcpu *vcpu) | ||
| 1770 | { | ||
| 1771 | unsigned i; | ||
| 1772 | |||
| 1773 | if (vcpu->arch.mmu.root_level == 4) | ||
| 1774 | audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); | ||
| 1775 | else | ||
| 1776 | for (i = 0; i < 4; ++i) | ||
| 1777 | if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) | ||
| 1778 | audit_mappings_page(vcpu, | ||
| 1779 | vcpu->arch.mmu.pae_root[i], | ||
| 1780 | i << 30, | ||
| 1781 | 2); | ||
| 1782 | } | ||
| 1783 | |||
| 1784 | static int count_rmaps(struct kvm_vcpu *vcpu) | ||
| 1785 | { | ||
| 1786 | int nmaps = 0; | ||
| 1787 | int i, j, k; | ||
| 1788 | |||
| 1789 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
| 1790 | struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; | ||
| 1791 | struct kvm_rmap_desc *d; | ||
| 1792 | |||
| 1793 | for (j = 0; j < m->npages; ++j) { | ||
| 1794 | unsigned long *rmapp = &m->rmap[j]; | ||
| 1795 | |||
| 1796 | if (!*rmapp) | ||
| 1797 | continue; | ||
| 1798 | if (!(*rmapp & 1)) { | ||
| 1799 | ++nmaps; | ||
| 1800 | continue; | ||
| 1801 | } | ||
| 1802 | d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
| 1803 | while (d) { | ||
| 1804 | for (k = 0; k < RMAP_EXT; ++k) | ||
| 1805 | if (d->shadow_ptes[k]) | ||
| 1806 | ++nmaps; | ||
| 1807 | else | ||
| 1808 | break; | ||
| 1809 | d = d->more; | ||
| 1810 | } | ||
| 1811 | } | ||
| 1812 | } | ||
| 1813 | return nmaps; | ||
| 1814 | } | ||
| 1815 | |||
| 1816 | static int count_writable_mappings(struct kvm_vcpu *vcpu) | ||
| 1817 | { | ||
| 1818 | int nmaps = 0; | ||
| 1819 | struct kvm_mmu_page *sp; | ||
| 1820 | int i; | ||
| 1821 | |||
| 1822 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | ||
| 1823 | u64 *pt = sp->spt; | ||
| 1824 | |||
| 1825 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) | ||
| 1826 | continue; | ||
| 1827 | |||
| 1828 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
| 1829 | u64 ent = pt[i]; | ||
| 1830 | |||
| 1831 | if (!(ent & PT_PRESENT_MASK)) | ||
| 1832 | continue; | ||
| 1833 | if (!(ent & PT_WRITABLE_MASK)) | ||
| 1834 | continue; | ||
| 1835 | ++nmaps; | ||
| 1836 | } | ||
| 1837 | } | ||
| 1838 | return nmaps; | ||
| 1839 | } | ||
| 1840 | |||
| 1841 | static void audit_rmap(struct kvm_vcpu *vcpu) | ||
| 1842 | { | ||
| 1843 | int n_rmap = count_rmaps(vcpu); | ||
| 1844 | int n_actual = count_writable_mappings(vcpu); | ||
| 1845 | |||
| 1846 | if (n_rmap != n_actual) | ||
| 1847 | printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", | ||
| 1848 | __FUNCTION__, audit_msg, n_rmap, n_actual); | ||
| 1849 | } | ||
| 1850 | |||
| 1851 | static void audit_write_protection(struct kvm_vcpu *vcpu) | ||
| 1852 | { | ||
| 1853 | struct kvm_mmu_page *sp; | ||
| 1854 | struct kvm_memory_slot *slot; | ||
| 1855 | unsigned long *rmapp; | ||
| 1856 | gfn_t gfn; | ||
| 1857 | |||
| 1858 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | ||
| 1859 | if (sp->role.metaphysical) | ||
| 1860 | continue; | ||
| 1861 | |||
| 1862 | slot = gfn_to_memslot(vcpu->kvm, sp->gfn); | ||
| 1863 | gfn = unalias_gfn(vcpu->kvm, sp->gfn); | ||
| 1864 | rmapp = &slot->rmap[gfn - slot->base_gfn]; | ||
| 1865 | if (*rmapp) | ||
| 1866 | printk(KERN_ERR "%s: (%s) shadow page has writable" | ||
| 1867 | " mappings: gfn %lx role %x\n", | ||
| 1868 | __FUNCTION__, audit_msg, sp->gfn, | ||
| 1869 | sp->role.word); | ||
| 1870 | } | ||
| 1871 | } | ||
| 1872 | |||
| 1873 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) | ||
| 1874 | { | ||
| 1875 | int olddbg = dbg; | ||
| 1876 | |||
| 1877 | dbg = 0; | ||
| 1878 | audit_msg = msg; | ||
| 1879 | audit_rmap(vcpu); | ||
| 1880 | audit_write_protection(vcpu); | ||
| 1881 | audit_mappings(vcpu); | ||
| 1882 | dbg = olddbg; | ||
| 1883 | } | ||
| 1884 | |||
| 1885 | #endif | ||
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h new file mode 100644 index 000000000000..1fce19ec7a23 --- /dev/null +++ b/arch/x86/kvm/mmu.h | |||
| @@ -0,0 +1,44 @@ | |||
| 1 | #ifndef __KVM_X86_MMU_H | ||
| 2 | #define __KVM_X86_MMU_H | ||
| 3 | |||
| 4 | #include <linux/kvm_host.h> | ||
| 5 | |||
| 6 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | ||
| 7 | { | ||
| 8 | if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) | ||
| 9 | __kvm_mmu_free_some_pages(vcpu); | ||
| 10 | } | ||
| 11 | |||
| 12 | static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) | ||
| 13 | { | ||
| 14 | if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) | ||
| 15 | return 0; | ||
| 16 | |||
| 17 | return kvm_mmu_load(vcpu); | ||
| 18 | } | ||
| 19 | |||
| 20 | static inline int is_long_mode(struct kvm_vcpu *vcpu) | ||
| 21 | { | ||
| 22 | #ifdef CONFIG_X86_64 | ||
| 23 | return vcpu->arch.shadow_efer & EFER_LME; | ||
| 24 | #else | ||
| 25 | return 0; | ||
| 26 | #endif | ||
| 27 | } | ||
| 28 | |||
| 29 | static inline int is_pae(struct kvm_vcpu *vcpu) | ||
| 30 | { | ||
| 31 | return vcpu->arch.cr4 & X86_CR4_PAE; | ||
| 32 | } | ||
| 33 | |||
| 34 | static inline int is_pse(struct kvm_vcpu *vcpu) | ||
| 35 | { | ||
| 36 | return vcpu->arch.cr4 & X86_CR4_PSE; | ||
| 37 | } | ||
| 38 | |||
| 39 | static inline int is_paging(struct kvm_vcpu *vcpu) | ||
| 40 | { | ||
| 41 | return vcpu->arch.cr0 & X86_CR0_PG; | ||
| 42 | } | ||
| 43 | |||
| 44 | #endif | ||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h new file mode 100644 index 000000000000..03ba8608fe0f --- /dev/null +++ b/arch/x86/kvm/paging_tmpl.h | |||
| @@ -0,0 +1,484 @@ | |||
| 1 | /* | ||
| 2 | * Kernel-based Virtual Machine driver for Linux | ||
| 3 | * | ||
| 4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
| 5 | * machines without emulation or binary translation. | ||
| 6 | * | ||
| 7 | * MMU support | ||
| 8 | * | ||
| 9 | * Copyright (C) 2006 Qumranet, Inc. | ||
| 10 | * | ||
| 11 | * Authors: | ||
| 12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
| 13 | * Avi Kivity <avi@qumranet.com> | ||
| 14 | * | ||
| 15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
| 16 | * the COPYING file in the top-level directory. | ||
| 17 | * | ||
| 18 | */ | ||
| 19 | |||
| 20 | /* | ||
| 21 | * We need the mmu code to access both 32-bit and 64-bit guest ptes, | ||
| 22 | * so the code in this file is compiled twice, once per pte size. | ||
| 23 | */ | ||
| 24 | |||
| 25 | #if PTTYPE == 64 | ||
| 26 | #define pt_element_t u64 | ||
| 27 | #define guest_walker guest_walker64 | ||
| 28 | #define FNAME(name) paging##64_##name | ||
| 29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK | ||
| 30 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK | ||
| 31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
| 32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
| 33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | ||
| 34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS | ||
| 35 | #ifdef CONFIG_X86_64 | ||
| 36 | #define PT_MAX_FULL_LEVELS 4 | ||
| 37 | #define CMPXCHG cmpxchg | ||
| 38 | #else | ||
| 39 | #define CMPXCHG cmpxchg64 | ||
| 40 | #define PT_MAX_FULL_LEVELS 2 | ||
| 41 | #endif | ||
| 42 | #elif PTTYPE == 32 | ||
| 43 | #define pt_element_t u32 | ||
| 44 | #define guest_walker guest_walker32 | ||
| 45 | #define FNAME(name) paging##32_##name | ||
| 46 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK | ||
| 47 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK | ||
| 48 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | ||
| 49 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
| 50 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | ||
| 51 | #define PT_LEVEL_BITS PT32_LEVEL_BITS | ||
| 52 | #define PT_MAX_FULL_LEVELS 2 | ||
| 53 | #define CMPXCHG cmpxchg | ||
| 54 | #else | ||
| 55 | #error Invalid PTTYPE value | ||
| 56 | #endif | ||
| 57 | |||
| 58 | #define gpte_to_gfn FNAME(gpte_to_gfn) | ||
| 59 | #define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) | ||
| 60 | |||
| 61 | /* | ||
| 62 | * The guest_walker structure emulates the behavior of the hardware page | ||
| 63 | * table walker. | ||
| 64 | */ | ||
| 65 | struct guest_walker { | ||
| 66 | int level; | ||
| 67 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; | ||
| 68 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; | ||
| 69 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; | ||
| 70 | unsigned pt_access; | ||
| 71 | unsigned pte_access; | ||
| 72 | gfn_t gfn; | ||
| 73 | u32 error_code; | ||
| 74 | }; | ||
| 75 | |||
| 76 | static gfn_t gpte_to_gfn(pt_element_t gpte) | ||
| 77 | { | ||
| 78 | return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
| 79 | } | ||
| 80 | |||
| 81 | static gfn_t gpte_to_gfn_pde(pt_element_t gpte) | ||
| 82 | { | ||
| 83 | return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
| 84 | } | ||
| 85 | |||
| 86 | static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, | ||
| 87 | gfn_t table_gfn, unsigned index, | ||
| 88 | pt_element_t orig_pte, pt_element_t new_pte) | ||
| 89 | { | ||
| 90 | pt_element_t ret; | ||
| 91 | pt_element_t *table; | ||
| 92 | struct page *page; | ||
| 93 | |||
| 94 | page = gfn_to_page(kvm, table_gfn); | ||
| 95 | table = kmap_atomic(page, KM_USER0); | ||
| 96 | |||
| 97 | ret = CMPXCHG(&table[index], orig_pte, new_pte); | ||
| 98 | |||
| 99 | kunmap_atomic(table, KM_USER0); | ||
| 100 | |||
| 101 | kvm_release_page_dirty(page); | ||
| 102 | |||
| 103 | return (ret != orig_pte); | ||
| 104 | } | ||
| 105 | |||
| 106 | static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) | ||
| 107 | { | ||
| 108 | unsigned access; | ||
| 109 | |||
| 110 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; | ||
| 111 | #if PTTYPE == 64 | ||
| 112 | if (is_nx(vcpu)) | ||
| 113 | access &= ~(gpte >> PT64_NX_SHIFT); | ||
| 114 | #endif | ||
| 115 | return access; | ||
| 116 | } | ||
| 117 | |||
| 118 | /* | ||
| 119 | * Fetch a guest pte for a guest virtual address | ||
| 120 | */ | ||
| 121 | static int FNAME(walk_addr)(struct guest_walker *walker, | ||
| 122 | struct kvm_vcpu *vcpu, gva_t addr, | ||
| 123 | int write_fault, int user_fault, int fetch_fault) | ||
| 124 | { | ||
| 125 | pt_element_t pte; | ||
| 126 | gfn_t table_gfn; | ||
| 127 | unsigned index, pt_access, pte_access; | ||
| 128 | gpa_t pte_gpa; | ||
| 129 | |||
| 130 | pgprintk("%s: addr %lx\n", __FUNCTION__, addr); | ||
| 131 | walk: | ||
| 132 | walker->level = vcpu->arch.mmu.root_level; | ||
| 133 | pte = vcpu->arch.cr3; | ||
| 134 | #if PTTYPE == 64 | ||
| 135 | if (!is_long_mode(vcpu)) { | ||
| 136 | pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; | ||
| 137 | if (!is_present_pte(pte)) | ||
| 138 | goto not_present; | ||
| 139 | --walker->level; | ||
| 140 | } | ||
| 141 | #endif | ||
| 142 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || | ||
| 143 | (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); | ||
| 144 | |||
| 145 | pt_access = ACC_ALL; | ||
| 146 | |||
| 147 | for (;;) { | ||
| 148 | index = PT_INDEX(addr, walker->level); | ||
| 149 | |||
| 150 | table_gfn = gpte_to_gfn(pte); | ||
| 151 | pte_gpa = gfn_to_gpa(table_gfn); | ||
| 152 | pte_gpa += index * sizeof(pt_element_t); | ||
| 153 | walker->table_gfn[walker->level - 1] = table_gfn; | ||
| 154 | walker->pte_gpa[walker->level - 1] = pte_gpa; | ||
| 155 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
| 156 | walker->level - 1, table_gfn); | ||
| 157 | |||
| 158 | kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); | ||
| 159 | |||
| 160 | if (!is_present_pte(pte)) | ||
| 161 | goto not_present; | ||
| 162 | |||
| 163 | if (write_fault && !is_writeble_pte(pte)) | ||
| 164 | if (user_fault || is_write_protection(vcpu)) | ||
| 165 | goto access_error; | ||
| 166 | |||
| 167 | if (user_fault && !(pte & PT_USER_MASK)) | ||
| 168 | goto access_error; | ||
| 169 | |||
| 170 | #if PTTYPE == 64 | ||
| 171 | if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) | ||
| 172 | goto access_error; | ||
| 173 | #endif | ||
| 174 | |||
| 175 | if (!(pte & PT_ACCESSED_MASK)) { | ||
| 176 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
| 177 | if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, | ||
| 178 | index, pte, pte|PT_ACCESSED_MASK)) | ||
| 179 | goto walk; | ||
| 180 | pte |= PT_ACCESSED_MASK; | ||
| 181 | } | ||
| 182 | |||
| 183 | pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); | ||
| 184 | |||
| 185 | walker->ptes[walker->level - 1] = pte; | ||
| 186 | |||
| 187 | if (walker->level == PT_PAGE_TABLE_LEVEL) { | ||
| 188 | walker->gfn = gpte_to_gfn(pte); | ||
| 189 | break; | ||
| 190 | } | ||
| 191 | |||
| 192 | if (walker->level == PT_DIRECTORY_LEVEL | ||
| 193 | && (pte & PT_PAGE_SIZE_MASK) | ||
| 194 | && (PTTYPE == 64 || is_pse(vcpu))) { | ||
| 195 | walker->gfn = gpte_to_gfn_pde(pte); | ||
| 196 | walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); | ||
| 197 | if (PTTYPE == 32 && is_cpuid_PSE36()) | ||
| 198 | walker->gfn += pse36_gfn_delta(pte); | ||
| 199 | break; | ||
| 200 | } | ||
| 201 | |||
| 202 | pt_access = pte_access; | ||
| 203 | --walker->level; | ||
| 204 | } | ||
| 205 | |||
| 206 | if (write_fault && !is_dirty_pte(pte)) { | ||
| 207 | bool ret; | ||
| 208 | |||
| 209 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
| 210 | ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, | ||
| 211 | pte|PT_DIRTY_MASK); | ||
| 212 | if (ret) | ||
| 213 | goto walk; | ||
| 214 | pte |= PT_DIRTY_MASK; | ||
| 215 | kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); | ||
| 216 | walker->ptes[walker->level - 1] = pte; | ||
| 217 | } | ||
| 218 | |||
| 219 | walker->pt_access = pt_access; | ||
| 220 | walker->pte_access = pte_access; | ||
| 221 | pgprintk("%s: pte %llx pte_access %x pt_access %x\n", | ||
| 222 | __FUNCTION__, (u64)pte, pt_access, pte_access); | ||
| 223 | return 1; | ||
| 224 | |||
| 225 | not_present: | ||
| 226 | walker->error_code = 0; | ||
| 227 | goto err; | ||
| 228 | |||
| 229 | access_error: | ||
| 230 | walker->error_code = PFERR_PRESENT_MASK; | ||
| 231 | |||
| 232 | err: | ||
| 233 | if (write_fault) | ||
| 234 | walker->error_code |= PFERR_WRITE_MASK; | ||
| 235 | if (user_fault) | ||
| 236 | walker->error_code |= PFERR_USER_MASK; | ||
| 237 | if (fetch_fault) | ||
| 238 | walker->error_code |= PFERR_FETCH_MASK; | ||
| 239 | return 0; | ||
| 240 | } | ||
| 241 | |||
| 242 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | ||
| 243 | u64 *spte, const void *pte, int bytes, | ||
| 244 | int offset_in_pte) | ||
| 245 | { | ||
| 246 | pt_element_t gpte; | ||
| 247 | unsigned pte_access; | ||
| 248 | struct page *npage; | ||
| 249 | |||
| 250 | gpte = *(const pt_element_t *)pte; | ||
| 251 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { | ||
| 252 | if (!offset_in_pte && !is_present_pte(gpte)) | ||
| 253 | set_shadow_pte(spte, shadow_notrap_nonpresent_pte); | ||
| 254 | return; | ||
| 255 | } | ||
| 256 | if (bytes < sizeof(pt_element_t)) | ||
| 257 | return; | ||
| 258 | pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); | ||
| 259 | pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); | ||
| 260 | if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) | ||
| 261 | return; | ||
| 262 | npage = vcpu->arch.update_pte.page; | ||
| 263 | if (!npage) | ||
| 264 | return; | ||
| 265 | get_page(npage); | ||
| 266 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, | ||
| 267 | gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage); | ||
| 268 | } | ||
| 269 | |||
| 270 | /* | ||
| 271 | * Fetch a shadow pte for a specific level in the paging hierarchy. | ||
| 272 | */ | ||
| 273 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | ||
| 274 | struct guest_walker *walker, | ||
| 275 | int user_fault, int write_fault, int *ptwrite, | ||
| 276 | struct page *page) | ||
| 277 | { | ||
| 278 | hpa_t shadow_addr; | ||
| 279 | int level; | ||
| 280 | u64 *shadow_ent; | ||
| 281 | unsigned access = walker->pt_access; | ||
| 282 | |||
| 283 | if (!is_present_pte(walker->ptes[walker->level - 1])) | ||
| 284 | return NULL; | ||
| 285 | |||
| 286 | shadow_addr = vcpu->arch.mmu.root_hpa; | ||
| 287 | level = vcpu->arch.mmu.shadow_root_level; | ||
| 288 | if (level == PT32E_ROOT_LEVEL) { | ||
| 289 | shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | ||
| 290 | shadow_addr &= PT64_BASE_ADDR_MASK; | ||
| 291 | --level; | ||
| 292 | } | ||
| 293 | |||
| 294 | for (; ; level--) { | ||
| 295 | u32 index = SHADOW_PT_INDEX(addr, level); | ||
| 296 | struct kvm_mmu_page *shadow_page; | ||
| 297 | u64 shadow_pte; | ||
| 298 | int metaphysical; | ||
| 299 | gfn_t table_gfn; | ||
| 300 | bool new_page = 0; | ||
| 301 | |||
| 302 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; | ||
| 303 | if (level == PT_PAGE_TABLE_LEVEL) | ||
| 304 | break; | ||
| 305 | if (is_shadow_present_pte(*shadow_ent)) { | ||
| 306 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | ||
| 307 | continue; | ||
| 308 | } | ||
| 309 | |||
| 310 | if (level - 1 == PT_PAGE_TABLE_LEVEL | ||
| 311 | && walker->level == PT_DIRECTORY_LEVEL) { | ||
| 312 | metaphysical = 1; | ||
| 313 | if (!is_dirty_pte(walker->ptes[level - 1])) | ||
| 314 | access &= ~ACC_WRITE_MASK; | ||
| 315 | table_gfn = gpte_to_gfn(walker->ptes[level - 1]); | ||
| 316 | } else { | ||
| 317 | metaphysical = 0; | ||
| 318 | table_gfn = walker->table_gfn[level - 2]; | ||
| 319 | } | ||
| 320 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | ||
| 321 | metaphysical, access, | ||
| 322 | shadow_ent, &new_page); | ||
| 323 | if (new_page && !metaphysical) { | ||
| 324 | int r; | ||
| 325 | pt_element_t curr_pte; | ||
| 326 | r = kvm_read_guest_atomic(vcpu->kvm, | ||
| 327 | walker->pte_gpa[level - 2], | ||
| 328 | &curr_pte, sizeof(curr_pte)); | ||
| 329 | if (r || curr_pte != walker->ptes[level - 2]) { | ||
| 330 | kvm_release_page_clean(page); | ||
| 331 | return NULL; | ||
| 332 | } | ||
| 333 | } | ||
| 334 | shadow_addr = __pa(shadow_page->spt); | ||
| 335 | shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | ||
| 336 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
| 337 | *shadow_ent = shadow_pte; | ||
| 338 | } | ||
| 339 | |||
| 340 | mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, | ||
| 341 | user_fault, write_fault, | ||
| 342 | walker->ptes[walker->level-1] & PT_DIRTY_MASK, | ||
| 343 | ptwrite, walker->gfn, page); | ||
| 344 | |||
| 345 | return shadow_ent; | ||
| 346 | } | ||
| 347 | |||
| 348 | /* | ||
| 349 | * Page fault handler. There are several causes for a page fault: | ||
| 350 | * - there is no shadow pte for the guest pte | ||
| 351 | * - write access through a shadow pte marked read only so that we can set | ||
| 352 | * the dirty bit | ||
| 353 | * - write access to a shadow pte marked read only so we can update the page | ||
| 354 | * dirty bitmap, when userspace requests it | ||
| 355 | * - mmio access; in this case we will never install a present shadow pte | ||
| 356 | * - normal guest page fault due to the guest pte marked not present, not | ||
| 357 | * writable, or not executable | ||
| 358 | * | ||
| 359 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or | ||
| 360 | * a negative value on error. | ||
| 361 | */ | ||
| 362 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | ||
| 363 | u32 error_code) | ||
| 364 | { | ||
| 365 | int write_fault = error_code & PFERR_WRITE_MASK; | ||
| 366 | int user_fault = error_code & PFERR_USER_MASK; | ||
| 367 | int fetch_fault = error_code & PFERR_FETCH_MASK; | ||
| 368 | struct guest_walker walker; | ||
| 369 | u64 *shadow_pte; | ||
| 370 | int write_pt = 0; | ||
| 371 | int r; | ||
| 372 | struct page *page; | ||
| 373 | |||
| 374 | pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); | ||
| 375 | kvm_mmu_audit(vcpu, "pre page fault"); | ||
| 376 | |||
| 377 | r = mmu_topup_memory_caches(vcpu); | ||
| 378 | if (r) | ||
| 379 | return r; | ||
| 380 | |||
| 381 | down_read(¤t->mm->mmap_sem); | ||
| 382 | /* | ||
| 383 | * Look up the shadow pte for the faulting address. | ||
| 384 | */ | ||
| 385 | r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, | ||
| 386 | fetch_fault); | ||
| 387 | |||
| 388 | /* | ||
| 389 | * The page is not mapped by the guest. Let the guest handle it. | ||
| 390 | */ | ||
| 391 | if (!r) { | ||
| 392 | pgprintk("%s: guest page fault\n", __FUNCTION__); | ||
| 393 | inject_page_fault(vcpu, addr, walker.error_code); | ||
| 394 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | ||
| 395 | up_read(¤t->mm->mmap_sem); | ||
| 396 | return 0; | ||
| 397 | } | ||
| 398 | |||
| 399 | page = gfn_to_page(vcpu->kvm, walker.gfn); | ||
| 400 | |||
| 401 | spin_lock(&vcpu->kvm->mmu_lock); | ||
| 402 | kvm_mmu_free_some_pages(vcpu); | ||
| 403 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | ||
| 404 | &write_pt, page); | ||
| 405 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, | ||
| 406 | shadow_pte, *shadow_pte, write_pt); | ||
| 407 | |||
| 408 | if (!write_pt) | ||
| 409 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | ||
| 410 | |||
| 411 | /* | ||
| 412 | * mmio: emulate if accessible, otherwise its a guest fault. | ||
| 413 | */ | ||
| 414 | if (shadow_pte && is_io_pte(*shadow_pte)) { | ||
| 415 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
| 416 | up_read(¤t->mm->mmap_sem); | ||
| 417 | return 1; | ||
| 418 | } | ||
| 419 | |||
| 420 | ++vcpu->stat.pf_fixed; | ||
| 421 | kvm_mmu_audit(vcpu, "post page fault (fixed)"); | ||
| 422 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
| 423 | up_read(¤t->mm->mmap_sem); | ||
| 424 | |||
| 425 | return write_pt; | ||
| 426 | } | ||
| 427 | |||
| 428 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
| 429 | { | ||
| 430 | struct guest_walker walker; | ||
| 431 | gpa_t gpa = UNMAPPED_GVA; | ||
| 432 | int r; | ||
| 433 | |||
| 434 | r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); | ||
| 435 | |||
| 436 | if (r) { | ||
| 437 | gpa = gfn_to_gpa(walker.gfn); | ||
| 438 | gpa |= vaddr & ~PAGE_MASK; | ||
| 439 | } | ||
| 440 | |||
| 441 | return gpa; | ||
| 442 | } | ||
| 443 | |||
| 444 | static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | ||
| 445 | struct kvm_mmu_page *sp) | ||
| 446 | { | ||
| 447 | int i, offset = 0, r = 0; | ||
| 448 | pt_element_t pt; | ||
| 449 | |||
| 450 | if (sp->role.metaphysical | ||
| 451 | || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { | ||
| 452 | nonpaging_prefetch_page(vcpu, sp); | ||
| 453 | return; | ||
| 454 | } | ||
| 455 | |||
| 456 | if (PTTYPE == 32) | ||
| 457 | offset = sp->role.quadrant << PT64_LEVEL_BITS; | ||
| 458 | |||
| 459 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
| 460 | gpa_t pte_gpa = gfn_to_gpa(sp->gfn); | ||
| 461 | pte_gpa += (i+offset) * sizeof(pt_element_t); | ||
| 462 | |||
| 463 | r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt, | ||
| 464 | sizeof(pt_element_t)); | ||
| 465 | if (r || is_present_pte(pt)) | ||
| 466 | sp->spt[i] = shadow_trap_nonpresent_pte; | ||
| 467 | else | ||
| 468 | sp->spt[i] = shadow_notrap_nonpresent_pte; | ||
| 469 | } | ||
| 470 | } | ||
| 471 | |||
| 472 | #undef pt_element_t | ||
| 473 | #undef guest_walker | ||
| 474 | #undef FNAME | ||
| 475 | #undef PT_BASE_ADDR_MASK | ||
| 476 | #undef PT_INDEX | ||
| 477 | #undef SHADOW_PT_INDEX | ||
| 478 | #undef PT_LEVEL_MASK | ||
| 479 | #undef PT_DIR_BASE_ADDR_MASK | ||
| 480 | #undef PT_LEVEL_BITS | ||
| 481 | #undef PT_MAX_FULL_LEVELS | ||
| 482 | #undef gpte_to_gfn | ||
| 483 | #undef gpte_to_gfn_pde | ||
| 484 | #undef CMPXCHG | ||
diff --git a/drivers/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h index 71fdf458619a..56fc4c873389 100644 --- a/drivers/kvm/segment_descriptor.h +++ b/arch/x86/kvm/segment_descriptor.h | |||
| @@ -1,3 +1,6 @@ | |||
| 1 | #ifndef __SEGMENT_DESCRIPTOR_H | ||
| 2 | #define __SEGMENT_DESCRIPTOR_H | ||
| 3 | |||
| 1 | struct segment_descriptor { | 4 | struct segment_descriptor { |
| 2 | u16 limit_low; | 5 | u16 limit_low; |
| 3 | u16 base_low; | 6 | u16 base_low; |
| @@ -14,4 +17,13 @@ struct segment_descriptor { | |||
| 14 | u8 base_high; | 17 | u8 base_high; |
| 15 | } __attribute__((packed)); | 18 | } __attribute__((packed)); |
| 16 | 19 | ||
| 20 | #ifdef CONFIG_X86_64 | ||
| 21 | /* LDT or TSS descriptor in the GDT. 16 bytes. */ | ||
| 22 | struct segment_descriptor_64 { | ||
| 23 | struct segment_descriptor s; | ||
| 24 | u32 base_higher; | ||
| 25 | u32 pad_zero; | ||
| 26 | }; | ||
| 17 | 27 | ||
| 28 | #endif | ||
| 29 | #endif | ||
diff --git a/drivers/kvm/svm.c b/arch/x86/kvm/svm.c index ced4ac1955db..de755cb1431d 100644 --- a/drivers/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
| @@ -13,10 +13,11 @@ | |||
| 13 | * the COPYING file in the top-level directory. | 13 | * the COPYING file in the top-level directory. |
| 14 | * | 14 | * |
| 15 | */ | 15 | */ |
| 16 | #include <linux/kvm_host.h> | ||
| 16 | 17 | ||
| 17 | #include "kvm_svm.h" | 18 | #include "kvm_svm.h" |
| 18 | #include "x86_emulate.h" | ||
| 19 | #include "irq.h" | 19 | #include "irq.h" |
| 20 | #include "mmu.h" | ||
| 20 | 21 | ||
| 21 | #include <linux/module.h> | 22 | #include <linux/module.h> |
| 22 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
| @@ -42,9 +43,6 @@ MODULE_LICENSE("GPL"); | |||
| 42 | #define SEG_TYPE_LDT 2 | 43 | #define SEG_TYPE_LDT 2 |
| 43 | #define SEG_TYPE_BUSY_TSS16 3 | 44 | #define SEG_TYPE_BUSY_TSS16 3 |
| 44 | 45 | ||
| 45 | #define KVM_EFER_LMA (1 << 10) | ||
| 46 | #define KVM_EFER_LME (1 << 8) | ||
| 47 | |||
| 48 | #define SVM_FEATURE_NPT (1 << 0) | 46 | #define SVM_FEATURE_NPT (1 << 0) |
| 49 | #define SVM_FEATURE_LBRV (1 << 1) | 47 | #define SVM_FEATURE_LBRV (1 << 1) |
| 50 | #define SVM_DEATURE_SVML (1 << 2) | 48 | #define SVM_DEATURE_SVML (1 << 2) |
| @@ -102,20 +100,20 @@ static inline u32 svm_has(u32 feat) | |||
| 102 | 100 | ||
| 103 | static inline u8 pop_irq(struct kvm_vcpu *vcpu) | 101 | static inline u8 pop_irq(struct kvm_vcpu *vcpu) |
| 104 | { | 102 | { |
| 105 | int word_index = __ffs(vcpu->irq_summary); | 103 | int word_index = __ffs(vcpu->arch.irq_summary); |
| 106 | int bit_index = __ffs(vcpu->irq_pending[word_index]); | 104 | int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); |
| 107 | int irq = word_index * BITS_PER_LONG + bit_index; | 105 | int irq = word_index * BITS_PER_LONG + bit_index; |
| 108 | 106 | ||
| 109 | clear_bit(bit_index, &vcpu->irq_pending[word_index]); | 107 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); |
| 110 | if (!vcpu->irq_pending[word_index]) | 108 | if (!vcpu->arch.irq_pending[word_index]) |
| 111 | clear_bit(word_index, &vcpu->irq_summary); | 109 | clear_bit(word_index, &vcpu->arch.irq_summary); |
| 112 | return irq; | 110 | return irq; |
| 113 | } | 111 | } |
| 114 | 112 | ||
| 115 | static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) | 113 | static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) |
| 116 | { | 114 | { |
| 117 | set_bit(irq, vcpu->irq_pending); | 115 | set_bit(irq, vcpu->arch.irq_pending); |
| 118 | set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); | 116 | set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); |
| 119 | } | 117 | } |
| 120 | 118 | ||
| 121 | static inline void clgi(void) | 119 | static inline void clgi(void) |
| @@ -184,35 +182,30 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) | |||
| 184 | 182 | ||
| 185 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) | 183 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) |
| 186 | { | 184 | { |
| 187 | if (!(efer & KVM_EFER_LMA)) | 185 | if (!(efer & EFER_LMA)) |
| 188 | efer &= ~KVM_EFER_LME; | 186 | efer &= ~EFER_LME; |
| 189 | 187 | ||
| 190 | to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; | 188 | to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; |
| 191 | vcpu->shadow_efer = efer; | 189 | vcpu->arch.shadow_efer = efer; |
| 192 | } | 190 | } |
| 193 | 191 | ||
| 194 | static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) | 192 | static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, |
| 193 | bool has_error_code, u32 error_code) | ||
| 195 | { | 194 | { |
| 196 | struct vcpu_svm *svm = to_svm(vcpu); | 195 | struct vcpu_svm *svm = to_svm(vcpu); |
| 197 | 196 | ||
| 198 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | | 197 | svm->vmcb->control.event_inj = nr |
| 199 | SVM_EVTINJ_VALID_ERR | | 198 | | SVM_EVTINJ_VALID |
| 200 | SVM_EVTINJ_TYPE_EXEPT | | 199 | | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) |
| 201 | GP_VECTOR; | 200 | | SVM_EVTINJ_TYPE_EXEPT; |
| 202 | svm->vmcb->control.event_inj_err = error_code; | 201 | svm->vmcb->control.event_inj_err = error_code; |
| 203 | } | 202 | } |
| 204 | 203 | ||
| 205 | static void inject_ud(struct kvm_vcpu *vcpu) | 204 | static bool svm_exception_injected(struct kvm_vcpu *vcpu) |
| 206 | { | 205 | { |
| 207 | to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID | | 206 | struct vcpu_svm *svm = to_svm(vcpu); |
| 208 | SVM_EVTINJ_TYPE_EXEPT | | ||
| 209 | UD_VECTOR; | ||
| 210 | } | ||
| 211 | 207 | ||
| 212 | static int is_page_fault(uint32_t info) | 208 | return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); |
| 213 | { | ||
| 214 | info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; | ||
| 215 | return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT); | ||
| 216 | } | 209 | } |
| 217 | 210 | ||
| 218 | static int is_external_interrupt(u32 info) | 211 | static int is_external_interrupt(u32 info) |
| @@ -229,17 +222,16 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
| 229 | printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); | 222 | printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); |
| 230 | return; | 223 | return; |
| 231 | } | 224 | } |
| 232 | if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) { | 225 | if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) |
| 233 | printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", | 226 | printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", |
| 234 | __FUNCTION__, | 227 | __FUNCTION__, |
| 235 | svm->vmcb->save.rip, | 228 | svm->vmcb->save.rip, |
| 236 | svm->next_rip); | 229 | svm->next_rip); |
| 237 | } | ||
| 238 | 230 | ||
| 239 | vcpu->rip = svm->vmcb->save.rip = svm->next_rip; | 231 | vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; |
| 240 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; | 232 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; |
| 241 | 233 | ||
| 242 | vcpu->interrupt_window_open = 1; | 234 | vcpu->arch.interrupt_window_open = 1; |
| 243 | } | 235 | } |
| 244 | 236 | ||
| 245 | static int has_svm(void) | 237 | static int has_svm(void) |
| @@ -312,7 +304,7 @@ static void svm_hardware_enable(void *garbage) | |||
| 312 | svm_data->next_asid = svm_data->max_asid + 1; | 304 | svm_data->next_asid = svm_data->max_asid + 1; |
| 313 | svm_features = cpuid_edx(SVM_CPUID_FUNC); | 305 | svm_features = cpuid_edx(SVM_CPUID_FUNC); |
| 314 | 306 | ||
| 315 | asm volatile ( "sgdt %0" : "=m"(gdt_descr) ); | 307 | asm volatile ("sgdt %0" : "=m"(gdt_descr)); |
| 316 | gdt = (struct desc_struct *)gdt_descr.address; | 308 | gdt = (struct desc_struct *)gdt_descr.address; |
| 317 | svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); | 309 | svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); |
| 318 | 310 | ||
| @@ -458,11 +450,13 @@ static void init_vmcb(struct vmcb *vmcb) | |||
| 458 | 450 | ||
| 459 | control->intercept_cr_read = INTERCEPT_CR0_MASK | | 451 | control->intercept_cr_read = INTERCEPT_CR0_MASK | |
| 460 | INTERCEPT_CR3_MASK | | 452 | INTERCEPT_CR3_MASK | |
| 461 | INTERCEPT_CR4_MASK; | 453 | INTERCEPT_CR4_MASK | |
| 454 | INTERCEPT_CR8_MASK; | ||
| 462 | 455 | ||
| 463 | control->intercept_cr_write = INTERCEPT_CR0_MASK | | 456 | control->intercept_cr_write = INTERCEPT_CR0_MASK | |
| 464 | INTERCEPT_CR3_MASK | | 457 | INTERCEPT_CR3_MASK | |
| 465 | INTERCEPT_CR4_MASK; | 458 | INTERCEPT_CR4_MASK | |
| 459 | INTERCEPT_CR8_MASK; | ||
| 466 | 460 | ||
| 467 | control->intercept_dr_read = INTERCEPT_DR0_MASK | | 461 | control->intercept_dr_read = INTERCEPT_DR0_MASK | |
| 468 | INTERCEPT_DR1_MASK | | 462 | INTERCEPT_DR1_MASK | |
| @@ -476,7 +470,8 @@ static void init_vmcb(struct vmcb *vmcb) | |||
| 476 | INTERCEPT_DR5_MASK | | 470 | INTERCEPT_DR5_MASK | |
| 477 | INTERCEPT_DR7_MASK; | 471 | INTERCEPT_DR7_MASK; |
| 478 | 472 | ||
| 479 | control->intercept_exceptions = 1 << PF_VECTOR; | 473 | control->intercept_exceptions = (1 << PF_VECTOR) | |
| 474 | (1 << UD_VECTOR); | ||
| 480 | 475 | ||
| 481 | 476 | ||
| 482 | control->intercept = (1ULL << INTERCEPT_INTR) | | 477 | control->intercept = (1ULL << INTERCEPT_INTR) | |
| @@ -543,8 +538,7 @@ static void init_vmcb(struct vmcb *vmcb) | |||
| 543 | init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); | 538 | init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); |
| 544 | 539 | ||
| 545 | save->efer = MSR_EFER_SVME_MASK; | 540 | save->efer = MSR_EFER_SVME_MASK; |
| 546 | 541 | save->dr6 = 0xffff0ff0; | |
| 547 | save->dr6 = 0xffff0ff0; | ||
| 548 | save->dr7 = 0x400; | 542 | save->dr7 = 0x400; |
| 549 | save->rflags = 2; | 543 | save->rflags = 2; |
| 550 | save->rip = 0x0000fff0; | 544 | save->rip = 0x0000fff0; |
| @@ -558,7 +552,7 @@ static void init_vmcb(struct vmcb *vmcb) | |||
| 558 | /* rdx = ?? */ | 552 | /* rdx = ?? */ |
| 559 | } | 553 | } |
| 560 | 554 | ||
| 561 | static void svm_vcpu_reset(struct kvm_vcpu *vcpu) | 555 | static int svm_vcpu_reset(struct kvm_vcpu *vcpu) |
| 562 | { | 556 | { |
| 563 | struct vcpu_svm *svm = to_svm(vcpu); | 557 | struct vcpu_svm *svm = to_svm(vcpu); |
| 564 | 558 | ||
| @@ -566,9 +560,11 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu) | |||
| 566 | 560 | ||
| 567 | if (vcpu->vcpu_id != 0) { | 561 | if (vcpu->vcpu_id != 0) { |
| 568 | svm->vmcb->save.rip = 0; | 562 | svm->vmcb->save.rip = 0; |
| 569 | svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12; | 563 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; |
| 570 | svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8; | 564 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; |
| 571 | } | 565 | } |
| 566 | |||
| 567 | return 0; | ||
| 572 | } | 568 | } |
| 573 | 569 | ||
| 574 | static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | 570 | static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) |
| @@ -587,12 +583,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
| 587 | if (err) | 583 | if (err) |
| 588 | goto free_svm; | 584 | goto free_svm; |
| 589 | 585 | ||
| 590 | if (irqchip_in_kernel(kvm)) { | ||
| 591 | err = kvm_create_lapic(&svm->vcpu); | ||
| 592 | if (err < 0) | ||
| 593 | goto free_svm; | ||
| 594 | } | ||
| 595 | |||
| 596 | page = alloc_page(GFP_KERNEL); | 586 | page = alloc_page(GFP_KERNEL); |
| 597 | if (!page) { | 587 | if (!page) { |
| 598 | err = -ENOMEM; | 588 | err = -ENOMEM; |
| @@ -608,9 +598,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
| 608 | 598 | ||
| 609 | fx_init(&svm->vcpu); | 599 | fx_init(&svm->vcpu); |
| 610 | svm->vcpu.fpu_active = 1; | 600 | svm->vcpu.fpu_active = 1; |
| 611 | svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | 601 | svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; |
| 612 | if (svm->vcpu.vcpu_id == 0) | 602 | if (svm->vcpu.vcpu_id == 0) |
| 613 | svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP; | 603 | svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; |
| 614 | 604 | ||
| 615 | return &svm->vcpu; | 605 | return &svm->vcpu; |
| 616 | 606 | ||
| @@ -644,7 +634,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
| 644 | * increasing TSC. | 634 | * increasing TSC. |
| 645 | */ | 635 | */ |
| 646 | rdtscll(tsc_this); | 636 | rdtscll(tsc_this); |
| 647 | delta = vcpu->host_tsc - tsc_this; | 637 | delta = vcpu->arch.host_tsc - tsc_this; |
| 648 | svm->vmcb->control.tsc_offset += delta; | 638 | svm->vmcb->control.tsc_offset += delta; |
| 649 | vcpu->cpu = cpu; | 639 | vcpu->cpu = cpu; |
| 650 | kvm_migrate_apic_timer(vcpu); | 640 | kvm_migrate_apic_timer(vcpu); |
| @@ -659,11 +649,11 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) | |||
| 659 | struct vcpu_svm *svm = to_svm(vcpu); | 649 | struct vcpu_svm *svm = to_svm(vcpu); |
| 660 | int i; | 650 | int i; |
| 661 | 651 | ||
| 652 | ++vcpu->stat.host_state_reload; | ||
| 662 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | 653 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) |
| 663 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | 654 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); |
| 664 | 655 | ||
| 665 | rdtscll(vcpu->host_tsc); | 656 | rdtscll(vcpu->arch.host_tsc); |
| 666 | kvm_put_guest_fpu(vcpu); | ||
| 667 | } | 657 | } |
| 668 | 658 | ||
| 669 | static void svm_vcpu_decache(struct kvm_vcpu *vcpu) | 659 | static void svm_vcpu_decache(struct kvm_vcpu *vcpu) |
| @@ -674,17 +664,17 @@ static void svm_cache_regs(struct kvm_vcpu *vcpu) | |||
| 674 | { | 664 | { |
| 675 | struct vcpu_svm *svm = to_svm(vcpu); | 665 | struct vcpu_svm *svm = to_svm(vcpu); |
| 676 | 666 | ||
| 677 | vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | 667 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; |
| 678 | vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | 668 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; |
| 679 | vcpu->rip = svm->vmcb->save.rip; | 669 | vcpu->arch.rip = svm->vmcb->save.rip; |
| 680 | } | 670 | } |
| 681 | 671 | ||
| 682 | static void svm_decache_regs(struct kvm_vcpu *vcpu) | 672 | static void svm_decache_regs(struct kvm_vcpu *vcpu) |
| 683 | { | 673 | { |
| 684 | struct vcpu_svm *svm = to_svm(vcpu); | 674 | struct vcpu_svm *svm = to_svm(vcpu); |
| 685 | svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX]; | 675 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; |
| 686 | svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP]; | 676 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; |
| 687 | svm->vmcb->save.rip = vcpu->rip; | 677 | svm->vmcb->save.rip = vcpu->arch.rip; |
| 688 | } | 678 | } |
| 689 | 679 | ||
| 690 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) | 680 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) |
| @@ -782,24 +772,24 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
| 782 | struct vcpu_svm *svm = to_svm(vcpu); | 772 | struct vcpu_svm *svm = to_svm(vcpu); |
| 783 | 773 | ||
| 784 | #ifdef CONFIG_X86_64 | 774 | #ifdef CONFIG_X86_64 |
| 785 | if (vcpu->shadow_efer & KVM_EFER_LME) { | 775 | if (vcpu->arch.shadow_efer & EFER_LME) { |
| 786 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { | 776 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { |
| 787 | vcpu->shadow_efer |= KVM_EFER_LMA; | 777 | vcpu->arch.shadow_efer |= EFER_LMA; |
| 788 | svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME; | 778 | svm->vmcb->save.efer |= EFER_LMA | EFER_LME; |
| 789 | } | 779 | } |
| 790 | 780 | ||
| 791 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) { | 781 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { |
| 792 | vcpu->shadow_efer &= ~KVM_EFER_LMA; | 782 | vcpu->arch.shadow_efer &= ~EFER_LMA; |
| 793 | svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME); | 783 | svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); |
| 794 | } | 784 | } |
| 795 | } | 785 | } |
| 796 | #endif | 786 | #endif |
| 797 | if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { | 787 | if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { |
| 798 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); | 788 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); |
| 799 | vcpu->fpu_active = 1; | 789 | vcpu->fpu_active = 1; |
| 800 | } | 790 | } |
| 801 | 791 | ||
| 802 | vcpu->cr0 = cr0; | 792 | vcpu->arch.cr0 = cr0; |
| 803 | cr0 |= X86_CR0_PG | X86_CR0_WP; | 793 | cr0 |= X86_CR0_PG | X86_CR0_WP; |
| 804 | cr0 &= ~(X86_CR0_CD | X86_CR0_NW); | 794 | cr0 &= ~(X86_CR0_CD | X86_CR0_NW); |
| 805 | svm->vmcb->save.cr0 = cr0; | 795 | svm->vmcb->save.cr0 = cr0; |
| @@ -807,7 +797,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
| 807 | 797 | ||
| 808 | static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 798 | static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
| 809 | { | 799 | { |
| 810 | vcpu->cr4 = cr4; | 800 | vcpu->arch.cr4 = cr4; |
| 811 | to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; | 801 | to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; |
| 812 | } | 802 | } |
| 813 | 803 | ||
| @@ -912,7 +902,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, | |||
| 912 | svm->db_regs[dr] = value; | 902 | svm->db_regs[dr] = value; |
| 913 | return; | 903 | return; |
| 914 | case 4 ... 5: | 904 | case 4 ... 5: |
| 915 | if (vcpu->cr4 & X86_CR4_DE) { | 905 | if (vcpu->arch.cr4 & X86_CR4_DE) { |
| 916 | *exception = UD_VECTOR; | 906 | *exception = UD_VECTOR; |
| 917 | return; | 907 | return; |
| 918 | } | 908 | } |
| @@ -938,51 +928,30 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
| 938 | struct kvm *kvm = svm->vcpu.kvm; | 928 | struct kvm *kvm = svm->vcpu.kvm; |
| 939 | u64 fault_address; | 929 | u64 fault_address; |
| 940 | u32 error_code; | 930 | u32 error_code; |
| 941 | enum emulation_result er; | ||
| 942 | int r; | ||
| 943 | 931 | ||
| 944 | if (!irqchip_in_kernel(kvm) && | 932 | if (!irqchip_in_kernel(kvm) && |
| 945 | is_external_interrupt(exit_int_info)) | 933 | is_external_interrupt(exit_int_info)) |
| 946 | push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); | 934 | push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); |
| 947 | 935 | ||
| 948 | mutex_lock(&kvm->lock); | ||
| 949 | |||
| 950 | fault_address = svm->vmcb->control.exit_info_2; | 936 | fault_address = svm->vmcb->control.exit_info_2; |
| 951 | error_code = svm->vmcb->control.exit_info_1; | 937 | error_code = svm->vmcb->control.exit_info_1; |
| 952 | r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); | 938 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); |
| 953 | if (r < 0) { | 939 | } |
| 954 | mutex_unlock(&kvm->lock); | ||
| 955 | return r; | ||
| 956 | } | ||
| 957 | if (!r) { | ||
| 958 | mutex_unlock(&kvm->lock); | ||
| 959 | return 1; | ||
| 960 | } | ||
| 961 | er = emulate_instruction(&svm->vcpu, kvm_run, fault_address, | ||
| 962 | error_code); | ||
| 963 | mutex_unlock(&kvm->lock); | ||
| 964 | 940 | ||
| 965 | switch (er) { | 941 | static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
| 966 | case EMULATE_DONE: | 942 | { |
| 967 | return 1; | 943 | int er; |
| 968 | case EMULATE_DO_MMIO: | ||
| 969 | ++svm->vcpu.stat.mmio_exits; | ||
| 970 | return 0; | ||
| 971 | case EMULATE_FAIL: | ||
| 972 | kvm_report_emulation_failure(&svm->vcpu, "pagetable"); | ||
| 973 | break; | ||
| 974 | default: | ||
| 975 | BUG(); | ||
| 976 | } | ||
| 977 | 944 | ||
| 978 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | 945 | er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); |
| 979 | return 0; | 946 | if (er != EMULATE_DONE) |
| 947 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | ||
| 948 | return 1; | ||
| 980 | } | 949 | } |
| 981 | 950 | ||
| 982 | static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 951 | static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
| 983 | { | 952 | { |
| 984 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); | 953 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); |
| 985 | if (!(svm->vcpu.cr0 & X86_CR0_TS)) | 954 | if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) |
| 986 | svm->vmcb->save.cr0 &= ~X86_CR0_TS; | 955 | svm->vmcb->save.cr0 &= ~X86_CR0_TS; |
| 987 | svm->vcpu.fpu_active = 1; | 956 | svm->vcpu.fpu_active = 1; |
| 988 | 957 | ||
| @@ -1004,7 +973,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
| 1004 | 973 | ||
| 1005 | static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 974 | static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
| 1006 | { | 975 | { |
| 1007 | u32 io_info = svm->vmcb->control.exit_info_1; //address size bug? | 976 | u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ |
| 1008 | int size, down, in, string, rep; | 977 | int size, down, in, string, rep; |
| 1009 | unsigned port; | 978 | unsigned port; |
| 1010 | 979 | ||
| @@ -1015,7 +984,8 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
| 1015 | string = (io_info & SVM_IOIO_STR_MASK) != 0; | 984 | string = (io_info & SVM_IOIO_STR_MASK) != 0; |
| 1016 | 985 | ||
| 1017 | if (string) { | 986 | if (string) { |
| 1018 | if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) | 987 | if (emulate_instruction(&svm->vcpu, |
| 988 | kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) | ||
| 1019 | return 0; | 989 | return 0; |
| 1020 | return 1; | 990 | return 1; |
| 1021 | } | 991 | } |
| @@ -1045,13 +1015,14 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
| 1045 | { | 1015 | { |
| 1046 | svm->next_rip = svm->vmcb->save.rip + 3; | 1016 | svm->next_rip = svm->vmcb->save.rip + 3; |
| 1047 | skip_emulated_instruction(&svm->vcpu); | 1017 | skip_emulated_instruction(&svm->vcpu); |
| 1048 | return kvm_hypercall(&svm->vcpu, kvm_run); | 1018 | kvm_emulate_hypercall(&svm->vcpu); |
| 1019 | return 1; | ||
| 1049 | } | 1020 | } |
| 1050 | 1021 | ||
| 1051 | static int invalid_op_interception(struct vcpu_svm *svm, | 1022 | static int invalid_op_interception(struct vcpu_svm *svm, |
| 1052 | struct kvm_run *kvm_run) | 1023 | struct kvm_run *kvm_run) |
| 1053 | { | 1024 | { |
| 1054 | inject_ud(&svm->vcpu); | 1025 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); |
| 1055 | return 1; | 1026 | return 1; |
| 1056 | } | 1027 | } |
| 1057 | 1028 | ||
| @@ -1073,11 +1044,20 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
| 1073 | static int emulate_on_interception(struct vcpu_svm *svm, | 1044 | static int emulate_on_interception(struct vcpu_svm *svm, |
| 1074 | struct kvm_run *kvm_run) | 1045 | struct kvm_run *kvm_run) |
| 1075 | { | 1046 | { |
| 1076 | if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE) | 1047 | if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) |
| 1077 | pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); | 1048 | pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); |
| 1078 | return 1; | 1049 | return 1; |
| 1079 | } | 1050 | } |
| 1080 | 1051 | ||
| 1052 | static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
| 1053 | { | ||
| 1054 | emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); | ||
| 1055 | if (irqchip_in_kernel(svm->vcpu.kvm)) | ||
| 1056 | return 1; | ||
| 1057 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; | ||
| 1058 | return 0; | ||
| 1059 | } | ||
| 1060 | |||
| 1081 | static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | 1061 | static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) |
| 1082 | { | 1062 | { |
| 1083 | struct vcpu_svm *svm = to_svm(vcpu); | 1063 | struct vcpu_svm *svm = to_svm(vcpu); |
| @@ -1124,14 +1104,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | |||
| 1124 | 1104 | ||
| 1125 | static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1105 | static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
| 1126 | { | 1106 | { |
| 1127 | u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; | 1107 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; |
| 1128 | u64 data; | 1108 | u64 data; |
| 1129 | 1109 | ||
| 1130 | if (svm_get_msr(&svm->vcpu, ecx, &data)) | 1110 | if (svm_get_msr(&svm->vcpu, ecx, &data)) |
| 1131 | svm_inject_gp(&svm->vcpu, 0); | 1111 | kvm_inject_gp(&svm->vcpu, 0); |
| 1132 | else { | 1112 | else { |
| 1133 | svm->vmcb->save.rax = data & 0xffffffff; | 1113 | svm->vmcb->save.rax = data & 0xffffffff; |
| 1134 | svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32; | 1114 | svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; |
| 1135 | svm->next_rip = svm->vmcb->save.rip + 2; | 1115 | svm->next_rip = svm->vmcb->save.rip + 2; |
| 1136 | skip_emulated_instruction(&svm->vcpu); | 1116 | skip_emulated_instruction(&svm->vcpu); |
| 1137 | } | 1117 | } |
| @@ -1176,7 +1156,20 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
| 1176 | case MSR_IA32_SYSENTER_ESP: | 1156 | case MSR_IA32_SYSENTER_ESP: |
| 1177 | svm->vmcb->save.sysenter_esp = data; | 1157 | svm->vmcb->save.sysenter_esp = data; |
| 1178 | break; | 1158 | break; |
| 1159 | case MSR_K7_EVNTSEL0: | ||
| 1160 | case MSR_K7_EVNTSEL1: | ||
| 1161 | case MSR_K7_EVNTSEL2: | ||
| 1162 | case MSR_K7_EVNTSEL3: | ||
| 1163 | /* | ||
| 1164 | * only support writing 0 to the performance counters for now | ||
| 1165 | * to make Windows happy. Should be replaced by a real | ||
| 1166 | * performance counter emulation later. | ||
| 1167 | */ | ||
| 1168 | if (data != 0) | ||
| 1169 | goto unhandled; | ||
| 1170 | break; | ||
| 1179 | default: | 1171 | default: |
| 1172 | unhandled: | ||
| 1180 | return kvm_set_msr_common(vcpu, ecx, data); | 1173 | return kvm_set_msr_common(vcpu, ecx, data); |
| 1181 | } | 1174 | } |
| 1182 | return 0; | 1175 | return 0; |
| @@ -1184,12 +1177,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
| 1184 | 1177 | ||
| 1185 | static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1178 | static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
| 1186 | { | 1179 | { |
| 1187 | u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; | 1180 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; |
| 1188 | u64 data = (svm->vmcb->save.rax & -1u) | 1181 | u64 data = (svm->vmcb->save.rax & -1u) |
| 1189 | | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32); | 1182 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); |
| 1190 | svm->next_rip = svm->vmcb->save.rip + 2; | 1183 | svm->next_rip = svm->vmcb->save.rip + 2; |
| 1191 | if (svm_set_msr(&svm->vcpu, ecx, data)) | 1184 | if (svm_set_msr(&svm->vcpu, ecx, data)) |
| 1192 | svm_inject_gp(&svm->vcpu, 0); | 1185 | kvm_inject_gp(&svm->vcpu, 0); |
| 1193 | else | 1186 | else |
| 1194 | skip_emulated_instruction(&svm->vcpu); | 1187 | skip_emulated_instruction(&svm->vcpu); |
| 1195 | return 1; | 1188 | return 1; |
| @@ -1213,7 +1206,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm, | |||
| 1213 | * possible | 1206 | * possible |
| 1214 | */ | 1207 | */ |
| 1215 | if (kvm_run->request_interrupt_window && | 1208 | if (kvm_run->request_interrupt_window && |
| 1216 | !svm->vcpu.irq_summary) { | 1209 | !svm->vcpu.arch.irq_summary) { |
| 1217 | ++svm->vcpu.stat.irq_window_exits; | 1210 | ++svm->vcpu.stat.irq_window_exits; |
| 1218 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | 1211 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; |
| 1219 | return 0; | 1212 | return 0; |
| @@ -1227,10 +1220,12 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, | |||
| 1227 | [SVM_EXIT_READ_CR0] = emulate_on_interception, | 1220 | [SVM_EXIT_READ_CR0] = emulate_on_interception, |
| 1228 | [SVM_EXIT_READ_CR3] = emulate_on_interception, | 1221 | [SVM_EXIT_READ_CR3] = emulate_on_interception, |
| 1229 | [SVM_EXIT_READ_CR4] = emulate_on_interception, | 1222 | [SVM_EXIT_READ_CR4] = emulate_on_interception, |
| 1223 | [SVM_EXIT_READ_CR8] = emulate_on_interception, | ||
| 1230 | /* for now: */ | 1224 | /* for now: */ |
| 1231 | [SVM_EXIT_WRITE_CR0] = emulate_on_interception, | 1225 | [SVM_EXIT_WRITE_CR0] = emulate_on_interception, |
| 1232 | [SVM_EXIT_WRITE_CR3] = emulate_on_interception, | 1226 | [SVM_EXIT_WRITE_CR3] = emulate_on_interception, |
| 1233 | [SVM_EXIT_WRITE_CR4] = emulate_on_interception, | 1227 | [SVM_EXIT_WRITE_CR4] = emulate_on_interception, |
| 1228 | [SVM_EXIT_WRITE_CR8] = cr8_write_interception, | ||
| 1234 | [SVM_EXIT_READ_DR0] = emulate_on_interception, | 1229 | [SVM_EXIT_READ_DR0] = emulate_on_interception, |
| 1235 | [SVM_EXIT_READ_DR1] = emulate_on_interception, | 1230 | [SVM_EXIT_READ_DR1] = emulate_on_interception, |
| 1236 | [SVM_EXIT_READ_DR2] = emulate_on_interception, | 1231 | [SVM_EXIT_READ_DR2] = emulate_on_interception, |
| @@ -1241,6 +1236,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, | |||
| 1241 | [SVM_EXIT_WRITE_DR3] = emulate_on_interception, | 1236 | [SVM_EXIT_WRITE_DR3] = emulate_on_interception, |
| 1242 | [SVM_EXIT_WRITE_DR5] = emulate_on_interception, | 1237 | [SVM_EXIT_WRITE_DR5] = emulate_on_interception, |
| 1243 | [SVM_EXIT_WRITE_DR7] = emulate_on_interception, | 1238 | [SVM_EXIT_WRITE_DR7] = emulate_on_interception, |
| 1239 | [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, | ||
| 1244 | [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, | 1240 | [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, |
| 1245 | [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, | 1241 | [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, |
| 1246 | [SVM_EXIT_INTR] = nop_on_interception, | 1242 | [SVM_EXIT_INTR] = nop_on_interception, |
| @@ -1293,7 +1289,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
| 1293 | exit_code); | 1289 | exit_code); |
| 1294 | 1290 | ||
| 1295 | if (exit_code >= ARRAY_SIZE(svm_exit_handlers) | 1291 | if (exit_code >= ARRAY_SIZE(svm_exit_handlers) |
| 1296 | || svm_exit_handlers[exit_code] == 0) { | 1292 | || !svm_exit_handlers[exit_code]) { |
| 1297 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | 1293 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; |
| 1298 | kvm_run->hw.hardware_exit_reason = exit_code; | 1294 | kvm_run->hw.hardware_exit_reason = exit_code; |
| 1299 | return 0; | 1295 | return 0; |
| @@ -1307,7 +1303,7 @@ static void reload_tss(struct kvm_vcpu *vcpu) | |||
| 1307 | int cpu = raw_smp_processor_id(); | 1303 | int cpu = raw_smp_processor_id(); |
| 1308 | 1304 | ||
| 1309 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); | 1305 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); |
| 1310 | svm_data->tss_desc->type = 9; //available 32/64-bit TSS | 1306 | svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ |
| 1311 | load_TR_desc(); | 1307 | load_TR_desc(); |
| 1312 | } | 1308 | } |
| 1313 | 1309 | ||
| @@ -1348,7 +1344,6 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu) | |||
| 1348 | struct vmcb *vmcb = svm->vmcb; | 1344 | struct vmcb *vmcb = svm->vmcb; |
| 1349 | int intr_vector = -1; | 1345 | int intr_vector = -1; |
| 1350 | 1346 | ||
| 1351 | kvm_inject_pending_timer_irqs(vcpu); | ||
| 1352 | if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && | 1347 | if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && |
| 1353 | ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { | 1348 | ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { |
| 1354 | intr_vector = vmcb->control.exit_int_info & | 1349 | intr_vector = vmcb->control.exit_int_info & |
| @@ -1388,20 +1383,20 @@ static void kvm_reput_irq(struct vcpu_svm *svm) | |||
| 1388 | push_irq(&svm->vcpu, control->int_vector); | 1383 | push_irq(&svm->vcpu, control->int_vector); |
| 1389 | } | 1384 | } |
| 1390 | 1385 | ||
| 1391 | svm->vcpu.interrupt_window_open = | 1386 | svm->vcpu.arch.interrupt_window_open = |
| 1392 | !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); | 1387 | !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); |
| 1393 | } | 1388 | } |
| 1394 | 1389 | ||
| 1395 | static void svm_do_inject_vector(struct vcpu_svm *svm) | 1390 | static void svm_do_inject_vector(struct vcpu_svm *svm) |
| 1396 | { | 1391 | { |
| 1397 | struct kvm_vcpu *vcpu = &svm->vcpu; | 1392 | struct kvm_vcpu *vcpu = &svm->vcpu; |
| 1398 | int word_index = __ffs(vcpu->irq_summary); | 1393 | int word_index = __ffs(vcpu->arch.irq_summary); |
| 1399 | int bit_index = __ffs(vcpu->irq_pending[word_index]); | 1394 | int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); |
| 1400 | int irq = word_index * BITS_PER_LONG + bit_index; | 1395 | int irq = word_index * BITS_PER_LONG + bit_index; |
| 1401 | 1396 | ||
| 1402 | clear_bit(bit_index, &vcpu->irq_pending[word_index]); | 1397 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); |
| 1403 | if (!vcpu->irq_pending[word_index]) | 1398 | if (!vcpu->arch.irq_pending[word_index]) |
| 1404 | clear_bit(word_index, &vcpu->irq_summary); | 1399 | clear_bit(word_index, &vcpu->arch.irq_summary); |
| 1405 | svm_inject_irq(svm, irq); | 1400 | svm_inject_irq(svm, irq); |
| 1406 | } | 1401 | } |
| 1407 | 1402 | ||
| @@ -1411,11 +1406,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, | |||
| 1411 | struct vcpu_svm *svm = to_svm(vcpu); | 1406 | struct vcpu_svm *svm = to_svm(vcpu); |
| 1412 | struct vmcb_control_area *control = &svm->vmcb->control; | 1407 | struct vmcb_control_area *control = &svm->vmcb->control; |
| 1413 | 1408 | ||
| 1414 | svm->vcpu.interrupt_window_open = | 1409 | svm->vcpu.arch.interrupt_window_open = |
| 1415 | (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && | 1410 | (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && |
| 1416 | (svm->vmcb->save.rflags & X86_EFLAGS_IF)); | 1411 | (svm->vmcb->save.rflags & X86_EFLAGS_IF)); |
| 1417 | 1412 | ||
| 1418 | if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary) | 1413 | if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) |
| 1419 | /* | 1414 | /* |
| 1420 | * If interrupts enabled, and not blocked by sti or mov ss. Good. | 1415 | * If interrupts enabled, and not blocked by sti or mov ss. Good. |
| 1421 | */ | 1416 | */ |
| @@ -1424,13 +1419,18 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, | |||
| 1424 | /* | 1419 | /* |
| 1425 | * Interrupts blocked. Wait for unblock. | 1420 | * Interrupts blocked. Wait for unblock. |
| 1426 | */ | 1421 | */ |
| 1427 | if (!svm->vcpu.interrupt_window_open && | 1422 | if (!svm->vcpu.arch.interrupt_window_open && |
| 1428 | (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) { | 1423 | (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window)) |
| 1429 | control->intercept |= 1ULL << INTERCEPT_VINTR; | 1424 | control->intercept |= 1ULL << INTERCEPT_VINTR; |
| 1430 | } else | 1425 | else |
| 1431 | control->intercept &= ~(1ULL << INTERCEPT_VINTR); | 1426 | control->intercept &= ~(1ULL << INTERCEPT_VINTR); |
| 1432 | } | 1427 | } |
| 1433 | 1428 | ||
| 1429 | static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) | ||
| 1430 | { | ||
| 1431 | return 0; | ||
| 1432 | } | ||
| 1433 | |||
| 1434 | static void save_db_regs(unsigned long *db_regs) | 1434 | static void save_db_regs(unsigned long *db_regs) |
| 1435 | { | 1435 | { |
| 1436 | asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); | 1436 | asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); |
| @@ -1472,7 +1472,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1472 | svm->host_cr2 = kvm_read_cr2(); | 1472 | svm->host_cr2 = kvm_read_cr2(); |
| 1473 | svm->host_dr6 = read_dr6(); | 1473 | svm->host_dr6 = read_dr6(); |
| 1474 | svm->host_dr7 = read_dr7(); | 1474 | svm->host_dr7 = read_dr7(); |
| 1475 | svm->vmcb->save.cr2 = vcpu->cr2; | 1475 | svm->vmcb->save.cr2 = vcpu->arch.cr2; |
| 1476 | 1476 | ||
| 1477 | if (svm->vmcb->save.dr7 & 0xff) { | 1477 | if (svm->vmcb->save.dr7 & 0xff) { |
| 1478 | write_dr7(0); | 1478 | write_dr7(0); |
| @@ -1486,13 +1486,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1486 | 1486 | ||
| 1487 | asm volatile ( | 1487 | asm volatile ( |
| 1488 | #ifdef CONFIG_X86_64 | 1488 | #ifdef CONFIG_X86_64 |
| 1489 | "push %%rbx; push %%rcx; push %%rdx;" | 1489 | "push %%rbp; \n\t" |
| 1490 | "push %%rsi; push %%rdi; push %%rbp;" | ||
| 1491 | "push %%r8; push %%r9; push %%r10; push %%r11;" | ||
| 1492 | "push %%r12; push %%r13; push %%r14; push %%r15;" | ||
| 1493 | #else | 1490 | #else |
| 1494 | "push %%ebx; push %%ecx; push %%edx;" | 1491 | "push %%ebp; \n\t" |
| 1495 | "push %%esi; push %%edi; push %%ebp;" | ||
| 1496 | #endif | 1492 | #endif |
| 1497 | 1493 | ||
| 1498 | #ifdef CONFIG_X86_64 | 1494 | #ifdef CONFIG_X86_64 |
| @@ -1554,10 +1550,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1554 | "mov %%r14, %c[r14](%[svm]) \n\t" | 1550 | "mov %%r14, %c[r14](%[svm]) \n\t" |
| 1555 | "mov %%r15, %c[r15](%[svm]) \n\t" | 1551 | "mov %%r15, %c[r15](%[svm]) \n\t" |
| 1556 | 1552 | ||
| 1557 | "pop %%r15; pop %%r14; pop %%r13; pop %%r12;" | 1553 | "pop %%rbp; \n\t" |
| 1558 | "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" | ||
| 1559 | "pop %%rbp; pop %%rdi; pop %%rsi;" | ||
| 1560 | "pop %%rdx; pop %%rcx; pop %%rbx; \n\t" | ||
| 1561 | #else | 1554 | #else |
| 1562 | "mov %%ebx, %c[rbx](%[svm]) \n\t" | 1555 | "mov %%ebx, %c[rbx](%[svm]) \n\t" |
| 1563 | "mov %%ecx, %c[rcx](%[svm]) \n\t" | 1556 | "mov %%ecx, %c[rcx](%[svm]) \n\t" |
| @@ -1566,34 +1559,40 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1566 | "mov %%edi, %c[rdi](%[svm]) \n\t" | 1559 | "mov %%edi, %c[rdi](%[svm]) \n\t" |
| 1567 | "mov %%ebp, %c[rbp](%[svm]) \n\t" | 1560 | "mov %%ebp, %c[rbp](%[svm]) \n\t" |
| 1568 | 1561 | ||
| 1569 | "pop %%ebp; pop %%edi; pop %%esi;" | 1562 | "pop %%ebp; \n\t" |
| 1570 | "pop %%edx; pop %%ecx; pop %%ebx; \n\t" | ||
| 1571 | #endif | 1563 | #endif |
| 1572 | : | 1564 | : |
| 1573 | : [svm]"a"(svm), | 1565 | : [svm]"a"(svm), |
| 1574 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), | 1566 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), |
| 1575 | [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])), | 1567 | [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])), |
| 1576 | [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])), | 1568 | [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])), |
| 1577 | [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])), | 1569 | [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])), |
| 1578 | [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])), | 1570 | [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])), |
| 1579 | [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])), | 1571 | [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])), |
| 1580 | [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP])) | 1572 | [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP])) |
| 1581 | #ifdef CONFIG_X86_64 | 1573 | #ifdef CONFIG_X86_64 |
| 1582 | ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])), | 1574 | , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])), |
| 1583 | [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])), | 1575 | [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])), |
| 1584 | [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])), | 1576 | [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])), |
| 1585 | [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])), | 1577 | [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])), |
| 1586 | [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])), | 1578 | [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])), |
| 1587 | [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])), | 1579 | [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])), |
| 1588 | [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])), | 1580 | [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])), |
| 1589 | [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15])) | 1581 | [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) |
| 1590 | #endif | 1582 | #endif |
| 1591 | : "cc", "memory" ); | 1583 | : "cc", "memory" |
| 1584 | #ifdef CONFIG_X86_64 | ||
| 1585 | , "rbx", "rcx", "rdx", "rsi", "rdi" | ||
| 1586 | , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" | ||
| 1587 | #else | ||
| 1588 | , "ebx", "ecx", "edx" , "esi", "edi" | ||
| 1589 | #endif | ||
| 1590 | ); | ||
| 1592 | 1591 | ||
| 1593 | if ((svm->vmcb->save.dr7 & 0xff)) | 1592 | if ((svm->vmcb->save.dr7 & 0xff)) |
| 1594 | load_db_regs(svm->host_db_regs); | 1593 | load_db_regs(svm->host_db_regs); |
| 1595 | 1594 | ||
| 1596 | vcpu->cr2 = svm->vmcb->save.cr2; | 1595 | vcpu->arch.cr2 = svm->vmcb->save.cr2; |
| 1597 | 1596 | ||
| 1598 | write_dr6(svm->host_dr6); | 1597 | write_dr6(svm->host_dr6); |
| 1599 | write_dr7(svm->host_dr7); | 1598 | write_dr7(svm->host_dr7); |
| @@ -1627,34 +1626,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) | |||
| 1627 | } | 1626 | } |
| 1628 | } | 1627 | } |
| 1629 | 1628 | ||
| 1630 | static void svm_inject_page_fault(struct kvm_vcpu *vcpu, | ||
| 1631 | unsigned long addr, | ||
| 1632 | uint32_t err_code) | ||
| 1633 | { | ||
| 1634 | struct vcpu_svm *svm = to_svm(vcpu); | ||
| 1635 | uint32_t exit_int_info = svm->vmcb->control.exit_int_info; | ||
| 1636 | |||
| 1637 | ++vcpu->stat.pf_guest; | ||
| 1638 | |||
| 1639 | if (is_page_fault(exit_int_info)) { | ||
| 1640 | |||
| 1641 | svm->vmcb->control.event_inj_err = 0; | ||
| 1642 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | | ||
| 1643 | SVM_EVTINJ_VALID_ERR | | ||
| 1644 | SVM_EVTINJ_TYPE_EXEPT | | ||
| 1645 | DF_VECTOR; | ||
| 1646 | return; | ||
| 1647 | } | ||
| 1648 | vcpu->cr2 = addr; | ||
| 1649 | svm->vmcb->save.cr2 = addr; | ||
| 1650 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | | ||
| 1651 | SVM_EVTINJ_VALID_ERR | | ||
| 1652 | SVM_EVTINJ_TYPE_EXEPT | | ||
| 1653 | PF_VECTOR; | ||
| 1654 | svm->vmcb->control.event_inj_err = err_code; | ||
| 1655 | } | ||
| 1656 | |||
| 1657 | |||
| 1658 | static int is_disabled(void) | 1629 | static int is_disabled(void) |
| 1659 | { | 1630 | { |
| 1660 | u64 vm_cr; | 1631 | u64 vm_cr; |
| @@ -1675,7 +1646,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | |||
| 1675 | hypercall[0] = 0x0f; | 1646 | hypercall[0] = 0x0f; |
| 1676 | hypercall[1] = 0x01; | 1647 | hypercall[1] = 0x01; |
| 1677 | hypercall[2] = 0xd9; | 1648 | hypercall[2] = 0xd9; |
| 1678 | hypercall[3] = 0xc3; | ||
| 1679 | } | 1649 | } |
| 1680 | 1650 | ||
| 1681 | static void svm_check_processor_compat(void *rtn) | 1651 | static void svm_check_processor_compat(void *rtn) |
| @@ -1683,6 +1653,11 @@ static void svm_check_processor_compat(void *rtn) | |||
| 1683 | *(int *)rtn = 0; | 1653 | *(int *)rtn = 0; |
| 1684 | } | 1654 | } |
| 1685 | 1655 | ||
| 1656 | static bool svm_cpu_has_accelerated_tpr(void) | ||
| 1657 | { | ||
| 1658 | return false; | ||
| 1659 | } | ||
| 1660 | |||
| 1686 | static struct kvm_x86_ops svm_x86_ops = { | 1661 | static struct kvm_x86_ops svm_x86_ops = { |
| 1687 | .cpu_has_kvm_support = has_svm, | 1662 | .cpu_has_kvm_support = has_svm, |
| 1688 | .disabled_by_bios = is_disabled, | 1663 | .disabled_by_bios = is_disabled, |
| @@ -1691,6 +1666,7 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
| 1691 | .check_processor_compatibility = svm_check_processor_compat, | 1666 | .check_processor_compatibility = svm_check_processor_compat, |
| 1692 | .hardware_enable = svm_hardware_enable, | 1667 | .hardware_enable = svm_hardware_enable, |
| 1693 | .hardware_disable = svm_hardware_disable, | 1668 | .hardware_disable = svm_hardware_disable, |
| 1669 | .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, | ||
| 1694 | 1670 | ||
| 1695 | .vcpu_create = svm_create_vcpu, | 1671 | .vcpu_create = svm_create_vcpu, |
| 1696 | .vcpu_free = svm_free_vcpu, | 1672 | .vcpu_free = svm_free_vcpu, |
| @@ -1725,9 +1701,6 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
| 1725 | .set_rflags = svm_set_rflags, | 1701 | .set_rflags = svm_set_rflags, |
| 1726 | 1702 | ||
| 1727 | .tlb_flush = svm_flush_tlb, | 1703 | .tlb_flush = svm_flush_tlb, |
| 1728 | .inject_page_fault = svm_inject_page_fault, | ||
| 1729 | |||
| 1730 | .inject_gp = svm_inject_gp, | ||
| 1731 | 1704 | ||
| 1732 | .run = svm_vcpu_run, | 1705 | .run = svm_vcpu_run, |
| 1733 | .handle_exit = handle_exit, | 1706 | .handle_exit = handle_exit, |
| @@ -1735,19 +1708,23 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
| 1735 | .patch_hypercall = svm_patch_hypercall, | 1708 | .patch_hypercall = svm_patch_hypercall, |
| 1736 | .get_irq = svm_get_irq, | 1709 | .get_irq = svm_get_irq, |
| 1737 | .set_irq = svm_set_irq, | 1710 | .set_irq = svm_set_irq, |
| 1711 | .queue_exception = svm_queue_exception, | ||
| 1712 | .exception_injected = svm_exception_injected, | ||
| 1738 | .inject_pending_irq = svm_intr_assist, | 1713 | .inject_pending_irq = svm_intr_assist, |
| 1739 | .inject_pending_vectors = do_interrupt_requests, | 1714 | .inject_pending_vectors = do_interrupt_requests, |
| 1715 | |||
| 1716 | .set_tss_addr = svm_set_tss_addr, | ||
| 1740 | }; | 1717 | }; |
| 1741 | 1718 | ||
| 1742 | static int __init svm_init(void) | 1719 | static int __init svm_init(void) |
| 1743 | { | 1720 | { |
| 1744 | return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm), | 1721 | return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), |
| 1745 | THIS_MODULE); | 1722 | THIS_MODULE); |
| 1746 | } | 1723 | } |
| 1747 | 1724 | ||
| 1748 | static void __exit svm_exit(void) | 1725 | static void __exit svm_exit(void) |
| 1749 | { | 1726 | { |
| 1750 | kvm_exit_x86(); | 1727 | kvm_exit(); |
| 1751 | } | 1728 | } |
| 1752 | 1729 | ||
| 1753 | module_init(svm_init) | 1730 | module_init(svm_init) |
diff --git a/drivers/kvm/svm.h b/arch/x86/kvm/svm.h index 3b1b0f35b6cb..5fd50491b555 100644 --- a/drivers/kvm/svm.h +++ b/arch/x86/kvm/svm.h | |||
| @@ -204,6 +204,7 @@ struct __attribute__ ((__packed__)) vmcb { | |||
| 204 | #define INTERCEPT_CR0_MASK 1 | 204 | #define INTERCEPT_CR0_MASK 1 |
| 205 | #define INTERCEPT_CR3_MASK (1 << 3) | 205 | #define INTERCEPT_CR3_MASK (1 << 3) |
| 206 | #define INTERCEPT_CR4_MASK (1 << 4) | 206 | #define INTERCEPT_CR4_MASK (1 << 4) |
| 207 | #define INTERCEPT_CR8_MASK (1 << 8) | ||
| 207 | 208 | ||
| 208 | #define INTERCEPT_DR0_MASK 1 | 209 | #define INTERCEPT_DR0_MASK 1 |
| 209 | #define INTERCEPT_DR1_MASK (1 << 1) | 210 | #define INTERCEPT_DR1_MASK (1 << 1) |
| @@ -311,7 +312,7 @@ struct __attribute__ ((__packed__)) vmcb { | |||
| 311 | 312 | ||
| 312 | #define SVM_EXIT_ERR -1 | 313 | #define SVM_EXIT_ERR -1 |
| 313 | 314 | ||
| 314 | #define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP | 315 | #define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ |
| 315 | 316 | ||
| 316 | #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" | 317 | #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" |
| 317 | #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" | 318 | #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" |
diff --git a/drivers/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5b397b6c9f93..ad36447e696e 100644 --- a/drivers/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
| @@ -15,17 +15,18 @@ | |||
| 15 | * | 15 | * |
| 16 | */ | 16 | */ |
| 17 | 17 | ||
| 18 | #include "kvm.h" | ||
| 19 | #include "x86_emulate.h" | ||
| 20 | #include "irq.h" | 18 | #include "irq.h" |
| 21 | #include "vmx.h" | 19 | #include "vmx.h" |
| 22 | #include "segment_descriptor.h" | 20 | #include "segment_descriptor.h" |
| 21 | #include "mmu.h" | ||
| 23 | 22 | ||
| 23 | #include <linux/kvm_host.h> | ||
| 24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
| 25 | #include <linux/kernel.h> | 25 | #include <linux/kernel.h> |
| 26 | #include <linux/mm.h> | 26 | #include <linux/mm.h> |
| 27 | #include <linux/highmem.h> | 27 | #include <linux/highmem.h> |
| 28 | #include <linux/sched.h> | 28 | #include <linux/sched.h> |
| 29 | #include <linux/moduleparam.h> | ||
| 29 | 30 | ||
| 30 | #include <asm/io.h> | 31 | #include <asm/io.h> |
| 31 | #include <asm/desc.h> | 32 | #include <asm/desc.h> |
| @@ -33,6 +34,9 @@ | |||
| 33 | MODULE_AUTHOR("Qumranet"); | 34 | MODULE_AUTHOR("Qumranet"); |
| 34 | MODULE_LICENSE("GPL"); | 35 | MODULE_LICENSE("GPL"); |
| 35 | 36 | ||
| 37 | static int bypass_guest_pf = 1; | ||
| 38 | module_param(bypass_guest_pf, bool, 0); | ||
| 39 | |||
| 36 | struct vmcs { | 40 | struct vmcs { |
| 37 | u32 revision_id; | 41 | u32 revision_id; |
| 38 | u32 abort; | 42 | u32 abort; |
| @@ -43,6 +47,7 @@ struct vcpu_vmx { | |||
| 43 | struct kvm_vcpu vcpu; | 47 | struct kvm_vcpu vcpu; |
| 44 | int launched; | 48 | int launched; |
| 45 | u8 fail; | 49 | u8 fail; |
| 50 | u32 idt_vectoring_info; | ||
| 46 | struct kvm_msr_entry *guest_msrs; | 51 | struct kvm_msr_entry *guest_msrs; |
| 47 | struct kvm_msr_entry *host_msrs; | 52 | struct kvm_msr_entry *host_msrs; |
| 48 | int nmsrs; | 53 | int nmsrs; |
| @@ -57,8 +62,15 @@ struct vcpu_vmx { | |||
| 57 | u16 fs_sel, gs_sel, ldt_sel; | 62 | u16 fs_sel, gs_sel, ldt_sel; |
| 58 | int gs_ldt_reload_needed; | 63 | int gs_ldt_reload_needed; |
| 59 | int fs_reload_needed; | 64 | int fs_reload_needed; |
| 60 | }host_state; | 65 | int guest_efer_loaded; |
| 61 | 66 | } host_state; | |
| 67 | struct { | ||
| 68 | struct { | ||
| 69 | bool pending; | ||
| 70 | u8 vector; | ||
| 71 | unsigned rip; | ||
| 72 | } irq; | ||
| 73 | } rmode; | ||
| 62 | }; | 74 | }; |
| 63 | 75 | ||
| 64 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | 76 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) |
| @@ -74,14 +86,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | |||
| 74 | static struct page *vmx_io_bitmap_a; | 86 | static struct page *vmx_io_bitmap_a; |
| 75 | static struct page *vmx_io_bitmap_b; | 87 | static struct page *vmx_io_bitmap_b; |
| 76 | 88 | ||
| 77 | #define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE) | ||
| 78 | |||
| 79 | static struct vmcs_config { | 89 | static struct vmcs_config { |
| 80 | int size; | 90 | int size; |
| 81 | int order; | 91 | int order; |
| 82 | u32 revision_id; | 92 | u32 revision_id; |
| 83 | u32 pin_based_exec_ctrl; | 93 | u32 pin_based_exec_ctrl; |
| 84 | u32 cpu_based_exec_ctrl; | 94 | u32 cpu_based_exec_ctrl; |
| 95 | u32 cpu_based_2nd_exec_ctrl; | ||
| 85 | u32 vmexit_ctrl; | 96 | u32 vmexit_ctrl; |
| 86 | u32 vmentry_ctrl; | 97 | u32 vmentry_ctrl; |
| 87 | } vmcs_config; | 98 | } vmcs_config; |
| @@ -138,18 +149,6 @@ static void save_msrs(struct kvm_msr_entry *e, int n) | |||
| 138 | rdmsrl(e[i].index, e[i].data); | 149 | rdmsrl(e[i].index, e[i].data); |
| 139 | } | 150 | } |
| 140 | 151 | ||
| 141 | static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr) | ||
| 142 | { | ||
| 143 | return (u64)msr.data & EFER_SAVE_RESTORE_BITS; | ||
| 144 | } | ||
| 145 | |||
| 146 | static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx) | ||
| 147 | { | ||
| 148 | int efer_offset = vmx->msr_offset_efer; | ||
| 149 | return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) != | ||
| 150 | msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]); | ||
| 151 | } | ||
| 152 | |||
| 153 | static inline int is_page_fault(u32 intr_info) | 152 | static inline int is_page_fault(u32 intr_info) |
| 154 | { | 153 | { |
| 155 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | 154 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | |
| @@ -164,6 +163,13 @@ static inline int is_no_device(u32 intr_info) | |||
| 164 | (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); | 163 | (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); |
| 165 | } | 164 | } |
| 166 | 165 | ||
| 166 | static inline int is_invalid_opcode(u32 intr_info) | ||
| 167 | { | ||
| 168 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
| 169 | INTR_INFO_VALID_MASK)) == | ||
| 170 | (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); | ||
| 171 | } | ||
| 172 | |||
| 167 | static inline int is_external_interrupt(u32 intr_info) | 173 | static inline int is_external_interrupt(u32 intr_info) |
| 168 | { | 174 | { |
| 169 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) | 175 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) |
| @@ -180,6 +186,24 @@ static inline int vm_need_tpr_shadow(struct kvm *kvm) | |||
| 180 | return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); | 186 | return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); |
| 181 | } | 187 | } |
| 182 | 188 | ||
| 189 | static inline int cpu_has_secondary_exec_ctrls(void) | ||
| 190 | { | ||
| 191 | return (vmcs_config.cpu_based_exec_ctrl & | ||
| 192 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); | ||
| 193 | } | ||
| 194 | |||
| 195 | static inline bool cpu_has_vmx_virtualize_apic_accesses(void) | ||
| 196 | { | ||
| 197 | return (vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 198 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); | ||
| 199 | } | ||
| 200 | |||
| 201 | static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) | ||
| 202 | { | ||
| 203 | return ((cpu_has_vmx_virtualize_apic_accesses()) && | ||
| 204 | (irqchip_in_kernel(kvm))); | ||
| 205 | } | ||
| 206 | |||
| 183 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) | 207 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) |
| 184 | { | 208 | { |
| 185 | int i; | 209 | int i; |
| @@ -222,16 +246,14 @@ static void __vcpu_clear(void *arg) | |||
| 222 | vmcs_clear(vmx->vmcs); | 246 | vmcs_clear(vmx->vmcs); |
| 223 | if (per_cpu(current_vmcs, cpu) == vmx->vmcs) | 247 | if (per_cpu(current_vmcs, cpu) == vmx->vmcs) |
| 224 | per_cpu(current_vmcs, cpu) = NULL; | 248 | per_cpu(current_vmcs, cpu) = NULL; |
| 225 | rdtscll(vmx->vcpu.host_tsc); | 249 | rdtscll(vmx->vcpu.arch.host_tsc); |
| 226 | } | 250 | } |
| 227 | 251 | ||
| 228 | static void vcpu_clear(struct vcpu_vmx *vmx) | 252 | static void vcpu_clear(struct vcpu_vmx *vmx) |
| 229 | { | 253 | { |
| 230 | if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1) | 254 | if (vmx->vcpu.cpu == -1) |
| 231 | smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, | 255 | return; |
| 232 | vmx, 0, 1); | 256 | smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1); |
| 233 | else | ||
| 234 | __vcpu_clear(vmx); | ||
| 235 | vmx->launched = 0; | 257 | vmx->launched = 0; |
| 236 | } | 258 | } |
| 237 | 259 | ||
| @@ -275,7 +297,7 @@ static void vmcs_writel(unsigned long field, unsigned long value) | |||
| 275 | u8 error; | 297 | u8 error; |
| 276 | 298 | ||
| 277 | asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" | 299 | asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" |
| 278 | : "=q"(error) : "a"(value), "d"(field) : "cc" ); | 300 | : "=q"(error) : "a"(value), "d"(field) : "cc"); |
| 279 | if (unlikely(error)) | 301 | if (unlikely(error)) |
| 280 | vmwrite_error(field, value); | 302 | vmwrite_error(field, value); |
| 281 | } | 303 | } |
| @@ -315,12 +337,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) | |||
| 315 | { | 337 | { |
| 316 | u32 eb; | 338 | u32 eb; |
| 317 | 339 | ||
| 318 | eb = 1u << PF_VECTOR; | 340 | eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); |
| 319 | if (!vcpu->fpu_active) | 341 | if (!vcpu->fpu_active) |
| 320 | eb |= 1u << NM_VECTOR; | 342 | eb |= 1u << NM_VECTOR; |
| 321 | if (vcpu->guest_debug.enabled) | 343 | if (vcpu->guest_debug.enabled) |
| 322 | eb |= 1u << 1; | 344 | eb |= 1u << 1; |
| 323 | if (vcpu->rmode.active) | 345 | if (vcpu->arch.rmode.active) |
| 324 | eb = ~0; | 346 | eb = ~0; |
| 325 | vmcs_write32(EXCEPTION_BITMAP, eb); | 347 | vmcs_write32(EXCEPTION_BITMAP, eb); |
| 326 | } | 348 | } |
| @@ -344,16 +366,42 @@ static void reload_tss(void) | |||
| 344 | 366 | ||
| 345 | static void load_transition_efer(struct vcpu_vmx *vmx) | 367 | static void load_transition_efer(struct vcpu_vmx *vmx) |
| 346 | { | 368 | { |
| 347 | u64 trans_efer; | ||
| 348 | int efer_offset = vmx->msr_offset_efer; | 369 | int efer_offset = vmx->msr_offset_efer; |
| 370 | u64 host_efer = vmx->host_msrs[efer_offset].data; | ||
| 371 | u64 guest_efer = vmx->guest_msrs[efer_offset].data; | ||
| 372 | u64 ignore_bits; | ||
| 349 | 373 | ||
| 350 | trans_efer = vmx->host_msrs[efer_offset].data; | 374 | if (efer_offset < 0) |
| 351 | trans_efer &= ~EFER_SAVE_RESTORE_BITS; | 375 | return; |
| 352 | trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]); | 376 | /* |
| 353 | wrmsrl(MSR_EFER, trans_efer); | 377 | * NX is emulated; LMA and LME handled by hardware; SCE meaninless |
| 378 | * outside long mode | ||
| 379 | */ | ||
| 380 | ignore_bits = EFER_NX | EFER_SCE; | ||
| 381 | #ifdef CONFIG_X86_64 | ||
| 382 | ignore_bits |= EFER_LMA | EFER_LME; | ||
| 383 | /* SCE is meaningful only in long mode on Intel */ | ||
| 384 | if (guest_efer & EFER_LMA) | ||
| 385 | ignore_bits &= ~(u64)EFER_SCE; | ||
| 386 | #endif | ||
| 387 | if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits)) | ||
| 388 | return; | ||
| 389 | |||
| 390 | vmx->host_state.guest_efer_loaded = 1; | ||
| 391 | guest_efer &= ~ignore_bits; | ||
| 392 | guest_efer |= host_efer & ignore_bits; | ||
| 393 | wrmsrl(MSR_EFER, guest_efer); | ||
| 354 | vmx->vcpu.stat.efer_reload++; | 394 | vmx->vcpu.stat.efer_reload++; |
| 355 | } | 395 | } |
| 356 | 396 | ||
| 397 | static void reload_host_efer(struct vcpu_vmx *vmx) | ||
| 398 | { | ||
| 399 | if (vmx->host_state.guest_efer_loaded) { | ||
| 400 | vmx->host_state.guest_efer_loaded = 0; | ||
| 401 | load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); | ||
| 402 | } | ||
| 403 | } | ||
| 404 | |||
| 357 | static void vmx_save_host_state(struct kvm_vcpu *vcpu) | 405 | static void vmx_save_host_state(struct kvm_vcpu *vcpu) |
| 358 | { | 406 | { |
| 359 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 407 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
| @@ -393,14 +441,13 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) | |||
| 393 | #endif | 441 | #endif |
| 394 | 442 | ||
| 395 | #ifdef CONFIG_X86_64 | 443 | #ifdef CONFIG_X86_64 |
| 396 | if (is_long_mode(&vmx->vcpu)) { | 444 | if (is_long_mode(&vmx->vcpu)) |
| 397 | save_msrs(vmx->host_msrs + | 445 | save_msrs(vmx->host_msrs + |
| 398 | vmx->msr_offset_kernel_gs_base, 1); | 446 | vmx->msr_offset_kernel_gs_base, 1); |
| 399 | } | 447 | |
| 400 | #endif | 448 | #endif |
| 401 | load_msrs(vmx->guest_msrs, vmx->save_nmsrs); | 449 | load_msrs(vmx->guest_msrs, vmx->save_nmsrs); |
| 402 | if (msr_efer_need_save_restore(vmx)) | 450 | load_transition_efer(vmx); |
| 403 | load_transition_efer(vmx); | ||
| 404 | } | 451 | } |
| 405 | 452 | ||
| 406 | static void vmx_load_host_state(struct vcpu_vmx *vmx) | 453 | static void vmx_load_host_state(struct vcpu_vmx *vmx) |
| @@ -410,6 +457,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) | |||
| 410 | if (!vmx->host_state.loaded) | 457 | if (!vmx->host_state.loaded) |
| 411 | return; | 458 | return; |
| 412 | 459 | ||
| 460 | ++vmx->vcpu.stat.host_state_reload; | ||
| 413 | vmx->host_state.loaded = 0; | 461 | vmx->host_state.loaded = 0; |
| 414 | if (vmx->host_state.fs_reload_needed) | 462 | if (vmx->host_state.fs_reload_needed) |
| 415 | load_fs(vmx->host_state.fs_sel); | 463 | load_fs(vmx->host_state.fs_sel); |
| @@ -429,8 +477,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) | |||
| 429 | reload_tss(); | 477 | reload_tss(); |
| 430 | save_msrs(vmx->guest_msrs, vmx->save_nmsrs); | 478 | save_msrs(vmx->guest_msrs, vmx->save_nmsrs); |
| 431 | load_msrs(vmx->host_msrs, vmx->save_nmsrs); | 479 | load_msrs(vmx->host_msrs, vmx->save_nmsrs); |
| 432 | if (msr_efer_need_save_restore(vmx)) | 480 | reload_host_efer(vmx); |
| 433 | load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); | ||
| 434 | } | 481 | } |
| 435 | 482 | ||
| 436 | /* | 483 | /* |
| @@ -480,7 +527,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
| 480 | * Make sure the time stamp counter is monotonous. | 527 | * Make sure the time stamp counter is monotonous. |
| 481 | */ | 528 | */ |
| 482 | rdtscll(tsc_this); | 529 | rdtscll(tsc_this); |
| 483 | delta = vcpu->host_tsc - tsc_this; | 530 | delta = vcpu->arch.host_tsc - tsc_this; |
| 484 | vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); | 531 | vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); |
| 485 | } | 532 | } |
| 486 | } | 533 | } |
| @@ -488,7 +535,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
| 488 | static void vmx_vcpu_put(struct kvm_vcpu *vcpu) | 535 | static void vmx_vcpu_put(struct kvm_vcpu *vcpu) |
| 489 | { | 536 | { |
| 490 | vmx_load_host_state(to_vmx(vcpu)); | 537 | vmx_load_host_state(to_vmx(vcpu)); |
| 491 | kvm_put_guest_fpu(vcpu); | ||
| 492 | } | 538 | } |
| 493 | 539 | ||
| 494 | static void vmx_fpu_activate(struct kvm_vcpu *vcpu) | 540 | static void vmx_fpu_activate(struct kvm_vcpu *vcpu) |
| @@ -497,7 +543,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) | |||
| 497 | return; | 543 | return; |
| 498 | vcpu->fpu_active = 1; | 544 | vcpu->fpu_active = 1; |
| 499 | vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); | 545 | vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); |
| 500 | if (vcpu->cr0 & X86_CR0_TS) | 546 | if (vcpu->arch.cr0 & X86_CR0_TS) |
| 501 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS); | 547 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS); |
| 502 | update_exception_bitmap(vcpu); | 548 | update_exception_bitmap(vcpu); |
| 503 | } | 549 | } |
| @@ -523,7 +569,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | |||
| 523 | 569 | ||
| 524 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | 570 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) |
| 525 | { | 571 | { |
| 526 | if (vcpu->rmode.active) | 572 | if (vcpu->arch.rmode.active) |
| 527 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | 573 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; |
| 528 | vmcs_writel(GUEST_RFLAGS, rflags); | 574 | vmcs_writel(GUEST_RFLAGS, rflags); |
| 529 | } | 575 | } |
| @@ -545,19 +591,25 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
| 545 | if (interruptibility & 3) | 591 | if (interruptibility & 3) |
| 546 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | 592 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, |
| 547 | interruptibility & ~3); | 593 | interruptibility & ~3); |
| 548 | vcpu->interrupt_window_open = 1; | 594 | vcpu->arch.interrupt_window_open = 1; |
| 549 | } | 595 | } |
| 550 | 596 | ||
| 551 | static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) | 597 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, |
| 598 | bool has_error_code, u32 error_code) | ||
| 552 | { | 599 | { |
| 553 | printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n", | ||
| 554 | vmcs_readl(GUEST_RIP)); | ||
| 555 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
| 556 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 600 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
| 557 | GP_VECTOR | | 601 | nr | INTR_TYPE_EXCEPTION |
| 558 | INTR_TYPE_EXCEPTION | | 602 | | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0) |
| 559 | INTR_INFO_DELIEVER_CODE_MASK | | 603 | | INTR_INFO_VALID_MASK); |
| 560 | INTR_INFO_VALID_MASK); | 604 | if (has_error_code) |
| 605 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
| 606 | } | ||
| 607 | |||
| 608 | static bool vmx_exception_injected(struct kvm_vcpu *vcpu) | ||
| 609 | { | ||
| 610 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 611 | |||
| 612 | return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); | ||
| 561 | } | 613 | } |
| 562 | 614 | ||
| 563 | /* | 615 | /* |
| @@ -608,7 +660,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) | |||
| 608 | * if efer.sce is enabled. | 660 | * if efer.sce is enabled. |
| 609 | */ | 661 | */ |
| 610 | index = __find_msr_index(vmx, MSR_K6_STAR); | 662 | index = __find_msr_index(vmx, MSR_K6_STAR); |
| 611 | if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE)) | 663 | if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) |
| 612 | move_msr_up(vmx, index, save_nmsrs++); | 664 | move_msr_up(vmx, index, save_nmsrs++); |
| 613 | } | 665 | } |
| 614 | #endif | 666 | #endif |
| @@ -712,8 +764,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
| 712 | #ifdef CONFIG_X86_64 | 764 | #ifdef CONFIG_X86_64 |
| 713 | case MSR_EFER: | 765 | case MSR_EFER: |
| 714 | ret = kvm_set_msr_common(vcpu, msr_index, data); | 766 | ret = kvm_set_msr_common(vcpu, msr_index, data); |
| 715 | if (vmx->host_state.loaded) | 767 | if (vmx->host_state.loaded) { |
| 768 | reload_host_efer(vmx); | ||
| 716 | load_transition_efer(vmx); | 769 | load_transition_efer(vmx); |
| 770 | } | ||
| 717 | break; | 771 | break; |
| 718 | case MSR_FS_BASE: | 772 | case MSR_FS_BASE: |
| 719 | vmcs_writel(GUEST_FS_BASE, data); | 773 | vmcs_writel(GUEST_FS_BASE, data); |
| @@ -750,12 +804,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
| 750 | 804 | ||
| 751 | /* | 805 | /* |
| 752 | * Sync the rsp and rip registers into the vcpu structure. This allows | 806 | * Sync the rsp and rip registers into the vcpu structure. This allows |
| 753 | * registers to be accessed by indexing vcpu->regs. | 807 | * registers to be accessed by indexing vcpu->arch.regs. |
| 754 | */ | 808 | */ |
| 755 | static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) | 809 | static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) |
| 756 | { | 810 | { |
| 757 | vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); | 811 | vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); |
| 758 | vcpu->rip = vmcs_readl(GUEST_RIP); | 812 | vcpu->arch.rip = vmcs_readl(GUEST_RIP); |
| 759 | } | 813 | } |
| 760 | 814 | ||
| 761 | /* | 815 | /* |
| @@ -764,8 +818,8 @@ static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) | |||
| 764 | */ | 818 | */ |
| 765 | static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) | 819 | static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) |
| 766 | { | 820 | { |
| 767 | vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]); | 821 | vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); |
| 768 | vmcs_writel(GUEST_RIP, vcpu->rip); | 822 | vmcs_writel(GUEST_RIP, vcpu->arch.rip); |
| 769 | } | 823 | } |
| 770 | 824 | ||
| 771 | static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | 825 | static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) |
| @@ -808,14 +862,15 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | |||
| 808 | 862 | ||
| 809 | static int vmx_get_irq(struct kvm_vcpu *vcpu) | 863 | static int vmx_get_irq(struct kvm_vcpu *vcpu) |
| 810 | { | 864 | { |
| 865 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 811 | u32 idtv_info_field; | 866 | u32 idtv_info_field; |
| 812 | 867 | ||
| 813 | idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 868 | idtv_info_field = vmx->idt_vectoring_info; |
| 814 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | 869 | if (idtv_info_field & INTR_INFO_VALID_MASK) { |
| 815 | if (is_external_interrupt(idtv_info_field)) | 870 | if (is_external_interrupt(idtv_info_field)) |
| 816 | return idtv_info_field & VECTORING_INFO_VECTOR_MASK; | 871 | return idtv_info_field & VECTORING_INFO_VECTOR_MASK; |
| 817 | else | 872 | else |
| 818 | printk("pending exception: not handled yet\n"); | 873 | printk(KERN_DEBUG "pending exception: not handled yet\n"); |
| 819 | } | 874 | } |
| 820 | return -1; | 875 | return -1; |
| 821 | } | 876 | } |
| @@ -863,7 +918,7 @@ static void hardware_disable(void *garbage) | |||
| 863 | } | 918 | } |
| 864 | 919 | ||
| 865 | static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, | 920 | static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, |
| 866 | u32 msr, u32* result) | 921 | u32 msr, u32 *result) |
| 867 | { | 922 | { |
| 868 | u32 vmx_msr_low, vmx_msr_high; | 923 | u32 vmx_msr_low, vmx_msr_high; |
| 869 | u32 ctl = ctl_min | ctl_opt; | 924 | u32 ctl = ctl_min | ctl_opt; |
| @@ -887,6 +942,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
| 887 | u32 min, opt; | 942 | u32 min, opt; |
| 888 | u32 _pin_based_exec_control = 0; | 943 | u32 _pin_based_exec_control = 0; |
| 889 | u32 _cpu_based_exec_control = 0; | 944 | u32 _cpu_based_exec_control = 0; |
| 945 | u32 _cpu_based_2nd_exec_control = 0; | ||
| 890 | u32 _vmexit_control = 0; | 946 | u32 _vmexit_control = 0; |
| 891 | u32 _vmentry_control = 0; | 947 | u32 _vmentry_control = 0; |
| 892 | 948 | ||
| @@ -904,11 +960,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
| 904 | CPU_BASED_USE_IO_BITMAPS | | 960 | CPU_BASED_USE_IO_BITMAPS | |
| 905 | CPU_BASED_MOV_DR_EXITING | | 961 | CPU_BASED_MOV_DR_EXITING | |
| 906 | CPU_BASED_USE_TSC_OFFSETING; | 962 | CPU_BASED_USE_TSC_OFFSETING; |
| 907 | #ifdef CONFIG_X86_64 | 963 | opt = CPU_BASED_TPR_SHADOW | |
| 908 | opt = CPU_BASED_TPR_SHADOW; | 964 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; |
| 909 | #else | ||
| 910 | opt = 0; | ||
| 911 | #endif | ||
| 912 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, | 965 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, |
| 913 | &_cpu_based_exec_control) < 0) | 966 | &_cpu_based_exec_control) < 0) |
| 914 | return -EIO; | 967 | return -EIO; |
| @@ -917,6 +970,19 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
| 917 | _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & | 970 | _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & |
| 918 | ~CPU_BASED_CR8_STORE_EXITING; | 971 | ~CPU_BASED_CR8_STORE_EXITING; |
| 919 | #endif | 972 | #endif |
| 973 | if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { | ||
| 974 | min = 0; | ||
| 975 | opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | ||
| 976 | SECONDARY_EXEC_WBINVD_EXITING; | ||
| 977 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, | ||
| 978 | &_cpu_based_2nd_exec_control) < 0) | ||
| 979 | return -EIO; | ||
| 980 | } | ||
| 981 | #ifndef CONFIG_X86_64 | ||
| 982 | if (!(_cpu_based_2nd_exec_control & | ||
| 983 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) | ||
| 984 | _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
| 985 | #endif | ||
| 920 | 986 | ||
| 921 | min = 0; | 987 | min = 0; |
| 922 | #ifdef CONFIG_X86_64 | 988 | #ifdef CONFIG_X86_64 |
| @@ -954,6 +1020,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
| 954 | 1020 | ||
| 955 | vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; | 1021 | vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; |
| 956 | vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; | 1022 | vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; |
| 1023 | vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; | ||
| 957 | vmcs_conf->vmexit_ctrl = _vmexit_control; | 1024 | vmcs_conf->vmexit_ctrl = _vmexit_control; |
| 958 | vmcs_conf->vmentry_ctrl = _vmentry_control; | 1025 | vmcs_conf->vmentry_ctrl = _vmentry_control; |
| 959 | 1026 | ||
| @@ -1043,15 +1110,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
| 1043 | { | 1110 | { |
| 1044 | unsigned long flags; | 1111 | unsigned long flags; |
| 1045 | 1112 | ||
| 1046 | vcpu->rmode.active = 0; | 1113 | vcpu->arch.rmode.active = 0; |
| 1047 | 1114 | ||
| 1048 | vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base); | 1115 | vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); |
| 1049 | vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit); | 1116 | vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); |
| 1050 | vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar); | 1117 | vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); |
| 1051 | 1118 | ||
| 1052 | flags = vmcs_readl(GUEST_RFLAGS); | 1119 | flags = vmcs_readl(GUEST_RFLAGS); |
| 1053 | flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); | 1120 | flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); |
| 1054 | flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT); | 1121 | flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); |
| 1055 | vmcs_writel(GUEST_RFLAGS, flags); | 1122 | vmcs_writel(GUEST_RFLAGS, flags); |
| 1056 | 1123 | ||
| 1057 | vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | | 1124 | vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | |
| @@ -1059,10 +1126,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
| 1059 | 1126 | ||
| 1060 | update_exception_bitmap(vcpu); | 1127 | update_exception_bitmap(vcpu); |
| 1061 | 1128 | ||
| 1062 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es); | 1129 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); |
| 1063 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds); | 1130 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); |
| 1064 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs); | 1131 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); |
| 1065 | fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs); | 1132 | fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); |
| 1066 | 1133 | ||
| 1067 | vmcs_write16(GUEST_SS_SELECTOR, 0); | 1134 | vmcs_write16(GUEST_SS_SELECTOR, 0); |
| 1068 | vmcs_write32(GUEST_SS_AR_BYTES, 0x93); | 1135 | vmcs_write32(GUEST_SS_AR_BYTES, 0x93); |
| @@ -1072,10 +1139,14 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
| 1072 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | 1139 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); |
| 1073 | } | 1140 | } |
| 1074 | 1141 | ||
| 1075 | static gva_t rmode_tss_base(struct kvm* kvm) | 1142 | static gva_t rmode_tss_base(struct kvm *kvm) |
| 1076 | { | 1143 | { |
| 1077 | gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3; | 1144 | if (!kvm->arch.tss_addr) { |
| 1078 | return base_gfn << PAGE_SHIFT; | 1145 | gfn_t base_gfn = kvm->memslots[0].base_gfn + |
| 1146 | kvm->memslots[0].npages - 3; | ||
| 1147 | return base_gfn << PAGE_SHIFT; | ||
| 1148 | } | ||
| 1149 | return kvm->arch.tss_addr; | ||
| 1079 | } | 1150 | } |
| 1080 | 1151 | ||
| 1081 | static void fix_rmode_seg(int seg, struct kvm_save_segment *save) | 1152 | static void fix_rmode_seg(int seg, struct kvm_save_segment *save) |
| @@ -1086,7 +1157,8 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) | |||
| 1086 | save->base = vmcs_readl(sf->base); | 1157 | save->base = vmcs_readl(sf->base); |
| 1087 | save->limit = vmcs_read32(sf->limit); | 1158 | save->limit = vmcs_read32(sf->limit); |
| 1088 | save->ar = vmcs_read32(sf->ar_bytes); | 1159 | save->ar = vmcs_read32(sf->ar_bytes); |
| 1089 | vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4); | 1160 | vmcs_write16(sf->selector, save->base >> 4); |
| 1161 | vmcs_write32(sf->base, save->base & 0xfffff); | ||
| 1090 | vmcs_write32(sf->limit, 0xffff); | 1162 | vmcs_write32(sf->limit, 0xffff); |
| 1091 | vmcs_write32(sf->ar_bytes, 0xf3); | 1163 | vmcs_write32(sf->ar_bytes, 0xf3); |
| 1092 | } | 1164 | } |
| @@ -1095,19 +1167,20 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
| 1095 | { | 1167 | { |
| 1096 | unsigned long flags; | 1168 | unsigned long flags; |
| 1097 | 1169 | ||
| 1098 | vcpu->rmode.active = 1; | 1170 | vcpu->arch.rmode.active = 1; |
| 1099 | 1171 | ||
| 1100 | vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); | 1172 | vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); |
| 1101 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); | 1173 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); |
| 1102 | 1174 | ||
| 1103 | vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); | 1175 | vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); |
| 1104 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); | 1176 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); |
| 1105 | 1177 | ||
| 1106 | vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); | 1178 | vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); |
| 1107 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | 1179 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); |
| 1108 | 1180 | ||
| 1109 | flags = vmcs_readl(GUEST_RFLAGS); | 1181 | flags = vmcs_readl(GUEST_RFLAGS); |
| 1110 | vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | 1182 | vcpu->arch.rmode.save_iopl |
| 1183 | = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | ||
| 1111 | 1184 | ||
| 1112 | flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | 1185 | flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; |
| 1113 | 1186 | ||
| @@ -1125,10 +1198,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
| 1125 | vmcs_writel(GUEST_CS_BASE, 0xf0000); | 1198 | vmcs_writel(GUEST_CS_BASE, 0xf0000); |
| 1126 | vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); | 1199 | vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); |
| 1127 | 1200 | ||
| 1128 | fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); | 1201 | fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); |
| 1129 | fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); | 1202 | fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); |
| 1130 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); | 1203 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); |
| 1131 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); | 1204 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); |
| 1132 | 1205 | ||
| 1133 | kvm_mmu_reset_context(vcpu); | 1206 | kvm_mmu_reset_context(vcpu); |
| 1134 | init_rmode_tss(vcpu->kvm); | 1207 | init_rmode_tss(vcpu->kvm); |
| @@ -1149,7 +1222,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu) | |||
| 1149 | | AR_TYPE_BUSY_64_TSS); | 1222 | | AR_TYPE_BUSY_64_TSS); |
| 1150 | } | 1223 | } |
| 1151 | 1224 | ||
| 1152 | vcpu->shadow_efer |= EFER_LMA; | 1225 | vcpu->arch.shadow_efer |= EFER_LMA; |
| 1153 | 1226 | ||
| 1154 | find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; | 1227 | find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; |
| 1155 | vmcs_write32(VM_ENTRY_CONTROLS, | 1228 | vmcs_write32(VM_ENTRY_CONTROLS, |
| @@ -1159,7 +1232,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu) | |||
| 1159 | 1232 | ||
| 1160 | static void exit_lmode(struct kvm_vcpu *vcpu) | 1233 | static void exit_lmode(struct kvm_vcpu *vcpu) |
| 1161 | { | 1234 | { |
| 1162 | vcpu->shadow_efer &= ~EFER_LMA; | 1235 | vcpu->arch.shadow_efer &= ~EFER_LMA; |
| 1163 | 1236 | ||
| 1164 | vmcs_write32(VM_ENTRY_CONTROLS, | 1237 | vmcs_write32(VM_ENTRY_CONTROLS, |
| 1165 | vmcs_read32(VM_ENTRY_CONTROLS) | 1238 | vmcs_read32(VM_ENTRY_CONTROLS) |
| @@ -1170,22 +1243,22 @@ static void exit_lmode(struct kvm_vcpu *vcpu) | |||
| 1170 | 1243 | ||
| 1171 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | 1244 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) |
| 1172 | { | 1245 | { |
| 1173 | vcpu->cr4 &= KVM_GUEST_CR4_MASK; | 1246 | vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; |
| 1174 | vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; | 1247 | vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; |
| 1175 | } | 1248 | } |
| 1176 | 1249 | ||
| 1177 | static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | 1250 | static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
| 1178 | { | 1251 | { |
| 1179 | vmx_fpu_deactivate(vcpu); | 1252 | vmx_fpu_deactivate(vcpu); |
| 1180 | 1253 | ||
| 1181 | if (vcpu->rmode.active && (cr0 & X86_CR0_PE)) | 1254 | if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) |
| 1182 | enter_pmode(vcpu); | 1255 | enter_pmode(vcpu); |
| 1183 | 1256 | ||
| 1184 | if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE)) | 1257 | if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) |
| 1185 | enter_rmode(vcpu); | 1258 | enter_rmode(vcpu); |
| 1186 | 1259 | ||
| 1187 | #ifdef CONFIG_X86_64 | 1260 | #ifdef CONFIG_X86_64 |
| 1188 | if (vcpu->shadow_efer & EFER_LME) { | 1261 | if (vcpu->arch.shadow_efer & EFER_LME) { |
| 1189 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) | 1262 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) |
| 1190 | enter_lmode(vcpu); | 1263 | enter_lmode(vcpu); |
| 1191 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) | 1264 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) |
| @@ -1196,7 +1269,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
| 1196 | vmcs_writel(CR0_READ_SHADOW, cr0); | 1269 | vmcs_writel(CR0_READ_SHADOW, cr0); |
| 1197 | vmcs_writel(GUEST_CR0, | 1270 | vmcs_writel(GUEST_CR0, |
| 1198 | (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); | 1271 | (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); |
| 1199 | vcpu->cr0 = cr0; | 1272 | vcpu->arch.cr0 = cr0; |
| 1200 | 1273 | ||
| 1201 | if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) | 1274 | if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) |
| 1202 | vmx_fpu_activate(vcpu); | 1275 | vmx_fpu_activate(vcpu); |
| @@ -1205,16 +1278,16 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
| 1205 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | 1278 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
| 1206 | { | 1279 | { |
| 1207 | vmcs_writel(GUEST_CR3, cr3); | 1280 | vmcs_writel(GUEST_CR3, cr3); |
| 1208 | if (vcpu->cr0 & X86_CR0_PE) | 1281 | if (vcpu->arch.cr0 & X86_CR0_PE) |
| 1209 | vmx_fpu_deactivate(vcpu); | 1282 | vmx_fpu_deactivate(vcpu); |
| 1210 | } | 1283 | } |
| 1211 | 1284 | ||
| 1212 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 1285 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
| 1213 | { | 1286 | { |
| 1214 | vmcs_writel(CR4_READ_SHADOW, cr4); | 1287 | vmcs_writel(CR4_READ_SHADOW, cr4); |
| 1215 | vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ? | 1288 | vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ? |
| 1216 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); | 1289 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); |
| 1217 | vcpu->cr4 = cr4; | 1290 | vcpu->arch.cr4 = cr4; |
| 1218 | } | 1291 | } |
| 1219 | 1292 | ||
| 1220 | #ifdef CONFIG_X86_64 | 1293 | #ifdef CONFIG_X86_64 |
| @@ -1224,7 +1297,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) | |||
| 1224 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1297 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
| 1225 | struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); | 1298 | struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); |
| 1226 | 1299 | ||
| 1227 | vcpu->shadow_efer = efer; | 1300 | vcpu->arch.shadow_efer = efer; |
| 1228 | if (efer & EFER_LMA) { | 1301 | if (efer & EFER_LMA) { |
| 1229 | vmcs_write32(VM_ENTRY_CONTROLS, | 1302 | vmcs_write32(VM_ENTRY_CONTROLS, |
| 1230 | vmcs_read32(VM_ENTRY_CONTROLS) | | 1303 | vmcs_read32(VM_ENTRY_CONTROLS) | |
| @@ -1301,17 +1374,17 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
| 1301 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 1374 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
| 1302 | u32 ar; | 1375 | u32 ar; |
| 1303 | 1376 | ||
| 1304 | if (vcpu->rmode.active && seg == VCPU_SREG_TR) { | 1377 | if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { |
| 1305 | vcpu->rmode.tr.selector = var->selector; | 1378 | vcpu->arch.rmode.tr.selector = var->selector; |
| 1306 | vcpu->rmode.tr.base = var->base; | 1379 | vcpu->arch.rmode.tr.base = var->base; |
| 1307 | vcpu->rmode.tr.limit = var->limit; | 1380 | vcpu->arch.rmode.tr.limit = var->limit; |
| 1308 | vcpu->rmode.tr.ar = vmx_segment_access_rights(var); | 1381 | vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); |
| 1309 | return; | 1382 | return; |
| 1310 | } | 1383 | } |
| 1311 | vmcs_writel(sf->base, var->base); | 1384 | vmcs_writel(sf->base, var->base); |
| 1312 | vmcs_write32(sf->limit, var->limit); | 1385 | vmcs_write32(sf->limit, var->limit); |
| 1313 | vmcs_write16(sf->selector, var->selector); | 1386 | vmcs_write16(sf->selector, var->selector); |
| 1314 | if (vcpu->rmode.active && var->s) { | 1387 | if (vcpu->arch.rmode.active && var->s) { |
| 1315 | /* | 1388 | /* |
| 1316 | * Hack real-mode segments into vm86 compatibility. | 1389 | * Hack real-mode segments into vm86 compatibility. |
| 1317 | */ | 1390 | */ |
| @@ -1355,36 +1428,38 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | |||
| 1355 | vmcs_writel(GUEST_GDTR_BASE, dt->base); | 1428 | vmcs_writel(GUEST_GDTR_BASE, dt->base); |
| 1356 | } | 1429 | } |
| 1357 | 1430 | ||
| 1358 | static int init_rmode_tss(struct kvm* kvm) | 1431 | static int init_rmode_tss(struct kvm *kvm) |
| 1359 | { | 1432 | { |
| 1360 | struct page *p1, *p2, *p3; | ||
| 1361 | gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; | 1433 | gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; |
| 1362 | char *page; | 1434 | u16 data = 0; |
| 1363 | 1435 | int ret = 0; | |
| 1364 | p1 = gfn_to_page(kvm, fn++); | 1436 | int r; |
| 1365 | p2 = gfn_to_page(kvm, fn++); | ||
| 1366 | p3 = gfn_to_page(kvm, fn); | ||
| 1367 | |||
| 1368 | if (!p1 || !p2 || !p3) { | ||
| 1369 | kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__); | ||
| 1370 | return 0; | ||
| 1371 | } | ||
| 1372 | |||
| 1373 | page = kmap_atomic(p1, KM_USER0); | ||
| 1374 | clear_page(page); | ||
| 1375 | *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; | ||
| 1376 | kunmap_atomic(page, KM_USER0); | ||
| 1377 | |||
| 1378 | page = kmap_atomic(p2, KM_USER0); | ||
| 1379 | clear_page(page); | ||
| 1380 | kunmap_atomic(page, KM_USER0); | ||
| 1381 | 1437 | ||
| 1382 | page = kmap_atomic(p3, KM_USER0); | 1438 | down_read(¤t->mm->mmap_sem); |
| 1383 | clear_page(page); | 1439 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); |
| 1384 | *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; | 1440 | if (r < 0) |
| 1385 | kunmap_atomic(page, KM_USER0); | 1441 | goto out; |
| 1442 | data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; | ||
| 1443 | r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); | ||
| 1444 | if (r < 0) | ||
| 1445 | goto out; | ||
| 1446 | r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); | ||
| 1447 | if (r < 0) | ||
| 1448 | goto out; | ||
| 1449 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); | ||
| 1450 | if (r < 0) | ||
| 1451 | goto out; | ||
| 1452 | data = ~0; | ||
| 1453 | r = kvm_write_guest_page(kvm, fn, &data, | ||
| 1454 | RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, | ||
| 1455 | sizeof(u8)); | ||
| 1456 | if (r < 0) | ||
| 1457 | goto out; | ||
| 1386 | 1458 | ||
| 1387 | return 1; | 1459 | ret = 1; |
| 1460 | out: | ||
| 1461 | up_read(¤t->mm->mmap_sem); | ||
| 1462 | return ret; | ||
| 1388 | } | 1463 | } |
| 1389 | 1464 | ||
| 1390 | static void seg_setup(int seg) | 1465 | static void seg_setup(int seg) |
| @@ -1397,6 +1472,27 @@ static void seg_setup(int seg) | |||
| 1397 | vmcs_write32(sf->ar_bytes, 0x93); | 1472 | vmcs_write32(sf->ar_bytes, 0x93); |
| 1398 | } | 1473 | } |
| 1399 | 1474 | ||
| 1475 | static int alloc_apic_access_page(struct kvm *kvm) | ||
| 1476 | { | ||
| 1477 | struct kvm_userspace_memory_region kvm_userspace_mem; | ||
| 1478 | int r = 0; | ||
| 1479 | |||
| 1480 | down_write(¤t->mm->mmap_sem); | ||
| 1481 | if (kvm->arch.apic_access_page) | ||
| 1482 | goto out; | ||
| 1483 | kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; | ||
| 1484 | kvm_userspace_mem.flags = 0; | ||
| 1485 | kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; | ||
| 1486 | kvm_userspace_mem.memory_size = PAGE_SIZE; | ||
| 1487 | r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); | ||
| 1488 | if (r) | ||
| 1489 | goto out; | ||
| 1490 | kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); | ||
| 1491 | out: | ||
| 1492 | up_write(¤t->mm->mmap_sem); | ||
| 1493 | return r; | ||
| 1494 | } | ||
| 1495 | |||
| 1400 | /* | 1496 | /* |
| 1401 | * Sets up the vmcs for emulated real mode. | 1497 | * Sets up the vmcs for emulated real mode. |
| 1402 | */ | 1498 | */ |
| @@ -1407,92 +1503,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
| 1407 | unsigned long a; | 1503 | unsigned long a; |
| 1408 | struct descriptor_table dt; | 1504 | struct descriptor_table dt; |
| 1409 | int i; | 1505 | int i; |
| 1410 | int ret = 0; | ||
| 1411 | unsigned long kvm_vmx_return; | 1506 | unsigned long kvm_vmx_return; |
| 1412 | u64 msr; | ||
| 1413 | u32 exec_control; | 1507 | u32 exec_control; |
| 1414 | 1508 | ||
| 1415 | if (!init_rmode_tss(vmx->vcpu.kvm)) { | ||
| 1416 | ret = -ENOMEM; | ||
| 1417 | goto out; | ||
| 1418 | } | ||
| 1419 | |||
| 1420 | vmx->vcpu.rmode.active = 0; | ||
| 1421 | |||
| 1422 | vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val(); | ||
| 1423 | set_cr8(&vmx->vcpu, 0); | ||
| 1424 | msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | ||
| 1425 | if (vmx->vcpu.vcpu_id == 0) | ||
| 1426 | msr |= MSR_IA32_APICBASE_BSP; | ||
| 1427 | kvm_set_apic_base(&vmx->vcpu, msr); | ||
| 1428 | |||
| 1429 | fx_init(&vmx->vcpu); | ||
| 1430 | |||
| 1431 | /* | ||
| 1432 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode | ||
| 1433 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. | ||
| 1434 | */ | ||
| 1435 | if (vmx->vcpu.vcpu_id == 0) { | ||
| 1436 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); | ||
| 1437 | vmcs_writel(GUEST_CS_BASE, 0x000f0000); | ||
| 1438 | } else { | ||
| 1439 | vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8); | ||
| 1440 | vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12); | ||
| 1441 | } | ||
| 1442 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
| 1443 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
| 1444 | |||
| 1445 | seg_setup(VCPU_SREG_DS); | ||
| 1446 | seg_setup(VCPU_SREG_ES); | ||
| 1447 | seg_setup(VCPU_SREG_FS); | ||
| 1448 | seg_setup(VCPU_SREG_GS); | ||
| 1449 | seg_setup(VCPU_SREG_SS); | ||
| 1450 | |||
| 1451 | vmcs_write16(GUEST_TR_SELECTOR, 0); | ||
| 1452 | vmcs_writel(GUEST_TR_BASE, 0); | ||
| 1453 | vmcs_write32(GUEST_TR_LIMIT, 0xffff); | ||
| 1454 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
| 1455 | |||
| 1456 | vmcs_write16(GUEST_LDTR_SELECTOR, 0); | ||
| 1457 | vmcs_writel(GUEST_LDTR_BASE, 0); | ||
| 1458 | vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); | ||
| 1459 | vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); | ||
| 1460 | |||
| 1461 | vmcs_write32(GUEST_SYSENTER_CS, 0); | ||
| 1462 | vmcs_writel(GUEST_SYSENTER_ESP, 0); | ||
| 1463 | vmcs_writel(GUEST_SYSENTER_EIP, 0); | ||
| 1464 | |||
| 1465 | vmcs_writel(GUEST_RFLAGS, 0x02); | ||
| 1466 | if (vmx->vcpu.vcpu_id == 0) | ||
| 1467 | vmcs_writel(GUEST_RIP, 0xfff0); | ||
| 1468 | else | ||
| 1469 | vmcs_writel(GUEST_RIP, 0); | ||
| 1470 | vmcs_writel(GUEST_RSP, 0); | ||
| 1471 | |||
| 1472 | //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 | ||
| 1473 | vmcs_writel(GUEST_DR7, 0x400); | ||
| 1474 | |||
| 1475 | vmcs_writel(GUEST_GDTR_BASE, 0); | ||
| 1476 | vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); | ||
| 1477 | |||
| 1478 | vmcs_writel(GUEST_IDTR_BASE, 0); | ||
| 1479 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); | ||
| 1480 | |||
| 1481 | vmcs_write32(GUEST_ACTIVITY_STATE, 0); | ||
| 1482 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); | ||
| 1483 | vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); | ||
| 1484 | |||
| 1485 | /* I/O */ | 1509 | /* I/O */ |
| 1486 | vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); | 1510 | vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); |
| 1487 | vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); | 1511 | vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); |
| 1488 | 1512 | ||
| 1489 | guest_write_tsc(0); | ||
| 1490 | |||
| 1491 | vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ | 1513 | vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ |
| 1492 | 1514 | ||
| 1493 | /* Special registers */ | ||
| 1494 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | ||
| 1495 | |||
| 1496 | /* Control */ | 1515 | /* Control */ |
| 1497 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, | 1516 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, |
| 1498 | vmcs_config.pin_based_exec_ctrl); | 1517 | vmcs_config.pin_based_exec_ctrl); |
| @@ -1507,8 +1526,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
| 1507 | } | 1526 | } |
| 1508 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | 1527 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); |
| 1509 | 1528 | ||
| 1510 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); | 1529 | if (cpu_has_secondary_exec_ctrls()) { |
| 1511 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); | 1530 | exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; |
| 1531 | if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) | ||
| 1532 | exec_control &= | ||
| 1533 | ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
| 1534 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | ||
| 1535 | } | ||
| 1536 | |||
| 1537 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); | ||
| 1538 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); | ||
| 1512 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ | 1539 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ |
| 1513 | 1540 | ||
| 1514 | vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ | 1541 | vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ |
| @@ -1536,7 +1563,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
| 1536 | get_idt(&dt); | 1563 | get_idt(&dt); |
| 1537 | vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ | 1564 | vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ |
| 1538 | 1565 | ||
| 1539 | asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); | 1566 | asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); |
| 1540 | vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ | 1567 | vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ |
| 1541 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); | 1568 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); |
| 1542 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); | 1569 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); |
| @@ -1567,97 +1594,145 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
| 1567 | ++vmx->nmsrs; | 1594 | ++vmx->nmsrs; |
| 1568 | } | 1595 | } |
| 1569 | 1596 | ||
| 1570 | setup_msrs(vmx); | ||
| 1571 | |||
| 1572 | vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); | 1597 | vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); |
| 1573 | 1598 | ||
| 1574 | /* 22.2.1, 20.8.1 */ | 1599 | /* 22.2.1, 20.8.1 */ |
| 1575 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); | 1600 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); |
| 1576 | 1601 | ||
| 1577 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ | ||
| 1578 | |||
| 1579 | #ifdef CONFIG_X86_64 | ||
| 1580 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); | ||
| 1581 | if (vm_need_tpr_shadow(vmx->vcpu.kvm)) | ||
| 1582 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, | ||
| 1583 | page_to_phys(vmx->vcpu.apic->regs_page)); | ||
| 1584 | vmcs_write32(TPR_THRESHOLD, 0); | ||
| 1585 | #endif | ||
| 1586 | |||
| 1587 | vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); | 1602 | vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); |
| 1588 | vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); | 1603 | vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); |
| 1589 | 1604 | ||
| 1590 | vmx->vcpu.cr0 = 0x60000010; | 1605 | if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) |
| 1591 | vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode | 1606 | if (alloc_apic_access_page(vmx->vcpu.kvm) != 0) |
| 1592 | vmx_set_cr4(&vmx->vcpu, 0); | 1607 | return -ENOMEM; |
| 1593 | #ifdef CONFIG_X86_64 | ||
| 1594 | vmx_set_efer(&vmx->vcpu, 0); | ||
| 1595 | #endif | ||
| 1596 | vmx_fpu_activate(&vmx->vcpu); | ||
| 1597 | update_exception_bitmap(&vmx->vcpu); | ||
| 1598 | 1608 | ||
| 1599 | return 0; | 1609 | return 0; |
| 1600 | |||
| 1601 | out: | ||
| 1602 | return ret; | ||
| 1603 | } | 1610 | } |
| 1604 | 1611 | ||
| 1605 | static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) | 1612 | static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) |
| 1606 | { | 1613 | { |
| 1607 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1614 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
| 1615 | u64 msr; | ||
| 1616 | int ret; | ||
| 1608 | 1617 | ||
| 1609 | vmx_vcpu_setup(vmx); | 1618 | if (!init_rmode_tss(vmx->vcpu.kvm)) { |
| 1610 | } | 1619 | ret = -ENOMEM; |
| 1611 | 1620 | goto out; | |
| 1612 | static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) | ||
| 1613 | { | ||
| 1614 | u16 ent[2]; | ||
| 1615 | u16 cs; | ||
| 1616 | u16 ip; | ||
| 1617 | unsigned long flags; | ||
| 1618 | unsigned long ss_base = vmcs_readl(GUEST_SS_BASE); | ||
| 1619 | u16 sp = vmcs_readl(GUEST_RSP); | ||
| 1620 | u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT); | ||
| 1621 | |||
| 1622 | if (sp > ss_limit || sp < 6 ) { | ||
| 1623 | vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n", | ||
| 1624 | __FUNCTION__, | ||
| 1625 | vmcs_readl(GUEST_RSP), | ||
| 1626 | vmcs_readl(GUEST_SS_BASE), | ||
| 1627 | vmcs_read32(GUEST_SS_LIMIT)); | ||
| 1628 | return; | ||
| 1629 | } | 1621 | } |
| 1630 | 1622 | ||
| 1631 | if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) != | 1623 | vmx->vcpu.arch.rmode.active = 0; |
| 1632 | X86EMUL_CONTINUE) { | 1624 | |
| 1633 | vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__); | 1625 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); |
| 1634 | return; | 1626 | set_cr8(&vmx->vcpu, 0); |
| 1627 | msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | ||
| 1628 | if (vmx->vcpu.vcpu_id == 0) | ||
| 1629 | msr |= MSR_IA32_APICBASE_BSP; | ||
| 1630 | kvm_set_apic_base(&vmx->vcpu, msr); | ||
| 1631 | |||
| 1632 | fx_init(&vmx->vcpu); | ||
| 1633 | |||
| 1634 | /* | ||
| 1635 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode | ||
| 1636 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. | ||
| 1637 | */ | ||
| 1638 | if (vmx->vcpu.vcpu_id == 0) { | ||
| 1639 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); | ||
| 1640 | vmcs_writel(GUEST_CS_BASE, 0x000f0000); | ||
| 1641 | } else { | ||
| 1642 | vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); | ||
| 1643 | vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); | ||
| 1635 | } | 1644 | } |
| 1645 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
| 1646 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
| 1647 | |||
| 1648 | seg_setup(VCPU_SREG_DS); | ||
| 1649 | seg_setup(VCPU_SREG_ES); | ||
| 1650 | seg_setup(VCPU_SREG_FS); | ||
| 1651 | seg_setup(VCPU_SREG_GS); | ||
| 1652 | seg_setup(VCPU_SREG_SS); | ||
| 1653 | |||
| 1654 | vmcs_write16(GUEST_TR_SELECTOR, 0); | ||
| 1655 | vmcs_writel(GUEST_TR_BASE, 0); | ||
| 1656 | vmcs_write32(GUEST_TR_LIMIT, 0xffff); | ||
| 1657 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
| 1636 | 1658 | ||
| 1637 | flags = vmcs_readl(GUEST_RFLAGS); | 1659 | vmcs_write16(GUEST_LDTR_SELECTOR, 0); |
| 1638 | cs = vmcs_readl(GUEST_CS_BASE) >> 4; | 1660 | vmcs_writel(GUEST_LDTR_BASE, 0); |
| 1639 | ip = vmcs_readl(GUEST_RIP); | 1661 | vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); |
| 1662 | vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); | ||
| 1640 | 1663 | ||
| 1664 | vmcs_write32(GUEST_SYSENTER_CS, 0); | ||
| 1665 | vmcs_writel(GUEST_SYSENTER_ESP, 0); | ||
| 1666 | vmcs_writel(GUEST_SYSENTER_EIP, 0); | ||
| 1641 | 1667 | ||
| 1642 | if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE || | 1668 | vmcs_writel(GUEST_RFLAGS, 0x02); |
| 1643 | emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE || | 1669 | if (vmx->vcpu.vcpu_id == 0) |
| 1644 | emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) { | 1670 | vmcs_writel(GUEST_RIP, 0xfff0); |
| 1645 | vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__); | 1671 | else |
| 1646 | return; | 1672 | vmcs_writel(GUEST_RIP, 0); |
| 1673 | vmcs_writel(GUEST_RSP, 0); | ||
| 1674 | |||
| 1675 | /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ | ||
| 1676 | vmcs_writel(GUEST_DR7, 0x400); | ||
| 1677 | |||
| 1678 | vmcs_writel(GUEST_GDTR_BASE, 0); | ||
| 1679 | vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); | ||
| 1680 | |||
| 1681 | vmcs_writel(GUEST_IDTR_BASE, 0); | ||
| 1682 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); | ||
| 1683 | |||
| 1684 | vmcs_write32(GUEST_ACTIVITY_STATE, 0); | ||
| 1685 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); | ||
| 1686 | vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); | ||
| 1687 | |||
| 1688 | guest_write_tsc(0); | ||
| 1689 | |||
| 1690 | /* Special registers */ | ||
| 1691 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | ||
| 1692 | |||
| 1693 | setup_msrs(vmx); | ||
| 1694 | |||
| 1695 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ | ||
| 1696 | |||
| 1697 | if (cpu_has_vmx_tpr_shadow()) { | ||
| 1698 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); | ||
| 1699 | if (vm_need_tpr_shadow(vmx->vcpu.kvm)) | ||
| 1700 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, | ||
| 1701 | page_to_phys(vmx->vcpu.arch.apic->regs_page)); | ||
| 1702 | vmcs_write32(TPR_THRESHOLD, 0); | ||
| 1647 | } | 1703 | } |
| 1648 | 1704 | ||
| 1649 | vmcs_writel(GUEST_RFLAGS, flags & | 1705 | if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) |
| 1650 | ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF)); | 1706 | vmcs_write64(APIC_ACCESS_ADDR, |
| 1651 | vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ; | 1707 | page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); |
| 1652 | vmcs_writel(GUEST_CS_BASE, ent[1] << 4); | 1708 | |
| 1653 | vmcs_writel(GUEST_RIP, ent[0]); | 1709 | vmx->vcpu.arch.cr0 = 0x60000010; |
| 1654 | vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6)); | 1710 | vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ |
| 1711 | vmx_set_cr4(&vmx->vcpu, 0); | ||
| 1712 | #ifdef CONFIG_X86_64 | ||
| 1713 | vmx_set_efer(&vmx->vcpu, 0); | ||
| 1714 | #endif | ||
| 1715 | vmx_fpu_activate(&vmx->vcpu); | ||
| 1716 | update_exception_bitmap(&vmx->vcpu); | ||
| 1717 | |||
| 1718 | return 0; | ||
| 1719 | |||
| 1720 | out: | ||
| 1721 | return ret; | ||
| 1655 | } | 1722 | } |
| 1656 | 1723 | ||
| 1657 | static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) | 1724 | static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) |
| 1658 | { | 1725 | { |
| 1659 | if (vcpu->rmode.active) { | 1726 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
| 1660 | inject_rmode_irq(vcpu, irq); | 1727 | |
| 1728 | if (vcpu->arch.rmode.active) { | ||
| 1729 | vmx->rmode.irq.pending = true; | ||
| 1730 | vmx->rmode.irq.vector = irq; | ||
| 1731 | vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); | ||
| 1732 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
| 1733 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); | ||
| 1734 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | ||
| 1735 | vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); | ||
| 1661 | return; | 1736 | return; |
| 1662 | } | 1737 | } |
| 1663 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 1738 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
| @@ -1666,13 +1741,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) | |||
| 1666 | 1741 | ||
| 1667 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | 1742 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) |
| 1668 | { | 1743 | { |
| 1669 | int word_index = __ffs(vcpu->irq_summary); | 1744 | int word_index = __ffs(vcpu->arch.irq_summary); |
| 1670 | int bit_index = __ffs(vcpu->irq_pending[word_index]); | 1745 | int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); |
| 1671 | int irq = word_index * BITS_PER_LONG + bit_index; | 1746 | int irq = word_index * BITS_PER_LONG + bit_index; |
| 1672 | 1747 | ||
| 1673 | clear_bit(bit_index, &vcpu->irq_pending[word_index]); | 1748 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); |
| 1674 | if (!vcpu->irq_pending[word_index]) | 1749 | if (!vcpu->arch.irq_pending[word_index]) |
| 1675 | clear_bit(word_index, &vcpu->irq_summary); | 1750 | clear_bit(word_index, &vcpu->arch.irq_summary); |
| 1676 | vmx_inject_irq(vcpu, irq); | 1751 | vmx_inject_irq(vcpu, irq); |
| 1677 | } | 1752 | } |
| 1678 | 1753 | ||
| @@ -1682,12 +1757,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, | |||
| 1682 | { | 1757 | { |
| 1683 | u32 cpu_based_vm_exec_control; | 1758 | u32 cpu_based_vm_exec_control; |
| 1684 | 1759 | ||
| 1685 | vcpu->interrupt_window_open = | 1760 | vcpu->arch.interrupt_window_open = |
| 1686 | ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | 1761 | ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && |
| 1687 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); | 1762 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); |
| 1688 | 1763 | ||
| 1689 | if (vcpu->interrupt_window_open && | 1764 | if (vcpu->arch.interrupt_window_open && |
| 1690 | vcpu->irq_summary && | 1765 | vcpu->arch.irq_summary && |
| 1691 | !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) | 1766 | !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) |
| 1692 | /* | 1767 | /* |
| 1693 | * If interrupts enabled, and not blocked by sti or mov ss. Good. | 1768 | * If interrupts enabled, and not blocked by sti or mov ss. Good. |
| @@ -1695,8 +1770,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, | |||
| 1695 | kvm_do_inject_irq(vcpu); | 1770 | kvm_do_inject_irq(vcpu); |
| 1696 | 1771 | ||
| 1697 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 1772 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); |
| 1698 | if (!vcpu->interrupt_window_open && | 1773 | if (!vcpu->arch.interrupt_window_open && |
| 1699 | (vcpu->irq_summary || kvm_run->request_interrupt_window)) | 1774 | (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) |
| 1700 | /* | 1775 | /* |
| 1701 | * Interrupts blocked. Wait for unblock. | 1776 | * Interrupts blocked. Wait for unblock. |
| 1702 | */ | 1777 | */ |
| @@ -1706,6 +1781,23 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, | |||
| 1706 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | 1781 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); |
| 1707 | } | 1782 | } |
| 1708 | 1783 | ||
| 1784 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) | ||
| 1785 | { | ||
| 1786 | int ret; | ||
| 1787 | struct kvm_userspace_memory_region tss_mem = { | ||
| 1788 | .slot = 8, | ||
| 1789 | .guest_phys_addr = addr, | ||
| 1790 | .memory_size = PAGE_SIZE * 3, | ||
| 1791 | .flags = 0, | ||
| 1792 | }; | ||
| 1793 | |||
| 1794 | ret = kvm_set_memory_region(kvm, &tss_mem, 0); | ||
| 1795 | if (ret) | ||
| 1796 | return ret; | ||
| 1797 | kvm->arch.tss_addr = addr; | ||
| 1798 | return 0; | ||
| 1799 | } | ||
| 1800 | |||
| 1709 | static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) | 1801 | static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) |
| 1710 | { | 1802 | { |
| 1711 | struct kvm_guest_debug *dbg = &vcpu->guest_debug; | 1803 | struct kvm_guest_debug *dbg = &vcpu->guest_debug; |
| @@ -1727,7 +1819,7 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) | |||
| 1727 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, | 1819 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, |
| 1728 | int vec, u32 err_code) | 1820 | int vec, u32 err_code) |
| 1729 | { | 1821 | { |
| 1730 | if (!vcpu->rmode.active) | 1822 | if (!vcpu->arch.rmode.active) |
| 1731 | return 0; | 1823 | return 0; |
| 1732 | 1824 | ||
| 1733 | /* | 1825 | /* |
| @@ -1735,32 +1827,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, | |||
| 1735 | * Cause the #SS fault with 0 error code in VM86 mode. | 1827 | * Cause the #SS fault with 0 error code in VM86 mode. |
| 1736 | */ | 1828 | */ |
| 1737 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) | 1829 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) |
| 1738 | if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) | 1830 | if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) |
| 1739 | return 1; | 1831 | return 1; |
| 1740 | return 0; | 1832 | return 0; |
| 1741 | } | 1833 | } |
| 1742 | 1834 | ||
| 1743 | static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 1835 | static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
| 1744 | { | 1836 | { |
| 1837 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 1745 | u32 intr_info, error_code; | 1838 | u32 intr_info, error_code; |
| 1746 | unsigned long cr2, rip; | 1839 | unsigned long cr2, rip; |
| 1747 | u32 vect_info; | 1840 | u32 vect_info; |
| 1748 | enum emulation_result er; | 1841 | enum emulation_result er; |
| 1749 | int r; | ||
| 1750 | 1842 | ||
| 1751 | vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 1843 | vect_info = vmx->idt_vectoring_info; |
| 1752 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | 1844 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); |
| 1753 | 1845 | ||
| 1754 | if ((vect_info & VECTORING_INFO_VALID_MASK) && | 1846 | if ((vect_info & VECTORING_INFO_VALID_MASK) && |
| 1755 | !is_page_fault(intr_info)) { | 1847 | !is_page_fault(intr_info)) |
| 1756 | printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " | 1848 | printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " |
| 1757 | "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); | 1849 | "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); |
| 1758 | } | ||
| 1759 | 1850 | ||
| 1760 | if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { | 1851 | if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { |
| 1761 | int irq = vect_info & VECTORING_INFO_VECTOR_MASK; | 1852 | int irq = vect_info & VECTORING_INFO_VECTOR_MASK; |
| 1762 | set_bit(irq, vcpu->irq_pending); | 1853 | set_bit(irq, vcpu->arch.irq_pending); |
| 1763 | set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); | 1854 | set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); |
| 1764 | } | 1855 | } |
| 1765 | 1856 | ||
| 1766 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ | 1857 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ |
| @@ -1771,52 +1862,34 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1771 | return 1; | 1862 | return 1; |
| 1772 | } | 1863 | } |
| 1773 | 1864 | ||
| 1865 | if (is_invalid_opcode(intr_info)) { | ||
| 1866 | er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); | ||
| 1867 | if (er != EMULATE_DONE) | ||
| 1868 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 1869 | return 1; | ||
| 1870 | } | ||
| 1871 | |||
| 1774 | error_code = 0; | 1872 | error_code = 0; |
| 1775 | rip = vmcs_readl(GUEST_RIP); | 1873 | rip = vmcs_readl(GUEST_RIP); |
| 1776 | if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) | 1874 | if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) |
| 1777 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | 1875 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); |
| 1778 | if (is_page_fault(intr_info)) { | 1876 | if (is_page_fault(intr_info)) { |
| 1779 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | 1877 | cr2 = vmcs_readl(EXIT_QUALIFICATION); |
| 1780 | 1878 | return kvm_mmu_page_fault(vcpu, cr2, error_code); | |
| 1781 | mutex_lock(&vcpu->kvm->lock); | ||
| 1782 | r = kvm_mmu_page_fault(vcpu, cr2, error_code); | ||
| 1783 | if (r < 0) { | ||
| 1784 | mutex_unlock(&vcpu->kvm->lock); | ||
| 1785 | return r; | ||
| 1786 | } | ||
| 1787 | if (!r) { | ||
| 1788 | mutex_unlock(&vcpu->kvm->lock); | ||
| 1789 | return 1; | ||
| 1790 | } | ||
| 1791 | |||
| 1792 | er = emulate_instruction(vcpu, kvm_run, cr2, error_code); | ||
| 1793 | mutex_unlock(&vcpu->kvm->lock); | ||
| 1794 | |||
| 1795 | switch (er) { | ||
| 1796 | case EMULATE_DONE: | ||
| 1797 | return 1; | ||
| 1798 | case EMULATE_DO_MMIO: | ||
| 1799 | ++vcpu->stat.mmio_exits; | ||
| 1800 | return 0; | ||
| 1801 | case EMULATE_FAIL: | ||
| 1802 | kvm_report_emulation_failure(vcpu, "pagetable"); | ||
| 1803 | break; | ||
| 1804 | default: | ||
| 1805 | BUG(); | ||
| 1806 | } | ||
| 1807 | } | 1879 | } |
| 1808 | 1880 | ||
| 1809 | if (vcpu->rmode.active && | 1881 | if (vcpu->arch.rmode.active && |
| 1810 | handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, | 1882 | handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, |
| 1811 | error_code)) { | 1883 | error_code)) { |
| 1812 | if (vcpu->halt_request) { | 1884 | if (vcpu->arch.halt_request) { |
| 1813 | vcpu->halt_request = 0; | 1885 | vcpu->arch.halt_request = 0; |
| 1814 | return kvm_emulate_halt(vcpu); | 1886 | return kvm_emulate_halt(vcpu); |
| 1815 | } | 1887 | } |
| 1816 | return 1; | 1888 | return 1; |
| 1817 | } | 1889 | } |
| 1818 | 1890 | ||
| 1819 | if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { | 1891 | if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == |
| 1892 | (INTR_TYPE_EXCEPTION | 1)) { | ||
| 1820 | kvm_run->exit_reason = KVM_EXIT_DEBUG; | 1893 | kvm_run->exit_reason = KVM_EXIT_DEBUG; |
| 1821 | return 0; | 1894 | return 0; |
| 1822 | } | 1895 | } |
| @@ -1850,7 +1923,8 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1850 | string = (exit_qualification & 16) != 0; | 1923 | string = (exit_qualification & 16) != 0; |
| 1851 | 1924 | ||
| 1852 | if (string) { | 1925 | if (string) { |
| 1853 | if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) | 1926 | if (emulate_instruction(vcpu, |
| 1927 | kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) | ||
| 1854 | return 0; | 1928 | return 0; |
| 1855 | return 1; | 1929 | return 1; |
| 1856 | } | 1930 | } |
| @@ -1873,7 +1947,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | |||
| 1873 | hypercall[0] = 0x0f; | 1947 | hypercall[0] = 0x0f; |
| 1874 | hypercall[1] = 0x01; | 1948 | hypercall[1] = 0x01; |
| 1875 | hypercall[2] = 0xc1; | 1949 | hypercall[2] = 0xc1; |
| 1876 | hypercall[3] = 0xc3; | ||
| 1877 | } | 1950 | } |
| 1878 | 1951 | ||
| 1879 | static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 1952 | static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
| @@ -1890,23 +1963,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1890 | switch (cr) { | 1963 | switch (cr) { |
| 1891 | case 0: | 1964 | case 0: |
| 1892 | vcpu_load_rsp_rip(vcpu); | 1965 | vcpu_load_rsp_rip(vcpu); |
| 1893 | set_cr0(vcpu, vcpu->regs[reg]); | 1966 | set_cr0(vcpu, vcpu->arch.regs[reg]); |
| 1894 | skip_emulated_instruction(vcpu); | 1967 | skip_emulated_instruction(vcpu); |
| 1895 | return 1; | 1968 | return 1; |
| 1896 | case 3: | 1969 | case 3: |
| 1897 | vcpu_load_rsp_rip(vcpu); | 1970 | vcpu_load_rsp_rip(vcpu); |
| 1898 | set_cr3(vcpu, vcpu->regs[reg]); | 1971 | set_cr3(vcpu, vcpu->arch.regs[reg]); |
| 1899 | skip_emulated_instruction(vcpu); | 1972 | skip_emulated_instruction(vcpu); |
| 1900 | return 1; | 1973 | return 1; |
| 1901 | case 4: | 1974 | case 4: |
| 1902 | vcpu_load_rsp_rip(vcpu); | 1975 | vcpu_load_rsp_rip(vcpu); |
| 1903 | set_cr4(vcpu, vcpu->regs[reg]); | 1976 | set_cr4(vcpu, vcpu->arch.regs[reg]); |
| 1904 | skip_emulated_instruction(vcpu); | 1977 | skip_emulated_instruction(vcpu); |
| 1905 | return 1; | 1978 | return 1; |
| 1906 | case 8: | 1979 | case 8: |
| 1907 | vcpu_load_rsp_rip(vcpu); | 1980 | vcpu_load_rsp_rip(vcpu); |
| 1908 | set_cr8(vcpu, vcpu->regs[reg]); | 1981 | set_cr8(vcpu, vcpu->arch.regs[reg]); |
| 1909 | skip_emulated_instruction(vcpu); | 1982 | skip_emulated_instruction(vcpu); |
| 1983 | if (irqchip_in_kernel(vcpu->kvm)) | ||
| 1984 | return 1; | ||
| 1910 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; | 1985 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; |
| 1911 | return 0; | 1986 | return 0; |
| 1912 | }; | 1987 | }; |
| @@ -1914,8 +1989,8 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1914 | case 2: /* clts */ | 1989 | case 2: /* clts */ |
| 1915 | vcpu_load_rsp_rip(vcpu); | 1990 | vcpu_load_rsp_rip(vcpu); |
| 1916 | vmx_fpu_deactivate(vcpu); | 1991 | vmx_fpu_deactivate(vcpu); |
| 1917 | vcpu->cr0 &= ~X86_CR0_TS; | 1992 | vcpu->arch.cr0 &= ~X86_CR0_TS; |
| 1918 | vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); | 1993 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); |
| 1919 | vmx_fpu_activate(vcpu); | 1994 | vmx_fpu_activate(vcpu); |
| 1920 | skip_emulated_instruction(vcpu); | 1995 | skip_emulated_instruction(vcpu); |
| 1921 | return 1; | 1996 | return 1; |
| @@ -1923,13 +1998,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1923 | switch (cr) { | 1998 | switch (cr) { |
| 1924 | case 3: | 1999 | case 3: |
| 1925 | vcpu_load_rsp_rip(vcpu); | 2000 | vcpu_load_rsp_rip(vcpu); |
| 1926 | vcpu->regs[reg] = vcpu->cr3; | 2001 | vcpu->arch.regs[reg] = vcpu->arch.cr3; |
| 1927 | vcpu_put_rsp_rip(vcpu); | 2002 | vcpu_put_rsp_rip(vcpu); |
| 1928 | skip_emulated_instruction(vcpu); | 2003 | skip_emulated_instruction(vcpu); |
| 1929 | return 1; | 2004 | return 1; |
| 1930 | case 8: | 2005 | case 8: |
| 1931 | vcpu_load_rsp_rip(vcpu); | 2006 | vcpu_load_rsp_rip(vcpu); |
| 1932 | vcpu->regs[reg] = get_cr8(vcpu); | 2007 | vcpu->arch.regs[reg] = get_cr8(vcpu); |
| 1933 | vcpu_put_rsp_rip(vcpu); | 2008 | vcpu_put_rsp_rip(vcpu); |
| 1934 | skip_emulated_instruction(vcpu); | 2009 | skip_emulated_instruction(vcpu); |
| 1935 | return 1; | 2010 | return 1; |
| @@ -1975,7 +2050,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1975 | default: | 2050 | default: |
| 1976 | val = 0; | 2051 | val = 0; |
| 1977 | } | 2052 | } |
| 1978 | vcpu->regs[reg] = val; | 2053 | vcpu->arch.regs[reg] = val; |
| 1979 | } else { | 2054 | } else { |
| 1980 | /* mov to dr */ | 2055 | /* mov to dr */ |
| 1981 | } | 2056 | } |
| @@ -1992,29 +2067,29 @@ static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1992 | 2067 | ||
| 1993 | static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2068 | static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
| 1994 | { | 2069 | { |
| 1995 | u32 ecx = vcpu->regs[VCPU_REGS_RCX]; | 2070 | u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; |
| 1996 | u64 data; | 2071 | u64 data; |
| 1997 | 2072 | ||
| 1998 | if (vmx_get_msr(vcpu, ecx, &data)) { | 2073 | if (vmx_get_msr(vcpu, ecx, &data)) { |
| 1999 | vmx_inject_gp(vcpu, 0); | 2074 | kvm_inject_gp(vcpu, 0); |
| 2000 | return 1; | 2075 | return 1; |
| 2001 | } | 2076 | } |
| 2002 | 2077 | ||
| 2003 | /* FIXME: handling of bits 32:63 of rax, rdx */ | 2078 | /* FIXME: handling of bits 32:63 of rax, rdx */ |
| 2004 | vcpu->regs[VCPU_REGS_RAX] = data & -1u; | 2079 | vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; |
| 2005 | vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u; | 2080 | vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; |
| 2006 | skip_emulated_instruction(vcpu); | 2081 | skip_emulated_instruction(vcpu); |
| 2007 | return 1; | 2082 | return 1; |
| 2008 | } | 2083 | } |
| 2009 | 2084 | ||
| 2010 | static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2085 | static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
| 2011 | { | 2086 | { |
| 2012 | u32 ecx = vcpu->regs[VCPU_REGS_RCX]; | 2087 | u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; |
| 2013 | u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u) | 2088 | u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) |
| 2014 | | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); | 2089 | | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); |
| 2015 | 2090 | ||
| 2016 | if (vmx_set_msr(vcpu, ecx, data) != 0) { | 2091 | if (vmx_set_msr(vcpu, ecx, data) != 0) { |
| 2017 | vmx_inject_gp(vcpu, 0); | 2092 | kvm_inject_gp(vcpu, 0); |
| 2018 | return 1; | 2093 | return 1; |
| 2019 | } | 2094 | } |
| 2020 | 2095 | ||
| @@ -2042,7 +2117,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, | |||
| 2042 | * possible | 2117 | * possible |
| 2043 | */ | 2118 | */ |
| 2044 | if (kvm_run->request_interrupt_window && | 2119 | if (kvm_run->request_interrupt_window && |
| 2045 | !vcpu->irq_summary) { | 2120 | !vcpu->arch.irq_summary) { |
| 2046 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | 2121 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; |
| 2047 | ++vcpu->stat.irq_window_exits; | 2122 | ++vcpu->stat.irq_window_exits; |
| 2048 | return 0; | 2123 | return 0; |
| @@ -2059,7 +2134,35 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2059 | static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2134 | static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
| 2060 | { | 2135 | { |
| 2061 | skip_emulated_instruction(vcpu); | 2136 | skip_emulated_instruction(vcpu); |
| 2062 | return kvm_hypercall(vcpu, kvm_run); | 2137 | kvm_emulate_hypercall(vcpu); |
| 2138 | return 1; | ||
| 2139 | } | ||
| 2140 | |||
| 2141 | static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
| 2142 | { | ||
| 2143 | skip_emulated_instruction(vcpu); | ||
| 2144 | /* TODO: Add support for VT-d/pass-through device */ | ||
| 2145 | return 1; | ||
| 2146 | } | ||
| 2147 | |||
| 2148 | static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
| 2149 | { | ||
| 2150 | u64 exit_qualification; | ||
| 2151 | enum emulation_result er; | ||
| 2152 | unsigned long offset; | ||
| 2153 | |||
| 2154 | exit_qualification = vmcs_read64(EXIT_QUALIFICATION); | ||
| 2155 | offset = exit_qualification & 0xffful; | ||
| 2156 | |||
| 2157 | er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); | ||
| 2158 | |||
| 2159 | if (er != EMULATE_DONE) { | ||
| 2160 | printk(KERN_ERR | ||
| 2161 | "Fail to handle apic access vmexit! Offset is 0x%lx\n", | ||
| 2162 | offset); | ||
| 2163 | return -ENOTSUPP; | ||
| 2164 | } | ||
| 2165 | return 1; | ||
| 2063 | } | 2166 | } |
| 2064 | 2167 | ||
| 2065 | /* | 2168 | /* |
| @@ -2081,7 +2184,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, | |||
| 2081 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, | 2184 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, |
| 2082 | [EXIT_REASON_HLT] = handle_halt, | 2185 | [EXIT_REASON_HLT] = handle_halt, |
| 2083 | [EXIT_REASON_VMCALL] = handle_vmcall, | 2186 | [EXIT_REASON_VMCALL] = handle_vmcall, |
| 2084 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold | 2187 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, |
| 2188 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | ||
| 2189 | [EXIT_REASON_WBINVD] = handle_wbinvd, | ||
| 2085 | }; | 2190 | }; |
| 2086 | 2191 | ||
| 2087 | static const int kvm_vmx_max_exit_handlers = | 2192 | static const int kvm_vmx_max_exit_handlers = |
| @@ -2093,9 +2198,9 @@ static const int kvm_vmx_max_exit_handlers = | |||
| 2093 | */ | 2198 | */ |
| 2094 | static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | 2199 | static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) |
| 2095 | { | 2200 | { |
| 2096 | u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
| 2097 | u32 exit_reason = vmcs_read32(VM_EXIT_REASON); | 2201 | u32 exit_reason = vmcs_read32(VM_EXIT_REASON); |
| 2098 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2202 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
| 2203 | u32 vectoring_info = vmx->idt_vectoring_info; | ||
| 2099 | 2204 | ||
| 2100 | if (unlikely(vmx->fail)) { | 2205 | if (unlikely(vmx->fail)) { |
| 2101 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; | 2206 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; |
| @@ -2104,8 +2209,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
| 2104 | return 0; | 2209 | return 0; |
| 2105 | } | 2210 | } |
| 2106 | 2211 | ||
| 2107 | if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && | 2212 | if ((vectoring_info & VECTORING_INFO_VALID_MASK) && |
| 2108 | exit_reason != EXIT_REASON_EXCEPTION_NMI ) | 2213 | exit_reason != EXIT_REASON_EXCEPTION_NMI) |
| 2109 | printk(KERN_WARNING "%s: unexpected, valid vectoring info and " | 2214 | printk(KERN_WARNING "%s: unexpected, valid vectoring info and " |
| 2110 | "exit reason is 0x%x\n", __FUNCTION__, exit_reason); | 2215 | "exit reason is 0x%x\n", __FUNCTION__, exit_reason); |
| 2111 | if (exit_reason < kvm_vmx_max_exit_handlers | 2216 | if (exit_reason < kvm_vmx_max_exit_handlers |
| @@ -2150,26 +2255,38 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) | |||
| 2150 | 2255 | ||
| 2151 | static void vmx_intr_assist(struct kvm_vcpu *vcpu) | 2256 | static void vmx_intr_assist(struct kvm_vcpu *vcpu) |
| 2152 | { | 2257 | { |
| 2258 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 2153 | u32 idtv_info_field, intr_info_field; | 2259 | u32 idtv_info_field, intr_info_field; |
| 2154 | int has_ext_irq, interrupt_window_open; | 2260 | int has_ext_irq, interrupt_window_open; |
| 2155 | int vector; | 2261 | int vector; |
| 2156 | 2262 | ||
| 2157 | kvm_inject_pending_timer_irqs(vcpu); | ||
| 2158 | update_tpr_threshold(vcpu); | 2263 | update_tpr_threshold(vcpu); |
| 2159 | 2264 | ||
| 2160 | has_ext_irq = kvm_cpu_has_interrupt(vcpu); | 2265 | has_ext_irq = kvm_cpu_has_interrupt(vcpu); |
| 2161 | intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); | 2266 | intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); |
| 2162 | idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 2267 | idtv_info_field = vmx->idt_vectoring_info; |
| 2163 | if (intr_info_field & INTR_INFO_VALID_MASK) { | 2268 | if (intr_info_field & INTR_INFO_VALID_MASK) { |
| 2164 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | 2269 | if (idtv_info_field & INTR_INFO_VALID_MASK) { |
| 2165 | /* TODO: fault when IDT_Vectoring */ | 2270 | /* TODO: fault when IDT_Vectoring */ |
| 2166 | printk(KERN_ERR "Fault when IDT_Vectoring\n"); | 2271 | if (printk_ratelimit()) |
| 2272 | printk(KERN_ERR "Fault when IDT_Vectoring\n"); | ||
| 2167 | } | 2273 | } |
| 2168 | if (has_ext_irq) | 2274 | if (has_ext_irq) |
| 2169 | enable_irq_window(vcpu); | 2275 | enable_irq_window(vcpu); |
| 2170 | return; | 2276 | return; |
| 2171 | } | 2277 | } |
| 2172 | if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { | 2278 | if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { |
| 2279 | if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) | ||
| 2280 | == INTR_TYPE_EXT_INTR | ||
| 2281 | && vcpu->arch.rmode.active) { | ||
| 2282 | u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; | ||
| 2283 | |||
| 2284 | vmx_inject_irq(vcpu, vect); | ||
| 2285 | if (unlikely(has_ext_irq)) | ||
| 2286 | enable_irq_window(vcpu); | ||
| 2287 | return; | ||
| 2288 | } | ||
| 2289 | |||
| 2173 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); | 2290 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); |
| 2174 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | 2291 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, |
| 2175 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); | 2292 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); |
| @@ -2194,6 +2311,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) | |||
| 2194 | enable_irq_window(vcpu); | 2311 | enable_irq_window(vcpu); |
| 2195 | } | 2312 | } |
| 2196 | 2313 | ||
| 2314 | /* | ||
| 2315 | * Failure to inject an interrupt should give us the information | ||
| 2316 | * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs | ||
| 2317 | * when fetching the interrupt redirection bitmap in the real-mode | ||
| 2318 | * tss, this doesn't happen. So we do it ourselves. | ||
| 2319 | */ | ||
| 2320 | static void fixup_rmode_irq(struct vcpu_vmx *vmx) | ||
| 2321 | { | ||
| 2322 | vmx->rmode.irq.pending = 0; | ||
| 2323 | if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) | ||
| 2324 | return; | ||
| 2325 | vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); | ||
| 2326 | if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { | ||
| 2327 | vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; | ||
| 2328 | vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; | ||
| 2329 | return; | ||
| 2330 | } | ||
| 2331 | vmx->idt_vectoring_info = | ||
| 2332 | VECTORING_INFO_VALID_MASK | ||
| 2333 | | INTR_TYPE_EXT_INTR | ||
| 2334 | | vmx->rmode.irq.vector; | ||
| 2335 | } | ||
| 2336 | |||
| 2197 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2337 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
| 2198 | { | 2338 | { |
| 2199 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2339 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
| @@ -2204,50 +2344,47 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2204 | */ | 2344 | */ |
| 2205 | vmcs_writel(HOST_CR0, read_cr0()); | 2345 | vmcs_writel(HOST_CR0, read_cr0()); |
| 2206 | 2346 | ||
| 2207 | asm ( | 2347 | asm( |
| 2208 | /* Store host registers */ | 2348 | /* Store host registers */ |
| 2209 | #ifdef CONFIG_X86_64 | 2349 | #ifdef CONFIG_X86_64 |
| 2210 | "push %%rax; push %%rbx; push %%rdx;" | 2350 | "push %%rdx; push %%rbp;" |
| 2211 | "push %%rsi; push %%rdi; push %%rbp;" | ||
| 2212 | "push %%r8; push %%r9; push %%r10; push %%r11;" | ||
| 2213 | "push %%r12; push %%r13; push %%r14; push %%r15;" | ||
| 2214 | "push %%rcx \n\t" | 2351 | "push %%rcx \n\t" |
| 2215 | ASM_VMX_VMWRITE_RSP_RDX "\n\t" | ||
| 2216 | #else | 2352 | #else |
| 2217 | "pusha; push %%ecx \n\t" | 2353 | "push %%edx; push %%ebp;" |
| 2218 | ASM_VMX_VMWRITE_RSP_RDX "\n\t" | 2354 | "push %%ecx \n\t" |
| 2219 | #endif | 2355 | #endif |
| 2356 | ASM_VMX_VMWRITE_RSP_RDX "\n\t" | ||
| 2220 | /* Check if vmlaunch of vmresume is needed */ | 2357 | /* Check if vmlaunch of vmresume is needed */ |
| 2221 | "cmp $0, %1 \n\t" | 2358 | "cmpl $0, %c[launched](%0) \n\t" |
| 2222 | /* Load guest registers. Don't clobber flags. */ | 2359 | /* Load guest registers. Don't clobber flags. */ |
| 2223 | #ifdef CONFIG_X86_64 | 2360 | #ifdef CONFIG_X86_64 |
| 2224 | "mov %c[cr2](%3), %%rax \n\t" | 2361 | "mov %c[cr2](%0), %%rax \n\t" |
| 2225 | "mov %%rax, %%cr2 \n\t" | 2362 | "mov %%rax, %%cr2 \n\t" |
| 2226 | "mov %c[rax](%3), %%rax \n\t" | 2363 | "mov %c[rax](%0), %%rax \n\t" |
| 2227 | "mov %c[rbx](%3), %%rbx \n\t" | 2364 | "mov %c[rbx](%0), %%rbx \n\t" |
| 2228 | "mov %c[rdx](%3), %%rdx \n\t" | 2365 | "mov %c[rdx](%0), %%rdx \n\t" |
| 2229 | "mov %c[rsi](%3), %%rsi \n\t" | 2366 | "mov %c[rsi](%0), %%rsi \n\t" |
| 2230 | "mov %c[rdi](%3), %%rdi \n\t" | 2367 | "mov %c[rdi](%0), %%rdi \n\t" |
| 2231 | "mov %c[rbp](%3), %%rbp \n\t" | 2368 | "mov %c[rbp](%0), %%rbp \n\t" |
| 2232 | "mov %c[r8](%3), %%r8 \n\t" | 2369 | "mov %c[r8](%0), %%r8 \n\t" |
| 2233 | "mov %c[r9](%3), %%r9 \n\t" | 2370 | "mov %c[r9](%0), %%r9 \n\t" |
| 2234 | "mov %c[r10](%3), %%r10 \n\t" | 2371 | "mov %c[r10](%0), %%r10 \n\t" |
| 2235 | "mov %c[r11](%3), %%r11 \n\t" | 2372 | "mov %c[r11](%0), %%r11 \n\t" |
| 2236 | "mov %c[r12](%3), %%r12 \n\t" | 2373 | "mov %c[r12](%0), %%r12 \n\t" |
| 2237 | "mov %c[r13](%3), %%r13 \n\t" | 2374 | "mov %c[r13](%0), %%r13 \n\t" |
| 2238 | "mov %c[r14](%3), %%r14 \n\t" | 2375 | "mov %c[r14](%0), %%r14 \n\t" |
| 2239 | "mov %c[r15](%3), %%r15 \n\t" | 2376 | "mov %c[r15](%0), %%r15 \n\t" |
| 2240 | "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */ | 2377 | "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ |
| 2241 | #else | 2378 | #else |
| 2242 | "mov %c[cr2](%3), %%eax \n\t" | 2379 | "mov %c[cr2](%0), %%eax \n\t" |
| 2243 | "mov %%eax, %%cr2 \n\t" | 2380 | "mov %%eax, %%cr2 \n\t" |
| 2244 | "mov %c[rax](%3), %%eax \n\t" | 2381 | "mov %c[rax](%0), %%eax \n\t" |
| 2245 | "mov %c[rbx](%3), %%ebx \n\t" | 2382 | "mov %c[rbx](%0), %%ebx \n\t" |
| 2246 | "mov %c[rdx](%3), %%edx \n\t" | 2383 | "mov %c[rdx](%0), %%edx \n\t" |
| 2247 | "mov %c[rsi](%3), %%esi \n\t" | 2384 | "mov %c[rsi](%0), %%esi \n\t" |
| 2248 | "mov %c[rdi](%3), %%edi \n\t" | 2385 | "mov %c[rdi](%0), %%edi \n\t" |
| 2249 | "mov %c[rbp](%3), %%ebp \n\t" | 2386 | "mov %c[rbp](%0), %%ebp \n\t" |
| 2250 | "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ | 2387 | "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ |
| 2251 | #endif | 2388 | #endif |
| 2252 | /* Enter guest mode */ | 2389 | /* Enter guest mode */ |
| 2253 | "jne .Llaunched \n\t" | 2390 | "jne .Llaunched \n\t" |
| @@ -2257,72 +2394,79 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2257 | ".Lkvm_vmx_return: " | 2394 | ".Lkvm_vmx_return: " |
| 2258 | /* Save guest registers, load host registers, keep flags */ | 2395 | /* Save guest registers, load host registers, keep flags */ |
| 2259 | #ifdef CONFIG_X86_64 | 2396 | #ifdef CONFIG_X86_64 |
| 2260 | "xchg %3, (%%rsp) \n\t" | 2397 | "xchg %0, (%%rsp) \n\t" |
| 2261 | "mov %%rax, %c[rax](%3) \n\t" | 2398 | "mov %%rax, %c[rax](%0) \n\t" |
| 2262 | "mov %%rbx, %c[rbx](%3) \n\t" | 2399 | "mov %%rbx, %c[rbx](%0) \n\t" |
| 2263 | "pushq (%%rsp); popq %c[rcx](%3) \n\t" | 2400 | "pushq (%%rsp); popq %c[rcx](%0) \n\t" |
| 2264 | "mov %%rdx, %c[rdx](%3) \n\t" | 2401 | "mov %%rdx, %c[rdx](%0) \n\t" |
| 2265 | "mov %%rsi, %c[rsi](%3) \n\t" | 2402 | "mov %%rsi, %c[rsi](%0) \n\t" |
| 2266 | "mov %%rdi, %c[rdi](%3) \n\t" | 2403 | "mov %%rdi, %c[rdi](%0) \n\t" |
| 2267 | "mov %%rbp, %c[rbp](%3) \n\t" | 2404 | "mov %%rbp, %c[rbp](%0) \n\t" |
| 2268 | "mov %%r8, %c[r8](%3) \n\t" | 2405 | "mov %%r8, %c[r8](%0) \n\t" |
| 2269 | "mov %%r9, %c[r9](%3) \n\t" | 2406 | "mov %%r9, %c[r9](%0) \n\t" |
| 2270 | "mov %%r10, %c[r10](%3) \n\t" | 2407 | "mov %%r10, %c[r10](%0) \n\t" |
| 2271 | "mov %%r11, %c[r11](%3) \n\t" | 2408 | "mov %%r11, %c[r11](%0) \n\t" |
| 2272 | "mov %%r12, %c[r12](%3) \n\t" | 2409 | "mov %%r12, %c[r12](%0) \n\t" |
| 2273 | "mov %%r13, %c[r13](%3) \n\t" | 2410 | "mov %%r13, %c[r13](%0) \n\t" |
| 2274 | "mov %%r14, %c[r14](%3) \n\t" | 2411 | "mov %%r14, %c[r14](%0) \n\t" |
| 2275 | "mov %%r15, %c[r15](%3) \n\t" | 2412 | "mov %%r15, %c[r15](%0) \n\t" |
| 2276 | "mov %%cr2, %%rax \n\t" | 2413 | "mov %%cr2, %%rax \n\t" |
| 2277 | "mov %%rax, %c[cr2](%3) \n\t" | 2414 | "mov %%rax, %c[cr2](%0) \n\t" |
| 2278 | "mov (%%rsp), %3 \n\t" | ||
| 2279 | 2415 | ||
| 2280 | "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;" | 2416 | "pop %%rbp; pop %%rbp; pop %%rdx \n\t" |
| 2281 | "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" | ||
| 2282 | "pop %%rbp; pop %%rdi; pop %%rsi;" | ||
| 2283 | "pop %%rdx; pop %%rbx; pop %%rax \n\t" | ||
| 2284 | #else | 2417 | #else |
| 2285 | "xchg %3, (%%esp) \n\t" | 2418 | "xchg %0, (%%esp) \n\t" |
| 2286 | "mov %%eax, %c[rax](%3) \n\t" | 2419 | "mov %%eax, %c[rax](%0) \n\t" |
| 2287 | "mov %%ebx, %c[rbx](%3) \n\t" | 2420 | "mov %%ebx, %c[rbx](%0) \n\t" |
| 2288 | "pushl (%%esp); popl %c[rcx](%3) \n\t" | 2421 | "pushl (%%esp); popl %c[rcx](%0) \n\t" |
| 2289 | "mov %%edx, %c[rdx](%3) \n\t" | 2422 | "mov %%edx, %c[rdx](%0) \n\t" |
| 2290 | "mov %%esi, %c[rsi](%3) \n\t" | 2423 | "mov %%esi, %c[rsi](%0) \n\t" |
| 2291 | "mov %%edi, %c[rdi](%3) \n\t" | 2424 | "mov %%edi, %c[rdi](%0) \n\t" |
| 2292 | "mov %%ebp, %c[rbp](%3) \n\t" | 2425 | "mov %%ebp, %c[rbp](%0) \n\t" |
| 2293 | "mov %%cr2, %%eax \n\t" | 2426 | "mov %%cr2, %%eax \n\t" |
| 2294 | "mov %%eax, %c[cr2](%3) \n\t" | 2427 | "mov %%eax, %c[cr2](%0) \n\t" |
| 2295 | "mov (%%esp), %3 \n\t" | ||
| 2296 | 2428 | ||
| 2297 | "pop %%ecx; popa \n\t" | 2429 | "pop %%ebp; pop %%ebp; pop %%edx \n\t" |
| 2430 | #endif | ||
| 2431 | "setbe %c[fail](%0) \n\t" | ||
| 2432 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), | ||
| 2433 | [launched]"i"(offsetof(struct vcpu_vmx, launched)), | ||
| 2434 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | ||
| 2435 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), | ||
| 2436 | [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), | ||
| 2437 | [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), | ||
| 2438 | [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), | ||
| 2439 | [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), | ||
| 2440 | [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), | ||
| 2441 | [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), | ||
| 2442 | #ifdef CONFIG_X86_64 | ||
| 2443 | [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), | ||
| 2444 | [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), | ||
| 2445 | [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), | ||
| 2446 | [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), | ||
| 2447 | [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), | ||
| 2448 | [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), | ||
| 2449 | [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), | ||
| 2450 | [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), | ||
| 2298 | #endif | 2451 | #endif |
| 2299 | "setbe %0 \n\t" | 2452 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) |
| 2300 | : "=q" (vmx->fail) | 2453 | : "cc", "memory" |
| 2301 | : "r"(vmx->launched), "d"((unsigned long)HOST_RSP), | ||
| 2302 | "c"(vcpu), | ||
| 2303 | [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])), | ||
| 2304 | [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), | ||
| 2305 | [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])), | ||
| 2306 | [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])), | ||
| 2307 | [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])), | ||
| 2308 | [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])), | ||
| 2309 | [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])), | ||
| 2310 | #ifdef CONFIG_X86_64 | 2454 | #ifdef CONFIG_X86_64 |
| 2311 | [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), | 2455 | , "rbx", "rdi", "rsi" |
| 2312 | [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), | 2456 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" |
| 2313 | [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), | 2457 | #else |
| 2314 | [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), | 2458 | , "ebx", "edi", "rsi" |
| 2315 | [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])), | ||
| 2316 | [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])), | ||
| 2317 | [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])), | ||
| 2318 | [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])), | ||
| 2319 | #endif | 2459 | #endif |
| 2320 | [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) | 2460 | ); |
| 2321 | : "cc", "memory" ); | 2461 | |
| 2462 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
| 2463 | if (vmx->rmode.irq.pending) | ||
| 2464 | fixup_rmode_irq(vmx); | ||
| 2322 | 2465 | ||
| 2323 | vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; | 2466 | vcpu->arch.interrupt_window_open = |
| 2467 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; | ||
| 2324 | 2468 | ||
| 2325 | asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); | 2469 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); |
| 2326 | vmx->launched = 1; | 2470 | vmx->launched = 1; |
| 2327 | 2471 | ||
| 2328 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | 2472 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); |
| @@ -2332,36 +2476,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2332 | asm("int $2"); | 2476 | asm("int $2"); |
| 2333 | } | 2477 | } |
| 2334 | 2478 | ||
| 2335 | static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, | ||
| 2336 | unsigned long addr, | ||
| 2337 | u32 err_code) | ||
| 2338 | { | ||
| 2339 | u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
| 2340 | |||
| 2341 | ++vcpu->stat.pf_guest; | ||
| 2342 | |||
| 2343 | if (is_page_fault(vect_info)) { | ||
| 2344 | printk(KERN_DEBUG "inject_page_fault: " | ||
| 2345 | "double fault 0x%lx @ 0x%lx\n", | ||
| 2346 | addr, vmcs_readl(GUEST_RIP)); | ||
| 2347 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0); | ||
| 2348 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
| 2349 | DF_VECTOR | | ||
| 2350 | INTR_TYPE_EXCEPTION | | ||
| 2351 | INTR_INFO_DELIEVER_CODE_MASK | | ||
| 2352 | INTR_INFO_VALID_MASK); | ||
| 2353 | return; | ||
| 2354 | } | ||
| 2355 | vcpu->cr2 = addr; | ||
| 2356 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code); | ||
| 2357 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
| 2358 | PF_VECTOR | | ||
| 2359 | INTR_TYPE_EXCEPTION | | ||
| 2360 | INTR_INFO_DELIEVER_CODE_MASK | | ||
| 2361 | INTR_INFO_VALID_MASK); | ||
| 2362 | |||
| 2363 | } | ||
| 2364 | |||
| 2365 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) | 2479 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) |
| 2366 | { | 2480 | { |
| 2367 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2481 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
| @@ -2397,12 +2511,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
| 2397 | if (err) | 2511 | if (err) |
| 2398 | goto free_vcpu; | 2512 | goto free_vcpu; |
| 2399 | 2513 | ||
| 2400 | if (irqchip_in_kernel(kvm)) { | ||
| 2401 | err = kvm_create_lapic(&vmx->vcpu); | ||
| 2402 | if (err < 0) | ||
| 2403 | goto free_vcpu; | ||
| 2404 | } | ||
| 2405 | |||
| 2406 | vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | 2514 | vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); |
| 2407 | if (!vmx->guest_msrs) { | 2515 | if (!vmx->guest_msrs) { |
| 2408 | err = -ENOMEM; | 2516 | err = -ENOMEM; |
| @@ -2464,6 +2572,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
| 2464 | .check_processor_compatibility = vmx_check_processor_compat, | 2572 | .check_processor_compatibility = vmx_check_processor_compat, |
| 2465 | .hardware_enable = hardware_enable, | 2573 | .hardware_enable = hardware_enable, |
| 2466 | .hardware_disable = hardware_disable, | 2574 | .hardware_disable = hardware_disable, |
| 2575 | .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses, | ||
| 2467 | 2576 | ||
| 2468 | .vcpu_create = vmx_create_vcpu, | 2577 | .vcpu_create = vmx_create_vcpu, |
| 2469 | .vcpu_free = vmx_free_vcpu, | 2578 | .vcpu_free = vmx_free_vcpu, |
| @@ -2499,9 +2608,6 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
| 2499 | .set_rflags = vmx_set_rflags, | 2608 | .set_rflags = vmx_set_rflags, |
| 2500 | 2609 | ||
| 2501 | .tlb_flush = vmx_flush_tlb, | 2610 | .tlb_flush = vmx_flush_tlb, |
| 2502 | .inject_page_fault = vmx_inject_page_fault, | ||
| 2503 | |||
| 2504 | .inject_gp = vmx_inject_gp, | ||
| 2505 | 2611 | ||
| 2506 | .run = vmx_vcpu_run, | 2612 | .run = vmx_vcpu_run, |
| 2507 | .handle_exit = kvm_handle_exit, | 2613 | .handle_exit = kvm_handle_exit, |
| @@ -2509,8 +2615,12 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
| 2509 | .patch_hypercall = vmx_patch_hypercall, | 2615 | .patch_hypercall = vmx_patch_hypercall, |
| 2510 | .get_irq = vmx_get_irq, | 2616 | .get_irq = vmx_get_irq, |
| 2511 | .set_irq = vmx_inject_irq, | 2617 | .set_irq = vmx_inject_irq, |
| 2618 | .queue_exception = vmx_queue_exception, | ||
| 2619 | .exception_injected = vmx_exception_injected, | ||
| 2512 | .inject_pending_irq = vmx_intr_assist, | 2620 | .inject_pending_irq = vmx_intr_assist, |
| 2513 | .inject_pending_vectors = do_interrupt_requests, | 2621 | .inject_pending_vectors = do_interrupt_requests, |
| 2622 | |||
| 2623 | .set_tss_addr = vmx_set_tss_addr, | ||
| 2514 | }; | 2624 | }; |
| 2515 | 2625 | ||
| 2516 | static int __init vmx_init(void) | 2626 | static int __init vmx_init(void) |
| @@ -2541,10 +2651,13 @@ static int __init vmx_init(void) | |||
| 2541 | memset(iova, 0xff, PAGE_SIZE); | 2651 | memset(iova, 0xff, PAGE_SIZE); |
| 2542 | kunmap(vmx_io_bitmap_b); | 2652 | kunmap(vmx_io_bitmap_b); |
| 2543 | 2653 | ||
| 2544 | r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); | 2654 | r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); |
| 2545 | if (r) | 2655 | if (r) |
| 2546 | goto out1; | 2656 | goto out1; |
| 2547 | 2657 | ||
| 2658 | if (bypass_guest_pf) | ||
| 2659 | kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); | ||
| 2660 | |||
| 2548 | return 0; | 2661 | return 0; |
| 2549 | 2662 | ||
| 2550 | out1: | 2663 | out1: |
| @@ -2559,7 +2672,7 @@ static void __exit vmx_exit(void) | |||
| 2559 | __free_page(vmx_io_bitmap_b); | 2672 | __free_page(vmx_io_bitmap_b); |
| 2560 | __free_page(vmx_io_bitmap_a); | 2673 | __free_page(vmx_io_bitmap_a); |
| 2561 | 2674 | ||
| 2562 | kvm_exit_x86(); | 2675 | kvm_exit(); |
| 2563 | } | 2676 | } |
| 2564 | 2677 | ||
| 2565 | module_init(vmx_init) | 2678 | module_init(vmx_init) |
diff --git a/drivers/kvm/vmx.h b/arch/x86/kvm/vmx.h index fd4e14666088..d52ae8d7303d 100644 --- a/drivers/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h | |||
| @@ -25,6 +25,9 @@ | |||
| 25 | * | 25 | * |
| 26 | */ | 26 | */ |
| 27 | 27 | ||
| 28 | /* | ||
| 29 | * Definitions of Primary Processor-Based VM-Execution Controls. | ||
| 30 | */ | ||
| 28 | #define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 | 31 | #define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 |
| 29 | #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 | 32 | #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 |
| 30 | #define CPU_BASED_HLT_EXITING 0x00000080 | 33 | #define CPU_BASED_HLT_EXITING 0x00000080 |
| @@ -42,6 +45,12 @@ | |||
| 42 | #define CPU_BASED_MONITOR_EXITING 0x20000000 | 45 | #define CPU_BASED_MONITOR_EXITING 0x20000000 |
| 43 | #define CPU_BASED_PAUSE_EXITING 0x40000000 | 46 | #define CPU_BASED_PAUSE_EXITING 0x40000000 |
| 44 | #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 | 47 | #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 |
| 48 | /* | ||
| 49 | * Definitions of Secondary Processor-Based VM-Execution Controls. | ||
| 50 | */ | ||
| 51 | #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 | ||
| 52 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 | ||
| 53 | |||
| 45 | 54 | ||
| 46 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 | 55 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 |
| 47 | #define PIN_BASED_NMI_EXITING 0x00000008 | 56 | #define PIN_BASED_NMI_EXITING 0x00000008 |
| @@ -54,8 +63,6 @@ | |||
| 54 | #define VM_ENTRY_SMM 0x00000400 | 63 | #define VM_ENTRY_SMM 0x00000400 |
| 55 | #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 | 64 | #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 |
| 56 | 65 | ||
| 57 | #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 | ||
| 58 | |||
| 59 | /* VMCS Encodings */ | 66 | /* VMCS Encodings */ |
| 60 | enum vmcs_field { | 67 | enum vmcs_field { |
| 61 | GUEST_ES_SELECTOR = 0x00000800, | 68 | GUEST_ES_SELECTOR = 0x00000800, |
| @@ -89,6 +96,8 @@ enum vmcs_field { | |||
| 89 | TSC_OFFSET_HIGH = 0x00002011, | 96 | TSC_OFFSET_HIGH = 0x00002011, |
| 90 | VIRTUAL_APIC_PAGE_ADDR = 0x00002012, | 97 | VIRTUAL_APIC_PAGE_ADDR = 0x00002012, |
| 91 | VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, | 98 | VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, |
| 99 | APIC_ACCESS_ADDR = 0x00002014, | ||
| 100 | APIC_ACCESS_ADDR_HIGH = 0x00002015, | ||
| 92 | VMCS_LINK_POINTER = 0x00002800, | 101 | VMCS_LINK_POINTER = 0x00002800, |
| 93 | VMCS_LINK_POINTER_HIGH = 0x00002801, | 102 | VMCS_LINK_POINTER_HIGH = 0x00002801, |
| 94 | GUEST_IA32_DEBUGCTL = 0x00002802, | 103 | GUEST_IA32_DEBUGCTL = 0x00002802, |
| @@ -214,6 +223,8 @@ enum vmcs_field { | |||
| 214 | #define EXIT_REASON_MSR_WRITE 32 | 223 | #define EXIT_REASON_MSR_WRITE 32 |
| 215 | #define EXIT_REASON_MWAIT_INSTRUCTION 36 | 224 | #define EXIT_REASON_MWAIT_INSTRUCTION 36 |
| 216 | #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 | 225 | #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 |
| 226 | #define EXIT_REASON_APIC_ACCESS 44 | ||
| 227 | #define EXIT_REASON_WBINVD 54 | ||
| 217 | 228 | ||
| 218 | /* | 229 | /* |
| 219 | * Interruption-information format | 230 | * Interruption-information format |
| @@ -230,13 +241,14 @@ enum vmcs_field { | |||
| 230 | 241 | ||
| 231 | #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ | 242 | #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ |
| 232 | #define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ | 243 | #define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ |
| 244 | #define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ | ||
| 233 | 245 | ||
| 234 | /* | 246 | /* |
| 235 | * Exit Qualifications for MOV for Control Register Access | 247 | * Exit Qualifications for MOV for Control Register Access |
| 236 | */ | 248 | */ |
| 237 | #define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */ | 249 | #define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/ |
| 238 | #define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ | 250 | #define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ |
| 239 | #define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */ | 251 | #define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */ |
| 240 | #define LMSW_SOURCE_DATA_SHIFT 16 | 252 | #define LMSW_SOURCE_DATA_SHIFT 16 |
| 241 | #define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ | 253 | #define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ |
| 242 | #define REG_EAX (0 << 8) | 254 | #define REG_EAX (0 << 8) |
| @@ -259,11 +271,11 @@ enum vmcs_field { | |||
| 259 | /* | 271 | /* |
| 260 | * Exit Qualifications for MOV for Debug Register Access | 272 | * Exit Qualifications for MOV for Debug Register Access |
| 261 | */ | 273 | */ |
| 262 | #define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */ | 274 | #define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */ |
| 263 | #define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ | 275 | #define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ |
| 264 | #define TYPE_MOV_TO_DR (0 << 4) | 276 | #define TYPE_MOV_TO_DR (0 << 4) |
| 265 | #define TYPE_MOV_FROM_DR (1 << 4) | 277 | #define TYPE_MOV_FROM_DR (1 << 4) |
| 266 | #define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */ | 278 | #define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */ |
| 267 | 279 | ||
| 268 | 280 | ||
| 269 | /* segment AR */ | 281 | /* segment AR */ |
| @@ -307,4 +319,6 @@ enum vmcs_field { | |||
| 307 | #define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 | 319 | #define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 |
| 308 | #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 | 320 | #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 |
| 309 | 321 | ||
| 322 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 | ||
| 323 | |||
| 310 | #endif | 324 | #endif |
diff --git a/drivers/kvm/kvm_main.c b/arch/x86/kvm/x86.c index c0f372f1d761..8f94a0b89dff 100644 --- a/drivers/kvm/kvm_main.c +++ b/arch/x86/kvm/x86.c | |||
| @@ -1,8 +1,7 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Kernel-based Virtual Machine driver for Linux | 2 | * Kernel-based Virtual Machine driver for Linux |
| 3 | * | 3 | * |
| 4 | * This module enables machines with Intel VT-x extensions to run virtual | 4 | * derived from drivers/kvm/kvm_main.c |
| 5 | * machines without emulation or binary translation. | ||
| 6 | * | 5 | * |
| 7 | * Copyright (C) 2006 Qumranet, Inc. | 6 | * Copyright (C) 2006 Qumranet, Inc. |
| 8 | * | 7 | * |
| @@ -15,80 +14,22 @@ | |||
| 15 | * | 14 | * |
| 16 | */ | 15 | */ |
| 17 | 16 | ||
| 18 | #include "kvm.h" | 17 | #include <linux/kvm_host.h> |
| 19 | #include "x86_emulate.h" | ||
| 20 | #include "segment_descriptor.h" | 18 | #include "segment_descriptor.h" |
| 21 | #include "irq.h" | 19 | #include "irq.h" |
| 20 | #include "mmu.h" | ||
| 22 | 21 | ||
| 23 | #include <linux/kvm.h> | 22 | #include <linux/kvm.h> |
| 24 | #include <linux/module.h> | 23 | #include <linux/fs.h> |
| 25 | #include <linux/errno.h> | ||
| 26 | #include <linux/percpu.h> | ||
| 27 | #include <linux/gfp.h> | ||
| 28 | #include <linux/mm.h> | ||
| 29 | #include <linux/miscdevice.h> | ||
| 30 | #include <linux/vmalloc.h> | 24 | #include <linux/vmalloc.h> |
| 31 | #include <linux/reboot.h> | 25 | #include <linux/module.h> |
| 32 | #include <linux/debugfs.h> | 26 | #include <linux/mman.h> |
| 33 | #include <linux/highmem.h> | 27 | #include <linux/highmem.h> |
| 34 | #include <linux/file.h> | ||
| 35 | #include <linux/sysdev.h> | ||
| 36 | #include <linux/cpu.h> | ||
| 37 | #include <linux/sched.h> | ||
| 38 | #include <linux/cpumask.h> | ||
| 39 | #include <linux/smp.h> | ||
| 40 | #include <linux/anon_inodes.h> | ||
| 41 | #include <linux/profile.h> | ||
| 42 | |||
| 43 | #include <asm/processor.h> | ||
| 44 | #include <asm/msr.h> | ||
| 45 | #include <asm/io.h> | ||
| 46 | #include <asm/uaccess.h> | ||
| 47 | #include <asm/desc.h> | ||
| 48 | |||
| 49 | MODULE_AUTHOR("Qumranet"); | ||
| 50 | MODULE_LICENSE("GPL"); | ||
| 51 | 28 | ||
| 52 | static DEFINE_SPINLOCK(kvm_lock); | 29 | #include <asm/uaccess.h> |
| 53 | static LIST_HEAD(vm_list); | 30 | #include <asm/msr.h> |
| 54 | |||
| 55 | static cpumask_t cpus_hardware_enabled; | ||
| 56 | |||
| 57 | struct kvm_x86_ops *kvm_x86_ops; | ||
| 58 | struct kmem_cache *kvm_vcpu_cache; | ||
| 59 | EXPORT_SYMBOL_GPL(kvm_vcpu_cache); | ||
| 60 | |||
| 61 | static __read_mostly struct preempt_ops kvm_preempt_ops; | ||
| 62 | |||
| 63 | #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x) | ||
| 64 | |||
| 65 | static struct kvm_stats_debugfs_item { | ||
| 66 | const char *name; | ||
| 67 | int offset; | ||
| 68 | struct dentry *dentry; | ||
| 69 | } debugfs_entries[] = { | ||
| 70 | { "pf_fixed", STAT_OFFSET(pf_fixed) }, | ||
| 71 | { "pf_guest", STAT_OFFSET(pf_guest) }, | ||
| 72 | { "tlb_flush", STAT_OFFSET(tlb_flush) }, | ||
| 73 | { "invlpg", STAT_OFFSET(invlpg) }, | ||
| 74 | { "exits", STAT_OFFSET(exits) }, | ||
| 75 | { "io_exits", STAT_OFFSET(io_exits) }, | ||
| 76 | { "mmio_exits", STAT_OFFSET(mmio_exits) }, | ||
| 77 | { "signal_exits", STAT_OFFSET(signal_exits) }, | ||
| 78 | { "irq_window", STAT_OFFSET(irq_window_exits) }, | ||
| 79 | { "halt_exits", STAT_OFFSET(halt_exits) }, | ||
| 80 | { "halt_wakeup", STAT_OFFSET(halt_wakeup) }, | ||
| 81 | { "request_irq", STAT_OFFSET(request_irq_exits) }, | ||
| 82 | { "irq_exits", STAT_OFFSET(irq_exits) }, | ||
| 83 | { "light_exits", STAT_OFFSET(light_exits) }, | ||
| 84 | { "efer_reload", STAT_OFFSET(efer_reload) }, | ||
| 85 | { NULL } | ||
| 86 | }; | ||
| 87 | |||
| 88 | static struct dentry *debugfs_dir; | ||
| 89 | 31 | ||
| 90 | #define MAX_IO_MSRS 256 | 32 | #define MAX_IO_MSRS 256 |
| 91 | |||
| 92 | #define CR0_RESERVED_BITS \ | 33 | #define CR0_RESERVED_BITS \ |
| 93 | (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | 34 | (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ |
| 94 | | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | 35 | | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ |
| @@ -102,317 +43,151 @@ static struct dentry *debugfs_dir; | |||
| 102 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) | 43 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) |
| 103 | #define EFER_RESERVED_BITS 0xfffffffffffff2fe | 44 | #define EFER_RESERVED_BITS 0xfffffffffffff2fe |
| 104 | 45 | ||
| 105 | #ifdef CONFIG_X86_64 | 46 | #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM |
| 106 | // LDT or TSS descriptor in the GDT. 16 bytes. | 47 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU |
| 107 | struct segment_descriptor_64 { | ||
| 108 | struct segment_descriptor s; | ||
| 109 | u32 base_higher; | ||
| 110 | u32 pad_zero; | ||
| 111 | }; | ||
| 112 | 48 | ||
| 113 | #endif | 49 | struct kvm_x86_ops *kvm_x86_ops; |
| 50 | |||
| 51 | struct kvm_stats_debugfs_item debugfs_entries[] = { | ||
| 52 | { "pf_fixed", VCPU_STAT(pf_fixed) }, | ||
| 53 | { "pf_guest", VCPU_STAT(pf_guest) }, | ||
| 54 | { "tlb_flush", VCPU_STAT(tlb_flush) }, | ||
| 55 | { "invlpg", VCPU_STAT(invlpg) }, | ||
| 56 | { "exits", VCPU_STAT(exits) }, | ||
| 57 | { "io_exits", VCPU_STAT(io_exits) }, | ||
| 58 | { "mmio_exits", VCPU_STAT(mmio_exits) }, | ||
| 59 | { "signal_exits", VCPU_STAT(signal_exits) }, | ||
| 60 | { "irq_window", VCPU_STAT(irq_window_exits) }, | ||
| 61 | { "halt_exits", VCPU_STAT(halt_exits) }, | ||
| 62 | { "halt_wakeup", VCPU_STAT(halt_wakeup) }, | ||
| 63 | { "request_irq", VCPU_STAT(request_irq_exits) }, | ||
| 64 | { "irq_exits", VCPU_STAT(irq_exits) }, | ||
| 65 | { "host_state_reload", VCPU_STAT(host_state_reload) }, | ||
| 66 | { "efer_reload", VCPU_STAT(efer_reload) }, | ||
| 67 | { "fpu_reload", VCPU_STAT(fpu_reload) }, | ||
| 68 | { "insn_emulation", VCPU_STAT(insn_emulation) }, | ||
| 69 | { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, | ||
| 70 | { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, | ||
| 71 | { "mmu_pte_write", VM_STAT(mmu_pte_write) }, | ||
| 72 | { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, | ||
| 73 | { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, | ||
| 74 | { "mmu_flooded", VM_STAT(mmu_flooded) }, | ||
| 75 | { "mmu_recycled", VM_STAT(mmu_recycled) }, | ||
| 76 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, | ||
| 77 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, | ||
| 78 | { NULL } | ||
| 79 | }; | ||
| 114 | 80 | ||
| 115 | static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, | ||
| 116 | unsigned long arg); | ||
| 117 | 81 | ||
| 118 | unsigned long segment_base(u16 selector) | 82 | unsigned long segment_base(u16 selector) |
| 119 | { | 83 | { |
| 120 | struct descriptor_table gdt; | 84 | struct descriptor_table gdt; |
| 121 | struct segment_descriptor *d; | 85 | struct segment_descriptor *d; |
| 122 | unsigned long table_base; | 86 | unsigned long table_base; |
| 123 | typedef unsigned long ul; | ||
| 124 | unsigned long v; | 87 | unsigned long v; |
| 125 | 88 | ||
| 126 | if (selector == 0) | 89 | if (selector == 0) |
| 127 | return 0; | 90 | return 0; |
| 128 | 91 | ||
| 129 | asm ("sgdt %0" : "=m"(gdt)); | 92 | asm("sgdt %0" : "=m"(gdt)); |
| 130 | table_base = gdt.base; | 93 | table_base = gdt.base; |
| 131 | 94 | ||
| 132 | if (selector & 4) { /* from ldt */ | 95 | if (selector & 4) { /* from ldt */ |
| 133 | u16 ldt_selector; | 96 | u16 ldt_selector; |
| 134 | 97 | ||
| 135 | asm ("sldt %0" : "=g"(ldt_selector)); | 98 | asm("sldt %0" : "=g"(ldt_selector)); |
| 136 | table_base = segment_base(ldt_selector); | 99 | table_base = segment_base(ldt_selector); |
| 137 | } | 100 | } |
| 138 | d = (struct segment_descriptor *)(table_base + (selector & ~7)); | 101 | d = (struct segment_descriptor *)(table_base + (selector & ~7)); |
| 139 | v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24); | 102 | v = d->base_low | ((unsigned long)d->base_mid << 16) | |
| 103 | ((unsigned long)d->base_high << 24); | ||
| 140 | #ifdef CONFIG_X86_64 | 104 | #ifdef CONFIG_X86_64 |
| 141 | if (d->system == 0 | 105 | if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) |
| 142 | && (d->type == 2 || d->type == 9 || d->type == 11)) | 106 | v |= ((unsigned long) \ |
| 143 | v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32; | 107 | ((struct segment_descriptor_64 *)d)->base_higher) << 32; |
| 144 | #endif | 108 | #endif |
| 145 | return v; | 109 | return v; |
| 146 | } | 110 | } |
| 147 | EXPORT_SYMBOL_GPL(segment_base); | 111 | EXPORT_SYMBOL_GPL(segment_base); |
| 148 | 112 | ||
| 149 | static inline int valid_vcpu(int n) | 113 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) |
| 150 | { | ||
| 151 | return likely(n >= 0 && n < KVM_MAX_VCPUS); | ||
| 152 | } | ||
| 153 | |||
| 154 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) | ||
| 155 | { | ||
| 156 | if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) | ||
| 157 | return; | ||
| 158 | |||
| 159 | vcpu->guest_fpu_loaded = 1; | ||
| 160 | fx_save(&vcpu->host_fx_image); | ||
| 161 | fx_restore(&vcpu->guest_fx_image); | ||
| 162 | } | ||
| 163 | EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); | ||
| 164 | |||
| 165 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) | ||
| 166 | { | ||
| 167 | if (!vcpu->guest_fpu_loaded) | ||
| 168 | return; | ||
| 169 | |||
| 170 | vcpu->guest_fpu_loaded = 0; | ||
| 171 | fx_save(&vcpu->guest_fx_image); | ||
| 172 | fx_restore(&vcpu->host_fx_image); | ||
| 173 | } | ||
| 174 | EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); | ||
| 175 | |||
| 176 | /* | ||
| 177 | * Switches to specified vcpu, until a matching vcpu_put() | ||
| 178 | */ | ||
| 179 | static void vcpu_load(struct kvm_vcpu *vcpu) | ||
| 180 | { | ||
| 181 | int cpu; | ||
| 182 | |||
| 183 | mutex_lock(&vcpu->mutex); | ||
| 184 | cpu = get_cpu(); | ||
| 185 | preempt_notifier_register(&vcpu->preempt_notifier); | ||
| 186 | kvm_x86_ops->vcpu_load(vcpu, cpu); | ||
| 187 | put_cpu(); | ||
| 188 | } | ||
| 189 | |||
| 190 | static void vcpu_put(struct kvm_vcpu *vcpu) | ||
| 191 | { | ||
| 192 | preempt_disable(); | ||
| 193 | kvm_x86_ops->vcpu_put(vcpu); | ||
| 194 | preempt_notifier_unregister(&vcpu->preempt_notifier); | ||
| 195 | preempt_enable(); | ||
| 196 | mutex_unlock(&vcpu->mutex); | ||
| 197 | } | ||
| 198 | |||
| 199 | static void ack_flush(void *_completed) | ||
| 200 | { | ||
| 201 | } | ||
| 202 | |||
| 203 | void kvm_flush_remote_tlbs(struct kvm *kvm) | ||
| 204 | { | ||
| 205 | int i, cpu; | ||
| 206 | cpumask_t cpus; | ||
| 207 | struct kvm_vcpu *vcpu; | ||
| 208 | |||
| 209 | cpus_clear(cpus); | ||
| 210 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
| 211 | vcpu = kvm->vcpus[i]; | ||
| 212 | if (!vcpu) | ||
| 213 | continue; | ||
| 214 | if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests)) | ||
| 215 | continue; | ||
| 216 | cpu = vcpu->cpu; | ||
| 217 | if (cpu != -1 && cpu != raw_smp_processor_id()) | ||
| 218 | cpu_set(cpu, cpus); | ||
| 219 | } | ||
| 220 | smp_call_function_mask(cpus, ack_flush, NULL, 1); | ||
| 221 | } | ||
| 222 | |||
| 223 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) | ||
| 224 | { | 114 | { |
| 225 | struct page *page; | 115 | if (irqchip_in_kernel(vcpu->kvm)) |
| 226 | int r; | 116 | return vcpu->arch.apic_base; |
| 227 | |||
| 228 | mutex_init(&vcpu->mutex); | ||
| 229 | vcpu->cpu = -1; | ||
| 230 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
| 231 | vcpu->kvm = kvm; | ||
| 232 | vcpu->vcpu_id = id; | ||
| 233 | if (!irqchip_in_kernel(kvm) || id == 0) | ||
| 234 | vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; | ||
| 235 | else | 117 | else |
| 236 | vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED; | 118 | return vcpu->arch.apic_base; |
| 237 | init_waitqueue_head(&vcpu->wq); | ||
| 238 | |||
| 239 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
| 240 | if (!page) { | ||
| 241 | r = -ENOMEM; | ||
| 242 | goto fail; | ||
| 243 | } | ||
| 244 | vcpu->run = page_address(page); | ||
| 245 | |||
| 246 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
| 247 | if (!page) { | ||
| 248 | r = -ENOMEM; | ||
| 249 | goto fail_free_run; | ||
| 250 | } | ||
| 251 | vcpu->pio_data = page_address(page); | ||
| 252 | |||
| 253 | r = kvm_mmu_create(vcpu); | ||
| 254 | if (r < 0) | ||
| 255 | goto fail_free_pio_data; | ||
| 256 | |||
| 257 | return 0; | ||
| 258 | |||
| 259 | fail_free_pio_data: | ||
| 260 | free_page((unsigned long)vcpu->pio_data); | ||
| 261 | fail_free_run: | ||
| 262 | free_page((unsigned long)vcpu->run); | ||
| 263 | fail: | ||
| 264 | return -ENOMEM; | ||
| 265 | } | ||
| 266 | EXPORT_SYMBOL_GPL(kvm_vcpu_init); | ||
| 267 | |||
| 268 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) | ||
| 269 | { | ||
| 270 | kvm_mmu_destroy(vcpu); | ||
| 271 | if (vcpu->apic) | ||
| 272 | hrtimer_cancel(&vcpu->apic->timer.dev); | ||
| 273 | kvm_free_apic(vcpu->apic); | ||
| 274 | free_page((unsigned long)vcpu->pio_data); | ||
| 275 | free_page((unsigned long)vcpu->run); | ||
| 276 | } | ||
| 277 | EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); | ||
| 278 | |||
| 279 | static struct kvm *kvm_create_vm(void) | ||
| 280 | { | ||
| 281 | struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); | ||
| 282 | |||
| 283 | if (!kvm) | ||
| 284 | return ERR_PTR(-ENOMEM); | ||
| 285 | |||
| 286 | kvm_io_bus_init(&kvm->pio_bus); | ||
| 287 | mutex_init(&kvm->lock); | ||
| 288 | INIT_LIST_HEAD(&kvm->active_mmu_pages); | ||
| 289 | kvm_io_bus_init(&kvm->mmio_bus); | ||
| 290 | spin_lock(&kvm_lock); | ||
| 291 | list_add(&kvm->vm_list, &vm_list); | ||
| 292 | spin_unlock(&kvm_lock); | ||
| 293 | return kvm; | ||
| 294 | } | ||
| 295 | |||
| 296 | /* | ||
| 297 | * Free any memory in @free but not in @dont. | ||
| 298 | */ | ||
| 299 | static void kvm_free_physmem_slot(struct kvm_memory_slot *free, | ||
| 300 | struct kvm_memory_slot *dont) | ||
| 301 | { | ||
| 302 | int i; | ||
| 303 | |||
| 304 | if (!dont || free->phys_mem != dont->phys_mem) | ||
| 305 | if (free->phys_mem) { | ||
| 306 | for (i = 0; i < free->npages; ++i) | ||
| 307 | if (free->phys_mem[i]) | ||
| 308 | __free_page(free->phys_mem[i]); | ||
| 309 | vfree(free->phys_mem); | ||
| 310 | } | ||
| 311 | |||
| 312 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) | ||
| 313 | vfree(free->dirty_bitmap); | ||
| 314 | |||
| 315 | free->phys_mem = NULL; | ||
| 316 | free->npages = 0; | ||
| 317 | free->dirty_bitmap = NULL; | ||
| 318 | } | ||
| 319 | |||
| 320 | static void kvm_free_physmem(struct kvm *kvm) | ||
| 321 | { | ||
| 322 | int i; | ||
| 323 | |||
| 324 | for (i = 0; i < kvm->nmemslots; ++i) | ||
| 325 | kvm_free_physmem_slot(&kvm->memslots[i], NULL); | ||
| 326 | } | 119 | } |
| 120 | EXPORT_SYMBOL_GPL(kvm_get_apic_base); | ||
| 327 | 121 | ||
| 328 | static void free_pio_guest_pages(struct kvm_vcpu *vcpu) | 122 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) |
| 329 | { | 123 | { |
| 330 | int i; | 124 | /* TODO: reserve bits check */ |
| 331 | 125 | if (irqchip_in_kernel(vcpu->kvm)) | |
| 332 | for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i) | 126 | kvm_lapic_set_base(vcpu, data); |
| 333 | if (vcpu->pio.guest_pages[i]) { | 127 | else |
| 334 | __free_page(vcpu->pio.guest_pages[i]); | 128 | vcpu->arch.apic_base = data; |
| 335 | vcpu->pio.guest_pages[i] = NULL; | ||
| 336 | } | ||
| 337 | } | 129 | } |
| 130 | EXPORT_SYMBOL_GPL(kvm_set_apic_base); | ||
| 338 | 131 | ||
| 339 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) | 132 | void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) |
| 340 | { | 133 | { |
| 341 | vcpu_load(vcpu); | 134 | WARN_ON(vcpu->arch.exception.pending); |
| 342 | kvm_mmu_unload(vcpu); | 135 | vcpu->arch.exception.pending = true; |
| 343 | vcpu_put(vcpu); | 136 | vcpu->arch.exception.has_error_code = false; |
| 137 | vcpu->arch.exception.nr = nr; | ||
| 344 | } | 138 | } |
| 139 | EXPORT_SYMBOL_GPL(kvm_queue_exception); | ||
| 345 | 140 | ||
| 346 | static void kvm_free_vcpus(struct kvm *kvm) | 141 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, |
| 142 | u32 error_code) | ||
| 347 | { | 143 | { |
| 348 | unsigned int i; | 144 | ++vcpu->stat.pf_guest; |
| 349 | 145 | if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) { | |
| 350 | /* | 146 | printk(KERN_DEBUG "kvm: inject_page_fault:" |
| 351 | * Unpin any mmu pages first. | 147 | " double fault 0x%lx\n", addr); |
| 352 | */ | 148 | vcpu->arch.exception.nr = DF_VECTOR; |
| 353 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | 149 | vcpu->arch.exception.error_code = 0; |
| 354 | if (kvm->vcpus[i]) | 150 | return; |
| 355 | kvm_unload_vcpu_mmu(kvm->vcpus[i]); | ||
| 356 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
| 357 | if (kvm->vcpus[i]) { | ||
| 358 | kvm_x86_ops->vcpu_free(kvm->vcpus[i]); | ||
| 359 | kvm->vcpus[i] = NULL; | ||
| 360 | } | ||
| 361 | } | 151 | } |
| 362 | 152 | vcpu->arch.cr2 = addr; | |
| 153 | kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); | ||
| 363 | } | 154 | } |
| 364 | 155 | ||
| 365 | static void kvm_destroy_vm(struct kvm *kvm) | 156 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) |
| 366 | { | 157 | { |
| 367 | spin_lock(&kvm_lock); | 158 | WARN_ON(vcpu->arch.exception.pending); |
| 368 | list_del(&kvm->vm_list); | 159 | vcpu->arch.exception.pending = true; |
| 369 | spin_unlock(&kvm_lock); | 160 | vcpu->arch.exception.has_error_code = true; |
| 370 | kvm_io_bus_destroy(&kvm->pio_bus); | 161 | vcpu->arch.exception.nr = nr; |
| 371 | kvm_io_bus_destroy(&kvm->mmio_bus); | 162 | vcpu->arch.exception.error_code = error_code; |
| 372 | kfree(kvm->vpic); | ||
| 373 | kfree(kvm->vioapic); | ||
| 374 | kvm_free_vcpus(kvm); | ||
| 375 | kvm_free_physmem(kvm); | ||
| 376 | kfree(kvm); | ||
| 377 | } | 163 | } |
| 164 | EXPORT_SYMBOL_GPL(kvm_queue_exception_e); | ||
| 378 | 165 | ||
| 379 | static int kvm_vm_release(struct inode *inode, struct file *filp) | 166 | static void __queue_exception(struct kvm_vcpu *vcpu) |
| 380 | { | 167 | { |
| 381 | struct kvm *kvm = filp->private_data; | 168 | kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, |
| 382 | 169 | vcpu->arch.exception.has_error_code, | |
| 383 | kvm_destroy_vm(kvm); | 170 | vcpu->arch.exception.error_code); |
| 384 | return 0; | ||
| 385 | } | ||
| 386 | |||
| 387 | static void inject_gp(struct kvm_vcpu *vcpu) | ||
| 388 | { | ||
| 389 | kvm_x86_ops->inject_gp(vcpu, 0); | ||
| 390 | } | 171 | } |
| 391 | 172 | ||
| 392 | /* | 173 | /* |
| 393 | * Load the pae pdptrs. Return true is they are all valid. | 174 | * Load the pae pdptrs. Return true is they are all valid. |
| 394 | */ | 175 | */ |
| 395 | static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | 176 | int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) |
| 396 | { | 177 | { |
| 397 | gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; | 178 | gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; |
| 398 | unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; | 179 | unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; |
| 399 | int i; | 180 | int i; |
| 400 | u64 *pdpt; | ||
| 401 | int ret; | 181 | int ret; |
| 402 | struct page *page; | 182 | u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; |
| 403 | u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)]; | ||
| 404 | 183 | ||
| 405 | mutex_lock(&vcpu->kvm->lock); | 184 | down_read(¤t->mm->mmap_sem); |
| 406 | page = gfn_to_page(vcpu->kvm, pdpt_gfn); | 185 | ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, |
| 407 | if (!page) { | 186 | offset * sizeof(u64), sizeof(pdpte)); |
| 187 | if (ret < 0) { | ||
| 408 | ret = 0; | 188 | ret = 0; |
| 409 | goto out; | 189 | goto out; |
| 410 | } | 190 | } |
| 411 | |||
| 412 | pdpt = kmap_atomic(page, KM_USER0); | ||
| 413 | memcpy(pdpte, pdpt+offset, sizeof(pdpte)); | ||
| 414 | kunmap_atomic(pdpt, KM_USER0); | ||
| 415 | |||
| 416 | for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { | 191 | for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { |
| 417 | if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { | 192 | if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { |
| 418 | ret = 0; | 193 | ret = 0; |
| @@ -421,78 +196,96 @@ static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
| 421 | } | 196 | } |
| 422 | ret = 1; | 197 | ret = 1; |
| 423 | 198 | ||
| 424 | memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs)); | 199 | memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); |
| 425 | out: | 200 | out: |
| 426 | mutex_unlock(&vcpu->kvm->lock); | 201 | up_read(¤t->mm->mmap_sem); |
| 427 | 202 | ||
| 428 | return ret; | 203 | return ret; |
| 429 | } | 204 | } |
| 430 | 205 | ||
| 206 | static bool pdptrs_changed(struct kvm_vcpu *vcpu) | ||
| 207 | { | ||
| 208 | u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; | ||
| 209 | bool changed = true; | ||
| 210 | int r; | ||
| 211 | |||
| 212 | if (is_long_mode(vcpu) || !is_pae(vcpu)) | ||
| 213 | return false; | ||
| 214 | |||
| 215 | down_read(¤t->mm->mmap_sem); | ||
| 216 | r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); | ||
| 217 | if (r < 0) | ||
| 218 | goto out; | ||
| 219 | changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; | ||
| 220 | out: | ||
| 221 | up_read(¤t->mm->mmap_sem); | ||
| 222 | |||
| 223 | return changed; | ||
| 224 | } | ||
| 225 | |||
| 431 | void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | 226 | void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
| 432 | { | 227 | { |
| 433 | if (cr0 & CR0_RESERVED_BITS) { | 228 | if (cr0 & CR0_RESERVED_BITS) { |
| 434 | printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", | 229 | printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", |
| 435 | cr0, vcpu->cr0); | 230 | cr0, vcpu->arch.cr0); |
| 436 | inject_gp(vcpu); | 231 | kvm_inject_gp(vcpu, 0); |
| 437 | return; | 232 | return; |
| 438 | } | 233 | } |
| 439 | 234 | ||
| 440 | if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { | 235 | if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { |
| 441 | printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); | 236 | printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); |
| 442 | inject_gp(vcpu); | 237 | kvm_inject_gp(vcpu, 0); |
| 443 | return; | 238 | return; |
| 444 | } | 239 | } |
| 445 | 240 | ||
| 446 | if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { | 241 | if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { |
| 447 | printk(KERN_DEBUG "set_cr0: #GP, set PG flag " | 242 | printk(KERN_DEBUG "set_cr0: #GP, set PG flag " |
| 448 | "and a clear PE flag\n"); | 243 | "and a clear PE flag\n"); |
| 449 | inject_gp(vcpu); | 244 | kvm_inject_gp(vcpu, 0); |
| 450 | return; | 245 | return; |
| 451 | } | 246 | } |
| 452 | 247 | ||
| 453 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { | 248 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { |
| 454 | #ifdef CONFIG_X86_64 | 249 | #ifdef CONFIG_X86_64 |
| 455 | if ((vcpu->shadow_efer & EFER_LME)) { | 250 | if ((vcpu->arch.shadow_efer & EFER_LME)) { |
| 456 | int cs_db, cs_l; | 251 | int cs_db, cs_l; |
| 457 | 252 | ||
| 458 | if (!is_pae(vcpu)) { | 253 | if (!is_pae(vcpu)) { |
| 459 | printk(KERN_DEBUG "set_cr0: #GP, start paging " | 254 | printk(KERN_DEBUG "set_cr0: #GP, start paging " |
| 460 | "in long mode while PAE is disabled\n"); | 255 | "in long mode while PAE is disabled\n"); |
| 461 | inject_gp(vcpu); | 256 | kvm_inject_gp(vcpu, 0); |
| 462 | return; | 257 | return; |
| 463 | } | 258 | } |
| 464 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | 259 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); |
| 465 | if (cs_l) { | 260 | if (cs_l) { |
| 466 | printk(KERN_DEBUG "set_cr0: #GP, start paging " | 261 | printk(KERN_DEBUG "set_cr0: #GP, start paging " |
| 467 | "in long mode while CS.L == 1\n"); | 262 | "in long mode while CS.L == 1\n"); |
| 468 | inject_gp(vcpu); | 263 | kvm_inject_gp(vcpu, 0); |
| 469 | return; | 264 | return; |
| 470 | 265 | ||
| 471 | } | 266 | } |
| 472 | } else | 267 | } else |
| 473 | #endif | 268 | #endif |
| 474 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) { | 269 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { |
| 475 | printk(KERN_DEBUG "set_cr0: #GP, pdptrs " | 270 | printk(KERN_DEBUG "set_cr0: #GP, pdptrs " |
| 476 | "reserved bits\n"); | 271 | "reserved bits\n"); |
| 477 | inject_gp(vcpu); | 272 | kvm_inject_gp(vcpu, 0); |
| 478 | return; | 273 | return; |
| 479 | } | 274 | } |
| 480 | 275 | ||
| 481 | } | 276 | } |
| 482 | 277 | ||
| 483 | kvm_x86_ops->set_cr0(vcpu, cr0); | 278 | kvm_x86_ops->set_cr0(vcpu, cr0); |
| 484 | vcpu->cr0 = cr0; | 279 | vcpu->arch.cr0 = cr0; |
| 485 | 280 | ||
| 486 | mutex_lock(&vcpu->kvm->lock); | ||
| 487 | kvm_mmu_reset_context(vcpu); | 281 | kvm_mmu_reset_context(vcpu); |
| 488 | mutex_unlock(&vcpu->kvm->lock); | ||
| 489 | return; | 282 | return; |
| 490 | } | 283 | } |
| 491 | EXPORT_SYMBOL_GPL(set_cr0); | 284 | EXPORT_SYMBOL_GPL(set_cr0); |
| 492 | 285 | ||
| 493 | void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) | 286 | void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) |
| 494 | { | 287 | { |
| 495 | set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); | 288 | set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); |
| 496 | } | 289 | } |
| 497 | EXPORT_SYMBOL_GPL(lmsw); | 290 | EXPORT_SYMBOL_GPL(lmsw); |
| 498 | 291 | ||
| @@ -500,7 +293,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
| 500 | { | 293 | { |
| 501 | if (cr4 & CR4_RESERVED_BITS) { | 294 | if (cr4 & CR4_RESERVED_BITS) { |
| 502 | printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); | 295 | printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); |
| 503 | inject_gp(vcpu); | 296 | kvm_inject_gp(vcpu, 0); |
| 504 | return; | 297 | return; |
| 505 | } | 298 | } |
| 506 | 299 | ||
| @@ -508,35 +301,38 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
| 508 | if (!(cr4 & X86_CR4_PAE)) { | 301 | if (!(cr4 & X86_CR4_PAE)) { |
| 509 | printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " | 302 | printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " |
| 510 | "in long mode\n"); | 303 | "in long mode\n"); |
| 511 | inject_gp(vcpu); | 304 | kvm_inject_gp(vcpu, 0); |
| 512 | return; | 305 | return; |
| 513 | } | 306 | } |
| 514 | } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) | 307 | } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) |
| 515 | && !load_pdptrs(vcpu, vcpu->cr3)) { | 308 | && !load_pdptrs(vcpu, vcpu->arch.cr3)) { |
| 516 | printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); | 309 | printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); |
| 517 | inject_gp(vcpu); | 310 | kvm_inject_gp(vcpu, 0); |
| 518 | return; | 311 | return; |
| 519 | } | 312 | } |
| 520 | 313 | ||
| 521 | if (cr4 & X86_CR4_VMXE) { | 314 | if (cr4 & X86_CR4_VMXE) { |
| 522 | printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); | 315 | printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); |
| 523 | inject_gp(vcpu); | 316 | kvm_inject_gp(vcpu, 0); |
| 524 | return; | 317 | return; |
| 525 | } | 318 | } |
| 526 | kvm_x86_ops->set_cr4(vcpu, cr4); | 319 | kvm_x86_ops->set_cr4(vcpu, cr4); |
| 527 | vcpu->cr4 = cr4; | 320 | vcpu->arch.cr4 = cr4; |
| 528 | mutex_lock(&vcpu->kvm->lock); | ||
| 529 | kvm_mmu_reset_context(vcpu); | 321 | kvm_mmu_reset_context(vcpu); |
| 530 | mutex_unlock(&vcpu->kvm->lock); | ||
| 531 | } | 322 | } |
| 532 | EXPORT_SYMBOL_GPL(set_cr4); | 323 | EXPORT_SYMBOL_GPL(set_cr4); |
| 533 | 324 | ||
| 534 | void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | 325 | void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
| 535 | { | 326 | { |
| 327 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { | ||
| 328 | kvm_mmu_flush_tlb(vcpu); | ||
| 329 | return; | ||
| 330 | } | ||
| 331 | |||
| 536 | if (is_long_mode(vcpu)) { | 332 | if (is_long_mode(vcpu)) { |
| 537 | if (cr3 & CR3_L_MODE_RESERVED_BITS) { | 333 | if (cr3 & CR3_L_MODE_RESERVED_BITS) { |
| 538 | printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); | 334 | printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); |
| 539 | inject_gp(vcpu); | 335 | kvm_inject_gp(vcpu, 0); |
| 540 | return; | 336 | return; |
| 541 | } | 337 | } |
| 542 | } else { | 338 | } else { |
| @@ -544,26 +340,23 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
| 544 | if (cr3 & CR3_PAE_RESERVED_BITS) { | 340 | if (cr3 & CR3_PAE_RESERVED_BITS) { |
| 545 | printk(KERN_DEBUG | 341 | printk(KERN_DEBUG |
| 546 | "set_cr3: #GP, reserved bits\n"); | 342 | "set_cr3: #GP, reserved bits\n"); |
| 547 | inject_gp(vcpu); | 343 | kvm_inject_gp(vcpu, 0); |
| 548 | return; | 344 | return; |
| 549 | } | 345 | } |
| 550 | if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { | 346 | if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { |
| 551 | printk(KERN_DEBUG "set_cr3: #GP, pdptrs " | 347 | printk(KERN_DEBUG "set_cr3: #GP, pdptrs " |
| 552 | "reserved bits\n"); | 348 | "reserved bits\n"); |
| 553 | inject_gp(vcpu); | 349 | kvm_inject_gp(vcpu, 0); |
| 554 | return; | ||
| 555 | } | ||
| 556 | } else { | ||
| 557 | if (cr3 & CR3_NONPAE_RESERVED_BITS) { | ||
| 558 | printk(KERN_DEBUG | ||
| 559 | "set_cr3: #GP, reserved bits\n"); | ||
| 560 | inject_gp(vcpu); | ||
| 561 | return; | 350 | return; |
| 562 | } | 351 | } |
| 563 | } | 352 | } |
| 353 | /* | ||
| 354 | * We don't check reserved bits in nonpae mode, because | ||
| 355 | * this isn't enforced, and VMware depends on this. | ||
| 356 | */ | ||
| 564 | } | 357 | } |
| 565 | 358 | ||
| 566 | mutex_lock(&vcpu->kvm->lock); | 359 | down_read(¤t->mm->mmap_sem); |
| 567 | /* | 360 | /* |
| 568 | * Does the new cr3 value map to physical memory? (Note, we | 361 | * Does the new cr3 value map to physical memory? (Note, we |
| 569 | * catch an invalid cr3 even in real-mode, because it would | 362 | * catch an invalid cr3 even in real-mode, because it would |
| @@ -574,12 +367,12 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
| 574 | * to debug) behavior on the guest side. | 367 | * to debug) behavior on the guest side. |
| 575 | */ | 368 | */ |
| 576 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) | 369 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) |
| 577 | inject_gp(vcpu); | 370 | kvm_inject_gp(vcpu, 0); |
| 578 | else { | 371 | else { |
| 579 | vcpu->cr3 = cr3; | 372 | vcpu->arch.cr3 = cr3; |
| 580 | vcpu->mmu.new_cr3(vcpu); | 373 | vcpu->arch.mmu.new_cr3(vcpu); |
| 581 | } | 374 | } |
| 582 | mutex_unlock(&vcpu->kvm->lock); | 375 | up_read(¤t->mm->mmap_sem); |
| 583 | } | 376 | } |
| 584 | EXPORT_SYMBOL_GPL(set_cr3); | 377 | EXPORT_SYMBOL_GPL(set_cr3); |
| 585 | 378 | ||
| @@ -587,13 +380,13 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | |||
| 587 | { | 380 | { |
| 588 | if (cr8 & CR8_RESERVED_BITS) { | 381 | if (cr8 & CR8_RESERVED_BITS) { |
| 589 | printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); | 382 | printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); |
| 590 | inject_gp(vcpu); | 383 | kvm_inject_gp(vcpu, 0); |
| 591 | return; | 384 | return; |
| 592 | } | 385 | } |
| 593 | if (irqchip_in_kernel(vcpu->kvm)) | 386 | if (irqchip_in_kernel(vcpu->kvm)) |
| 594 | kvm_lapic_set_tpr(vcpu, cr8); | 387 | kvm_lapic_set_tpr(vcpu, cr8); |
| 595 | else | 388 | else |
| 596 | vcpu->cr8 = cr8; | 389 | vcpu->arch.cr8 = cr8; |
| 597 | } | 390 | } |
| 598 | EXPORT_SYMBOL_GPL(set_cr8); | 391 | EXPORT_SYMBOL_GPL(set_cr8); |
| 599 | 392 | ||
| @@ -602,210 +395,846 @@ unsigned long get_cr8(struct kvm_vcpu *vcpu) | |||
| 602 | if (irqchip_in_kernel(vcpu->kvm)) | 395 | if (irqchip_in_kernel(vcpu->kvm)) |
| 603 | return kvm_lapic_get_cr8(vcpu); | 396 | return kvm_lapic_get_cr8(vcpu); |
| 604 | else | 397 | else |
| 605 | return vcpu->cr8; | 398 | return vcpu->arch.cr8; |
| 606 | } | 399 | } |
| 607 | EXPORT_SYMBOL_GPL(get_cr8); | 400 | EXPORT_SYMBOL_GPL(get_cr8); |
| 608 | 401 | ||
| 609 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) | 402 | /* |
| 403 | * List of msr numbers which we expose to userspace through KVM_GET_MSRS | ||
| 404 | * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. | ||
| 405 | * | ||
| 406 | * This list is modified at module load time to reflect the | ||
| 407 | * capabilities of the host cpu. | ||
| 408 | */ | ||
| 409 | static u32 msrs_to_save[] = { | ||
| 410 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
| 411 | MSR_K6_STAR, | ||
| 412 | #ifdef CONFIG_X86_64 | ||
| 413 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, | ||
| 414 | #endif | ||
| 415 | MSR_IA32_TIME_STAMP_COUNTER, | ||
| 416 | }; | ||
| 417 | |||
| 418 | static unsigned num_msrs_to_save; | ||
| 419 | |||
| 420 | static u32 emulated_msrs[] = { | ||
| 421 | MSR_IA32_MISC_ENABLE, | ||
| 422 | }; | ||
| 423 | |||
| 424 | #ifdef CONFIG_X86_64 | ||
| 425 | |||
| 426 | static void set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
| 610 | { | 427 | { |
| 611 | if (irqchip_in_kernel(vcpu->kvm)) | 428 | if (efer & EFER_RESERVED_BITS) { |
| 612 | return vcpu->apic_base; | 429 | printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", |
| 613 | else | 430 | efer); |
| 614 | return vcpu->apic_base; | 431 | kvm_inject_gp(vcpu, 0); |
| 432 | return; | ||
| 433 | } | ||
| 434 | |||
| 435 | if (is_paging(vcpu) | ||
| 436 | && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { | ||
| 437 | printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); | ||
| 438 | kvm_inject_gp(vcpu, 0); | ||
| 439 | return; | ||
| 440 | } | ||
| 441 | |||
| 442 | kvm_x86_ops->set_efer(vcpu, efer); | ||
| 443 | |||
| 444 | efer &= ~EFER_LMA; | ||
| 445 | efer |= vcpu->arch.shadow_efer & EFER_LMA; | ||
| 446 | |||
| 447 | vcpu->arch.shadow_efer = efer; | ||
| 615 | } | 448 | } |
| 616 | EXPORT_SYMBOL_GPL(kvm_get_apic_base); | ||
| 617 | 449 | ||
| 618 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) | 450 | #endif |
| 451 | |||
| 452 | /* | ||
| 453 | * Writes msr value into into the appropriate "register". | ||
| 454 | * Returns 0 on success, non-0 otherwise. | ||
| 455 | * Assumes vcpu_load() was already called. | ||
| 456 | */ | ||
| 457 | int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
| 619 | { | 458 | { |
| 620 | /* TODO: reserve bits check */ | 459 | return kvm_x86_ops->set_msr(vcpu, msr_index, data); |
| 621 | if (irqchip_in_kernel(vcpu->kvm)) | ||
| 622 | kvm_lapic_set_base(vcpu, data); | ||
| 623 | else | ||
| 624 | vcpu->apic_base = data; | ||
| 625 | } | 460 | } |
| 626 | EXPORT_SYMBOL_GPL(kvm_set_apic_base); | ||
| 627 | 461 | ||
| 628 | void fx_init(struct kvm_vcpu *vcpu) | 462 | /* |
| 463 | * Adapt set_msr() to msr_io()'s calling convention | ||
| 464 | */ | ||
| 465 | static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) | ||
| 629 | { | 466 | { |
| 630 | unsigned after_mxcsr_mask; | 467 | return kvm_set_msr(vcpu, index, *data); |
| 468 | } | ||
| 631 | 469 | ||
| 632 | /* Initialize guest FPU by resetting ours and saving into guest's */ | ||
| 633 | preempt_disable(); | ||
| 634 | fx_save(&vcpu->host_fx_image); | ||
| 635 | fpu_init(); | ||
| 636 | fx_save(&vcpu->guest_fx_image); | ||
| 637 | fx_restore(&vcpu->host_fx_image); | ||
| 638 | preempt_enable(); | ||
| 639 | 470 | ||
| 640 | vcpu->cr0 |= X86_CR0_ET; | 471 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) |
| 641 | after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); | 472 | { |
| 642 | vcpu->guest_fx_image.mxcsr = 0x1f80; | 473 | switch (msr) { |
| 643 | memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask, | 474 | #ifdef CONFIG_X86_64 |
| 644 | 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); | 475 | case MSR_EFER: |
| 476 | set_efer(vcpu, data); | ||
| 477 | break; | ||
| 478 | #endif | ||
| 479 | case MSR_IA32_MC0_STATUS: | ||
| 480 | pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", | ||
| 481 | __FUNCTION__, data); | ||
| 482 | break; | ||
| 483 | case MSR_IA32_MCG_STATUS: | ||
| 484 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", | ||
| 485 | __FUNCTION__, data); | ||
| 486 | break; | ||
| 487 | case MSR_IA32_UCODE_REV: | ||
| 488 | case MSR_IA32_UCODE_WRITE: | ||
| 489 | case 0x200 ... 0x2ff: /* MTRRs */ | ||
| 490 | break; | ||
| 491 | case MSR_IA32_APICBASE: | ||
| 492 | kvm_set_apic_base(vcpu, data); | ||
| 493 | break; | ||
| 494 | case MSR_IA32_MISC_ENABLE: | ||
| 495 | vcpu->arch.ia32_misc_enable_msr = data; | ||
| 496 | break; | ||
| 497 | default: | ||
| 498 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); | ||
| 499 | return 1; | ||
| 500 | } | ||
| 501 | return 0; | ||
| 645 | } | 502 | } |
| 646 | EXPORT_SYMBOL_GPL(fx_init); | 503 | EXPORT_SYMBOL_GPL(kvm_set_msr_common); |
| 504 | |||
| 505 | |||
| 506 | /* | ||
| 507 | * Reads an msr value (of 'msr_index') into 'pdata'. | ||
| 508 | * Returns 0 on success, non-0 otherwise. | ||
| 509 | * Assumes vcpu_load() was already called. | ||
| 510 | */ | ||
| 511 | int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | ||
| 512 | { | ||
| 513 | return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); | ||
| 514 | } | ||
| 515 | |||
| 516 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | ||
| 517 | { | ||
| 518 | u64 data; | ||
| 519 | |||
| 520 | switch (msr) { | ||
| 521 | case 0xc0010010: /* SYSCFG */ | ||
| 522 | case 0xc0010015: /* HWCR */ | ||
| 523 | case MSR_IA32_PLATFORM_ID: | ||
| 524 | case MSR_IA32_P5_MC_ADDR: | ||
| 525 | case MSR_IA32_P5_MC_TYPE: | ||
| 526 | case MSR_IA32_MC0_CTL: | ||
| 527 | case MSR_IA32_MCG_STATUS: | ||
| 528 | case MSR_IA32_MCG_CAP: | ||
| 529 | case MSR_IA32_MC0_MISC: | ||
| 530 | case MSR_IA32_MC0_MISC+4: | ||
| 531 | case MSR_IA32_MC0_MISC+8: | ||
| 532 | case MSR_IA32_MC0_MISC+12: | ||
| 533 | case MSR_IA32_MC0_MISC+16: | ||
| 534 | case MSR_IA32_UCODE_REV: | ||
| 535 | case MSR_IA32_PERF_STATUS: | ||
| 536 | case MSR_IA32_EBL_CR_POWERON: | ||
| 537 | /* MTRR registers */ | ||
| 538 | case 0xfe: | ||
| 539 | case 0x200 ... 0x2ff: | ||
| 540 | data = 0; | ||
| 541 | break; | ||
| 542 | case 0xcd: /* fsb frequency */ | ||
| 543 | data = 3; | ||
| 544 | break; | ||
| 545 | case MSR_IA32_APICBASE: | ||
| 546 | data = kvm_get_apic_base(vcpu); | ||
| 547 | break; | ||
| 548 | case MSR_IA32_MISC_ENABLE: | ||
| 549 | data = vcpu->arch.ia32_misc_enable_msr; | ||
| 550 | break; | ||
| 551 | #ifdef CONFIG_X86_64 | ||
| 552 | case MSR_EFER: | ||
| 553 | data = vcpu->arch.shadow_efer; | ||
| 554 | break; | ||
| 555 | #endif | ||
| 556 | default: | ||
| 557 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); | ||
| 558 | return 1; | ||
| 559 | } | ||
| 560 | *pdata = data; | ||
| 561 | return 0; | ||
| 562 | } | ||
| 563 | EXPORT_SYMBOL_GPL(kvm_get_msr_common); | ||
| 647 | 564 | ||
| 648 | /* | 565 | /* |
| 649 | * Allocate some memory and give it an address in the guest physical address | 566 | * Read or write a bunch of msrs. All parameters are kernel addresses. |
| 650 | * space. | ||
| 651 | * | 567 | * |
| 652 | * Discontiguous memory is allowed, mostly for framebuffers. | 568 | * @return number of msrs set successfully. |
| 653 | */ | 569 | */ |
| 654 | static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, | 570 | static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, |
| 655 | struct kvm_memory_region *mem) | 571 | struct kvm_msr_entry *entries, |
| 572 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
| 573 | unsigned index, u64 *data)) | ||
| 656 | { | 574 | { |
| 657 | int r; | 575 | int i; |
| 658 | gfn_t base_gfn; | ||
| 659 | unsigned long npages; | ||
| 660 | unsigned long i; | ||
| 661 | struct kvm_memory_slot *memslot; | ||
| 662 | struct kvm_memory_slot old, new; | ||
| 663 | 576 | ||
| 664 | r = -EINVAL; | 577 | vcpu_load(vcpu); |
| 665 | /* General sanity checks */ | 578 | |
| 666 | if (mem->memory_size & (PAGE_SIZE - 1)) | 579 | for (i = 0; i < msrs->nmsrs; ++i) |
| 667 | goto out; | 580 | if (do_msr(vcpu, entries[i].index, &entries[i].data)) |
| 668 | if (mem->guest_phys_addr & (PAGE_SIZE - 1)) | 581 | break; |
| 582 | |||
| 583 | vcpu_put(vcpu); | ||
| 584 | |||
| 585 | return i; | ||
| 586 | } | ||
| 587 | |||
| 588 | /* | ||
| 589 | * Read or write a bunch of msrs. Parameters are user addresses. | ||
| 590 | * | ||
| 591 | * @return number of msrs set successfully. | ||
| 592 | */ | ||
| 593 | static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, | ||
| 594 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
| 595 | unsigned index, u64 *data), | ||
| 596 | int writeback) | ||
| 597 | { | ||
| 598 | struct kvm_msrs msrs; | ||
| 599 | struct kvm_msr_entry *entries; | ||
| 600 | int r, n; | ||
| 601 | unsigned size; | ||
| 602 | |||
| 603 | r = -EFAULT; | ||
| 604 | if (copy_from_user(&msrs, user_msrs, sizeof msrs)) | ||
| 669 | goto out; | 605 | goto out; |
| 670 | if (mem->slot >= KVM_MEMORY_SLOTS) | 606 | |
| 607 | r = -E2BIG; | ||
| 608 | if (msrs.nmsrs >= MAX_IO_MSRS) | ||
| 671 | goto out; | 609 | goto out; |
| 672 | if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) | 610 | |
| 611 | r = -ENOMEM; | ||
| 612 | size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; | ||
| 613 | entries = vmalloc(size); | ||
| 614 | if (!entries) | ||
| 673 | goto out; | 615 | goto out; |
| 674 | 616 | ||
| 675 | memslot = &kvm->memslots[mem->slot]; | 617 | r = -EFAULT; |
| 676 | base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; | 618 | if (copy_from_user(entries, user_msrs->entries, size)) |
| 677 | npages = mem->memory_size >> PAGE_SHIFT; | 619 | goto out_free; |
| 678 | 620 | ||
| 679 | if (!npages) | 621 | r = n = __msr_io(vcpu, &msrs, entries, do_msr); |
| 680 | mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; | 622 | if (r < 0) |
| 623 | goto out_free; | ||
| 681 | 624 | ||
| 682 | mutex_lock(&kvm->lock); | 625 | r = -EFAULT; |
| 626 | if (writeback && copy_to_user(user_msrs->entries, entries, size)) | ||
| 627 | goto out_free; | ||
| 683 | 628 | ||
| 684 | new = old = *memslot; | 629 | r = n; |
| 685 | 630 | ||
| 686 | new.base_gfn = base_gfn; | 631 | out_free: |
| 687 | new.npages = npages; | 632 | vfree(entries); |
| 688 | new.flags = mem->flags; | 633 | out: |
| 634 | return r; | ||
| 635 | } | ||
| 689 | 636 | ||
| 690 | /* Disallow changing a memory slot's size. */ | 637 | /* |
| 691 | r = -EINVAL; | 638 | * Make sure that a cpu that is being hot-unplugged does not have any vcpus |
| 692 | if (npages && old.npages && npages != old.npages) | 639 | * cached on it. |
| 693 | goto out_unlock; | 640 | */ |
| 641 | void decache_vcpus_on_cpu(int cpu) | ||
| 642 | { | ||
| 643 | struct kvm *vm; | ||
| 644 | struct kvm_vcpu *vcpu; | ||
| 645 | int i; | ||
| 694 | 646 | ||
| 695 | /* Check for overlaps */ | 647 | spin_lock(&kvm_lock); |
| 696 | r = -EEXIST; | 648 | list_for_each_entry(vm, &vm_list, vm_list) |
| 697 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | 649 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { |
| 698 | struct kvm_memory_slot *s = &kvm->memslots[i]; | 650 | vcpu = vm->vcpus[i]; |
| 651 | if (!vcpu) | ||
| 652 | continue; | ||
| 653 | /* | ||
| 654 | * If the vcpu is locked, then it is running on some | ||
| 655 | * other cpu and therefore it is not cached on the | ||
| 656 | * cpu in question. | ||
| 657 | * | ||
| 658 | * If it's not locked, check the last cpu it executed | ||
| 659 | * on. | ||
| 660 | */ | ||
| 661 | if (mutex_trylock(&vcpu->mutex)) { | ||
| 662 | if (vcpu->cpu == cpu) { | ||
| 663 | kvm_x86_ops->vcpu_decache(vcpu); | ||
| 664 | vcpu->cpu = -1; | ||
| 665 | } | ||
| 666 | mutex_unlock(&vcpu->mutex); | ||
| 667 | } | ||
| 668 | } | ||
| 669 | spin_unlock(&kvm_lock); | ||
| 670 | } | ||
| 699 | 671 | ||
| 700 | if (s == memslot) | 672 | int kvm_dev_ioctl_check_extension(long ext) |
| 701 | continue; | 673 | { |
| 702 | if (!((base_gfn + npages <= s->base_gfn) || | 674 | int r; |
| 703 | (base_gfn >= s->base_gfn + s->npages))) | 675 | |
| 704 | goto out_unlock; | 676 | switch (ext) { |
| 677 | case KVM_CAP_IRQCHIP: | ||
| 678 | case KVM_CAP_HLT: | ||
| 679 | case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: | ||
| 680 | case KVM_CAP_USER_MEMORY: | ||
| 681 | case KVM_CAP_SET_TSS_ADDR: | ||
| 682 | case KVM_CAP_EXT_CPUID: | ||
| 683 | r = 1; | ||
| 684 | break; | ||
| 685 | case KVM_CAP_VAPIC: | ||
| 686 | r = !kvm_x86_ops->cpu_has_accelerated_tpr(); | ||
| 687 | break; | ||
| 688 | default: | ||
| 689 | r = 0; | ||
| 690 | break; | ||
| 705 | } | 691 | } |
| 692 | return r; | ||
| 706 | 693 | ||
| 707 | /* Deallocate if slot is being removed */ | 694 | } |
| 708 | if (!npages) | ||
| 709 | new.phys_mem = NULL; | ||
| 710 | 695 | ||
| 711 | /* Free page dirty bitmap if unneeded */ | 696 | long kvm_arch_dev_ioctl(struct file *filp, |
| 712 | if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) | 697 | unsigned int ioctl, unsigned long arg) |
| 713 | new.dirty_bitmap = NULL; | 698 | { |
| 699 | void __user *argp = (void __user *)arg; | ||
| 700 | long r; | ||
| 714 | 701 | ||
| 715 | r = -ENOMEM; | 702 | switch (ioctl) { |
| 703 | case KVM_GET_MSR_INDEX_LIST: { | ||
| 704 | struct kvm_msr_list __user *user_msr_list = argp; | ||
| 705 | struct kvm_msr_list msr_list; | ||
| 706 | unsigned n; | ||
| 716 | 707 | ||
| 717 | /* Allocate if a slot is being created */ | 708 | r = -EFAULT; |
| 718 | if (npages && !new.phys_mem) { | 709 | if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) |
| 719 | new.phys_mem = vmalloc(npages * sizeof(struct page *)); | 710 | goto out; |
| 711 | n = msr_list.nmsrs; | ||
| 712 | msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); | ||
| 713 | if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) | ||
| 714 | goto out; | ||
| 715 | r = -E2BIG; | ||
| 716 | if (n < num_msrs_to_save) | ||
| 717 | goto out; | ||
| 718 | r = -EFAULT; | ||
| 719 | if (copy_to_user(user_msr_list->indices, &msrs_to_save, | ||
| 720 | num_msrs_to_save * sizeof(u32))) | ||
| 721 | goto out; | ||
| 722 | if (copy_to_user(user_msr_list->indices | ||
| 723 | + num_msrs_to_save * sizeof(u32), | ||
| 724 | &emulated_msrs, | ||
| 725 | ARRAY_SIZE(emulated_msrs) * sizeof(u32))) | ||
| 726 | goto out; | ||
| 727 | r = 0; | ||
| 728 | break; | ||
| 729 | } | ||
| 730 | default: | ||
| 731 | r = -EINVAL; | ||
| 732 | } | ||
| 733 | out: | ||
| 734 | return r; | ||
| 735 | } | ||
| 736 | |||
| 737 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||
| 738 | { | ||
| 739 | kvm_x86_ops->vcpu_load(vcpu, cpu); | ||
| 740 | } | ||
| 720 | 741 | ||
| 721 | if (!new.phys_mem) | 742 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) |
| 722 | goto out_unlock; | 743 | { |
| 744 | kvm_x86_ops->vcpu_put(vcpu); | ||
| 745 | kvm_put_guest_fpu(vcpu); | ||
| 746 | } | ||
| 723 | 747 | ||
| 724 | memset(new.phys_mem, 0, npages * sizeof(struct page *)); | 748 | static int is_efer_nx(void) |
| 725 | for (i = 0; i < npages; ++i) { | 749 | { |
| 726 | new.phys_mem[i] = alloc_page(GFP_HIGHUSER | 750 | u64 efer; |
| 727 | | __GFP_ZERO); | 751 | |
| 728 | if (!new.phys_mem[i]) | 752 | rdmsrl(MSR_EFER, efer); |
| 729 | goto out_unlock; | 753 | return efer & EFER_NX; |
| 730 | set_page_private(new.phys_mem[i],0); | 754 | } |
| 731 | } | ||
| 732 | } | ||
| 733 | 755 | ||
| 734 | /* Allocate page dirty bitmap if needed */ | 756 | static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) |
| 735 | if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { | 757 | { |
| 736 | unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; | 758 | int i; |
| 759 | struct kvm_cpuid_entry2 *e, *entry; | ||
| 737 | 760 | ||
| 738 | new.dirty_bitmap = vmalloc(dirty_bytes); | 761 | entry = NULL; |
| 739 | if (!new.dirty_bitmap) | 762 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { |
| 740 | goto out_unlock; | 763 | e = &vcpu->arch.cpuid_entries[i]; |
| 741 | memset(new.dirty_bitmap, 0, dirty_bytes); | 764 | if (e->function == 0x80000001) { |
| 765 | entry = e; | ||
| 766 | break; | ||
| 767 | } | ||
| 742 | } | 768 | } |
| 769 | if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { | ||
| 770 | entry->edx &= ~(1 << 20); | ||
| 771 | printk(KERN_INFO "kvm: guest NX capability removed\n"); | ||
| 772 | } | ||
| 773 | } | ||
| 743 | 774 | ||
| 744 | if (mem->slot >= kvm->nmemslots) | 775 | /* when an old userspace process fills a new kernel module */ |
| 745 | kvm->nmemslots = mem->slot + 1; | 776 | static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, |
| 777 | struct kvm_cpuid *cpuid, | ||
| 778 | struct kvm_cpuid_entry __user *entries) | ||
| 779 | { | ||
| 780 | int r, i; | ||
| 781 | struct kvm_cpuid_entry *cpuid_entries; | ||
| 746 | 782 | ||
| 747 | *memslot = new; | 783 | r = -E2BIG; |
| 784 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
| 785 | goto out; | ||
| 786 | r = -ENOMEM; | ||
| 787 | cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); | ||
| 788 | if (!cpuid_entries) | ||
| 789 | goto out; | ||
| 790 | r = -EFAULT; | ||
| 791 | if (copy_from_user(cpuid_entries, entries, | ||
| 792 | cpuid->nent * sizeof(struct kvm_cpuid_entry))) | ||
| 793 | goto out_free; | ||
| 794 | for (i = 0; i < cpuid->nent; i++) { | ||
| 795 | vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; | ||
| 796 | vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; | ||
| 797 | vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; | ||
| 798 | vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; | ||
| 799 | vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; | ||
| 800 | vcpu->arch.cpuid_entries[i].index = 0; | ||
| 801 | vcpu->arch.cpuid_entries[i].flags = 0; | ||
| 802 | vcpu->arch.cpuid_entries[i].padding[0] = 0; | ||
| 803 | vcpu->arch.cpuid_entries[i].padding[1] = 0; | ||
| 804 | vcpu->arch.cpuid_entries[i].padding[2] = 0; | ||
| 805 | } | ||
| 806 | vcpu->arch.cpuid_nent = cpuid->nent; | ||
| 807 | cpuid_fix_nx_cap(vcpu); | ||
| 808 | r = 0; | ||
| 748 | 809 | ||
| 749 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); | 810 | out_free: |
| 750 | kvm_flush_remote_tlbs(kvm); | 811 | vfree(cpuid_entries); |
| 812 | out: | ||
| 813 | return r; | ||
| 814 | } | ||
| 751 | 815 | ||
| 752 | mutex_unlock(&kvm->lock); | 816 | static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, |
| 817 | struct kvm_cpuid2 *cpuid, | ||
| 818 | struct kvm_cpuid_entry2 __user *entries) | ||
| 819 | { | ||
| 820 | int r; | ||
| 753 | 821 | ||
| 754 | kvm_free_physmem_slot(&old, &new); | 822 | r = -E2BIG; |
| 823 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
| 824 | goto out; | ||
| 825 | r = -EFAULT; | ||
| 826 | if (copy_from_user(&vcpu->arch.cpuid_entries, entries, | ||
| 827 | cpuid->nent * sizeof(struct kvm_cpuid_entry2))) | ||
| 828 | goto out; | ||
| 829 | vcpu->arch.cpuid_nent = cpuid->nent; | ||
| 755 | return 0; | 830 | return 0; |
| 756 | 831 | ||
| 757 | out_unlock: | ||
| 758 | mutex_unlock(&kvm->lock); | ||
| 759 | kvm_free_physmem_slot(&new, &old); | ||
| 760 | out: | 832 | out: |
| 761 | return r; | 833 | return r; |
| 762 | } | 834 | } |
| 763 | 835 | ||
| 764 | /* | 836 | static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, |
| 765 | * Get (and clear) the dirty memory log for a memory slot. | 837 | struct kvm_cpuid2 *cpuid, |
| 766 | */ | 838 | struct kvm_cpuid_entry2 __user *entries) |
| 767 | static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | ||
| 768 | struct kvm_dirty_log *log) | ||
| 769 | { | 839 | { |
| 770 | struct kvm_memory_slot *memslot; | 840 | int r; |
| 771 | int r, i; | ||
| 772 | int n; | ||
| 773 | unsigned long any = 0; | ||
| 774 | |||
| 775 | mutex_lock(&kvm->lock); | ||
| 776 | 841 | ||
| 777 | r = -EINVAL; | 842 | r = -E2BIG; |
| 778 | if (log->slot >= KVM_MEMORY_SLOTS) | 843 | if (cpuid->nent < vcpu->arch.cpuid_nent) |
| 779 | goto out; | 844 | goto out; |
| 780 | 845 | r = -EFAULT; | |
| 781 | memslot = &kvm->memslots[log->slot]; | 846 | if (copy_to_user(entries, &vcpu->arch.cpuid_entries, |
| 782 | r = -ENOENT; | 847 | vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) |
| 783 | if (!memslot->dirty_bitmap) | ||
| 784 | goto out; | 848 | goto out; |
| 849 | return 0; | ||
| 850 | |||
| 851 | out: | ||
| 852 | cpuid->nent = vcpu->arch.cpuid_nent; | ||
| 853 | return r; | ||
| 854 | } | ||
| 855 | |||
| 856 | static inline u32 bit(int bitno) | ||
| 857 | { | ||
| 858 | return 1 << (bitno & 31); | ||
| 859 | } | ||
| 860 | |||
| 861 | static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
| 862 | u32 index) | ||
| 863 | { | ||
| 864 | entry->function = function; | ||
| 865 | entry->index = index; | ||
| 866 | cpuid_count(entry->function, entry->index, | ||
| 867 | &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); | ||
| 868 | entry->flags = 0; | ||
| 869 | } | ||
| 870 | |||
| 871 | static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
| 872 | u32 index, int *nent, int maxnent) | ||
| 873 | { | ||
| 874 | const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | | ||
| 875 | bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | | ||
| 876 | bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | | ||
| 877 | bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | | ||
| 878 | bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | | ||
| 879 | bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | | ||
| 880 | bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | | ||
| 881 | bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | | ||
| 882 | bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | | ||
| 883 | bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); | ||
| 884 | const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | | ||
| 885 | bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | | ||
| 886 | bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | | ||
| 887 | bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | | ||
| 888 | bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | | ||
| 889 | bit(X86_FEATURE_PGE) | | ||
| 890 | bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | | ||
| 891 | bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | | ||
| 892 | bit(X86_FEATURE_SYSCALL) | | ||
| 893 | (bit(X86_FEATURE_NX) && is_efer_nx()) | | ||
| 894 | #ifdef CONFIG_X86_64 | ||
| 895 | bit(X86_FEATURE_LM) | | ||
| 896 | #endif | ||
| 897 | bit(X86_FEATURE_MMXEXT) | | ||
| 898 | bit(X86_FEATURE_3DNOWEXT) | | ||
| 899 | bit(X86_FEATURE_3DNOW); | ||
| 900 | const u32 kvm_supported_word3_x86_features = | ||
| 901 | bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); | ||
| 902 | const u32 kvm_supported_word6_x86_features = | ||
| 903 | bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); | ||
| 904 | |||
| 905 | /* all func 2 cpuid_count() should be called on the same cpu */ | ||
| 906 | get_cpu(); | ||
| 907 | do_cpuid_1_ent(entry, function, index); | ||
| 908 | ++*nent; | ||
| 909 | |||
| 910 | switch (function) { | ||
| 911 | case 0: | ||
| 912 | entry->eax = min(entry->eax, (u32)0xb); | ||
| 913 | break; | ||
| 914 | case 1: | ||
| 915 | entry->edx &= kvm_supported_word0_x86_features; | ||
| 916 | entry->ecx &= kvm_supported_word3_x86_features; | ||
| 917 | break; | ||
| 918 | /* function 2 entries are STATEFUL. That is, repeated cpuid commands | ||
| 919 | * may return different values. This forces us to get_cpu() before | ||
| 920 | * issuing the first command, and also to emulate this annoying behavior | ||
| 921 | * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ | ||
| 922 | case 2: { | ||
| 923 | int t, times = entry->eax & 0xff; | ||
| 924 | |||
| 925 | entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; | ||
| 926 | for (t = 1; t < times && *nent < maxnent; ++t) { | ||
| 927 | do_cpuid_1_ent(&entry[t], function, 0); | ||
| 928 | entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; | ||
| 929 | ++*nent; | ||
| 930 | } | ||
| 931 | break; | ||
| 932 | } | ||
| 933 | /* function 4 and 0xb have additional index. */ | ||
| 934 | case 4: { | ||
| 935 | int index, cache_type; | ||
| 936 | |||
| 937 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
| 938 | /* read more entries until cache_type is zero */ | ||
| 939 | for (index = 1; *nent < maxnent; ++index) { | ||
| 940 | cache_type = entry[index - 1].eax & 0x1f; | ||
| 941 | if (!cache_type) | ||
| 942 | break; | ||
| 943 | do_cpuid_1_ent(&entry[index], function, index); | ||
| 944 | entry[index].flags |= | ||
| 945 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
| 946 | ++*nent; | ||
| 947 | } | ||
| 948 | break; | ||
| 949 | } | ||
| 950 | case 0xb: { | ||
| 951 | int index, level_type; | ||
| 952 | |||
| 953 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
| 954 | /* read more entries until level_type is zero */ | ||
| 955 | for (index = 1; *nent < maxnent; ++index) { | ||
| 956 | level_type = entry[index - 1].ecx & 0xff; | ||
| 957 | if (!level_type) | ||
| 958 | break; | ||
| 959 | do_cpuid_1_ent(&entry[index], function, index); | ||
| 960 | entry[index].flags |= | ||
| 961 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
| 962 | ++*nent; | ||
| 963 | } | ||
| 964 | break; | ||
| 965 | } | ||
| 966 | case 0x80000000: | ||
| 967 | entry->eax = min(entry->eax, 0x8000001a); | ||
| 968 | break; | ||
| 969 | case 0x80000001: | ||
| 970 | entry->edx &= kvm_supported_word1_x86_features; | ||
| 971 | entry->ecx &= kvm_supported_word6_x86_features; | ||
| 972 | break; | ||
| 973 | } | ||
| 974 | put_cpu(); | ||
| 975 | } | ||
| 785 | 976 | ||
| 786 | n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; | 977 | static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm, |
| 978 | struct kvm_cpuid2 *cpuid, | ||
| 979 | struct kvm_cpuid_entry2 __user *entries) | ||
| 980 | { | ||
| 981 | struct kvm_cpuid_entry2 *cpuid_entries; | ||
| 982 | int limit, nent = 0, r = -E2BIG; | ||
| 983 | u32 func; | ||
| 787 | 984 | ||
| 788 | for (i = 0; !any && i < n/sizeof(long); ++i) | 985 | if (cpuid->nent < 1) |
| 789 | any = memslot->dirty_bitmap[i]; | 986 | goto out; |
| 987 | r = -ENOMEM; | ||
| 988 | cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); | ||
| 989 | if (!cpuid_entries) | ||
| 990 | goto out; | ||
| 790 | 991 | ||
| 992 | do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); | ||
| 993 | limit = cpuid_entries[0].eax; | ||
| 994 | for (func = 1; func <= limit && nent < cpuid->nent; ++func) | ||
| 995 | do_cpuid_ent(&cpuid_entries[nent], func, 0, | ||
| 996 | &nent, cpuid->nent); | ||
| 997 | r = -E2BIG; | ||
| 998 | if (nent >= cpuid->nent) | ||
| 999 | goto out_free; | ||
| 1000 | |||
| 1001 | do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); | ||
| 1002 | limit = cpuid_entries[nent - 1].eax; | ||
| 1003 | for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) | ||
| 1004 | do_cpuid_ent(&cpuid_entries[nent], func, 0, | ||
| 1005 | &nent, cpuid->nent); | ||
| 791 | r = -EFAULT; | 1006 | r = -EFAULT; |
| 792 | if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) | 1007 | if (copy_to_user(entries, cpuid_entries, |
| 793 | goto out; | 1008 | nent * sizeof(struct kvm_cpuid_entry2))) |
| 1009 | goto out_free; | ||
| 1010 | cpuid->nent = nent; | ||
| 1011 | r = 0; | ||
| 794 | 1012 | ||
| 795 | /* If nothing is dirty, don't bother messing with page tables. */ | 1013 | out_free: |
| 796 | if (any) { | 1014 | vfree(cpuid_entries); |
| 797 | kvm_mmu_slot_remove_write_access(kvm, log->slot); | 1015 | out: |
| 798 | kvm_flush_remote_tlbs(kvm); | 1016 | return r; |
| 799 | memset(memslot->dirty_bitmap, 0, n); | 1017 | } |
| 1018 | |||
| 1019 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, | ||
| 1020 | struct kvm_lapic_state *s) | ||
| 1021 | { | ||
| 1022 | vcpu_load(vcpu); | ||
| 1023 | memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); | ||
| 1024 | vcpu_put(vcpu); | ||
| 1025 | |||
| 1026 | return 0; | ||
| 1027 | } | ||
| 1028 | |||
| 1029 | static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, | ||
| 1030 | struct kvm_lapic_state *s) | ||
| 1031 | { | ||
| 1032 | vcpu_load(vcpu); | ||
| 1033 | memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); | ||
| 1034 | kvm_apic_post_state_restore(vcpu); | ||
| 1035 | vcpu_put(vcpu); | ||
| 1036 | |||
| 1037 | return 0; | ||
| 1038 | } | ||
| 1039 | |||
| 1040 | static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | ||
| 1041 | struct kvm_interrupt *irq) | ||
| 1042 | { | ||
| 1043 | if (irq->irq < 0 || irq->irq >= 256) | ||
| 1044 | return -EINVAL; | ||
| 1045 | if (irqchip_in_kernel(vcpu->kvm)) | ||
| 1046 | return -ENXIO; | ||
| 1047 | vcpu_load(vcpu); | ||
| 1048 | |||
| 1049 | set_bit(irq->irq, vcpu->arch.irq_pending); | ||
| 1050 | set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); | ||
| 1051 | |||
| 1052 | vcpu_put(vcpu); | ||
| 1053 | |||
| 1054 | return 0; | ||
| 1055 | } | ||
| 1056 | |||
| 1057 | static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, | ||
| 1058 | struct kvm_tpr_access_ctl *tac) | ||
| 1059 | { | ||
| 1060 | if (tac->flags) | ||
| 1061 | return -EINVAL; | ||
| 1062 | vcpu->arch.tpr_access_reporting = !!tac->enabled; | ||
| 1063 | return 0; | ||
| 1064 | } | ||
| 1065 | |||
| 1066 | long kvm_arch_vcpu_ioctl(struct file *filp, | ||
| 1067 | unsigned int ioctl, unsigned long arg) | ||
| 1068 | { | ||
| 1069 | struct kvm_vcpu *vcpu = filp->private_data; | ||
| 1070 | void __user *argp = (void __user *)arg; | ||
| 1071 | int r; | ||
| 1072 | |||
| 1073 | switch (ioctl) { | ||
| 1074 | case KVM_GET_LAPIC: { | ||
| 1075 | struct kvm_lapic_state lapic; | ||
| 1076 | |||
| 1077 | memset(&lapic, 0, sizeof lapic); | ||
| 1078 | r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); | ||
| 1079 | if (r) | ||
| 1080 | goto out; | ||
| 1081 | r = -EFAULT; | ||
| 1082 | if (copy_to_user(argp, &lapic, sizeof lapic)) | ||
| 1083 | goto out; | ||
| 1084 | r = 0; | ||
| 1085 | break; | ||
| 800 | } | 1086 | } |
| 1087 | case KVM_SET_LAPIC: { | ||
| 1088 | struct kvm_lapic_state lapic; | ||
| 801 | 1089 | ||
| 802 | r = 0; | 1090 | r = -EFAULT; |
| 1091 | if (copy_from_user(&lapic, argp, sizeof lapic)) | ||
| 1092 | goto out; | ||
| 1093 | r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; | ||
| 1094 | if (r) | ||
| 1095 | goto out; | ||
| 1096 | r = 0; | ||
| 1097 | break; | ||
| 1098 | } | ||
| 1099 | case KVM_INTERRUPT: { | ||
| 1100 | struct kvm_interrupt irq; | ||
| 1101 | |||
| 1102 | r = -EFAULT; | ||
| 1103 | if (copy_from_user(&irq, argp, sizeof irq)) | ||
| 1104 | goto out; | ||
| 1105 | r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); | ||
| 1106 | if (r) | ||
| 1107 | goto out; | ||
| 1108 | r = 0; | ||
| 1109 | break; | ||
| 1110 | } | ||
| 1111 | case KVM_SET_CPUID: { | ||
| 1112 | struct kvm_cpuid __user *cpuid_arg = argp; | ||
| 1113 | struct kvm_cpuid cpuid; | ||
| 803 | 1114 | ||
| 1115 | r = -EFAULT; | ||
| 1116 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
| 1117 | goto out; | ||
| 1118 | r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); | ||
| 1119 | if (r) | ||
| 1120 | goto out; | ||
| 1121 | break; | ||
| 1122 | } | ||
| 1123 | case KVM_SET_CPUID2: { | ||
| 1124 | struct kvm_cpuid2 __user *cpuid_arg = argp; | ||
| 1125 | struct kvm_cpuid2 cpuid; | ||
| 1126 | |||
| 1127 | r = -EFAULT; | ||
| 1128 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
| 1129 | goto out; | ||
| 1130 | r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, | ||
| 1131 | cpuid_arg->entries); | ||
| 1132 | if (r) | ||
| 1133 | goto out; | ||
| 1134 | break; | ||
| 1135 | } | ||
| 1136 | case KVM_GET_CPUID2: { | ||
| 1137 | struct kvm_cpuid2 __user *cpuid_arg = argp; | ||
| 1138 | struct kvm_cpuid2 cpuid; | ||
| 1139 | |||
| 1140 | r = -EFAULT; | ||
| 1141 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
| 1142 | goto out; | ||
| 1143 | r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, | ||
| 1144 | cpuid_arg->entries); | ||
| 1145 | if (r) | ||
| 1146 | goto out; | ||
| 1147 | r = -EFAULT; | ||
| 1148 | if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) | ||
| 1149 | goto out; | ||
| 1150 | r = 0; | ||
| 1151 | break; | ||
| 1152 | } | ||
| 1153 | case KVM_GET_MSRS: | ||
| 1154 | r = msr_io(vcpu, argp, kvm_get_msr, 1); | ||
| 1155 | break; | ||
| 1156 | case KVM_SET_MSRS: | ||
| 1157 | r = msr_io(vcpu, argp, do_set_msr, 0); | ||
| 1158 | break; | ||
| 1159 | case KVM_TPR_ACCESS_REPORTING: { | ||
| 1160 | struct kvm_tpr_access_ctl tac; | ||
| 1161 | |||
| 1162 | r = -EFAULT; | ||
| 1163 | if (copy_from_user(&tac, argp, sizeof tac)) | ||
| 1164 | goto out; | ||
| 1165 | r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); | ||
| 1166 | if (r) | ||
| 1167 | goto out; | ||
| 1168 | r = -EFAULT; | ||
| 1169 | if (copy_to_user(argp, &tac, sizeof tac)) | ||
| 1170 | goto out; | ||
| 1171 | r = 0; | ||
| 1172 | break; | ||
| 1173 | }; | ||
| 1174 | case KVM_SET_VAPIC_ADDR: { | ||
| 1175 | struct kvm_vapic_addr va; | ||
| 1176 | |||
| 1177 | r = -EINVAL; | ||
| 1178 | if (!irqchip_in_kernel(vcpu->kvm)) | ||
| 1179 | goto out; | ||
| 1180 | r = -EFAULT; | ||
| 1181 | if (copy_from_user(&va, argp, sizeof va)) | ||
| 1182 | goto out; | ||
| 1183 | r = 0; | ||
| 1184 | kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); | ||
| 1185 | break; | ||
| 1186 | } | ||
| 1187 | default: | ||
| 1188 | r = -EINVAL; | ||
| 1189 | } | ||
| 804 | out: | 1190 | out: |
| 805 | mutex_unlock(&kvm->lock); | ||
| 806 | return r; | 1191 | return r; |
| 807 | } | 1192 | } |
| 808 | 1193 | ||
| 1194 | static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) | ||
| 1195 | { | ||
| 1196 | int ret; | ||
| 1197 | |||
| 1198 | if (addr > (unsigned int)(-3 * PAGE_SIZE)) | ||
| 1199 | return -1; | ||
| 1200 | ret = kvm_x86_ops->set_tss_addr(kvm, addr); | ||
| 1201 | return ret; | ||
| 1202 | } | ||
| 1203 | |||
| 1204 | static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, | ||
| 1205 | u32 kvm_nr_mmu_pages) | ||
| 1206 | { | ||
| 1207 | if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) | ||
| 1208 | return -EINVAL; | ||
| 1209 | |||
| 1210 | down_write(¤t->mm->mmap_sem); | ||
| 1211 | |||
| 1212 | kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); | ||
| 1213 | kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; | ||
| 1214 | |||
| 1215 | up_write(¤t->mm->mmap_sem); | ||
| 1216 | return 0; | ||
| 1217 | } | ||
| 1218 | |||
| 1219 | static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) | ||
| 1220 | { | ||
| 1221 | return kvm->arch.n_alloc_mmu_pages; | ||
| 1222 | } | ||
| 1223 | |||
| 1224 | gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) | ||
| 1225 | { | ||
| 1226 | int i; | ||
| 1227 | struct kvm_mem_alias *alias; | ||
| 1228 | |||
| 1229 | for (i = 0; i < kvm->arch.naliases; ++i) { | ||
| 1230 | alias = &kvm->arch.aliases[i]; | ||
| 1231 | if (gfn >= alias->base_gfn | ||
| 1232 | && gfn < alias->base_gfn + alias->npages) | ||
| 1233 | return alias->target_gfn + gfn - alias->base_gfn; | ||
| 1234 | } | ||
| 1235 | return gfn; | ||
| 1236 | } | ||
| 1237 | |||
| 809 | /* | 1238 | /* |
| 810 | * Set a new alias region. Aliases map a portion of physical memory into | 1239 | * Set a new alias region. Aliases map a portion of physical memory into |
| 811 | * another portion. This is useful for memory windows, for example the PC | 1240 | * another portion. This is useful for memory windows, for example the PC |
| @@ -832,21 +1261,21 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, | |||
| 832 | < alias->target_phys_addr) | 1261 | < alias->target_phys_addr) |
| 833 | goto out; | 1262 | goto out; |
| 834 | 1263 | ||
| 835 | mutex_lock(&kvm->lock); | 1264 | down_write(¤t->mm->mmap_sem); |
| 836 | 1265 | ||
| 837 | p = &kvm->aliases[alias->slot]; | 1266 | p = &kvm->arch.aliases[alias->slot]; |
| 838 | p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; | 1267 | p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; |
| 839 | p->npages = alias->memory_size >> PAGE_SHIFT; | 1268 | p->npages = alias->memory_size >> PAGE_SHIFT; |
| 840 | p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; | 1269 | p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; |
| 841 | 1270 | ||
| 842 | for (n = KVM_ALIAS_SLOTS; n > 0; --n) | 1271 | for (n = KVM_ALIAS_SLOTS; n > 0; --n) |
| 843 | if (kvm->aliases[n - 1].npages) | 1272 | if (kvm->arch.aliases[n - 1].npages) |
| 844 | break; | 1273 | break; |
| 845 | kvm->naliases = n; | 1274 | kvm->arch.naliases = n; |
| 846 | 1275 | ||
| 847 | kvm_mmu_zap_all(kvm); | 1276 | kvm_mmu_zap_all(kvm); |
| 848 | 1277 | ||
| 849 | mutex_unlock(&kvm->lock); | 1278 | up_write(¤t->mm->mmap_sem); |
| 850 | 1279 | ||
| 851 | return 0; | 1280 | return 0; |
| 852 | 1281 | ||
| @@ -861,17 +1290,17 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | |||
| 861 | r = 0; | 1290 | r = 0; |
| 862 | switch (chip->chip_id) { | 1291 | switch (chip->chip_id) { |
| 863 | case KVM_IRQCHIP_PIC_MASTER: | 1292 | case KVM_IRQCHIP_PIC_MASTER: |
| 864 | memcpy (&chip->chip.pic, | 1293 | memcpy(&chip->chip.pic, |
| 865 | &pic_irqchip(kvm)->pics[0], | 1294 | &pic_irqchip(kvm)->pics[0], |
| 866 | sizeof(struct kvm_pic_state)); | 1295 | sizeof(struct kvm_pic_state)); |
| 867 | break; | 1296 | break; |
| 868 | case KVM_IRQCHIP_PIC_SLAVE: | 1297 | case KVM_IRQCHIP_PIC_SLAVE: |
| 869 | memcpy (&chip->chip.pic, | 1298 | memcpy(&chip->chip.pic, |
| 870 | &pic_irqchip(kvm)->pics[1], | 1299 | &pic_irqchip(kvm)->pics[1], |
| 871 | sizeof(struct kvm_pic_state)); | 1300 | sizeof(struct kvm_pic_state)); |
| 872 | break; | 1301 | break; |
| 873 | case KVM_IRQCHIP_IOAPIC: | 1302 | case KVM_IRQCHIP_IOAPIC: |
| 874 | memcpy (&chip->chip.ioapic, | 1303 | memcpy(&chip->chip.ioapic, |
| 875 | ioapic_irqchip(kvm), | 1304 | ioapic_irqchip(kvm), |
| 876 | sizeof(struct kvm_ioapic_state)); | 1305 | sizeof(struct kvm_ioapic_state)); |
| 877 | break; | 1306 | break; |
| @@ -889,17 +1318,17 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | |||
| 889 | r = 0; | 1318 | r = 0; |
| 890 | switch (chip->chip_id) { | 1319 | switch (chip->chip_id) { |
| 891 | case KVM_IRQCHIP_PIC_MASTER: | 1320 | case KVM_IRQCHIP_PIC_MASTER: |
| 892 | memcpy (&pic_irqchip(kvm)->pics[0], | 1321 | memcpy(&pic_irqchip(kvm)->pics[0], |
| 893 | &chip->chip.pic, | 1322 | &chip->chip.pic, |
| 894 | sizeof(struct kvm_pic_state)); | 1323 | sizeof(struct kvm_pic_state)); |
| 895 | break; | 1324 | break; |
| 896 | case KVM_IRQCHIP_PIC_SLAVE: | 1325 | case KVM_IRQCHIP_PIC_SLAVE: |
| 897 | memcpy (&pic_irqchip(kvm)->pics[1], | 1326 | memcpy(&pic_irqchip(kvm)->pics[1], |
| 898 | &chip->chip.pic, | 1327 | &chip->chip.pic, |
| 899 | sizeof(struct kvm_pic_state)); | 1328 | sizeof(struct kvm_pic_state)); |
| 900 | break; | 1329 | break; |
| 901 | case KVM_IRQCHIP_IOAPIC: | 1330 | case KVM_IRQCHIP_IOAPIC: |
| 902 | memcpy (ioapic_irqchip(kvm), | 1331 | memcpy(ioapic_irqchip(kvm), |
| 903 | &chip->chip.ioapic, | 1332 | &chip->chip.ioapic, |
| 904 | sizeof(struct kvm_ioapic_state)); | 1333 | sizeof(struct kvm_ioapic_state)); |
| 905 | break; | 1334 | break; |
| @@ -911,110 +1340,191 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | |||
| 911 | return r; | 1340 | return r; |
| 912 | } | 1341 | } |
| 913 | 1342 | ||
| 914 | static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) | 1343 | /* |
| 1344 | * Get (and clear) the dirty memory log for a memory slot. | ||
| 1345 | */ | ||
| 1346 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | ||
| 1347 | struct kvm_dirty_log *log) | ||
| 915 | { | 1348 | { |
| 916 | int i; | 1349 | int r; |
| 917 | struct kvm_mem_alias *alias; | 1350 | int n; |
| 918 | 1351 | struct kvm_memory_slot *memslot; | |
| 919 | for (i = 0; i < kvm->naliases; ++i) { | 1352 | int is_dirty = 0; |
| 920 | alias = &kvm->aliases[i]; | ||
| 921 | if (gfn >= alias->base_gfn | ||
| 922 | && gfn < alias->base_gfn + alias->npages) | ||
| 923 | return alias->target_gfn + gfn - alias->base_gfn; | ||
| 924 | } | ||
| 925 | return gfn; | ||
| 926 | } | ||
| 927 | 1353 | ||
| 928 | static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | 1354 | down_write(¤t->mm->mmap_sem); |
| 929 | { | ||
| 930 | int i; | ||
| 931 | 1355 | ||
| 932 | for (i = 0; i < kvm->nmemslots; ++i) { | 1356 | r = kvm_get_dirty_log(kvm, log, &is_dirty); |
| 933 | struct kvm_memory_slot *memslot = &kvm->memslots[i]; | 1357 | if (r) |
| 1358 | goto out; | ||
| 934 | 1359 | ||
| 935 | if (gfn >= memslot->base_gfn | 1360 | /* If nothing is dirty, don't bother messing with page tables. */ |
| 936 | && gfn < memslot->base_gfn + memslot->npages) | 1361 | if (is_dirty) { |
| 937 | return memslot; | 1362 | kvm_mmu_slot_remove_write_access(kvm, log->slot); |
| 1363 | kvm_flush_remote_tlbs(kvm); | ||
| 1364 | memslot = &kvm->memslots[log->slot]; | ||
| 1365 | n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; | ||
| 1366 | memset(memslot->dirty_bitmap, 0, n); | ||
| 938 | } | 1367 | } |
| 939 | return NULL; | 1368 | r = 0; |
| 940 | } | 1369 | out: |
| 941 | 1370 | up_write(¤t->mm->mmap_sem); | |
| 942 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | 1371 | return r; |
| 943 | { | ||
| 944 | gfn = unalias_gfn(kvm, gfn); | ||
| 945 | return __gfn_to_memslot(kvm, gfn); | ||
| 946 | } | ||
| 947 | |||
| 948 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) | ||
| 949 | { | ||
| 950 | struct kvm_memory_slot *slot; | ||
| 951 | |||
| 952 | gfn = unalias_gfn(kvm, gfn); | ||
| 953 | slot = __gfn_to_memslot(kvm, gfn); | ||
| 954 | if (!slot) | ||
| 955 | return NULL; | ||
| 956 | return slot->phys_mem[gfn - slot->base_gfn]; | ||
| 957 | } | 1372 | } |
| 958 | EXPORT_SYMBOL_GPL(gfn_to_page); | ||
| 959 | 1373 | ||
| 960 | /* WARNING: Does not work on aliased pages. */ | 1374 | long kvm_arch_vm_ioctl(struct file *filp, |
| 961 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn) | 1375 | unsigned int ioctl, unsigned long arg) |
| 962 | { | 1376 | { |
| 963 | struct kvm_memory_slot *memslot; | 1377 | struct kvm *kvm = filp->private_data; |
| 1378 | void __user *argp = (void __user *)arg; | ||
| 1379 | int r = -EINVAL; | ||
| 964 | 1380 | ||
| 965 | memslot = __gfn_to_memslot(kvm, gfn); | 1381 | switch (ioctl) { |
| 966 | if (memslot && memslot->dirty_bitmap) { | 1382 | case KVM_SET_TSS_ADDR: |
| 967 | unsigned long rel_gfn = gfn - memslot->base_gfn; | 1383 | r = kvm_vm_ioctl_set_tss_addr(kvm, arg); |
| 1384 | if (r < 0) | ||
| 1385 | goto out; | ||
| 1386 | break; | ||
| 1387 | case KVM_SET_MEMORY_REGION: { | ||
| 1388 | struct kvm_memory_region kvm_mem; | ||
| 1389 | struct kvm_userspace_memory_region kvm_userspace_mem; | ||
| 968 | 1390 | ||
| 969 | /* avoid RMW */ | 1391 | r = -EFAULT; |
| 970 | if (!test_bit(rel_gfn, memslot->dirty_bitmap)) | 1392 | if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) |
| 971 | set_bit(rel_gfn, memslot->dirty_bitmap); | 1393 | goto out; |
| 1394 | kvm_userspace_mem.slot = kvm_mem.slot; | ||
| 1395 | kvm_userspace_mem.flags = kvm_mem.flags; | ||
| 1396 | kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; | ||
| 1397 | kvm_userspace_mem.memory_size = kvm_mem.memory_size; | ||
| 1398 | r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); | ||
| 1399 | if (r) | ||
| 1400 | goto out; | ||
| 1401 | break; | ||
| 972 | } | 1402 | } |
| 973 | } | 1403 | case KVM_SET_NR_MMU_PAGES: |
| 1404 | r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); | ||
| 1405 | if (r) | ||
| 1406 | goto out; | ||
| 1407 | break; | ||
| 1408 | case KVM_GET_NR_MMU_PAGES: | ||
| 1409 | r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); | ||
| 1410 | break; | ||
| 1411 | case KVM_SET_MEMORY_ALIAS: { | ||
| 1412 | struct kvm_memory_alias alias; | ||
| 974 | 1413 | ||
| 975 | int emulator_read_std(unsigned long addr, | 1414 | r = -EFAULT; |
| 976 | void *val, | 1415 | if (copy_from_user(&alias, argp, sizeof alias)) |
| 977 | unsigned int bytes, | 1416 | goto out; |
| 978 | struct kvm_vcpu *vcpu) | 1417 | r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); |
| 979 | { | 1418 | if (r) |
| 980 | void *data = val; | 1419 | goto out; |
| 1420 | break; | ||
| 1421 | } | ||
| 1422 | case KVM_CREATE_IRQCHIP: | ||
| 1423 | r = -ENOMEM; | ||
| 1424 | kvm->arch.vpic = kvm_create_pic(kvm); | ||
| 1425 | if (kvm->arch.vpic) { | ||
| 1426 | r = kvm_ioapic_init(kvm); | ||
| 1427 | if (r) { | ||
| 1428 | kfree(kvm->arch.vpic); | ||
| 1429 | kvm->arch.vpic = NULL; | ||
| 1430 | goto out; | ||
| 1431 | } | ||
| 1432 | } else | ||
| 1433 | goto out; | ||
| 1434 | break; | ||
| 1435 | case KVM_IRQ_LINE: { | ||
| 1436 | struct kvm_irq_level irq_event; | ||
| 981 | 1437 | ||
| 982 | while (bytes) { | 1438 | r = -EFAULT; |
| 983 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); | 1439 | if (copy_from_user(&irq_event, argp, sizeof irq_event)) |
| 984 | unsigned offset = addr & (PAGE_SIZE-1); | 1440 | goto out; |
| 985 | unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); | 1441 | if (irqchip_in_kernel(kvm)) { |
| 986 | unsigned long pfn; | 1442 | mutex_lock(&kvm->lock); |
| 987 | struct page *page; | 1443 | if (irq_event.irq < 16) |
| 988 | void *page_virt; | 1444 | kvm_pic_set_irq(pic_irqchip(kvm), |
| 1445 | irq_event.irq, | ||
| 1446 | irq_event.level); | ||
| 1447 | kvm_ioapic_set_irq(kvm->arch.vioapic, | ||
| 1448 | irq_event.irq, | ||
| 1449 | irq_event.level); | ||
| 1450 | mutex_unlock(&kvm->lock); | ||
| 1451 | r = 0; | ||
| 1452 | } | ||
| 1453 | break; | ||
| 1454 | } | ||
| 1455 | case KVM_GET_IRQCHIP: { | ||
| 1456 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | ||
| 1457 | struct kvm_irqchip chip; | ||
| 989 | 1458 | ||
| 990 | if (gpa == UNMAPPED_GVA) | 1459 | r = -EFAULT; |
| 991 | return X86EMUL_PROPAGATE_FAULT; | 1460 | if (copy_from_user(&chip, argp, sizeof chip)) |
| 992 | pfn = gpa >> PAGE_SHIFT; | 1461 | goto out; |
| 993 | page = gfn_to_page(vcpu->kvm, pfn); | 1462 | r = -ENXIO; |
| 994 | if (!page) | 1463 | if (!irqchip_in_kernel(kvm)) |
| 995 | return X86EMUL_UNHANDLEABLE; | 1464 | goto out; |
| 996 | page_virt = kmap_atomic(page, KM_USER0); | 1465 | r = kvm_vm_ioctl_get_irqchip(kvm, &chip); |
| 1466 | if (r) | ||
| 1467 | goto out; | ||
| 1468 | r = -EFAULT; | ||
| 1469 | if (copy_to_user(argp, &chip, sizeof chip)) | ||
| 1470 | goto out; | ||
| 1471 | r = 0; | ||
| 1472 | break; | ||
| 1473 | } | ||
| 1474 | case KVM_SET_IRQCHIP: { | ||
| 1475 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | ||
| 1476 | struct kvm_irqchip chip; | ||
| 997 | 1477 | ||
| 998 | memcpy(data, page_virt + offset, tocopy); | 1478 | r = -EFAULT; |
| 1479 | if (copy_from_user(&chip, argp, sizeof chip)) | ||
| 1480 | goto out; | ||
| 1481 | r = -ENXIO; | ||
| 1482 | if (!irqchip_in_kernel(kvm)) | ||
| 1483 | goto out; | ||
| 1484 | r = kvm_vm_ioctl_set_irqchip(kvm, &chip); | ||
| 1485 | if (r) | ||
| 1486 | goto out; | ||
| 1487 | r = 0; | ||
| 1488 | break; | ||
| 1489 | } | ||
| 1490 | case KVM_GET_SUPPORTED_CPUID: { | ||
| 1491 | struct kvm_cpuid2 __user *cpuid_arg = argp; | ||
| 1492 | struct kvm_cpuid2 cpuid; | ||
| 999 | 1493 | ||
| 1000 | kunmap_atomic(page_virt, KM_USER0); | 1494 | r = -EFAULT; |
| 1495 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
| 1496 | goto out; | ||
| 1497 | r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid, | ||
| 1498 | cpuid_arg->entries); | ||
| 1499 | if (r) | ||
| 1500 | goto out; | ||
| 1001 | 1501 | ||
| 1002 | bytes -= tocopy; | 1502 | r = -EFAULT; |
| 1003 | data += tocopy; | 1503 | if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) |
| 1004 | addr += tocopy; | 1504 | goto out; |
| 1505 | r = 0; | ||
| 1506 | break; | ||
| 1005 | } | 1507 | } |
| 1006 | 1508 | default: | |
| 1007 | return X86EMUL_CONTINUE; | 1509 | ; |
| 1510 | } | ||
| 1511 | out: | ||
| 1512 | return r; | ||
| 1008 | } | 1513 | } |
| 1009 | EXPORT_SYMBOL_GPL(emulator_read_std); | ||
| 1010 | 1514 | ||
| 1011 | static int emulator_write_std(unsigned long addr, | 1515 | static void kvm_init_msr_list(void) |
| 1012 | const void *val, | ||
| 1013 | unsigned int bytes, | ||
| 1014 | struct kvm_vcpu *vcpu) | ||
| 1015 | { | 1516 | { |
| 1016 | pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes); | 1517 | u32 dummy[2]; |
| 1017 | return X86EMUL_UNHANDLEABLE; | 1518 | unsigned i, j; |
| 1519 | |||
| 1520 | for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { | ||
| 1521 | if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) | ||
| 1522 | continue; | ||
| 1523 | if (j < i) | ||
| 1524 | msrs_to_save[j] = msrs_to_save[i]; | ||
| 1525 | j++; | ||
| 1526 | } | ||
| 1527 | num_msrs_to_save = j; | ||
| 1018 | } | 1528 | } |
| 1019 | 1529 | ||
| 1020 | /* | 1530 | /* |
| @@ -1025,14 +1535,15 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, | |||
| 1025 | { | 1535 | { |
| 1026 | struct kvm_io_device *dev; | 1536 | struct kvm_io_device *dev; |
| 1027 | 1537 | ||
| 1028 | if (vcpu->apic) { | 1538 | if (vcpu->arch.apic) { |
| 1029 | dev = &vcpu->apic->dev; | 1539 | dev = &vcpu->arch.apic->dev; |
| 1030 | if (dev->in_range(dev, addr)) | 1540 | if (dev->in_range(dev, addr)) |
| 1031 | return dev; | 1541 | return dev; |
| 1032 | } | 1542 | } |
| 1033 | return NULL; | 1543 | return NULL; |
| 1034 | } | 1544 | } |
| 1035 | 1545 | ||
| 1546 | |||
| 1036 | static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, | 1547 | static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, |
| 1037 | gpa_t addr) | 1548 | gpa_t addr) |
| 1038 | { | 1549 | { |
| @@ -1044,11 +1555,40 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, | |||
| 1044 | return dev; | 1555 | return dev; |
| 1045 | } | 1556 | } |
| 1046 | 1557 | ||
| 1047 | static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, | 1558 | int emulator_read_std(unsigned long addr, |
| 1048 | gpa_t addr) | 1559 | void *val, |
| 1560 | unsigned int bytes, | ||
| 1561 | struct kvm_vcpu *vcpu) | ||
| 1049 | { | 1562 | { |
| 1050 | return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); | 1563 | void *data = val; |
| 1564 | int r = X86EMUL_CONTINUE; | ||
| 1565 | |||
| 1566 | down_read(¤t->mm->mmap_sem); | ||
| 1567 | while (bytes) { | ||
| 1568 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
| 1569 | unsigned offset = addr & (PAGE_SIZE-1); | ||
| 1570 | unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); | ||
| 1571 | int ret; | ||
| 1572 | |||
| 1573 | if (gpa == UNMAPPED_GVA) { | ||
| 1574 | r = X86EMUL_PROPAGATE_FAULT; | ||
| 1575 | goto out; | ||
| 1576 | } | ||
| 1577 | ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); | ||
| 1578 | if (ret < 0) { | ||
| 1579 | r = X86EMUL_UNHANDLEABLE; | ||
| 1580 | goto out; | ||
| 1581 | } | ||
| 1582 | |||
| 1583 | bytes -= tocopy; | ||
| 1584 | data += tocopy; | ||
| 1585 | addr += tocopy; | ||
| 1586 | } | ||
| 1587 | out: | ||
| 1588 | up_read(¤t->mm->mmap_sem); | ||
| 1589 | return r; | ||
| 1051 | } | 1590 | } |
| 1591 | EXPORT_SYMBOL_GPL(emulator_read_std); | ||
| 1052 | 1592 | ||
| 1053 | static int emulator_read_emulated(unsigned long addr, | 1593 | static int emulator_read_emulated(unsigned long addr, |
| 1054 | void *val, | 1594 | void *val, |
| @@ -1062,22 +1602,34 @@ static int emulator_read_emulated(unsigned long addr, | |||
| 1062 | memcpy(val, vcpu->mmio_data, bytes); | 1602 | memcpy(val, vcpu->mmio_data, bytes); |
| 1063 | vcpu->mmio_read_completed = 0; | 1603 | vcpu->mmio_read_completed = 0; |
| 1064 | return X86EMUL_CONTINUE; | 1604 | return X86EMUL_CONTINUE; |
| 1065 | } else if (emulator_read_std(addr, val, bytes, vcpu) | 1605 | } |
| 1066 | == X86EMUL_CONTINUE) | 1606 | |
| 1067 | return X86EMUL_CONTINUE; | 1607 | down_read(¤t->mm->mmap_sem); |
| 1608 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
| 1609 | up_read(¤t->mm->mmap_sem); | ||
| 1610 | |||
| 1611 | /* For APIC access vmexit */ | ||
| 1612 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
| 1613 | goto mmio; | ||
| 1068 | 1614 | ||
| 1069 | gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); | 1615 | if (emulator_read_std(addr, val, bytes, vcpu) |
| 1616 | == X86EMUL_CONTINUE) | ||
| 1617 | return X86EMUL_CONTINUE; | ||
| 1070 | if (gpa == UNMAPPED_GVA) | 1618 | if (gpa == UNMAPPED_GVA) |
| 1071 | return X86EMUL_PROPAGATE_FAULT; | 1619 | return X86EMUL_PROPAGATE_FAULT; |
| 1072 | 1620 | ||
| 1621 | mmio: | ||
| 1073 | /* | 1622 | /* |
| 1074 | * Is this MMIO handled locally? | 1623 | * Is this MMIO handled locally? |
| 1075 | */ | 1624 | */ |
| 1625 | mutex_lock(&vcpu->kvm->lock); | ||
| 1076 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); | 1626 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); |
| 1077 | if (mmio_dev) { | 1627 | if (mmio_dev) { |
| 1078 | kvm_iodevice_read(mmio_dev, gpa, bytes, val); | 1628 | kvm_iodevice_read(mmio_dev, gpa, bytes, val); |
| 1629 | mutex_unlock(&vcpu->kvm->lock); | ||
| 1079 | return X86EMUL_CONTINUE; | 1630 | return X86EMUL_CONTINUE; |
| 1080 | } | 1631 | } |
| 1632 | mutex_unlock(&vcpu->kvm->lock); | ||
| 1081 | 1633 | ||
| 1082 | vcpu->mmio_needed = 1; | 1634 | vcpu->mmio_needed = 1; |
| 1083 | vcpu->mmio_phys_addr = gpa; | 1635 | vcpu->mmio_phys_addr = gpa; |
| @@ -1090,19 +1642,16 @@ static int emulator_read_emulated(unsigned long addr, | |||
| 1090 | static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | 1642 | static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, |
| 1091 | const void *val, int bytes) | 1643 | const void *val, int bytes) |
| 1092 | { | 1644 | { |
| 1093 | struct page *page; | 1645 | int ret; |
| 1094 | void *virt; | ||
| 1095 | 1646 | ||
| 1096 | if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) | 1647 | down_read(¤t->mm->mmap_sem); |
| 1648 | ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); | ||
| 1649 | if (ret < 0) { | ||
| 1650 | up_read(¤t->mm->mmap_sem); | ||
| 1097 | return 0; | 1651 | return 0; |
| 1098 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | 1652 | } |
| 1099 | if (!page) | ||
| 1100 | return 0; | ||
| 1101 | mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
| 1102 | virt = kmap_atomic(page, KM_USER0); | ||
| 1103 | kvm_mmu_pte_write(vcpu, gpa, val, bytes); | 1653 | kvm_mmu_pte_write(vcpu, gpa, val, bytes); |
| 1104 | memcpy(virt + offset_in_page(gpa), val, bytes); | 1654 | up_read(¤t->mm->mmap_sem); |
| 1105 | kunmap_atomic(virt, KM_USER0); | ||
| 1106 | return 1; | 1655 | return 1; |
| 1107 | } | 1656 | } |
| 1108 | 1657 | ||
| @@ -1112,24 +1661,36 @@ static int emulator_write_emulated_onepage(unsigned long addr, | |||
| 1112 | struct kvm_vcpu *vcpu) | 1661 | struct kvm_vcpu *vcpu) |
| 1113 | { | 1662 | { |
| 1114 | struct kvm_io_device *mmio_dev; | 1663 | struct kvm_io_device *mmio_dev; |
| 1115 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); | 1664 | gpa_t gpa; |
| 1665 | |||
| 1666 | down_read(¤t->mm->mmap_sem); | ||
| 1667 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
| 1668 | up_read(¤t->mm->mmap_sem); | ||
| 1116 | 1669 | ||
| 1117 | if (gpa == UNMAPPED_GVA) { | 1670 | if (gpa == UNMAPPED_GVA) { |
| 1118 | kvm_x86_ops->inject_page_fault(vcpu, addr, 2); | 1671 | kvm_inject_page_fault(vcpu, addr, 2); |
| 1119 | return X86EMUL_PROPAGATE_FAULT; | 1672 | return X86EMUL_PROPAGATE_FAULT; |
| 1120 | } | 1673 | } |
| 1121 | 1674 | ||
| 1675 | /* For APIC access vmexit */ | ||
| 1676 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
| 1677 | goto mmio; | ||
| 1678 | |||
| 1122 | if (emulator_write_phys(vcpu, gpa, val, bytes)) | 1679 | if (emulator_write_phys(vcpu, gpa, val, bytes)) |
| 1123 | return X86EMUL_CONTINUE; | 1680 | return X86EMUL_CONTINUE; |
| 1124 | 1681 | ||
| 1682 | mmio: | ||
| 1125 | /* | 1683 | /* |
| 1126 | * Is this MMIO handled locally? | 1684 | * Is this MMIO handled locally? |
| 1127 | */ | 1685 | */ |
| 1686 | mutex_lock(&vcpu->kvm->lock); | ||
| 1128 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); | 1687 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); |
| 1129 | if (mmio_dev) { | 1688 | if (mmio_dev) { |
| 1130 | kvm_iodevice_write(mmio_dev, gpa, bytes, val); | 1689 | kvm_iodevice_write(mmio_dev, gpa, bytes, val); |
| 1690 | mutex_unlock(&vcpu->kvm->lock); | ||
| 1131 | return X86EMUL_CONTINUE; | 1691 | return X86EMUL_CONTINUE; |
| 1132 | } | 1692 | } |
| 1693 | mutex_unlock(&vcpu->kvm->lock); | ||
| 1133 | 1694 | ||
| 1134 | vcpu->mmio_needed = 1; | 1695 | vcpu->mmio_needed = 1; |
| 1135 | vcpu->mmio_phys_addr = gpa; | 1696 | vcpu->mmio_phys_addr = gpa; |
| @@ -1173,6 +1734,35 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
| 1173 | reported = 1; | 1734 | reported = 1; |
| 1174 | printk(KERN_WARNING "kvm: emulating exchange as write\n"); | 1735 | printk(KERN_WARNING "kvm: emulating exchange as write\n"); |
| 1175 | } | 1736 | } |
| 1737 | #ifndef CONFIG_X86_64 | ||
| 1738 | /* guests cmpxchg8b have to be emulated atomically */ | ||
| 1739 | if (bytes == 8) { | ||
| 1740 | gpa_t gpa; | ||
| 1741 | struct page *page; | ||
| 1742 | char *addr; | ||
| 1743 | u64 val; | ||
| 1744 | |||
| 1745 | down_read(¤t->mm->mmap_sem); | ||
| 1746 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
| 1747 | |||
| 1748 | if (gpa == UNMAPPED_GVA || | ||
| 1749 | (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
| 1750 | goto emul_write; | ||
| 1751 | |||
| 1752 | if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) | ||
| 1753 | goto emul_write; | ||
| 1754 | |||
| 1755 | val = *(u64 *)new; | ||
| 1756 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
| 1757 | addr = kmap_atomic(page, KM_USER0); | ||
| 1758 | set_64bit((u64 *)(addr + offset_in_page(gpa)), val); | ||
| 1759 | kunmap_atomic(addr, KM_USER0); | ||
| 1760 | kvm_release_page_dirty(page); | ||
| 1761 | emul_write: | ||
| 1762 | up_read(¤t->mm->mmap_sem); | ||
| 1763 | } | ||
| 1764 | #endif | ||
| 1765 | |||
| 1176 | return emulator_write_emulated(addr, new, bytes, vcpu); | 1766 | return emulator_write_emulated(addr, new, bytes, vcpu); |
| 1177 | } | 1767 | } |
| 1178 | 1768 | ||
| @@ -1188,11 +1778,11 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) | |||
| 1188 | 1778 | ||
| 1189 | int emulate_clts(struct kvm_vcpu *vcpu) | 1779 | int emulate_clts(struct kvm_vcpu *vcpu) |
| 1190 | { | 1780 | { |
| 1191 | kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS); | 1781 | kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); |
| 1192 | return X86EMUL_CONTINUE; | 1782 | return X86EMUL_CONTINUE; |
| 1193 | } | 1783 | } |
| 1194 | 1784 | ||
| 1195 | int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest) | 1785 | int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) |
| 1196 | { | 1786 | { |
| 1197 | struct kvm_vcpu *vcpu = ctxt->vcpu; | 1787 | struct kvm_vcpu *vcpu = ctxt->vcpu; |
| 1198 | 1788 | ||
| @@ -1223,7 +1813,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) | |||
| 1223 | { | 1813 | { |
| 1224 | static int reported; | 1814 | static int reported; |
| 1225 | u8 opcodes[4]; | 1815 | u8 opcodes[4]; |
| 1226 | unsigned long rip = vcpu->rip; | 1816 | unsigned long rip = vcpu->arch.rip; |
| 1227 | unsigned long rip_linear; | 1817 | unsigned long rip_linear; |
| 1228 | 1818 | ||
| 1229 | rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); | 1819 | rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); |
| @@ -1241,7 +1831,6 @@ EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); | |||
| 1241 | 1831 | ||
| 1242 | struct x86_emulate_ops emulate_ops = { | 1832 | struct x86_emulate_ops emulate_ops = { |
| 1243 | .read_std = emulator_read_std, | 1833 | .read_std = emulator_read_std, |
| 1244 | .write_std = emulator_write_std, | ||
| 1245 | .read_emulated = emulator_read_emulated, | 1834 | .read_emulated = emulator_read_emulated, |
| 1246 | .write_emulated = emulator_write_emulated, | 1835 | .write_emulated = emulator_write_emulated, |
| 1247 | .cmpxchg_emulated = emulator_cmpxchg_emulated, | 1836 | .cmpxchg_emulated = emulator_cmpxchg_emulated, |
| @@ -1250,44 +1839,74 @@ struct x86_emulate_ops emulate_ops = { | |||
| 1250 | int emulate_instruction(struct kvm_vcpu *vcpu, | 1839 | int emulate_instruction(struct kvm_vcpu *vcpu, |
| 1251 | struct kvm_run *run, | 1840 | struct kvm_run *run, |
| 1252 | unsigned long cr2, | 1841 | unsigned long cr2, |
| 1253 | u16 error_code) | 1842 | u16 error_code, |
| 1843 | int emulation_type) | ||
| 1254 | { | 1844 | { |
| 1255 | struct x86_emulate_ctxt emulate_ctxt; | ||
| 1256 | int r; | 1845 | int r; |
| 1257 | int cs_db, cs_l; | 1846 | struct decode_cache *c; |
| 1258 | 1847 | ||
| 1259 | vcpu->mmio_fault_cr2 = cr2; | 1848 | vcpu->arch.mmio_fault_cr2 = cr2; |
| 1260 | kvm_x86_ops->cache_regs(vcpu); | 1849 | kvm_x86_ops->cache_regs(vcpu); |
| 1261 | 1850 | ||
| 1262 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | 1851 | vcpu->mmio_is_write = 0; |
| 1263 | 1852 | vcpu->arch.pio.string = 0; | |
| 1264 | emulate_ctxt.vcpu = vcpu; | 1853 | |
| 1265 | emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); | 1854 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { |
| 1266 | emulate_ctxt.cr2 = cr2; | 1855 | int cs_db, cs_l; |
| 1267 | emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) | 1856 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); |
| 1268 | ? X86EMUL_MODE_REAL : cs_l | 1857 | |
| 1269 | ? X86EMUL_MODE_PROT64 : cs_db | 1858 | vcpu->arch.emulate_ctxt.vcpu = vcpu; |
| 1270 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | 1859 | vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); |
| 1271 | 1860 | vcpu->arch.emulate_ctxt.mode = | |
| 1272 | if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) { | 1861 | (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) |
| 1273 | emulate_ctxt.cs_base = 0; | 1862 | ? X86EMUL_MODE_REAL : cs_l |
| 1274 | emulate_ctxt.ds_base = 0; | 1863 | ? X86EMUL_MODE_PROT64 : cs_db |
| 1275 | emulate_ctxt.es_base = 0; | 1864 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; |
| 1276 | emulate_ctxt.ss_base = 0; | 1865 | |
| 1277 | } else { | 1866 | if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) { |
| 1278 | emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS); | 1867 | vcpu->arch.emulate_ctxt.cs_base = 0; |
| 1279 | emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS); | 1868 | vcpu->arch.emulate_ctxt.ds_base = 0; |
| 1280 | emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES); | 1869 | vcpu->arch.emulate_ctxt.es_base = 0; |
| 1281 | emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS); | 1870 | vcpu->arch.emulate_ctxt.ss_base = 0; |
| 1871 | } else { | ||
| 1872 | vcpu->arch.emulate_ctxt.cs_base = | ||
| 1873 | get_segment_base(vcpu, VCPU_SREG_CS); | ||
| 1874 | vcpu->arch.emulate_ctxt.ds_base = | ||
| 1875 | get_segment_base(vcpu, VCPU_SREG_DS); | ||
| 1876 | vcpu->arch.emulate_ctxt.es_base = | ||
| 1877 | get_segment_base(vcpu, VCPU_SREG_ES); | ||
| 1878 | vcpu->arch.emulate_ctxt.ss_base = | ||
| 1879 | get_segment_base(vcpu, VCPU_SREG_SS); | ||
| 1880 | } | ||
| 1881 | |||
| 1882 | vcpu->arch.emulate_ctxt.gs_base = | ||
| 1883 | get_segment_base(vcpu, VCPU_SREG_GS); | ||
| 1884 | vcpu->arch.emulate_ctxt.fs_base = | ||
| 1885 | get_segment_base(vcpu, VCPU_SREG_FS); | ||
| 1886 | |||
| 1887 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); | ||
| 1888 | |||
| 1889 | /* Reject the instructions other than VMCALL/VMMCALL when | ||
| 1890 | * try to emulate invalid opcode */ | ||
| 1891 | c = &vcpu->arch.emulate_ctxt.decode; | ||
| 1892 | if ((emulation_type & EMULTYPE_TRAP_UD) && | ||
| 1893 | (!(c->twobyte && c->b == 0x01 && | ||
| 1894 | (c->modrm_reg == 0 || c->modrm_reg == 3) && | ||
| 1895 | c->modrm_mod == 3 && c->modrm_rm == 1))) | ||
| 1896 | return EMULATE_FAIL; | ||
| 1897 | |||
| 1898 | ++vcpu->stat.insn_emulation; | ||
| 1899 | if (r) { | ||
| 1900 | ++vcpu->stat.insn_emulation_fail; | ||
| 1901 | if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) | ||
| 1902 | return EMULATE_DONE; | ||
| 1903 | return EMULATE_FAIL; | ||
| 1904 | } | ||
| 1282 | } | 1905 | } |
| 1283 | 1906 | ||
| 1284 | emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS); | 1907 | r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); |
| 1285 | emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS); | ||
| 1286 | 1908 | ||
| 1287 | vcpu->mmio_is_write = 0; | 1909 | if (vcpu->arch.pio.string) |
| 1288 | vcpu->pio.string = 0; | ||
| 1289 | r = x86_emulate_memop(&emulate_ctxt, &emulate_ops); | ||
| 1290 | if (vcpu->pio.string) | ||
| 1291 | return EMULATE_DO_MMIO; | 1910 | return EMULATE_DO_MMIO; |
| 1292 | 1911 | ||
| 1293 | if ((r || vcpu->mmio_is_write) && run) { | 1912 | if ((r || vcpu->mmio_is_write) && run) { |
| @@ -1309,7 +1928,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
| 1309 | } | 1928 | } |
| 1310 | 1929 | ||
| 1311 | kvm_x86_ops->decache_regs(vcpu); | 1930 | kvm_x86_ops->decache_regs(vcpu); |
| 1312 | kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags); | 1931 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); |
| 1313 | 1932 | ||
| 1314 | if (vcpu->mmio_is_write) { | 1933 | if (vcpu->mmio_is_write) { |
| 1315 | vcpu->mmio_needed = 0; | 1934 | vcpu->mmio_needed = 0; |
| @@ -1320,439 +1939,45 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
| 1320 | } | 1939 | } |
| 1321 | EXPORT_SYMBOL_GPL(emulate_instruction); | 1940 | EXPORT_SYMBOL_GPL(emulate_instruction); |
| 1322 | 1941 | ||
| 1323 | /* | 1942 | static void free_pio_guest_pages(struct kvm_vcpu *vcpu) |
| 1324 | * The vCPU has executed a HLT instruction with in-kernel mode enabled. | ||
| 1325 | */ | ||
| 1326 | static void kvm_vcpu_block(struct kvm_vcpu *vcpu) | ||
| 1327 | { | ||
| 1328 | DECLARE_WAITQUEUE(wait, current); | ||
| 1329 | |||
| 1330 | add_wait_queue(&vcpu->wq, &wait); | ||
| 1331 | |||
| 1332 | /* | ||
| 1333 | * We will block until either an interrupt or a signal wakes us up | ||
| 1334 | */ | ||
| 1335 | while (!kvm_cpu_has_interrupt(vcpu) | ||
| 1336 | && !signal_pending(current) | ||
| 1337 | && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE | ||
| 1338 | && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) { | ||
| 1339 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 1340 | vcpu_put(vcpu); | ||
| 1341 | schedule(); | ||
| 1342 | vcpu_load(vcpu); | ||
| 1343 | } | ||
| 1344 | |||
| 1345 | __set_current_state(TASK_RUNNING); | ||
| 1346 | remove_wait_queue(&vcpu->wq, &wait); | ||
| 1347 | } | ||
| 1348 | |||
| 1349 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) | ||
| 1350 | { | ||
| 1351 | ++vcpu->stat.halt_exits; | ||
| 1352 | if (irqchip_in_kernel(vcpu->kvm)) { | ||
| 1353 | vcpu->mp_state = VCPU_MP_STATE_HALTED; | ||
| 1354 | kvm_vcpu_block(vcpu); | ||
| 1355 | if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE) | ||
| 1356 | return -EINTR; | ||
| 1357 | return 1; | ||
| 1358 | } else { | ||
| 1359 | vcpu->run->exit_reason = KVM_EXIT_HLT; | ||
| 1360 | return 0; | ||
| 1361 | } | ||
| 1362 | } | ||
| 1363 | EXPORT_SYMBOL_GPL(kvm_emulate_halt); | ||
| 1364 | |||
| 1365 | int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) | ||
| 1366 | { | ||
| 1367 | unsigned long nr, a0, a1, a2, a3, a4, a5, ret; | ||
| 1368 | |||
| 1369 | kvm_x86_ops->cache_regs(vcpu); | ||
| 1370 | ret = -KVM_EINVAL; | ||
| 1371 | #ifdef CONFIG_X86_64 | ||
| 1372 | if (is_long_mode(vcpu)) { | ||
| 1373 | nr = vcpu->regs[VCPU_REGS_RAX]; | ||
| 1374 | a0 = vcpu->regs[VCPU_REGS_RDI]; | ||
| 1375 | a1 = vcpu->regs[VCPU_REGS_RSI]; | ||
| 1376 | a2 = vcpu->regs[VCPU_REGS_RDX]; | ||
| 1377 | a3 = vcpu->regs[VCPU_REGS_RCX]; | ||
| 1378 | a4 = vcpu->regs[VCPU_REGS_R8]; | ||
| 1379 | a5 = vcpu->regs[VCPU_REGS_R9]; | ||
| 1380 | } else | ||
| 1381 | #endif | ||
| 1382 | { | ||
| 1383 | nr = vcpu->regs[VCPU_REGS_RBX] & -1u; | ||
| 1384 | a0 = vcpu->regs[VCPU_REGS_RAX] & -1u; | ||
| 1385 | a1 = vcpu->regs[VCPU_REGS_RCX] & -1u; | ||
| 1386 | a2 = vcpu->regs[VCPU_REGS_RDX] & -1u; | ||
| 1387 | a3 = vcpu->regs[VCPU_REGS_RSI] & -1u; | ||
| 1388 | a4 = vcpu->regs[VCPU_REGS_RDI] & -1u; | ||
| 1389 | a5 = vcpu->regs[VCPU_REGS_RBP] & -1u; | ||
| 1390 | } | ||
| 1391 | switch (nr) { | ||
| 1392 | default: | ||
| 1393 | run->hypercall.nr = nr; | ||
| 1394 | run->hypercall.args[0] = a0; | ||
| 1395 | run->hypercall.args[1] = a1; | ||
| 1396 | run->hypercall.args[2] = a2; | ||
| 1397 | run->hypercall.args[3] = a3; | ||
| 1398 | run->hypercall.args[4] = a4; | ||
| 1399 | run->hypercall.args[5] = a5; | ||
| 1400 | run->hypercall.ret = ret; | ||
| 1401 | run->hypercall.longmode = is_long_mode(vcpu); | ||
| 1402 | kvm_x86_ops->decache_regs(vcpu); | ||
| 1403 | return 0; | ||
| 1404 | } | ||
| 1405 | vcpu->regs[VCPU_REGS_RAX] = ret; | ||
| 1406 | kvm_x86_ops->decache_regs(vcpu); | ||
| 1407 | return 1; | ||
| 1408 | } | ||
| 1409 | EXPORT_SYMBOL_GPL(kvm_hypercall); | ||
| 1410 | |||
| 1411 | static u64 mk_cr_64(u64 curr_cr, u32 new_val) | ||
| 1412 | { | ||
| 1413 | return (curr_cr & ~((1ULL << 32) - 1)) | new_val; | ||
| 1414 | } | ||
| 1415 | |||
| 1416 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
| 1417 | { | ||
| 1418 | struct descriptor_table dt = { limit, base }; | ||
| 1419 | |||
| 1420 | kvm_x86_ops->set_gdt(vcpu, &dt); | ||
| 1421 | } | ||
| 1422 | |||
| 1423 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
| 1424 | { | ||
| 1425 | struct descriptor_table dt = { limit, base }; | ||
| 1426 | |||
| 1427 | kvm_x86_ops->set_idt(vcpu, &dt); | ||
| 1428 | } | ||
| 1429 | |||
| 1430 | void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, | ||
| 1431 | unsigned long *rflags) | ||
| 1432 | { | ||
| 1433 | lmsw(vcpu, msw); | ||
| 1434 | *rflags = kvm_x86_ops->get_rflags(vcpu); | ||
| 1435 | } | ||
| 1436 | |||
| 1437 | unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) | ||
| 1438 | { | ||
| 1439 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | ||
| 1440 | switch (cr) { | ||
| 1441 | case 0: | ||
| 1442 | return vcpu->cr0; | ||
| 1443 | case 2: | ||
| 1444 | return vcpu->cr2; | ||
| 1445 | case 3: | ||
| 1446 | return vcpu->cr3; | ||
| 1447 | case 4: | ||
| 1448 | return vcpu->cr4; | ||
| 1449 | default: | ||
| 1450 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
| 1451 | return 0; | ||
| 1452 | } | ||
| 1453 | } | ||
| 1454 | |||
| 1455 | void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, | ||
| 1456 | unsigned long *rflags) | ||
| 1457 | { | ||
| 1458 | switch (cr) { | ||
| 1459 | case 0: | ||
| 1460 | set_cr0(vcpu, mk_cr_64(vcpu->cr0, val)); | ||
| 1461 | *rflags = kvm_x86_ops->get_rflags(vcpu); | ||
| 1462 | break; | ||
| 1463 | case 2: | ||
| 1464 | vcpu->cr2 = val; | ||
| 1465 | break; | ||
| 1466 | case 3: | ||
| 1467 | set_cr3(vcpu, val); | ||
| 1468 | break; | ||
| 1469 | case 4: | ||
| 1470 | set_cr4(vcpu, mk_cr_64(vcpu->cr4, val)); | ||
| 1471 | break; | ||
| 1472 | default: | ||
| 1473 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
| 1474 | } | ||
| 1475 | } | ||
| 1476 | |||
| 1477 | /* | ||
| 1478 | * Register the para guest with the host: | ||
| 1479 | */ | ||
| 1480 | static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) | ||
| 1481 | { | ||
| 1482 | struct kvm_vcpu_para_state *para_state; | ||
| 1483 | hpa_t para_state_hpa, hypercall_hpa; | ||
| 1484 | struct page *para_state_page; | ||
| 1485 | unsigned char *hypercall; | ||
| 1486 | gpa_t hypercall_gpa; | ||
| 1487 | |||
| 1488 | printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n"); | ||
| 1489 | printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa); | ||
| 1490 | |||
| 1491 | /* | ||
| 1492 | * Needs to be page aligned: | ||
| 1493 | */ | ||
| 1494 | if (para_state_gpa != PAGE_ALIGN(para_state_gpa)) | ||
| 1495 | goto err_gp; | ||
| 1496 | |||
| 1497 | para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa); | ||
| 1498 | printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa); | ||
| 1499 | if (is_error_hpa(para_state_hpa)) | ||
| 1500 | goto err_gp; | ||
| 1501 | |||
| 1502 | mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT); | ||
| 1503 | para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT); | ||
| 1504 | para_state = kmap(para_state_page); | ||
| 1505 | |||
| 1506 | printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version); | ||
| 1507 | printk(KERN_DEBUG ".... size: %d\n", para_state->size); | ||
| 1508 | |||
| 1509 | para_state->host_version = KVM_PARA_API_VERSION; | ||
| 1510 | /* | ||
| 1511 | * We cannot support guests that try to register themselves | ||
| 1512 | * with a newer API version than the host supports: | ||
| 1513 | */ | ||
| 1514 | if (para_state->guest_version > KVM_PARA_API_VERSION) { | ||
| 1515 | para_state->ret = -KVM_EINVAL; | ||
| 1516 | goto err_kunmap_skip; | ||
| 1517 | } | ||
| 1518 | |||
| 1519 | hypercall_gpa = para_state->hypercall_gpa; | ||
| 1520 | hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa); | ||
| 1521 | printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa); | ||
| 1522 | if (is_error_hpa(hypercall_hpa)) { | ||
| 1523 | para_state->ret = -KVM_EINVAL; | ||
| 1524 | goto err_kunmap_skip; | ||
| 1525 | } | ||
| 1526 | |||
| 1527 | printk(KERN_DEBUG "kvm: para guest successfully registered.\n"); | ||
| 1528 | vcpu->para_state_page = para_state_page; | ||
| 1529 | vcpu->para_state_gpa = para_state_gpa; | ||
| 1530 | vcpu->hypercall_gpa = hypercall_gpa; | ||
| 1531 | |||
| 1532 | mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT); | ||
| 1533 | hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT), | ||
| 1534 | KM_USER1) + (hypercall_hpa & ~PAGE_MASK); | ||
| 1535 | kvm_x86_ops->patch_hypercall(vcpu, hypercall); | ||
| 1536 | kunmap_atomic(hypercall, KM_USER1); | ||
| 1537 | |||
| 1538 | para_state->ret = 0; | ||
| 1539 | err_kunmap_skip: | ||
| 1540 | kunmap(para_state_page); | ||
| 1541 | return 0; | ||
| 1542 | err_gp: | ||
| 1543 | return 1; | ||
| 1544 | } | ||
| 1545 | |||
| 1546 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | ||
| 1547 | { | ||
| 1548 | u64 data; | ||
| 1549 | |||
| 1550 | switch (msr) { | ||
| 1551 | case 0xc0010010: /* SYSCFG */ | ||
| 1552 | case 0xc0010015: /* HWCR */ | ||
| 1553 | case MSR_IA32_PLATFORM_ID: | ||
| 1554 | case MSR_IA32_P5_MC_ADDR: | ||
| 1555 | case MSR_IA32_P5_MC_TYPE: | ||
| 1556 | case MSR_IA32_MC0_CTL: | ||
| 1557 | case MSR_IA32_MCG_STATUS: | ||
| 1558 | case MSR_IA32_MCG_CAP: | ||
| 1559 | case MSR_IA32_MC0_MISC: | ||
| 1560 | case MSR_IA32_MC0_MISC+4: | ||
| 1561 | case MSR_IA32_MC0_MISC+8: | ||
| 1562 | case MSR_IA32_MC0_MISC+12: | ||
| 1563 | case MSR_IA32_MC0_MISC+16: | ||
| 1564 | case MSR_IA32_UCODE_REV: | ||
| 1565 | case MSR_IA32_PERF_STATUS: | ||
| 1566 | case MSR_IA32_EBL_CR_POWERON: | ||
| 1567 | /* MTRR registers */ | ||
| 1568 | case 0xfe: | ||
| 1569 | case 0x200 ... 0x2ff: | ||
| 1570 | data = 0; | ||
| 1571 | break; | ||
| 1572 | case 0xcd: /* fsb frequency */ | ||
| 1573 | data = 3; | ||
| 1574 | break; | ||
| 1575 | case MSR_IA32_APICBASE: | ||
| 1576 | data = kvm_get_apic_base(vcpu); | ||
| 1577 | break; | ||
| 1578 | case MSR_IA32_MISC_ENABLE: | ||
| 1579 | data = vcpu->ia32_misc_enable_msr; | ||
| 1580 | break; | ||
| 1581 | #ifdef CONFIG_X86_64 | ||
| 1582 | case MSR_EFER: | ||
| 1583 | data = vcpu->shadow_efer; | ||
| 1584 | break; | ||
| 1585 | #endif | ||
| 1586 | default: | ||
| 1587 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); | ||
| 1588 | return 1; | ||
| 1589 | } | ||
| 1590 | *pdata = data; | ||
| 1591 | return 0; | ||
| 1592 | } | ||
| 1593 | EXPORT_SYMBOL_GPL(kvm_get_msr_common); | ||
| 1594 | |||
| 1595 | /* | ||
| 1596 | * Reads an msr value (of 'msr_index') into 'pdata'. | ||
| 1597 | * Returns 0 on success, non-0 otherwise. | ||
| 1598 | * Assumes vcpu_load() was already called. | ||
| 1599 | */ | ||
| 1600 | int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | ||
| 1601 | { | ||
| 1602 | return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); | ||
| 1603 | } | ||
| 1604 | |||
| 1605 | #ifdef CONFIG_X86_64 | ||
| 1606 | |||
| 1607 | static void set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
| 1608 | { | ||
| 1609 | if (efer & EFER_RESERVED_BITS) { | ||
| 1610 | printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", | ||
| 1611 | efer); | ||
| 1612 | inject_gp(vcpu); | ||
| 1613 | return; | ||
| 1614 | } | ||
| 1615 | |||
| 1616 | if (is_paging(vcpu) | ||
| 1617 | && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) { | ||
| 1618 | printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); | ||
| 1619 | inject_gp(vcpu); | ||
| 1620 | return; | ||
| 1621 | } | ||
| 1622 | |||
| 1623 | kvm_x86_ops->set_efer(vcpu, efer); | ||
| 1624 | |||
| 1625 | efer &= ~EFER_LMA; | ||
| 1626 | efer |= vcpu->shadow_efer & EFER_LMA; | ||
| 1627 | |||
| 1628 | vcpu->shadow_efer = efer; | ||
| 1629 | } | ||
| 1630 | |||
| 1631 | #endif | ||
| 1632 | |||
| 1633 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | ||
| 1634 | { | ||
| 1635 | switch (msr) { | ||
| 1636 | #ifdef CONFIG_X86_64 | ||
| 1637 | case MSR_EFER: | ||
| 1638 | set_efer(vcpu, data); | ||
| 1639 | break; | ||
| 1640 | #endif | ||
| 1641 | case MSR_IA32_MC0_STATUS: | ||
| 1642 | pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", | ||
| 1643 | __FUNCTION__, data); | ||
| 1644 | break; | ||
| 1645 | case MSR_IA32_MCG_STATUS: | ||
| 1646 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", | ||
| 1647 | __FUNCTION__, data); | ||
| 1648 | break; | ||
| 1649 | case MSR_IA32_UCODE_REV: | ||
| 1650 | case MSR_IA32_UCODE_WRITE: | ||
| 1651 | case 0x200 ... 0x2ff: /* MTRRs */ | ||
| 1652 | break; | ||
| 1653 | case MSR_IA32_APICBASE: | ||
| 1654 | kvm_set_apic_base(vcpu, data); | ||
| 1655 | break; | ||
| 1656 | case MSR_IA32_MISC_ENABLE: | ||
| 1657 | vcpu->ia32_misc_enable_msr = data; | ||
| 1658 | break; | ||
| 1659 | /* | ||
| 1660 | * This is the 'probe whether the host is KVM' logic: | ||
| 1661 | */ | ||
| 1662 | case MSR_KVM_API_MAGIC: | ||
| 1663 | return vcpu_register_para(vcpu, data); | ||
| 1664 | |||
| 1665 | default: | ||
| 1666 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr); | ||
| 1667 | return 1; | ||
| 1668 | } | ||
| 1669 | return 0; | ||
| 1670 | } | ||
| 1671 | EXPORT_SYMBOL_GPL(kvm_set_msr_common); | ||
| 1672 | |||
| 1673 | /* | ||
| 1674 | * Writes msr value into into the appropriate "register". | ||
| 1675 | * Returns 0 on success, non-0 otherwise. | ||
| 1676 | * Assumes vcpu_load() was already called. | ||
| 1677 | */ | ||
| 1678 | int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
| 1679 | { | ||
| 1680 | return kvm_x86_ops->set_msr(vcpu, msr_index, data); | ||
| 1681 | } | ||
| 1682 | |||
| 1683 | void kvm_resched(struct kvm_vcpu *vcpu) | ||
| 1684 | { | ||
| 1685 | if (!need_resched()) | ||
| 1686 | return; | ||
| 1687 | cond_resched(); | ||
| 1688 | } | ||
| 1689 | EXPORT_SYMBOL_GPL(kvm_resched); | ||
| 1690 | |||
| 1691 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | ||
| 1692 | { | 1943 | { |
| 1693 | int i; | 1944 | int i; |
| 1694 | u32 function; | ||
| 1695 | struct kvm_cpuid_entry *e, *best; | ||
| 1696 | 1945 | ||
| 1697 | kvm_x86_ops->cache_regs(vcpu); | 1946 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i) |
| 1698 | function = vcpu->regs[VCPU_REGS_RAX]; | 1947 | if (vcpu->arch.pio.guest_pages[i]) { |
| 1699 | vcpu->regs[VCPU_REGS_RAX] = 0; | 1948 | kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]); |
| 1700 | vcpu->regs[VCPU_REGS_RBX] = 0; | 1949 | vcpu->arch.pio.guest_pages[i] = NULL; |
| 1701 | vcpu->regs[VCPU_REGS_RCX] = 0; | ||
| 1702 | vcpu->regs[VCPU_REGS_RDX] = 0; | ||
| 1703 | best = NULL; | ||
| 1704 | for (i = 0; i < vcpu->cpuid_nent; ++i) { | ||
| 1705 | e = &vcpu->cpuid_entries[i]; | ||
| 1706 | if (e->function == function) { | ||
| 1707 | best = e; | ||
| 1708 | break; | ||
| 1709 | } | 1950 | } |
| 1710 | /* | ||
| 1711 | * Both basic or both extended? | ||
| 1712 | */ | ||
| 1713 | if (((e->function ^ function) & 0x80000000) == 0) | ||
| 1714 | if (!best || e->function > best->function) | ||
| 1715 | best = e; | ||
| 1716 | } | ||
| 1717 | if (best) { | ||
| 1718 | vcpu->regs[VCPU_REGS_RAX] = best->eax; | ||
| 1719 | vcpu->regs[VCPU_REGS_RBX] = best->ebx; | ||
| 1720 | vcpu->regs[VCPU_REGS_RCX] = best->ecx; | ||
| 1721 | vcpu->regs[VCPU_REGS_RDX] = best->edx; | ||
| 1722 | } | ||
| 1723 | kvm_x86_ops->decache_regs(vcpu); | ||
| 1724 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
| 1725 | } | 1951 | } |
| 1726 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | ||
| 1727 | 1952 | ||
| 1728 | static int pio_copy_data(struct kvm_vcpu *vcpu) | 1953 | static int pio_copy_data(struct kvm_vcpu *vcpu) |
| 1729 | { | 1954 | { |
| 1730 | void *p = vcpu->pio_data; | 1955 | void *p = vcpu->arch.pio_data; |
| 1731 | void *q; | 1956 | void *q; |
| 1732 | unsigned bytes; | 1957 | unsigned bytes; |
| 1733 | int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1; | 1958 | int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1; |
| 1734 | 1959 | ||
| 1735 | q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE, | 1960 | q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE, |
| 1736 | PAGE_KERNEL); | 1961 | PAGE_KERNEL); |
| 1737 | if (!q) { | 1962 | if (!q) { |
| 1738 | free_pio_guest_pages(vcpu); | 1963 | free_pio_guest_pages(vcpu); |
| 1739 | return -ENOMEM; | 1964 | return -ENOMEM; |
| 1740 | } | 1965 | } |
| 1741 | q += vcpu->pio.guest_page_offset; | 1966 | q += vcpu->arch.pio.guest_page_offset; |
| 1742 | bytes = vcpu->pio.size * vcpu->pio.cur_count; | 1967 | bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; |
| 1743 | if (vcpu->pio.in) | 1968 | if (vcpu->arch.pio.in) |
| 1744 | memcpy(q, p, bytes); | 1969 | memcpy(q, p, bytes); |
| 1745 | else | 1970 | else |
| 1746 | memcpy(p, q, bytes); | 1971 | memcpy(p, q, bytes); |
| 1747 | q -= vcpu->pio.guest_page_offset; | 1972 | q -= vcpu->arch.pio.guest_page_offset; |
| 1748 | vunmap(q); | 1973 | vunmap(q); |
| 1749 | free_pio_guest_pages(vcpu); | 1974 | free_pio_guest_pages(vcpu); |
| 1750 | return 0; | 1975 | return 0; |
| 1751 | } | 1976 | } |
| 1752 | 1977 | ||
| 1753 | static int complete_pio(struct kvm_vcpu *vcpu) | 1978 | int complete_pio(struct kvm_vcpu *vcpu) |
| 1754 | { | 1979 | { |
| 1755 | struct kvm_pio_request *io = &vcpu->pio; | 1980 | struct kvm_pio_request *io = &vcpu->arch.pio; |
| 1756 | long delta; | 1981 | long delta; |
| 1757 | int r; | 1982 | int r; |
| 1758 | 1983 | ||
| @@ -1760,7 +1985,7 @@ static int complete_pio(struct kvm_vcpu *vcpu) | |||
| 1760 | 1985 | ||
| 1761 | if (!io->string) { | 1986 | if (!io->string) { |
| 1762 | if (io->in) | 1987 | if (io->in) |
| 1763 | memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data, | 1988 | memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, |
| 1764 | io->size); | 1989 | io->size); |
| 1765 | } else { | 1990 | } else { |
| 1766 | if (io->in) { | 1991 | if (io->in) { |
| @@ -1778,15 +2003,15 @@ static int complete_pio(struct kvm_vcpu *vcpu) | |||
| 1778 | * The size of the register should really depend on | 2003 | * The size of the register should really depend on |
| 1779 | * current address size. | 2004 | * current address size. |
| 1780 | */ | 2005 | */ |
| 1781 | vcpu->regs[VCPU_REGS_RCX] -= delta; | 2006 | vcpu->arch.regs[VCPU_REGS_RCX] -= delta; |
| 1782 | } | 2007 | } |
| 1783 | if (io->down) | 2008 | if (io->down) |
| 1784 | delta = -delta; | 2009 | delta = -delta; |
| 1785 | delta *= io->size; | 2010 | delta *= io->size; |
| 1786 | if (io->in) | 2011 | if (io->in) |
| 1787 | vcpu->regs[VCPU_REGS_RDI] += delta; | 2012 | vcpu->arch.regs[VCPU_REGS_RDI] += delta; |
| 1788 | else | 2013 | else |
| 1789 | vcpu->regs[VCPU_REGS_RSI] += delta; | 2014 | vcpu->arch.regs[VCPU_REGS_RSI] += delta; |
| 1790 | } | 2015 | } |
| 1791 | 2016 | ||
| 1792 | kvm_x86_ops->decache_regs(vcpu); | 2017 | kvm_x86_ops->decache_regs(vcpu); |
| @@ -1804,13 +2029,13 @@ static void kernel_pio(struct kvm_io_device *pio_dev, | |||
| 1804 | /* TODO: String I/O for in kernel device */ | 2029 | /* TODO: String I/O for in kernel device */ |
| 1805 | 2030 | ||
| 1806 | mutex_lock(&vcpu->kvm->lock); | 2031 | mutex_lock(&vcpu->kvm->lock); |
| 1807 | if (vcpu->pio.in) | 2032 | if (vcpu->arch.pio.in) |
| 1808 | kvm_iodevice_read(pio_dev, vcpu->pio.port, | 2033 | kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, |
| 1809 | vcpu->pio.size, | 2034 | vcpu->arch.pio.size, |
| 1810 | pd); | 2035 | pd); |
| 1811 | else | 2036 | else |
| 1812 | kvm_iodevice_write(pio_dev, vcpu->pio.port, | 2037 | kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, |
| 1813 | vcpu->pio.size, | 2038 | vcpu->arch.pio.size, |
| 1814 | pd); | 2039 | pd); |
| 1815 | mutex_unlock(&vcpu->kvm->lock); | 2040 | mutex_unlock(&vcpu->kvm->lock); |
| 1816 | } | 2041 | } |
| @@ -1818,8 +2043,8 @@ static void kernel_pio(struct kvm_io_device *pio_dev, | |||
| 1818 | static void pio_string_write(struct kvm_io_device *pio_dev, | 2043 | static void pio_string_write(struct kvm_io_device *pio_dev, |
| 1819 | struct kvm_vcpu *vcpu) | 2044 | struct kvm_vcpu *vcpu) |
| 1820 | { | 2045 | { |
| 1821 | struct kvm_pio_request *io = &vcpu->pio; | 2046 | struct kvm_pio_request *io = &vcpu->arch.pio; |
| 1822 | void *pd = vcpu->pio_data; | 2047 | void *pd = vcpu->arch.pio_data; |
| 1823 | int i; | 2048 | int i; |
| 1824 | 2049 | ||
| 1825 | mutex_lock(&vcpu->kvm->lock); | 2050 | mutex_lock(&vcpu->kvm->lock); |
| @@ -1832,32 +2057,38 @@ static void pio_string_write(struct kvm_io_device *pio_dev, | |||
| 1832 | mutex_unlock(&vcpu->kvm->lock); | 2057 | mutex_unlock(&vcpu->kvm->lock); |
| 1833 | } | 2058 | } |
| 1834 | 2059 | ||
| 1835 | int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | 2060 | static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, |
| 2061 | gpa_t addr) | ||
| 2062 | { | ||
| 2063 | return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); | ||
| 2064 | } | ||
| 2065 | |||
| 2066 | int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | ||
| 1836 | int size, unsigned port) | 2067 | int size, unsigned port) |
| 1837 | { | 2068 | { |
| 1838 | struct kvm_io_device *pio_dev; | 2069 | struct kvm_io_device *pio_dev; |
| 1839 | 2070 | ||
| 1840 | vcpu->run->exit_reason = KVM_EXIT_IO; | 2071 | vcpu->run->exit_reason = KVM_EXIT_IO; |
| 1841 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | 2072 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; |
| 1842 | vcpu->run->io.size = vcpu->pio.size = size; | 2073 | vcpu->run->io.size = vcpu->arch.pio.size = size; |
| 1843 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; | 2074 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; |
| 1844 | vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1; | 2075 | vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; |
| 1845 | vcpu->run->io.port = vcpu->pio.port = port; | 2076 | vcpu->run->io.port = vcpu->arch.pio.port = port; |
| 1846 | vcpu->pio.in = in; | 2077 | vcpu->arch.pio.in = in; |
| 1847 | vcpu->pio.string = 0; | 2078 | vcpu->arch.pio.string = 0; |
| 1848 | vcpu->pio.down = 0; | 2079 | vcpu->arch.pio.down = 0; |
| 1849 | vcpu->pio.guest_page_offset = 0; | 2080 | vcpu->arch.pio.guest_page_offset = 0; |
| 1850 | vcpu->pio.rep = 0; | 2081 | vcpu->arch.pio.rep = 0; |
| 1851 | 2082 | ||
| 1852 | kvm_x86_ops->cache_regs(vcpu); | 2083 | kvm_x86_ops->cache_regs(vcpu); |
| 1853 | memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); | 2084 | memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); |
| 1854 | kvm_x86_ops->decache_regs(vcpu); | 2085 | kvm_x86_ops->decache_regs(vcpu); |
| 1855 | 2086 | ||
| 1856 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 2087 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
| 1857 | 2088 | ||
| 1858 | pio_dev = vcpu_find_pio_dev(vcpu, port); | 2089 | pio_dev = vcpu_find_pio_dev(vcpu, port); |
| 1859 | if (pio_dev) { | 2090 | if (pio_dev) { |
| 1860 | kernel_pio(pio_dev, vcpu, vcpu->pio_data); | 2091 | kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); |
| 1861 | complete_pio(vcpu); | 2092 | complete_pio(vcpu); |
| 1862 | return 1; | 2093 | return 1; |
| 1863 | } | 2094 | } |
| @@ -1877,15 +2108,15 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
| 1877 | 2108 | ||
| 1878 | vcpu->run->exit_reason = KVM_EXIT_IO; | 2109 | vcpu->run->exit_reason = KVM_EXIT_IO; |
| 1879 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | 2110 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; |
| 1880 | vcpu->run->io.size = vcpu->pio.size = size; | 2111 | vcpu->run->io.size = vcpu->arch.pio.size = size; |
| 1881 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; | 2112 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; |
| 1882 | vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count; | 2113 | vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; |
| 1883 | vcpu->run->io.port = vcpu->pio.port = port; | 2114 | vcpu->run->io.port = vcpu->arch.pio.port = port; |
| 1884 | vcpu->pio.in = in; | 2115 | vcpu->arch.pio.in = in; |
| 1885 | vcpu->pio.string = 1; | 2116 | vcpu->arch.pio.string = 1; |
| 1886 | vcpu->pio.down = down; | 2117 | vcpu->arch.pio.down = down; |
| 1887 | vcpu->pio.guest_page_offset = offset_in_page(address); | 2118 | vcpu->arch.pio.guest_page_offset = offset_in_page(address); |
| 1888 | vcpu->pio.rep = rep; | 2119 | vcpu->arch.pio.rep = rep; |
| 1889 | 2120 | ||
| 1890 | if (!count) { | 2121 | if (!count) { |
| 1891 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 2122 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
| @@ -1911,37 +2142,35 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
| 1911 | * String I/O in reverse. Yuck. Kill the guest, fix later. | 2142 | * String I/O in reverse. Yuck. Kill the guest, fix later. |
| 1912 | */ | 2143 | */ |
| 1913 | pr_unimpl(vcpu, "guest string pio down\n"); | 2144 | pr_unimpl(vcpu, "guest string pio down\n"); |
| 1914 | inject_gp(vcpu); | 2145 | kvm_inject_gp(vcpu, 0); |
| 1915 | return 1; | 2146 | return 1; |
| 1916 | } | 2147 | } |
| 1917 | vcpu->run->io.count = now; | 2148 | vcpu->run->io.count = now; |
| 1918 | vcpu->pio.cur_count = now; | 2149 | vcpu->arch.pio.cur_count = now; |
| 1919 | 2150 | ||
| 1920 | if (vcpu->pio.cur_count == vcpu->pio.count) | 2151 | if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) |
| 1921 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 2152 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
| 1922 | 2153 | ||
| 1923 | for (i = 0; i < nr_pages; ++i) { | 2154 | for (i = 0; i < nr_pages; ++i) { |
| 1924 | mutex_lock(&vcpu->kvm->lock); | 2155 | down_read(¤t->mm->mmap_sem); |
| 1925 | page = gva_to_page(vcpu, address + i * PAGE_SIZE); | 2156 | page = gva_to_page(vcpu, address + i * PAGE_SIZE); |
| 1926 | if (page) | 2157 | vcpu->arch.pio.guest_pages[i] = page; |
| 1927 | get_page(page); | 2158 | up_read(¤t->mm->mmap_sem); |
| 1928 | vcpu->pio.guest_pages[i] = page; | ||
| 1929 | mutex_unlock(&vcpu->kvm->lock); | ||
| 1930 | if (!page) { | 2159 | if (!page) { |
| 1931 | inject_gp(vcpu); | 2160 | kvm_inject_gp(vcpu, 0); |
| 1932 | free_pio_guest_pages(vcpu); | 2161 | free_pio_guest_pages(vcpu); |
| 1933 | return 1; | 2162 | return 1; |
| 1934 | } | 2163 | } |
| 1935 | } | 2164 | } |
| 1936 | 2165 | ||
| 1937 | pio_dev = vcpu_find_pio_dev(vcpu, port); | 2166 | pio_dev = vcpu_find_pio_dev(vcpu, port); |
| 1938 | if (!vcpu->pio.in) { | 2167 | if (!vcpu->arch.pio.in) { |
| 1939 | /* string PIO write */ | 2168 | /* string PIO write */ |
| 1940 | ret = pio_copy_data(vcpu); | 2169 | ret = pio_copy_data(vcpu); |
| 1941 | if (ret >= 0 && pio_dev) { | 2170 | if (ret >= 0 && pio_dev) { |
| 1942 | pio_string_write(pio_dev, vcpu); | 2171 | pio_string_write(pio_dev, vcpu); |
| 1943 | complete_pio(vcpu); | 2172 | complete_pio(vcpu); |
| 1944 | if (vcpu->pio.count == 0) | 2173 | if (vcpu->arch.pio.count == 0) |
| 1945 | ret = 1; | 2174 | ret = 1; |
| 1946 | } | 2175 | } |
| 1947 | } else if (pio_dev) | 2176 | } else if (pio_dev) |
| @@ -1953,6 +2182,263 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
| 1953 | } | 2182 | } |
| 1954 | EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); | 2183 | EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); |
| 1955 | 2184 | ||
| 2185 | int kvm_arch_init(void *opaque) | ||
| 2186 | { | ||
| 2187 | int r; | ||
| 2188 | struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; | ||
| 2189 | |||
| 2190 | if (kvm_x86_ops) { | ||
| 2191 | printk(KERN_ERR "kvm: already loaded the other module\n"); | ||
| 2192 | r = -EEXIST; | ||
| 2193 | goto out; | ||
| 2194 | } | ||
| 2195 | |||
| 2196 | if (!ops->cpu_has_kvm_support()) { | ||
| 2197 | printk(KERN_ERR "kvm: no hardware support\n"); | ||
| 2198 | r = -EOPNOTSUPP; | ||
| 2199 | goto out; | ||
| 2200 | } | ||
| 2201 | if (ops->disabled_by_bios()) { | ||
| 2202 | printk(KERN_ERR "kvm: disabled by bios\n"); | ||
| 2203 | r = -EOPNOTSUPP; | ||
| 2204 | goto out; | ||
| 2205 | } | ||
| 2206 | |||
| 2207 | r = kvm_mmu_module_init(); | ||
| 2208 | if (r) | ||
| 2209 | goto out; | ||
| 2210 | |||
| 2211 | kvm_init_msr_list(); | ||
| 2212 | |||
| 2213 | kvm_x86_ops = ops; | ||
| 2214 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); | ||
| 2215 | return 0; | ||
| 2216 | |||
| 2217 | out: | ||
| 2218 | return r; | ||
| 2219 | } | ||
| 2220 | |||
| 2221 | void kvm_arch_exit(void) | ||
| 2222 | { | ||
| 2223 | kvm_x86_ops = NULL; | ||
| 2224 | kvm_mmu_module_exit(); | ||
| 2225 | } | ||
| 2226 | |||
| 2227 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) | ||
| 2228 | { | ||
| 2229 | ++vcpu->stat.halt_exits; | ||
| 2230 | if (irqchip_in_kernel(vcpu->kvm)) { | ||
| 2231 | vcpu->arch.mp_state = VCPU_MP_STATE_HALTED; | ||
| 2232 | kvm_vcpu_block(vcpu); | ||
| 2233 | if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE) | ||
| 2234 | return -EINTR; | ||
| 2235 | return 1; | ||
| 2236 | } else { | ||
| 2237 | vcpu->run->exit_reason = KVM_EXIT_HLT; | ||
| 2238 | return 0; | ||
| 2239 | } | ||
| 2240 | } | ||
| 2241 | EXPORT_SYMBOL_GPL(kvm_emulate_halt); | ||
| 2242 | |||
| 2243 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | ||
| 2244 | { | ||
| 2245 | unsigned long nr, a0, a1, a2, a3, ret; | ||
| 2246 | |||
| 2247 | kvm_x86_ops->cache_regs(vcpu); | ||
| 2248 | |||
| 2249 | nr = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
| 2250 | a0 = vcpu->arch.regs[VCPU_REGS_RBX]; | ||
| 2251 | a1 = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
| 2252 | a2 = vcpu->arch.regs[VCPU_REGS_RDX]; | ||
| 2253 | a3 = vcpu->arch.regs[VCPU_REGS_RSI]; | ||
| 2254 | |||
| 2255 | if (!is_long_mode(vcpu)) { | ||
| 2256 | nr &= 0xFFFFFFFF; | ||
| 2257 | a0 &= 0xFFFFFFFF; | ||
| 2258 | a1 &= 0xFFFFFFFF; | ||
| 2259 | a2 &= 0xFFFFFFFF; | ||
| 2260 | a3 &= 0xFFFFFFFF; | ||
| 2261 | } | ||
| 2262 | |||
| 2263 | switch (nr) { | ||
| 2264 | case KVM_HC_VAPIC_POLL_IRQ: | ||
| 2265 | ret = 0; | ||
| 2266 | break; | ||
| 2267 | default: | ||
| 2268 | ret = -KVM_ENOSYS; | ||
| 2269 | break; | ||
| 2270 | } | ||
| 2271 | vcpu->arch.regs[VCPU_REGS_RAX] = ret; | ||
| 2272 | kvm_x86_ops->decache_regs(vcpu); | ||
| 2273 | return 0; | ||
| 2274 | } | ||
| 2275 | EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); | ||
| 2276 | |||
| 2277 | int kvm_fix_hypercall(struct kvm_vcpu *vcpu) | ||
| 2278 | { | ||
| 2279 | char instruction[3]; | ||
| 2280 | int ret = 0; | ||
| 2281 | |||
| 2282 | |||
| 2283 | /* | ||
| 2284 | * Blow out the MMU to ensure that no other VCPU has an active mapping | ||
| 2285 | * to ensure that the updated hypercall appears atomically across all | ||
| 2286 | * VCPUs. | ||
| 2287 | */ | ||
| 2288 | kvm_mmu_zap_all(vcpu->kvm); | ||
| 2289 | |||
| 2290 | kvm_x86_ops->cache_regs(vcpu); | ||
| 2291 | kvm_x86_ops->patch_hypercall(vcpu, instruction); | ||
| 2292 | if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) | ||
| 2293 | != X86EMUL_CONTINUE) | ||
| 2294 | ret = -EFAULT; | ||
| 2295 | |||
| 2296 | return ret; | ||
| 2297 | } | ||
| 2298 | |||
| 2299 | static u64 mk_cr_64(u64 curr_cr, u32 new_val) | ||
| 2300 | { | ||
| 2301 | return (curr_cr & ~((1ULL << 32) - 1)) | new_val; | ||
| 2302 | } | ||
| 2303 | |||
| 2304 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
| 2305 | { | ||
| 2306 | struct descriptor_table dt = { limit, base }; | ||
| 2307 | |||
| 2308 | kvm_x86_ops->set_gdt(vcpu, &dt); | ||
| 2309 | } | ||
| 2310 | |||
| 2311 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
| 2312 | { | ||
| 2313 | struct descriptor_table dt = { limit, base }; | ||
| 2314 | |||
| 2315 | kvm_x86_ops->set_idt(vcpu, &dt); | ||
| 2316 | } | ||
| 2317 | |||
| 2318 | void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, | ||
| 2319 | unsigned long *rflags) | ||
| 2320 | { | ||
| 2321 | lmsw(vcpu, msw); | ||
| 2322 | *rflags = kvm_x86_ops->get_rflags(vcpu); | ||
| 2323 | } | ||
| 2324 | |||
| 2325 | unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) | ||
| 2326 | { | ||
| 2327 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | ||
| 2328 | switch (cr) { | ||
| 2329 | case 0: | ||
| 2330 | return vcpu->arch.cr0; | ||
| 2331 | case 2: | ||
| 2332 | return vcpu->arch.cr2; | ||
| 2333 | case 3: | ||
| 2334 | return vcpu->arch.cr3; | ||
| 2335 | case 4: | ||
| 2336 | return vcpu->arch.cr4; | ||
| 2337 | case 8: | ||
| 2338 | return get_cr8(vcpu); | ||
| 2339 | default: | ||
| 2340 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
| 2341 | return 0; | ||
| 2342 | } | ||
| 2343 | } | ||
| 2344 | |||
| 2345 | void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, | ||
| 2346 | unsigned long *rflags) | ||
| 2347 | { | ||
| 2348 | switch (cr) { | ||
| 2349 | case 0: | ||
| 2350 | set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); | ||
| 2351 | *rflags = kvm_x86_ops->get_rflags(vcpu); | ||
| 2352 | break; | ||
| 2353 | case 2: | ||
| 2354 | vcpu->arch.cr2 = val; | ||
| 2355 | break; | ||
| 2356 | case 3: | ||
| 2357 | set_cr3(vcpu, val); | ||
| 2358 | break; | ||
| 2359 | case 4: | ||
| 2360 | set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); | ||
| 2361 | break; | ||
| 2362 | case 8: | ||
| 2363 | set_cr8(vcpu, val & 0xfUL); | ||
| 2364 | break; | ||
| 2365 | default: | ||
| 2366 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
| 2367 | } | ||
| 2368 | } | ||
| 2369 | |||
| 2370 | static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) | ||
| 2371 | { | ||
| 2372 | struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; | ||
| 2373 | int j, nent = vcpu->arch.cpuid_nent; | ||
| 2374 | |||
| 2375 | e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; | ||
| 2376 | /* when no next entry is found, the current entry[i] is reselected */ | ||
| 2377 | for (j = i + 1; j == i; j = (j + 1) % nent) { | ||
| 2378 | struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; | ||
| 2379 | if (ej->function == e->function) { | ||
| 2380 | ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; | ||
| 2381 | return j; | ||
| 2382 | } | ||
| 2383 | } | ||
| 2384 | return 0; /* silence gcc, even though control never reaches here */ | ||
| 2385 | } | ||
| 2386 | |||
| 2387 | /* find an entry with matching function, matching index (if needed), and that | ||
| 2388 | * should be read next (if it's stateful) */ | ||
| 2389 | static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, | ||
| 2390 | u32 function, u32 index) | ||
| 2391 | { | ||
| 2392 | if (e->function != function) | ||
| 2393 | return 0; | ||
| 2394 | if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) | ||
| 2395 | return 0; | ||
| 2396 | if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && | ||
| 2397 | !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) | ||
| 2398 | return 0; | ||
| 2399 | return 1; | ||
| 2400 | } | ||
| 2401 | |||
| 2402 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | ||
| 2403 | { | ||
| 2404 | int i; | ||
| 2405 | u32 function, index; | ||
| 2406 | struct kvm_cpuid_entry2 *e, *best; | ||
| 2407 | |||
| 2408 | kvm_x86_ops->cache_regs(vcpu); | ||
| 2409 | function = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
| 2410 | index = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
| 2411 | vcpu->arch.regs[VCPU_REGS_RAX] = 0; | ||
| 2412 | vcpu->arch.regs[VCPU_REGS_RBX] = 0; | ||
| 2413 | vcpu->arch.regs[VCPU_REGS_RCX] = 0; | ||
| 2414 | vcpu->arch.regs[VCPU_REGS_RDX] = 0; | ||
| 2415 | best = NULL; | ||
| 2416 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { | ||
| 2417 | e = &vcpu->arch.cpuid_entries[i]; | ||
| 2418 | if (is_matching_cpuid_entry(e, function, index)) { | ||
| 2419 | if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) | ||
| 2420 | move_to_next_stateful_cpuid_entry(vcpu, i); | ||
| 2421 | best = e; | ||
| 2422 | break; | ||
| 2423 | } | ||
| 2424 | /* | ||
| 2425 | * Both basic or both extended? | ||
| 2426 | */ | ||
| 2427 | if (((e->function ^ function) & 0x80000000) == 0) | ||
| 2428 | if (!best || e->function > best->function) | ||
| 2429 | best = e; | ||
| 2430 | } | ||
| 2431 | if (best) { | ||
| 2432 | vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; | ||
| 2433 | vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; | ||
| 2434 | vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; | ||
| 2435 | vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; | ||
| 2436 | } | ||
| 2437 | kvm_x86_ops->decache_regs(vcpu); | ||
| 2438 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
| 2439 | } | ||
| 2440 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | ||
| 2441 | |||
| 1956 | /* | 2442 | /* |
| 1957 | * Check if userspace requested an interrupt window, and that the | 2443 | * Check if userspace requested an interrupt window, and that the |
| 1958 | * interrupt window is open. | 2444 | * interrupt window is open. |
| @@ -1962,9 +2448,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); | |||
| 1962 | static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, | 2448 | static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, |
| 1963 | struct kvm_run *kvm_run) | 2449 | struct kvm_run *kvm_run) |
| 1964 | { | 2450 | { |
| 1965 | return (!vcpu->irq_summary && | 2451 | return (!vcpu->arch.irq_summary && |
| 1966 | kvm_run->request_interrupt_window && | 2452 | kvm_run->request_interrupt_window && |
| 1967 | vcpu->interrupt_window_open && | 2453 | vcpu->arch.interrupt_window_open && |
| 1968 | (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); | 2454 | (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); |
| 1969 | } | 2455 | } |
| 1970 | 2456 | ||
| @@ -1978,22 +2464,51 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu, | |||
| 1978 | kvm_run->ready_for_interrupt_injection = 1; | 2464 | kvm_run->ready_for_interrupt_injection = 1; |
| 1979 | else | 2465 | else |
| 1980 | kvm_run->ready_for_interrupt_injection = | 2466 | kvm_run->ready_for_interrupt_injection = |
| 1981 | (vcpu->interrupt_window_open && | 2467 | (vcpu->arch.interrupt_window_open && |
| 1982 | vcpu->irq_summary == 0); | 2468 | vcpu->arch.irq_summary == 0); |
| 2469 | } | ||
| 2470 | |||
| 2471 | static void vapic_enter(struct kvm_vcpu *vcpu) | ||
| 2472 | { | ||
| 2473 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
| 2474 | struct page *page; | ||
| 2475 | |||
| 2476 | if (!apic || !apic->vapic_addr) | ||
| 2477 | return; | ||
| 2478 | |||
| 2479 | down_read(¤t->mm->mmap_sem); | ||
| 2480 | page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); | ||
| 2481 | vcpu->arch.apic->vapic_page = page; | ||
| 2482 | up_read(¤t->mm->mmap_sem); | ||
| 2483 | } | ||
| 2484 | |||
| 2485 | static void vapic_exit(struct kvm_vcpu *vcpu) | ||
| 2486 | { | ||
| 2487 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
| 2488 | |||
| 2489 | if (!apic || !apic->vapic_addr) | ||
| 2490 | return; | ||
| 2491 | |||
| 2492 | kvm_release_page_dirty(apic->vapic_page); | ||
| 2493 | mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); | ||
| 1983 | } | 2494 | } |
| 1984 | 2495 | ||
| 1985 | static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2496 | static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
| 1986 | { | 2497 | { |
| 1987 | int r; | 2498 | int r; |
| 1988 | 2499 | ||
| 1989 | if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { | 2500 | if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { |
| 1990 | printk("vcpu %d received sipi with vector # %x\n", | 2501 | pr_debug("vcpu %d received sipi with vector # %x\n", |
| 1991 | vcpu->vcpu_id, vcpu->sipi_vector); | 2502 | vcpu->vcpu_id, vcpu->arch.sipi_vector); |
| 1992 | kvm_lapic_reset(vcpu); | 2503 | kvm_lapic_reset(vcpu); |
| 1993 | kvm_x86_ops->vcpu_reset(vcpu); | 2504 | r = kvm_x86_ops->vcpu_reset(vcpu); |
| 1994 | vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; | 2505 | if (r) |
| 2506 | return r; | ||
| 2507 | vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; | ||
| 1995 | } | 2508 | } |
| 1996 | 2509 | ||
| 2510 | vapic_enter(vcpu); | ||
| 2511 | |||
| 1997 | preempted: | 2512 | preempted: |
| 1998 | if (vcpu->guest_debug.enabled) | 2513 | if (vcpu->guest_debug.enabled) |
| 1999 | kvm_x86_ops->guest_debug_pre(vcpu); | 2514 | kvm_x86_ops->guest_debug_pre(vcpu); |
| @@ -2003,6 +2518,19 @@ again: | |||
| 2003 | if (unlikely(r)) | 2518 | if (unlikely(r)) |
| 2004 | goto out; | 2519 | goto out; |
| 2005 | 2520 | ||
| 2521 | if (vcpu->requests) { | ||
| 2522 | if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) | ||
| 2523 | __kvm_migrate_apic_timer(vcpu); | ||
| 2524 | if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, | ||
| 2525 | &vcpu->requests)) { | ||
| 2526 | kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; | ||
| 2527 | r = 0; | ||
| 2528 | goto out; | ||
| 2529 | } | ||
| 2530 | } | ||
| 2531 | |||
| 2532 | kvm_inject_pending_timer_irqs(vcpu); | ||
| 2533 | |||
| 2006 | preempt_disable(); | 2534 | preempt_disable(); |
| 2007 | 2535 | ||
| 2008 | kvm_x86_ops->prepare_guest_switch(vcpu); | 2536 | kvm_x86_ops->prepare_guest_switch(vcpu); |
| @@ -2010,6 +2538,13 @@ again: | |||
| 2010 | 2538 | ||
| 2011 | local_irq_disable(); | 2539 | local_irq_disable(); |
| 2012 | 2540 | ||
| 2541 | if (need_resched()) { | ||
| 2542 | local_irq_enable(); | ||
| 2543 | preempt_enable(); | ||
| 2544 | r = 1; | ||
| 2545 | goto out; | ||
| 2546 | } | ||
| 2547 | |||
| 2013 | if (signal_pending(current)) { | 2548 | if (signal_pending(current)) { |
| 2014 | local_irq_enable(); | 2549 | local_irq_enable(); |
| 2015 | preempt_enable(); | 2550 | preempt_enable(); |
| @@ -2019,16 +2554,20 @@ again: | |||
| 2019 | goto out; | 2554 | goto out; |
| 2020 | } | 2555 | } |
| 2021 | 2556 | ||
| 2022 | if (irqchip_in_kernel(vcpu->kvm)) | 2557 | if (vcpu->arch.exception.pending) |
| 2558 | __queue_exception(vcpu); | ||
| 2559 | else if (irqchip_in_kernel(vcpu->kvm)) | ||
| 2023 | kvm_x86_ops->inject_pending_irq(vcpu); | 2560 | kvm_x86_ops->inject_pending_irq(vcpu); |
| 2024 | else if (!vcpu->mmio_read_completed) | 2561 | else |
| 2025 | kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); | 2562 | kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); |
| 2026 | 2563 | ||
| 2564 | kvm_lapic_sync_to_vapic(vcpu); | ||
| 2565 | |||
| 2027 | vcpu->guest_mode = 1; | 2566 | vcpu->guest_mode = 1; |
| 2028 | kvm_guest_enter(); | 2567 | kvm_guest_enter(); |
| 2029 | 2568 | ||
| 2030 | if (vcpu->requests) | 2569 | if (vcpu->requests) |
| 2031 | if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) | 2570 | if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) |
| 2032 | kvm_x86_ops->tlb_flush(vcpu); | 2571 | kvm_x86_ops->tlb_flush(vcpu); |
| 2033 | 2572 | ||
| 2034 | kvm_x86_ops->run(vcpu, kvm_run); | 2573 | kvm_x86_ops->run(vcpu, kvm_run); |
| @@ -2055,9 +2594,14 @@ again: | |||
| 2055 | */ | 2594 | */ |
| 2056 | if (unlikely(prof_on == KVM_PROFILING)) { | 2595 | if (unlikely(prof_on == KVM_PROFILING)) { |
| 2057 | kvm_x86_ops->cache_regs(vcpu); | 2596 | kvm_x86_ops->cache_regs(vcpu); |
| 2058 | profile_hit(KVM_PROFILING, (void *)vcpu->rip); | 2597 | profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); |
| 2059 | } | 2598 | } |
| 2060 | 2599 | ||
| 2600 | if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) | ||
| 2601 | vcpu->arch.exception.pending = false; | ||
| 2602 | |||
| 2603 | kvm_lapic_sync_from_vapic(vcpu); | ||
| 2604 | |||
| 2061 | r = kvm_x86_ops->handle_exit(kvm_run, vcpu); | 2605 | r = kvm_x86_ops->handle_exit(kvm_run, vcpu); |
| 2062 | 2606 | ||
| 2063 | if (r > 0) { | 2607 | if (r > 0) { |
| @@ -2067,10 +2611,8 @@ again: | |||
| 2067 | ++vcpu->stat.request_irq_exits; | 2611 | ++vcpu->stat.request_irq_exits; |
| 2068 | goto out; | 2612 | goto out; |
| 2069 | } | 2613 | } |
| 2070 | if (!need_resched()) { | 2614 | if (!need_resched()) |
| 2071 | ++vcpu->stat.light_exits; | ||
| 2072 | goto again; | 2615 | goto again; |
| 2073 | } | ||
| 2074 | } | 2616 | } |
| 2075 | 2617 | ||
| 2076 | out: | 2618 | out: |
| @@ -2081,18 +2623,19 @@ out: | |||
| 2081 | 2623 | ||
| 2082 | post_kvm_run_save(vcpu, kvm_run); | 2624 | post_kvm_run_save(vcpu, kvm_run); |
| 2083 | 2625 | ||
| 2626 | vapic_exit(vcpu); | ||
| 2627 | |||
| 2084 | return r; | 2628 | return r; |
| 2085 | } | 2629 | } |
| 2086 | 2630 | ||
| 2087 | 2631 | int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |
| 2088 | static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
| 2089 | { | 2632 | { |
| 2090 | int r; | 2633 | int r; |
| 2091 | sigset_t sigsaved; | 2634 | sigset_t sigsaved; |
| 2092 | 2635 | ||
| 2093 | vcpu_load(vcpu); | 2636 | vcpu_load(vcpu); |
| 2094 | 2637 | ||
| 2095 | if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) { | 2638 | if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) { |
| 2096 | kvm_vcpu_block(vcpu); | 2639 | kvm_vcpu_block(vcpu); |
| 2097 | vcpu_put(vcpu); | 2640 | vcpu_put(vcpu); |
| 2098 | return -EAGAIN; | 2641 | return -EAGAIN; |
| @@ -2105,18 +2648,19 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2105 | if (!irqchip_in_kernel(vcpu->kvm)) | 2648 | if (!irqchip_in_kernel(vcpu->kvm)) |
| 2106 | set_cr8(vcpu, kvm_run->cr8); | 2649 | set_cr8(vcpu, kvm_run->cr8); |
| 2107 | 2650 | ||
| 2108 | if (vcpu->pio.cur_count) { | 2651 | if (vcpu->arch.pio.cur_count) { |
| 2109 | r = complete_pio(vcpu); | 2652 | r = complete_pio(vcpu); |
| 2110 | if (r) | 2653 | if (r) |
| 2111 | goto out; | 2654 | goto out; |
| 2112 | } | 2655 | } |
| 2113 | 2656 | #if CONFIG_HAS_IOMEM | |
| 2114 | if (vcpu->mmio_needed) { | 2657 | if (vcpu->mmio_needed) { |
| 2115 | memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); | 2658 | memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); |
| 2116 | vcpu->mmio_read_completed = 1; | 2659 | vcpu->mmio_read_completed = 1; |
| 2117 | vcpu->mmio_needed = 0; | 2660 | vcpu->mmio_needed = 0; |
| 2118 | r = emulate_instruction(vcpu, kvm_run, | 2661 | r = emulate_instruction(vcpu, kvm_run, |
| 2119 | vcpu->mmio_fault_cr2, 0); | 2662 | vcpu->arch.mmio_fault_cr2, 0, |
| 2663 | EMULTYPE_NO_DECODE); | ||
| 2120 | if (r == EMULATE_DO_MMIO) { | 2664 | if (r == EMULATE_DO_MMIO) { |
| 2121 | /* | 2665 | /* |
| 2122 | * Read-modify-write. Back to userspace. | 2666 | * Read-modify-write. Back to userspace. |
| @@ -2125,10 +2669,10 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2125 | goto out; | 2669 | goto out; |
| 2126 | } | 2670 | } |
| 2127 | } | 2671 | } |
| 2128 | 2672 | #endif | |
| 2129 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { | 2673 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { |
| 2130 | kvm_x86_ops->cache_regs(vcpu); | 2674 | kvm_x86_ops->cache_regs(vcpu); |
| 2131 | vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; | 2675 | vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; |
| 2132 | kvm_x86_ops->decache_regs(vcpu); | 2676 | kvm_x86_ops->decache_regs(vcpu); |
| 2133 | } | 2677 | } |
| 2134 | 2678 | ||
| @@ -2142,33 +2686,32 @@ out: | |||
| 2142 | return r; | 2686 | return r; |
| 2143 | } | 2687 | } |
| 2144 | 2688 | ||
| 2145 | static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, | 2689 | int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) |
| 2146 | struct kvm_regs *regs) | ||
| 2147 | { | 2690 | { |
| 2148 | vcpu_load(vcpu); | 2691 | vcpu_load(vcpu); |
| 2149 | 2692 | ||
| 2150 | kvm_x86_ops->cache_regs(vcpu); | 2693 | kvm_x86_ops->cache_regs(vcpu); |
| 2151 | 2694 | ||
| 2152 | regs->rax = vcpu->regs[VCPU_REGS_RAX]; | 2695 | regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; |
| 2153 | regs->rbx = vcpu->regs[VCPU_REGS_RBX]; | 2696 | regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; |
| 2154 | regs->rcx = vcpu->regs[VCPU_REGS_RCX]; | 2697 | regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; |
| 2155 | regs->rdx = vcpu->regs[VCPU_REGS_RDX]; | 2698 | regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; |
| 2156 | regs->rsi = vcpu->regs[VCPU_REGS_RSI]; | 2699 | regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; |
| 2157 | regs->rdi = vcpu->regs[VCPU_REGS_RDI]; | 2700 | regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; |
| 2158 | regs->rsp = vcpu->regs[VCPU_REGS_RSP]; | 2701 | regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; |
| 2159 | regs->rbp = vcpu->regs[VCPU_REGS_RBP]; | 2702 | regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; |
| 2160 | #ifdef CONFIG_X86_64 | 2703 | #ifdef CONFIG_X86_64 |
| 2161 | regs->r8 = vcpu->regs[VCPU_REGS_R8]; | 2704 | regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; |
| 2162 | regs->r9 = vcpu->regs[VCPU_REGS_R9]; | 2705 | regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; |
| 2163 | regs->r10 = vcpu->regs[VCPU_REGS_R10]; | 2706 | regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; |
| 2164 | regs->r11 = vcpu->regs[VCPU_REGS_R11]; | 2707 | regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; |
| 2165 | regs->r12 = vcpu->regs[VCPU_REGS_R12]; | 2708 | regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; |
| 2166 | regs->r13 = vcpu->regs[VCPU_REGS_R13]; | 2709 | regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; |
| 2167 | regs->r14 = vcpu->regs[VCPU_REGS_R14]; | 2710 | regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; |
| 2168 | regs->r15 = vcpu->regs[VCPU_REGS_R15]; | 2711 | regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; |
| 2169 | #endif | 2712 | #endif |
| 2170 | 2713 | ||
| 2171 | regs->rip = vcpu->rip; | 2714 | regs->rip = vcpu->arch.rip; |
| 2172 | regs->rflags = kvm_x86_ops->get_rflags(vcpu); | 2715 | regs->rflags = kvm_x86_ops->get_rflags(vcpu); |
| 2173 | 2716 | ||
| 2174 | /* | 2717 | /* |
| @@ -2182,31 +2725,30 @@ static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, | |||
| 2182 | return 0; | 2725 | return 0; |
| 2183 | } | 2726 | } |
| 2184 | 2727 | ||
| 2185 | static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, | 2728 | int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) |
| 2186 | struct kvm_regs *regs) | ||
| 2187 | { | 2729 | { |
| 2188 | vcpu_load(vcpu); | 2730 | vcpu_load(vcpu); |
| 2189 | 2731 | ||
| 2190 | vcpu->regs[VCPU_REGS_RAX] = regs->rax; | 2732 | vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; |
| 2191 | vcpu->regs[VCPU_REGS_RBX] = regs->rbx; | 2733 | vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; |
| 2192 | vcpu->regs[VCPU_REGS_RCX] = regs->rcx; | 2734 | vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; |
| 2193 | vcpu->regs[VCPU_REGS_RDX] = regs->rdx; | 2735 | vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; |
| 2194 | vcpu->regs[VCPU_REGS_RSI] = regs->rsi; | 2736 | vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; |
| 2195 | vcpu->regs[VCPU_REGS_RDI] = regs->rdi; | 2737 | vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; |
| 2196 | vcpu->regs[VCPU_REGS_RSP] = regs->rsp; | 2738 | vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; |
| 2197 | vcpu->regs[VCPU_REGS_RBP] = regs->rbp; | 2739 | vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; |
| 2198 | #ifdef CONFIG_X86_64 | 2740 | #ifdef CONFIG_X86_64 |
| 2199 | vcpu->regs[VCPU_REGS_R8] = regs->r8; | 2741 | vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; |
| 2200 | vcpu->regs[VCPU_REGS_R9] = regs->r9; | 2742 | vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; |
| 2201 | vcpu->regs[VCPU_REGS_R10] = regs->r10; | 2743 | vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; |
| 2202 | vcpu->regs[VCPU_REGS_R11] = regs->r11; | 2744 | vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; |
| 2203 | vcpu->regs[VCPU_REGS_R12] = regs->r12; | 2745 | vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; |
| 2204 | vcpu->regs[VCPU_REGS_R13] = regs->r13; | 2746 | vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; |
| 2205 | vcpu->regs[VCPU_REGS_R14] = regs->r14; | 2747 | vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; |
| 2206 | vcpu->regs[VCPU_REGS_R15] = regs->r15; | 2748 | vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; |
| 2207 | #endif | 2749 | #endif |
| 2208 | 2750 | ||
| 2209 | vcpu->rip = regs->rip; | 2751 | vcpu->arch.rip = regs->rip; |
| 2210 | kvm_x86_ops->set_rflags(vcpu, regs->rflags); | 2752 | kvm_x86_ops->set_rflags(vcpu, regs->rflags); |
| 2211 | 2753 | ||
| 2212 | kvm_x86_ops->decache_regs(vcpu); | 2754 | kvm_x86_ops->decache_regs(vcpu); |
| @@ -2222,8 +2764,18 @@ static void get_segment(struct kvm_vcpu *vcpu, | |||
| 2222 | return kvm_x86_ops->get_segment(vcpu, var, seg); | 2764 | return kvm_x86_ops->get_segment(vcpu, var, seg); |
| 2223 | } | 2765 | } |
| 2224 | 2766 | ||
| 2225 | static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | 2767 | void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) |
| 2226 | struct kvm_sregs *sregs) | 2768 | { |
| 2769 | struct kvm_segment cs; | ||
| 2770 | |||
| 2771 | get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
| 2772 | *db = cs.db; | ||
| 2773 | *l = cs.l; | ||
| 2774 | } | ||
| 2775 | EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); | ||
| 2776 | |||
| 2777 | int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | ||
| 2778 | struct kvm_sregs *sregs) | ||
| 2227 | { | 2779 | { |
| 2228 | struct descriptor_table dt; | 2780 | struct descriptor_table dt; |
| 2229 | int pending_vec; | 2781 | int pending_vec; |
| @@ -2248,12 +2800,12 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | |||
| 2248 | sregs->gdt.base = dt.base; | 2800 | sregs->gdt.base = dt.base; |
| 2249 | 2801 | ||
| 2250 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | 2802 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); |
| 2251 | sregs->cr0 = vcpu->cr0; | 2803 | sregs->cr0 = vcpu->arch.cr0; |
| 2252 | sregs->cr2 = vcpu->cr2; | 2804 | sregs->cr2 = vcpu->arch.cr2; |
| 2253 | sregs->cr3 = vcpu->cr3; | 2805 | sregs->cr3 = vcpu->arch.cr3; |
| 2254 | sregs->cr4 = vcpu->cr4; | 2806 | sregs->cr4 = vcpu->arch.cr4; |
| 2255 | sregs->cr8 = get_cr8(vcpu); | 2807 | sregs->cr8 = get_cr8(vcpu); |
| 2256 | sregs->efer = vcpu->shadow_efer; | 2808 | sregs->efer = vcpu->arch.shadow_efer; |
| 2257 | sregs->apic_base = kvm_get_apic_base(vcpu); | 2809 | sregs->apic_base = kvm_get_apic_base(vcpu); |
| 2258 | 2810 | ||
| 2259 | if (irqchip_in_kernel(vcpu->kvm)) { | 2811 | if (irqchip_in_kernel(vcpu->kvm)) { |
| @@ -2261,9 +2813,10 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | |||
| 2261 | sizeof sregs->interrupt_bitmap); | 2813 | sizeof sregs->interrupt_bitmap); |
| 2262 | pending_vec = kvm_x86_ops->get_irq(vcpu); | 2814 | pending_vec = kvm_x86_ops->get_irq(vcpu); |
| 2263 | if (pending_vec >= 0) | 2815 | if (pending_vec >= 0) |
| 2264 | set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap); | 2816 | set_bit(pending_vec, |
| 2817 | (unsigned long *)sregs->interrupt_bitmap); | ||
| 2265 | } else | 2818 | } else |
| 2266 | memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, | 2819 | memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, |
| 2267 | sizeof sregs->interrupt_bitmap); | 2820 | sizeof sregs->interrupt_bitmap); |
| 2268 | 2821 | ||
| 2269 | vcpu_put(vcpu); | 2822 | vcpu_put(vcpu); |
| @@ -2277,8 +2830,8 @@ static void set_segment(struct kvm_vcpu *vcpu, | |||
| 2277 | return kvm_x86_ops->set_segment(vcpu, var, seg); | 2830 | return kvm_x86_ops->set_segment(vcpu, var, seg); |
| 2278 | } | 2831 | } |
| 2279 | 2832 | ||
| 2280 | static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | 2833 | int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, |
| 2281 | struct kvm_sregs *sregs) | 2834 | struct kvm_sregs *sregs) |
| 2282 | { | 2835 | { |
| 2283 | int mmu_reset_needed = 0; | 2836 | int mmu_reset_needed = 0; |
| 2284 | int i, pending_vec, max_bits; | 2837 | int i, pending_vec, max_bits; |
| @@ -2293,13 +2846,13 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
| 2293 | dt.base = sregs->gdt.base; | 2846 | dt.base = sregs->gdt.base; |
| 2294 | kvm_x86_ops->set_gdt(vcpu, &dt); | 2847 | kvm_x86_ops->set_gdt(vcpu, &dt); |
| 2295 | 2848 | ||
| 2296 | vcpu->cr2 = sregs->cr2; | 2849 | vcpu->arch.cr2 = sregs->cr2; |
| 2297 | mmu_reset_needed |= vcpu->cr3 != sregs->cr3; | 2850 | mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; |
| 2298 | vcpu->cr3 = sregs->cr3; | 2851 | vcpu->arch.cr3 = sregs->cr3; |
| 2299 | 2852 | ||
| 2300 | set_cr8(vcpu, sregs->cr8); | 2853 | set_cr8(vcpu, sregs->cr8); |
| 2301 | 2854 | ||
| 2302 | mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; | 2855 | mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; |
| 2303 | #ifdef CONFIG_X86_64 | 2856 | #ifdef CONFIG_X86_64 |
| 2304 | kvm_x86_ops->set_efer(vcpu, sregs->efer); | 2857 | kvm_x86_ops->set_efer(vcpu, sregs->efer); |
| 2305 | #endif | 2858 | #endif |
| @@ -2307,25 +2860,25 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
| 2307 | 2860 | ||
| 2308 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | 2861 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); |
| 2309 | 2862 | ||
| 2310 | mmu_reset_needed |= vcpu->cr0 != sregs->cr0; | 2863 | mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; |
| 2311 | vcpu->cr0 = sregs->cr0; | 2864 | vcpu->arch.cr0 = sregs->cr0; |
| 2312 | kvm_x86_ops->set_cr0(vcpu, sregs->cr0); | 2865 | kvm_x86_ops->set_cr0(vcpu, sregs->cr0); |
| 2313 | 2866 | ||
| 2314 | mmu_reset_needed |= vcpu->cr4 != sregs->cr4; | 2867 | mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; |
| 2315 | kvm_x86_ops->set_cr4(vcpu, sregs->cr4); | 2868 | kvm_x86_ops->set_cr4(vcpu, sregs->cr4); |
| 2316 | if (!is_long_mode(vcpu) && is_pae(vcpu)) | 2869 | if (!is_long_mode(vcpu) && is_pae(vcpu)) |
| 2317 | load_pdptrs(vcpu, vcpu->cr3); | 2870 | load_pdptrs(vcpu, vcpu->arch.cr3); |
| 2318 | 2871 | ||
| 2319 | if (mmu_reset_needed) | 2872 | if (mmu_reset_needed) |
| 2320 | kvm_mmu_reset_context(vcpu); | 2873 | kvm_mmu_reset_context(vcpu); |
| 2321 | 2874 | ||
| 2322 | if (!irqchip_in_kernel(vcpu->kvm)) { | 2875 | if (!irqchip_in_kernel(vcpu->kvm)) { |
| 2323 | memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, | 2876 | memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, |
| 2324 | sizeof vcpu->irq_pending); | 2877 | sizeof vcpu->arch.irq_pending); |
| 2325 | vcpu->irq_summary = 0; | 2878 | vcpu->arch.irq_summary = 0; |
| 2326 | for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i) | 2879 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) |
| 2327 | if (vcpu->irq_pending[i]) | 2880 | if (vcpu->arch.irq_pending[i]) |
| 2328 | __set_bit(i, &vcpu->irq_summary); | 2881 | __set_bit(i, &vcpu->arch.irq_summary); |
| 2329 | } else { | 2882 | } else { |
| 2330 | max_bits = (sizeof sregs->interrupt_bitmap) << 3; | 2883 | max_bits = (sizeof sregs->interrupt_bitmap) << 3; |
| 2331 | pending_vec = find_first_bit( | 2884 | pending_vec = find_first_bit( |
| @@ -2334,7 +2887,8 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
| 2334 | /* Only pending external irq is handled here */ | 2887 | /* Only pending external irq is handled here */ |
| 2335 | if (pending_vec < max_bits) { | 2888 | if (pending_vec < max_bits) { |
| 2336 | kvm_x86_ops->set_irq(vcpu, pending_vec); | 2889 | kvm_x86_ops->set_irq(vcpu, pending_vec); |
| 2337 | printk("Set back pending irq %d\n", pending_vec); | 2890 | pr_debug("Set back pending irq %d\n", |
| 2891 | pending_vec); | ||
| 2338 | } | 2892 | } |
| 2339 | } | 2893 | } |
| 2340 | 2894 | ||
| @@ -2353,174 +2907,8 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
| 2353 | return 0; | 2907 | return 0; |
| 2354 | } | 2908 | } |
| 2355 | 2909 | ||
| 2356 | void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | 2910 | int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, |
| 2357 | { | 2911 | struct kvm_debug_guest *dbg) |
| 2358 | struct kvm_segment cs; | ||
| 2359 | |||
| 2360 | get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
| 2361 | *db = cs.db; | ||
| 2362 | *l = cs.l; | ||
| 2363 | } | ||
| 2364 | EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); | ||
| 2365 | |||
| 2366 | /* | ||
| 2367 | * List of msr numbers which we expose to userspace through KVM_GET_MSRS | ||
| 2368 | * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. | ||
| 2369 | * | ||
| 2370 | * This list is modified at module load time to reflect the | ||
| 2371 | * capabilities of the host cpu. | ||
| 2372 | */ | ||
| 2373 | static u32 msrs_to_save[] = { | ||
| 2374 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
| 2375 | MSR_K6_STAR, | ||
| 2376 | #ifdef CONFIG_X86_64 | ||
| 2377 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, | ||
| 2378 | #endif | ||
| 2379 | MSR_IA32_TIME_STAMP_COUNTER, | ||
| 2380 | }; | ||
| 2381 | |||
| 2382 | static unsigned num_msrs_to_save; | ||
| 2383 | |||
| 2384 | static u32 emulated_msrs[] = { | ||
| 2385 | MSR_IA32_MISC_ENABLE, | ||
| 2386 | }; | ||
| 2387 | |||
| 2388 | static __init void kvm_init_msr_list(void) | ||
| 2389 | { | ||
| 2390 | u32 dummy[2]; | ||
| 2391 | unsigned i, j; | ||
| 2392 | |||
| 2393 | for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { | ||
| 2394 | if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) | ||
| 2395 | continue; | ||
| 2396 | if (j < i) | ||
| 2397 | msrs_to_save[j] = msrs_to_save[i]; | ||
| 2398 | j++; | ||
| 2399 | } | ||
| 2400 | num_msrs_to_save = j; | ||
| 2401 | } | ||
| 2402 | |||
| 2403 | /* | ||
| 2404 | * Adapt set_msr() to msr_io()'s calling convention | ||
| 2405 | */ | ||
| 2406 | static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) | ||
| 2407 | { | ||
| 2408 | return kvm_set_msr(vcpu, index, *data); | ||
| 2409 | } | ||
| 2410 | |||
| 2411 | /* | ||
| 2412 | * Read or write a bunch of msrs. All parameters are kernel addresses. | ||
| 2413 | * | ||
| 2414 | * @return number of msrs set successfully. | ||
| 2415 | */ | ||
| 2416 | static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, | ||
| 2417 | struct kvm_msr_entry *entries, | ||
| 2418 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
| 2419 | unsigned index, u64 *data)) | ||
| 2420 | { | ||
| 2421 | int i; | ||
| 2422 | |||
| 2423 | vcpu_load(vcpu); | ||
| 2424 | |||
| 2425 | for (i = 0; i < msrs->nmsrs; ++i) | ||
| 2426 | if (do_msr(vcpu, entries[i].index, &entries[i].data)) | ||
| 2427 | break; | ||
| 2428 | |||
| 2429 | vcpu_put(vcpu); | ||
| 2430 | |||
| 2431 | return i; | ||
| 2432 | } | ||
| 2433 | |||
| 2434 | /* | ||
| 2435 | * Read or write a bunch of msrs. Parameters are user addresses. | ||
| 2436 | * | ||
| 2437 | * @return number of msrs set successfully. | ||
| 2438 | */ | ||
| 2439 | static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, | ||
| 2440 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
| 2441 | unsigned index, u64 *data), | ||
| 2442 | int writeback) | ||
| 2443 | { | ||
| 2444 | struct kvm_msrs msrs; | ||
| 2445 | struct kvm_msr_entry *entries; | ||
| 2446 | int r, n; | ||
| 2447 | unsigned size; | ||
| 2448 | |||
| 2449 | r = -EFAULT; | ||
| 2450 | if (copy_from_user(&msrs, user_msrs, sizeof msrs)) | ||
| 2451 | goto out; | ||
| 2452 | |||
| 2453 | r = -E2BIG; | ||
| 2454 | if (msrs.nmsrs >= MAX_IO_MSRS) | ||
| 2455 | goto out; | ||
| 2456 | |||
| 2457 | r = -ENOMEM; | ||
| 2458 | size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; | ||
| 2459 | entries = vmalloc(size); | ||
| 2460 | if (!entries) | ||
| 2461 | goto out; | ||
| 2462 | |||
| 2463 | r = -EFAULT; | ||
| 2464 | if (copy_from_user(entries, user_msrs->entries, size)) | ||
| 2465 | goto out_free; | ||
| 2466 | |||
| 2467 | r = n = __msr_io(vcpu, &msrs, entries, do_msr); | ||
| 2468 | if (r < 0) | ||
| 2469 | goto out_free; | ||
| 2470 | |||
| 2471 | r = -EFAULT; | ||
| 2472 | if (writeback && copy_to_user(user_msrs->entries, entries, size)) | ||
| 2473 | goto out_free; | ||
| 2474 | |||
| 2475 | r = n; | ||
| 2476 | |||
| 2477 | out_free: | ||
| 2478 | vfree(entries); | ||
| 2479 | out: | ||
| 2480 | return r; | ||
| 2481 | } | ||
| 2482 | |||
| 2483 | /* | ||
| 2484 | * Translate a guest virtual address to a guest physical address. | ||
| 2485 | */ | ||
| 2486 | static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, | ||
| 2487 | struct kvm_translation *tr) | ||
| 2488 | { | ||
| 2489 | unsigned long vaddr = tr->linear_address; | ||
| 2490 | gpa_t gpa; | ||
| 2491 | |||
| 2492 | vcpu_load(vcpu); | ||
| 2493 | mutex_lock(&vcpu->kvm->lock); | ||
| 2494 | gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr); | ||
| 2495 | tr->physical_address = gpa; | ||
| 2496 | tr->valid = gpa != UNMAPPED_GVA; | ||
| 2497 | tr->writeable = 1; | ||
| 2498 | tr->usermode = 0; | ||
| 2499 | mutex_unlock(&vcpu->kvm->lock); | ||
| 2500 | vcpu_put(vcpu); | ||
| 2501 | |||
| 2502 | return 0; | ||
| 2503 | } | ||
| 2504 | |||
| 2505 | static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | ||
| 2506 | struct kvm_interrupt *irq) | ||
| 2507 | { | ||
| 2508 | if (irq->irq < 0 || irq->irq >= 256) | ||
| 2509 | return -EINVAL; | ||
| 2510 | if (irqchip_in_kernel(vcpu->kvm)) | ||
| 2511 | return -ENXIO; | ||
| 2512 | vcpu_load(vcpu); | ||
| 2513 | |||
| 2514 | set_bit(irq->irq, vcpu->irq_pending); | ||
| 2515 | set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary); | ||
| 2516 | |||
| 2517 | vcpu_put(vcpu); | ||
| 2518 | |||
| 2519 | return 0; | ||
| 2520 | } | ||
| 2521 | |||
| 2522 | static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, | ||
| 2523 | struct kvm_debug_guest *dbg) | ||
| 2524 | { | 2912 | { |
| 2525 | int r; | 2913 | int r; |
| 2526 | 2914 | ||
| @@ -2533,179 +2921,6 @@ static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, | |||
| 2533 | return r; | 2921 | return r; |
| 2534 | } | 2922 | } |
| 2535 | 2923 | ||
| 2536 | static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma, | ||
| 2537 | unsigned long address, | ||
| 2538 | int *type) | ||
| 2539 | { | ||
| 2540 | struct kvm_vcpu *vcpu = vma->vm_file->private_data; | ||
| 2541 | unsigned long pgoff; | ||
| 2542 | struct page *page; | ||
| 2543 | |||
| 2544 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
| 2545 | if (pgoff == 0) | ||
| 2546 | page = virt_to_page(vcpu->run); | ||
| 2547 | else if (pgoff == KVM_PIO_PAGE_OFFSET) | ||
| 2548 | page = virt_to_page(vcpu->pio_data); | ||
| 2549 | else | ||
| 2550 | return NOPAGE_SIGBUS; | ||
| 2551 | get_page(page); | ||
| 2552 | if (type != NULL) | ||
| 2553 | *type = VM_FAULT_MINOR; | ||
| 2554 | |||
| 2555 | return page; | ||
| 2556 | } | ||
| 2557 | |||
| 2558 | static struct vm_operations_struct kvm_vcpu_vm_ops = { | ||
| 2559 | .nopage = kvm_vcpu_nopage, | ||
| 2560 | }; | ||
| 2561 | |||
| 2562 | static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) | ||
| 2563 | { | ||
| 2564 | vma->vm_ops = &kvm_vcpu_vm_ops; | ||
| 2565 | return 0; | ||
| 2566 | } | ||
| 2567 | |||
| 2568 | static int kvm_vcpu_release(struct inode *inode, struct file *filp) | ||
| 2569 | { | ||
| 2570 | struct kvm_vcpu *vcpu = filp->private_data; | ||
| 2571 | |||
| 2572 | fput(vcpu->kvm->filp); | ||
| 2573 | return 0; | ||
| 2574 | } | ||
| 2575 | |||
| 2576 | static struct file_operations kvm_vcpu_fops = { | ||
| 2577 | .release = kvm_vcpu_release, | ||
| 2578 | .unlocked_ioctl = kvm_vcpu_ioctl, | ||
| 2579 | .compat_ioctl = kvm_vcpu_ioctl, | ||
| 2580 | .mmap = kvm_vcpu_mmap, | ||
| 2581 | }; | ||
| 2582 | |||
| 2583 | /* | ||
| 2584 | * Allocates an inode for the vcpu. | ||
| 2585 | */ | ||
| 2586 | static int create_vcpu_fd(struct kvm_vcpu *vcpu) | ||
| 2587 | { | ||
| 2588 | int fd, r; | ||
| 2589 | struct inode *inode; | ||
| 2590 | struct file *file; | ||
| 2591 | |||
| 2592 | r = anon_inode_getfd(&fd, &inode, &file, | ||
| 2593 | "kvm-vcpu", &kvm_vcpu_fops, vcpu); | ||
| 2594 | if (r) | ||
| 2595 | return r; | ||
| 2596 | atomic_inc(&vcpu->kvm->filp->f_count); | ||
| 2597 | return fd; | ||
| 2598 | } | ||
| 2599 | |||
| 2600 | /* | ||
| 2601 | * Creates some virtual cpus. Good luck creating more than one. | ||
| 2602 | */ | ||
| 2603 | static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) | ||
| 2604 | { | ||
| 2605 | int r; | ||
| 2606 | struct kvm_vcpu *vcpu; | ||
| 2607 | |||
| 2608 | if (!valid_vcpu(n)) | ||
| 2609 | return -EINVAL; | ||
| 2610 | |||
| 2611 | vcpu = kvm_x86_ops->vcpu_create(kvm, n); | ||
| 2612 | if (IS_ERR(vcpu)) | ||
| 2613 | return PTR_ERR(vcpu); | ||
| 2614 | |||
| 2615 | preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); | ||
| 2616 | |||
| 2617 | /* We do fxsave: this must be aligned. */ | ||
| 2618 | BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF); | ||
| 2619 | |||
| 2620 | vcpu_load(vcpu); | ||
| 2621 | r = kvm_mmu_setup(vcpu); | ||
| 2622 | vcpu_put(vcpu); | ||
| 2623 | if (r < 0) | ||
| 2624 | goto free_vcpu; | ||
| 2625 | |||
| 2626 | mutex_lock(&kvm->lock); | ||
| 2627 | if (kvm->vcpus[n]) { | ||
| 2628 | r = -EEXIST; | ||
| 2629 | mutex_unlock(&kvm->lock); | ||
| 2630 | goto mmu_unload; | ||
| 2631 | } | ||
| 2632 | kvm->vcpus[n] = vcpu; | ||
| 2633 | mutex_unlock(&kvm->lock); | ||
| 2634 | |||
| 2635 | /* Now it's all set up, let userspace reach it */ | ||
| 2636 | r = create_vcpu_fd(vcpu); | ||
| 2637 | if (r < 0) | ||
| 2638 | goto unlink; | ||
| 2639 | return r; | ||
| 2640 | |||
| 2641 | unlink: | ||
| 2642 | mutex_lock(&kvm->lock); | ||
| 2643 | kvm->vcpus[n] = NULL; | ||
| 2644 | mutex_unlock(&kvm->lock); | ||
| 2645 | |||
| 2646 | mmu_unload: | ||
| 2647 | vcpu_load(vcpu); | ||
| 2648 | kvm_mmu_unload(vcpu); | ||
| 2649 | vcpu_put(vcpu); | ||
| 2650 | |||
| 2651 | free_vcpu: | ||
| 2652 | kvm_x86_ops->vcpu_free(vcpu); | ||
| 2653 | return r; | ||
| 2654 | } | ||
| 2655 | |||
| 2656 | static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) | ||
| 2657 | { | ||
| 2658 | u64 efer; | ||
| 2659 | int i; | ||
| 2660 | struct kvm_cpuid_entry *e, *entry; | ||
| 2661 | |||
| 2662 | rdmsrl(MSR_EFER, efer); | ||
| 2663 | entry = NULL; | ||
| 2664 | for (i = 0; i < vcpu->cpuid_nent; ++i) { | ||
| 2665 | e = &vcpu->cpuid_entries[i]; | ||
| 2666 | if (e->function == 0x80000001) { | ||
| 2667 | entry = e; | ||
| 2668 | break; | ||
| 2669 | } | ||
| 2670 | } | ||
| 2671 | if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) { | ||
| 2672 | entry->edx &= ~(1 << 20); | ||
| 2673 | printk(KERN_INFO "kvm: guest NX capability removed\n"); | ||
| 2674 | } | ||
| 2675 | } | ||
| 2676 | |||
| 2677 | static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, | ||
| 2678 | struct kvm_cpuid *cpuid, | ||
| 2679 | struct kvm_cpuid_entry __user *entries) | ||
| 2680 | { | ||
| 2681 | int r; | ||
| 2682 | |||
| 2683 | r = -E2BIG; | ||
| 2684 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
| 2685 | goto out; | ||
| 2686 | r = -EFAULT; | ||
| 2687 | if (copy_from_user(&vcpu->cpuid_entries, entries, | ||
| 2688 | cpuid->nent * sizeof(struct kvm_cpuid_entry))) | ||
| 2689 | goto out; | ||
| 2690 | vcpu->cpuid_nent = cpuid->nent; | ||
| 2691 | cpuid_fix_nx_cap(vcpu); | ||
| 2692 | return 0; | ||
| 2693 | |||
| 2694 | out: | ||
| 2695 | return r; | ||
| 2696 | } | ||
| 2697 | |||
| 2698 | static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) | ||
| 2699 | { | ||
| 2700 | if (sigset) { | ||
| 2701 | sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
| 2702 | vcpu->sigset_active = 1; | ||
| 2703 | vcpu->sigset = *sigset; | ||
| 2704 | } else | ||
| 2705 | vcpu->sigset_active = 0; | ||
| 2706 | return 0; | ||
| 2707 | } | ||
| 2708 | |||
| 2709 | /* | 2924 | /* |
| 2710 | * fxsave fpu state. Taken from x86_64/processor.h. To be killed when | 2925 | * fxsave fpu state. Taken from x86_64/processor.h. To be killed when |
| 2711 | * we have asm/x86/processor.h | 2926 | * we have asm/x86/processor.h |
| @@ -2727,9 +2942,31 @@ struct fxsave { | |||
| 2727 | #endif | 2942 | #endif |
| 2728 | }; | 2943 | }; |
| 2729 | 2944 | ||
| 2730 | static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | 2945 | /* |
| 2946 | * Translate a guest virtual address to a guest physical address. | ||
| 2947 | */ | ||
| 2948 | int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, | ||
| 2949 | struct kvm_translation *tr) | ||
| 2731 | { | 2950 | { |
| 2732 | struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; | 2951 | unsigned long vaddr = tr->linear_address; |
| 2952 | gpa_t gpa; | ||
| 2953 | |||
| 2954 | vcpu_load(vcpu); | ||
| 2955 | down_read(¤t->mm->mmap_sem); | ||
| 2956 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); | ||
| 2957 | up_read(¤t->mm->mmap_sem); | ||
| 2958 | tr->physical_address = gpa; | ||
| 2959 | tr->valid = gpa != UNMAPPED_GVA; | ||
| 2960 | tr->writeable = 1; | ||
| 2961 | tr->usermode = 0; | ||
| 2962 | vcpu_put(vcpu); | ||
| 2963 | |||
| 2964 | return 0; | ||
| 2965 | } | ||
| 2966 | |||
| 2967 | int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | ||
| 2968 | { | ||
| 2969 | struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; | ||
| 2733 | 2970 | ||
| 2734 | vcpu_load(vcpu); | 2971 | vcpu_load(vcpu); |
| 2735 | 2972 | ||
| @@ -2747,9 +2984,9 @@ static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | |||
| 2747 | return 0; | 2984 | return 0; |
| 2748 | } | 2985 | } |
| 2749 | 2986 | ||
| 2750 | static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | 2987 | int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) |
| 2751 | { | 2988 | { |
| 2752 | struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; | 2989 | struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; |
| 2753 | 2990 | ||
| 2754 | vcpu_load(vcpu); | 2991 | vcpu_load(vcpu); |
| 2755 | 2992 | ||
| @@ -2767,862 +3004,284 @@ static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | |||
| 2767 | return 0; | 3004 | return 0; |
| 2768 | } | 3005 | } |
| 2769 | 3006 | ||
| 2770 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, | 3007 | void fx_init(struct kvm_vcpu *vcpu) |
| 2771 | struct kvm_lapic_state *s) | ||
| 2772 | { | 3008 | { |
| 2773 | vcpu_load(vcpu); | 3009 | unsigned after_mxcsr_mask; |
| 2774 | memcpy(s->regs, vcpu->apic->regs, sizeof *s); | ||
| 2775 | vcpu_put(vcpu); | ||
| 2776 | |||
| 2777 | return 0; | ||
| 2778 | } | ||
| 2779 | 3010 | ||
| 2780 | static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, | 3011 | /* Initialize guest FPU by resetting ours and saving into guest's */ |
| 2781 | struct kvm_lapic_state *s) | 3012 | preempt_disable(); |
| 2782 | { | 3013 | fx_save(&vcpu->arch.host_fx_image); |
| 2783 | vcpu_load(vcpu); | 3014 | fpu_init(); |
| 2784 | memcpy(vcpu->apic->regs, s->regs, sizeof *s); | 3015 | fx_save(&vcpu->arch.guest_fx_image); |
| 2785 | kvm_apic_post_state_restore(vcpu); | 3016 | fx_restore(&vcpu->arch.host_fx_image); |
| 2786 | vcpu_put(vcpu); | 3017 | preempt_enable(); |
| 2787 | 3018 | ||
| 2788 | return 0; | 3019 | vcpu->arch.cr0 |= X86_CR0_ET; |
| 3020 | after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); | ||
| 3021 | vcpu->arch.guest_fx_image.mxcsr = 0x1f80; | ||
| 3022 | memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, | ||
| 3023 | 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); | ||
| 2789 | } | 3024 | } |
| 3025 | EXPORT_SYMBOL_GPL(fx_init); | ||
| 2790 | 3026 | ||
| 2791 | static long kvm_vcpu_ioctl(struct file *filp, | 3027 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) |
| 2792 | unsigned int ioctl, unsigned long arg) | ||
| 2793 | { | 3028 | { |
| 2794 | struct kvm_vcpu *vcpu = filp->private_data; | 3029 | if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) |
| 2795 | void __user *argp = (void __user *)arg; | 3030 | return; |
| 2796 | int r = -EINVAL; | ||
| 2797 | |||
| 2798 | switch (ioctl) { | ||
| 2799 | case KVM_RUN: | ||
| 2800 | r = -EINVAL; | ||
| 2801 | if (arg) | ||
| 2802 | goto out; | ||
| 2803 | r = kvm_vcpu_ioctl_run(vcpu, vcpu->run); | ||
| 2804 | break; | ||
| 2805 | case KVM_GET_REGS: { | ||
| 2806 | struct kvm_regs kvm_regs; | ||
| 2807 | |||
| 2808 | memset(&kvm_regs, 0, sizeof kvm_regs); | ||
| 2809 | r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs); | ||
| 2810 | if (r) | ||
| 2811 | goto out; | ||
| 2812 | r = -EFAULT; | ||
| 2813 | if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs)) | ||
| 2814 | goto out; | ||
| 2815 | r = 0; | ||
| 2816 | break; | ||
| 2817 | } | ||
| 2818 | case KVM_SET_REGS: { | ||
| 2819 | struct kvm_regs kvm_regs; | ||
| 2820 | |||
| 2821 | r = -EFAULT; | ||
| 2822 | if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs)) | ||
| 2823 | goto out; | ||
| 2824 | r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs); | ||
| 2825 | if (r) | ||
| 2826 | goto out; | ||
| 2827 | r = 0; | ||
| 2828 | break; | ||
| 2829 | } | ||
| 2830 | case KVM_GET_SREGS: { | ||
| 2831 | struct kvm_sregs kvm_sregs; | ||
| 2832 | |||
| 2833 | memset(&kvm_sregs, 0, sizeof kvm_sregs); | ||
| 2834 | r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); | ||
| 2835 | if (r) | ||
| 2836 | goto out; | ||
| 2837 | r = -EFAULT; | ||
| 2838 | if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) | ||
| 2839 | goto out; | ||
| 2840 | r = 0; | ||
| 2841 | break; | ||
| 2842 | } | ||
| 2843 | case KVM_SET_SREGS: { | ||
| 2844 | struct kvm_sregs kvm_sregs; | ||
| 2845 | |||
| 2846 | r = -EFAULT; | ||
| 2847 | if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) | ||
| 2848 | goto out; | ||
| 2849 | r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); | ||
| 2850 | if (r) | ||
| 2851 | goto out; | ||
| 2852 | r = 0; | ||
| 2853 | break; | ||
| 2854 | } | ||
| 2855 | case KVM_TRANSLATE: { | ||
| 2856 | struct kvm_translation tr; | ||
| 2857 | |||
| 2858 | r = -EFAULT; | ||
| 2859 | if (copy_from_user(&tr, argp, sizeof tr)) | ||
| 2860 | goto out; | ||
| 2861 | r = kvm_vcpu_ioctl_translate(vcpu, &tr); | ||
| 2862 | if (r) | ||
| 2863 | goto out; | ||
| 2864 | r = -EFAULT; | ||
| 2865 | if (copy_to_user(argp, &tr, sizeof tr)) | ||
| 2866 | goto out; | ||
| 2867 | r = 0; | ||
| 2868 | break; | ||
| 2869 | } | ||
| 2870 | case KVM_INTERRUPT: { | ||
| 2871 | struct kvm_interrupt irq; | ||
| 2872 | |||
| 2873 | r = -EFAULT; | ||
| 2874 | if (copy_from_user(&irq, argp, sizeof irq)) | ||
| 2875 | goto out; | ||
| 2876 | r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); | ||
| 2877 | if (r) | ||
| 2878 | goto out; | ||
| 2879 | r = 0; | ||
| 2880 | break; | ||
| 2881 | } | ||
| 2882 | case KVM_DEBUG_GUEST: { | ||
| 2883 | struct kvm_debug_guest dbg; | ||
| 2884 | |||
| 2885 | r = -EFAULT; | ||
| 2886 | if (copy_from_user(&dbg, argp, sizeof dbg)) | ||
| 2887 | goto out; | ||
| 2888 | r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg); | ||
| 2889 | if (r) | ||
| 2890 | goto out; | ||
| 2891 | r = 0; | ||
| 2892 | break; | ||
| 2893 | } | ||
| 2894 | case KVM_GET_MSRS: | ||
| 2895 | r = msr_io(vcpu, argp, kvm_get_msr, 1); | ||
| 2896 | break; | ||
| 2897 | case KVM_SET_MSRS: | ||
| 2898 | r = msr_io(vcpu, argp, do_set_msr, 0); | ||
| 2899 | break; | ||
| 2900 | case KVM_SET_CPUID: { | ||
| 2901 | struct kvm_cpuid __user *cpuid_arg = argp; | ||
| 2902 | struct kvm_cpuid cpuid; | ||
| 2903 | |||
| 2904 | r = -EFAULT; | ||
| 2905 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
| 2906 | goto out; | ||
| 2907 | r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); | ||
| 2908 | if (r) | ||
| 2909 | goto out; | ||
| 2910 | break; | ||
| 2911 | } | ||
| 2912 | case KVM_SET_SIGNAL_MASK: { | ||
| 2913 | struct kvm_signal_mask __user *sigmask_arg = argp; | ||
| 2914 | struct kvm_signal_mask kvm_sigmask; | ||
| 2915 | sigset_t sigset, *p; | ||
| 2916 | |||
| 2917 | p = NULL; | ||
| 2918 | if (argp) { | ||
| 2919 | r = -EFAULT; | ||
| 2920 | if (copy_from_user(&kvm_sigmask, argp, | ||
| 2921 | sizeof kvm_sigmask)) | ||
| 2922 | goto out; | ||
| 2923 | r = -EINVAL; | ||
| 2924 | if (kvm_sigmask.len != sizeof sigset) | ||
| 2925 | goto out; | ||
| 2926 | r = -EFAULT; | ||
| 2927 | if (copy_from_user(&sigset, sigmask_arg->sigset, | ||
| 2928 | sizeof sigset)) | ||
| 2929 | goto out; | ||
| 2930 | p = &sigset; | ||
| 2931 | } | ||
| 2932 | r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); | ||
| 2933 | break; | ||
| 2934 | } | ||
| 2935 | case KVM_GET_FPU: { | ||
| 2936 | struct kvm_fpu fpu; | ||
| 2937 | |||
| 2938 | memset(&fpu, 0, sizeof fpu); | ||
| 2939 | r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu); | ||
| 2940 | if (r) | ||
| 2941 | goto out; | ||
| 2942 | r = -EFAULT; | ||
| 2943 | if (copy_to_user(argp, &fpu, sizeof fpu)) | ||
| 2944 | goto out; | ||
| 2945 | r = 0; | ||
| 2946 | break; | ||
| 2947 | } | ||
| 2948 | case KVM_SET_FPU: { | ||
| 2949 | struct kvm_fpu fpu; | ||
| 2950 | |||
| 2951 | r = -EFAULT; | ||
| 2952 | if (copy_from_user(&fpu, argp, sizeof fpu)) | ||
| 2953 | goto out; | ||
| 2954 | r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu); | ||
| 2955 | if (r) | ||
| 2956 | goto out; | ||
| 2957 | r = 0; | ||
| 2958 | break; | ||
| 2959 | } | ||
| 2960 | case KVM_GET_LAPIC: { | ||
| 2961 | struct kvm_lapic_state lapic; | ||
| 2962 | |||
| 2963 | memset(&lapic, 0, sizeof lapic); | ||
| 2964 | r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); | ||
| 2965 | if (r) | ||
| 2966 | goto out; | ||
| 2967 | r = -EFAULT; | ||
| 2968 | if (copy_to_user(argp, &lapic, sizeof lapic)) | ||
| 2969 | goto out; | ||
| 2970 | r = 0; | ||
| 2971 | break; | ||
| 2972 | } | ||
| 2973 | case KVM_SET_LAPIC: { | ||
| 2974 | struct kvm_lapic_state lapic; | ||
| 2975 | 3031 | ||
| 2976 | r = -EFAULT; | 3032 | vcpu->guest_fpu_loaded = 1; |
| 2977 | if (copy_from_user(&lapic, argp, sizeof lapic)) | 3033 | fx_save(&vcpu->arch.host_fx_image); |
| 2978 | goto out; | 3034 | fx_restore(&vcpu->arch.guest_fx_image); |
| 2979 | r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; | ||
| 2980 | if (r) | ||
| 2981 | goto out; | ||
| 2982 | r = 0; | ||
| 2983 | break; | ||
| 2984 | } | ||
| 2985 | default: | ||
| 2986 | ; | ||
| 2987 | } | ||
| 2988 | out: | ||
| 2989 | return r; | ||
| 2990 | } | 3035 | } |
| 3036 | EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); | ||
| 2991 | 3037 | ||
| 2992 | static long kvm_vm_ioctl(struct file *filp, | 3038 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) |
| 2993 | unsigned int ioctl, unsigned long arg) | ||
| 2994 | { | 3039 | { |
| 2995 | struct kvm *kvm = filp->private_data; | 3040 | if (!vcpu->guest_fpu_loaded) |
| 2996 | void __user *argp = (void __user *)arg; | 3041 | return; |
| 2997 | int r = -EINVAL; | ||
| 2998 | |||
| 2999 | switch (ioctl) { | ||
| 3000 | case KVM_CREATE_VCPU: | ||
| 3001 | r = kvm_vm_ioctl_create_vcpu(kvm, arg); | ||
| 3002 | if (r < 0) | ||
| 3003 | goto out; | ||
| 3004 | break; | ||
| 3005 | case KVM_SET_MEMORY_REGION: { | ||
| 3006 | struct kvm_memory_region kvm_mem; | ||
| 3007 | |||
| 3008 | r = -EFAULT; | ||
| 3009 | if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) | ||
| 3010 | goto out; | ||
| 3011 | r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem); | ||
| 3012 | if (r) | ||
| 3013 | goto out; | ||
| 3014 | break; | ||
| 3015 | } | ||
| 3016 | case KVM_GET_DIRTY_LOG: { | ||
| 3017 | struct kvm_dirty_log log; | ||
| 3018 | |||
| 3019 | r = -EFAULT; | ||
| 3020 | if (copy_from_user(&log, argp, sizeof log)) | ||
| 3021 | goto out; | ||
| 3022 | r = kvm_vm_ioctl_get_dirty_log(kvm, &log); | ||
| 3023 | if (r) | ||
| 3024 | goto out; | ||
| 3025 | break; | ||
| 3026 | } | ||
| 3027 | case KVM_SET_MEMORY_ALIAS: { | ||
| 3028 | struct kvm_memory_alias alias; | ||
| 3029 | |||
| 3030 | r = -EFAULT; | ||
| 3031 | if (copy_from_user(&alias, argp, sizeof alias)) | ||
| 3032 | goto out; | ||
| 3033 | r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); | ||
| 3034 | if (r) | ||
| 3035 | goto out; | ||
| 3036 | break; | ||
| 3037 | } | ||
| 3038 | case KVM_CREATE_IRQCHIP: | ||
| 3039 | r = -ENOMEM; | ||
| 3040 | kvm->vpic = kvm_create_pic(kvm); | ||
| 3041 | if (kvm->vpic) { | ||
| 3042 | r = kvm_ioapic_init(kvm); | ||
| 3043 | if (r) { | ||
| 3044 | kfree(kvm->vpic); | ||
| 3045 | kvm->vpic = NULL; | ||
| 3046 | goto out; | ||
| 3047 | } | ||
| 3048 | } | ||
| 3049 | else | ||
| 3050 | goto out; | ||
| 3051 | break; | ||
| 3052 | case KVM_IRQ_LINE: { | ||
| 3053 | struct kvm_irq_level irq_event; | ||
| 3054 | |||
| 3055 | r = -EFAULT; | ||
| 3056 | if (copy_from_user(&irq_event, argp, sizeof irq_event)) | ||
| 3057 | goto out; | ||
| 3058 | if (irqchip_in_kernel(kvm)) { | ||
| 3059 | mutex_lock(&kvm->lock); | ||
| 3060 | if (irq_event.irq < 16) | ||
| 3061 | kvm_pic_set_irq(pic_irqchip(kvm), | ||
| 3062 | irq_event.irq, | ||
| 3063 | irq_event.level); | ||
| 3064 | kvm_ioapic_set_irq(kvm->vioapic, | ||
| 3065 | irq_event.irq, | ||
| 3066 | irq_event.level); | ||
| 3067 | mutex_unlock(&kvm->lock); | ||
| 3068 | r = 0; | ||
| 3069 | } | ||
| 3070 | break; | ||
| 3071 | } | ||
| 3072 | case KVM_GET_IRQCHIP: { | ||
| 3073 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | ||
| 3074 | struct kvm_irqchip chip; | ||
| 3075 | |||
| 3076 | r = -EFAULT; | ||
| 3077 | if (copy_from_user(&chip, argp, sizeof chip)) | ||
| 3078 | goto out; | ||
| 3079 | r = -ENXIO; | ||
| 3080 | if (!irqchip_in_kernel(kvm)) | ||
| 3081 | goto out; | ||
| 3082 | r = kvm_vm_ioctl_get_irqchip(kvm, &chip); | ||
| 3083 | if (r) | ||
| 3084 | goto out; | ||
| 3085 | r = -EFAULT; | ||
| 3086 | if (copy_to_user(argp, &chip, sizeof chip)) | ||
| 3087 | goto out; | ||
| 3088 | r = 0; | ||
| 3089 | break; | ||
| 3090 | } | ||
| 3091 | case KVM_SET_IRQCHIP: { | ||
| 3092 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | ||
| 3093 | struct kvm_irqchip chip; | ||
| 3094 | 3042 | ||
| 3095 | r = -EFAULT; | 3043 | vcpu->guest_fpu_loaded = 0; |
| 3096 | if (copy_from_user(&chip, argp, sizeof chip)) | 3044 | fx_save(&vcpu->arch.guest_fx_image); |
| 3097 | goto out; | 3045 | fx_restore(&vcpu->arch.host_fx_image); |
| 3098 | r = -ENXIO; | 3046 | ++vcpu->stat.fpu_reload; |
| 3099 | if (!irqchip_in_kernel(kvm)) | ||
| 3100 | goto out; | ||
| 3101 | r = kvm_vm_ioctl_set_irqchip(kvm, &chip); | ||
| 3102 | if (r) | ||
| 3103 | goto out; | ||
| 3104 | r = 0; | ||
| 3105 | break; | ||
| 3106 | } | ||
| 3107 | default: | ||
| 3108 | ; | ||
| 3109 | } | ||
| 3110 | out: | ||
| 3111 | return r; | ||
| 3112 | } | 3047 | } |
| 3048 | EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); | ||
| 3113 | 3049 | ||
| 3114 | static struct page *kvm_vm_nopage(struct vm_area_struct *vma, | 3050 | void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) |
| 3115 | unsigned long address, | ||
| 3116 | int *type) | ||
| 3117 | { | 3051 | { |
| 3118 | struct kvm *kvm = vma->vm_file->private_data; | 3052 | kvm_x86_ops->vcpu_free(vcpu); |
| 3119 | unsigned long pgoff; | ||
| 3120 | struct page *page; | ||
| 3121 | |||
| 3122 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
| 3123 | page = gfn_to_page(kvm, pgoff); | ||
| 3124 | if (!page) | ||
| 3125 | return NOPAGE_SIGBUS; | ||
| 3126 | get_page(page); | ||
| 3127 | if (type != NULL) | ||
| 3128 | *type = VM_FAULT_MINOR; | ||
| 3129 | |||
| 3130 | return page; | ||
| 3131 | } | 3053 | } |
| 3132 | 3054 | ||
| 3133 | static struct vm_operations_struct kvm_vm_vm_ops = { | 3055 | struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, |
| 3134 | .nopage = kvm_vm_nopage, | 3056 | unsigned int id) |
| 3135 | }; | ||
| 3136 | |||
| 3137 | static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) | ||
| 3138 | { | 3057 | { |
| 3139 | vma->vm_ops = &kvm_vm_vm_ops; | 3058 | return kvm_x86_ops->vcpu_create(kvm, id); |
| 3140 | return 0; | ||
| 3141 | } | 3059 | } |
| 3142 | 3060 | ||
| 3143 | static struct file_operations kvm_vm_fops = { | 3061 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) |
| 3144 | .release = kvm_vm_release, | ||
| 3145 | .unlocked_ioctl = kvm_vm_ioctl, | ||
| 3146 | .compat_ioctl = kvm_vm_ioctl, | ||
| 3147 | .mmap = kvm_vm_mmap, | ||
| 3148 | }; | ||
| 3149 | |||
| 3150 | static int kvm_dev_ioctl_create_vm(void) | ||
| 3151 | { | 3062 | { |
| 3152 | int fd, r; | 3063 | int r; |
| 3153 | struct inode *inode; | ||
| 3154 | struct file *file; | ||
| 3155 | struct kvm *kvm; | ||
| 3156 | 3064 | ||
| 3157 | kvm = kvm_create_vm(); | 3065 | /* We do fxsave: this must be aligned. */ |
| 3158 | if (IS_ERR(kvm)) | 3066 | BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); |
| 3159 | return PTR_ERR(kvm); | ||
| 3160 | r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm); | ||
| 3161 | if (r) { | ||
| 3162 | kvm_destroy_vm(kvm); | ||
| 3163 | return r; | ||
| 3164 | } | ||
| 3165 | 3067 | ||
| 3166 | kvm->filp = file; | 3068 | vcpu_load(vcpu); |
| 3069 | r = kvm_arch_vcpu_reset(vcpu); | ||
| 3070 | if (r == 0) | ||
| 3071 | r = kvm_mmu_setup(vcpu); | ||
| 3072 | vcpu_put(vcpu); | ||
| 3073 | if (r < 0) | ||
| 3074 | goto free_vcpu; | ||
| 3167 | 3075 | ||
| 3168 | return fd; | 3076 | return 0; |
| 3077 | free_vcpu: | ||
| 3078 | kvm_x86_ops->vcpu_free(vcpu); | ||
| 3079 | return r; | ||
| 3169 | } | 3080 | } |
| 3170 | 3081 | ||
| 3171 | static long kvm_dev_ioctl(struct file *filp, | 3082 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) |
| 3172 | unsigned int ioctl, unsigned long arg) | ||
| 3173 | { | 3083 | { |
| 3174 | void __user *argp = (void __user *)arg; | 3084 | vcpu_load(vcpu); |
| 3175 | long r = -EINVAL; | 3085 | kvm_mmu_unload(vcpu); |
| 3176 | 3086 | vcpu_put(vcpu); | |
| 3177 | switch (ioctl) { | ||
| 3178 | case KVM_GET_API_VERSION: | ||
| 3179 | r = -EINVAL; | ||
| 3180 | if (arg) | ||
| 3181 | goto out; | ||
| 3182 | r = KVM_API_VERSION; | ||
| 3183 | break; | ||
| 3184 | case KVM_CREATE_VM: | ||
| 3185 | r = -EINVAL; | ||
| 3186 | if (arg) | ||
| 3187 | goto out; | ||
| 3188 | r = kvm_dev_ioctl_create_vm(); | ||
| 3189 | break; | ||
| 3190 | case KVM_GET_MSR_INDEX_LIST: { | ||
| 3191 | struct kvm_msr_list __user *user_msr_list = argp; | ||
| 3192 | struct kvm_msr_list msr_list; | ||
| 3193 | unsigned n; | ||
| 3194 | |||
| 3195 | r = -EFAULT; | ||
| 3196 | if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) | ||
| 3197 | goto out; | ||
| 3198 | n = msr_list.nmsrs; | ||
| 3199 | msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); | ||
| 3200 | if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) | ||
| 3201 | goto out; | ||
| 3202 | r = -E2BIG; | ||
| 3203 | if (n < num_msrs_to_save) | ||
| 3204 | goto out; | ||
| 3205 | r = -EFAULT; | ||
| 3206 | if (copy_to_user(user_msr_list->indices, &msrs_to_save, | ||
| 3207 | num_msrs_to_save * sizeof(u32))) | ||
| 3208 | goto out; | ||
| 3209 | if (copy_to_user(user_msr_list->indices | ||
| 3210 | + num_msrs_to_save * sizeof(u32), | ||
| 3211 | &emulated_msrs, | ||
| 3212 | ARRAY_SIZE(emulated_msrs) * sizeof(u32))) | ||
| 3213 | goto out; | ||
| 3214 | r = 0; | ||
| 3215 | break; | ||
| 3216 | } | ||
| 3217 | case KVM_CHECK_EXTENSION: { | ||
| 3218 | int ext = (long)argp; | ||
| 3219 | 3087 | ||
| 3220 | switch (ext) { | 3088 | kvm_x86_ops->vcpu_free(vcpu); |
| 3221 | case KVM_CAP_IRQCHIP: | ||
| 3222 | case KVM_CAP_HLT: | ||
| 3223 | r = 1; | ||
| 3224 | break; | ||
| 3225 | default: | ||
| 3226 | r = 0; | ||
| 3227 | break; | ||
| 3228 | } | ||
| 3229 | break; | ||
| 3230 | } | ||
| 3231 | case KVM_GET_VCPU_MMAP_SIZE: | ||
| 3232 | r = -EINVAL; | ||
| 3233 | if (arg) | ||
| 3234 | goto out; | ||
| 3235 | r = 2 * PAGE_SIZE; | ||
| 3236 | break; | ||
| 3237 | default: | ||
| 3238 | ; | ||
| 3239 | } | ||
| 3240 | out: | ||
| 3241 | return r; | ||
| 3242 | } | 3089 | } |
| 3243 | 3090 | ||
| 3244 | static struct file_operations kvm_chardev_ops = { | 3091 | int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) |
| 3245 | .unlocked_ioctl = kvm_dev_ioctl, | ||
| 3246 | .compat_ioctl = kvm_dev_ioctl, | ||
| 3247 | }; | ||
| 3248 | |||
| 3249 | static struct miscdevice kvm_dev = { | ||
| 3250 | KVM_MINOR, | ||
| 3251 | "kvm", | ||
| 3252 | &kvm_chardev_ops, | ||
| 3253 | }; | ||
| 3254 | |||
| 3255 | /* | ||
| 3256 | * Make sure that a cpu that is being hot-unplugged does not have any vcpus | ||
| 3257 | * cached on it. | ||
| 3258 | */ | ||
| 3259 | static void decache_vcpus_on_cpu(int cpu) | ||
| 3260 | { | 3092 | { |
| 3261 | struct kvm *vm; | 3093 | return kvm_x86_ops->vcpu_reset(vcpu); |
| 3262 | struct kvm_vcpu *vcpu; | ||
| 3263 | int i; | ||
| 3264 | |||
| 3265 | spin_lock(&kvm_lock); | ||
| 3266 | list_for_each_entry(vm, &vm_list, vm_list) | ||
| 3267 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
| 3268 | vcpu = vm->vcpus[i]; | ||
| 3269 | if (!vcpu) | ||
| 3270 | continue; | ||
| 3271 | /* | ||
| 3272 | * If the vcpu is locked, then it is running on some | ||
| 3273 | * other cpu and therefore it is not cached on the | ||
| 3274 | * cpu in question. | ||
| 3275 | * | ||
| 3276 | * If it's not locked, check the last cpu it executed | ||
| 3277 | * on. | ||
| 3278 | */ | ||
| 3279 | if (mutex_trylock(&vcpu->mutex)) { | ||
| 3280 | if (vcpu->cpu == cpu) { | ||
| 3281 | kvm_x86_ops->vcpu_decache(vcpu); | ||
| 3282 | vcpu->cpu = -1; | ||
| 3283 | } | ||
| 3284 | mutex_unlock(&vcpu->mutex); | ||
| 3285 | } | ||
| 3286 | } | ||
| 3287 | spin_unlock(&kvm_lock); | ||
| 3288 | } | 3094 | } |
| 3289 | 3095 | ||
| 3290 | static void hardware_enable(void *junk) | 3096 | void kvm_arch_hardware_enable(void *garbage) |
| 3291 | { | 3097 | { |
| 3292 | int cpu = raw_smp_processor_id(); | 3098 | kvm_x86_ops->hardware_enable(garbage); |
| 3293 | |||
| 3294 | if (cpu_isset(cpu, cpus_hardware_enabled)) | ||
| 3295 | return; | ||
| 3296 | cpu_set(cpu, cpus_hardware_enabled); | ||
| 3297 | kvm_x86_ops->hardware_enable(NULL); | ||
| 3298 | } | 3099 | } |
| 3299 | 3100 | ||
| 3300 | static void hardware_disable(void *junk) | 3101 | void kvm_arch_hardware_disable(void *garbage) |
| 3301 | { | 3102 | { |
| 3302 | int cpu = raw_smp_processor_id(); | 3103 | kvm_x86_ops->hardware_disable(garbage); |
| 3303 | |||
| 3304 | if (!cpu_isset(cpu, cpus_hardware_enabled)) | ||
| 3305 | return; | ||
| 3306 | cpu_clear(cpu, cpus_hardware_enabled); | ||
| 3307 | decache_vcpus_on_cpu(cpu); | ||
| 3308 | kvm_x86_ops->hardware_disable(NULL); | ||
| 3309 | } | 3104 | } |
| 3310 | 3105 | ||
| 3311 | static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, | 3106 | int kvm_arch_hardware_setup(void) |
| 3312 | void *v) | ||
| 3313 | { | 3107 | { |
| 3314 | int cpu = (long)v; | 3108 | return kvm_x86_ops->hardware_setup(); |
| 3315 | |||
| 3316 | switch (val) { | ||
| 3317 | case CPU_DYING: | ||
| 3318 | case CPU_DYING_FROZEN: | ||
| 3319 | printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", | ||
| 3320 | cpu); | ||
| 3321 | hardware_disable(NULL); | ||
| 3322 | break; | ||
| 3323 | case CPU_UP_CANCELED: | ||
| 3324 | case CPU_UP_CANCELED_FROZEN: | ||
| 3325 | printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", | ||
| 3326 | cpu); | ||
| 3327 | smp_call_function_single(cpu, hardware_disable, NULL, 0, 1); | ||
| 3328 | break; | ||
| 3329 | case CPU_ONLINE: | ||
| 3330 | case CPU_ONLINE_FROZEN: | ||
| 3331 | printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", | ||
| 3332 | cpu); | ||
| 3333 | smp_call_function_single(cpu, hardware_enable, NULL, 0, 1); | ||
| 3334 | break; | ||
| 3335 | } | ||
| 3336 | return NOTIFY_OK; | ||
| 3337 | } | 3109 | } |
| 3338 | 3110 | ||
| 3339 | static int kvm_reboot(struct notifier_block *notifier, unsigned long val, | 3111 | void kvm_arch_hardware_unsetup(void) |
| 3340 | void *v) | ||
| 3341 | { | 3112 | { |
| 3342 | if (val == SYS_RESTART) { | 3113 | kvm_x86_ops->hardware_unsetup(); |
| 3343 | /* | ||
| 3344 | * Some (well, at least mine) BIOSes hang on reboot if | ||
| 3345 | * in vmx root mode. | ||
| 3346 | */ | ||
| 3347 | printk(KERN_INFO "kvm: exiting hardware virtualization\n"); | ||
| 3348 | on_each_cpu(hardware_disable, NULL, 0, 1); | ||
| 3349 | } | ||
| 3350 | return NOTIFY_OK; | ||
| 3351 | } | 3114 | } |
| 3352 | 3115 | ||
| 3353 | static struct notifier_block kvm_reboot_notifier = { | 3116 | void kvm_arch_check_processor_compat(void *rtn) |
| 3354 | .notifier_call = kvm_reboot, | ||
| 3355 | .priority = 0, | ||
| 3356 | }; | ||
| 3357 | |||
| 3358 | void kvm_io_bus_init(struct kvm_io_bus *bus) | ||
| 3359 | { | 3117 | { |
| 3360 | memset(bus, 0, sizeof(*bus)); | 3118 | kvm_x86_ops->check_processor_compatibility(rtn); |
| 3361 | } | 3119 | } |
| 3362 | 3120 | ||
| 3363 | void kvm_io_bus_destroy(struct kvm_io_bus *bus) | 3121 | int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) |
| 3364 | { | 3122 | { |
| 3365 | int i; | 3123 | struct page *page; |
| 3124 | struct kvm *kvm; | ||
| 3125 | int r; | ||
| 3366 | 3126 | ||
| 3367 | for (i = 0; i < bus->dev_count; i++) { | 3127 | BUG_ON(vcpu->kvm == NULL); |
| 3368 | struct kvm_io_device *pos = bus->devs[i]; | 3128 | kvm = vcpu->kvm; |
| 3369 | 3129 | ||
| 3370 | kvm_iodevice_destructor(pos); | 3130 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; |
| 3371 | } | 3131 | if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) |
| 3372 | } | 3132 | vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; |
| 3133 | else | ||
| 3134 | vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED; | ||
| 3373 | 3135 | ||
| 3374 | struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) | 3136 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); |
| 3375 | { | 3137 | if (!page) { |
| 3376 | int i; | 3138 | r = -ENOMEM; |
| 3139 | goto fail; | ||
| 3140 | } | ||
| 3141 | vcpu->arch.pio_data = page_address(page); | ||
| 3377 | 3142 | ||
| 3378 | for (i = 0; i < bus->dev_count; i++) { | 3143 | r = kvm_mmu_create(vcpu); |
| 3379 | struct kvm_io_device *pos = bus->devs[i]; | 3144 | if (r < 0) |
| 3145 | goto fail_free_pio_data; | ||
| 3380 | 3146 | ||
| 3381 | if (pos->in_range(pos, addr)) | 3147 | if (irqchip_in_kernel(kvm)) { |
| 3382 | return pos; | 3148 | r = kvm_create_lapic(vcpu); |
| 3149 | if (r < 0) | ||
| 3150 | goto fail_mmu_destroy; | ||
| 3383 | } | 3151 | } |
| 3384 | 3152 | ||
| 3385 | return NULL; | 3153 | return 0; |
| 3386 | } | ||
| 3387 | |||
| 3388 | void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) | ||
| 3389 | { | ||
| 3390 | BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); | ||
| 3391 | 3154 | ||
| 3392 | bus->devs[bus->dev_count++] = dev; | 3155 | fail_mmu_destroy: |
| 3156 | kvm_mmu_destroy(vcpu); | ||
| 3157 | fail_free_pio_data: | ||
| 3158 | free_page((unsigned long)vcpu->arch.pio_data); | ||
| 3159 | fail: | ||
| 3160 | return r; | ||
| 3393 | } | 3161 | } |
| 3394 | 3162 | ||
| 3395 | static struct notifier_block kvm_cpu_notifier = { | 3163 | void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) |
| 3396 | .notifier_call = kvm_cpu_hotplug, | ||
| 3397 | .priority = 20, /* must be > scheduler priority */ | ||
| 3398 | }; | ||
| 3399 | |||
| 3400 | static u64 stat_get(void *_offset) | ||
| 3401 | { | 3164 | { |
| 3402 | unsigned offset = (long)_offset; | 3165 | kvm_free_lapic(vcpu); |
| 3403 | u64 total = 0; | 3166 | kvm_mmu_destroy(vcpu); |
| 3404 | struct kvm *kvm; | 3167 | free_page((unsigned long)vcpu->arch.pio_data); |
| 3405 | struct kvm_vcpu *vcpu; | ||
| 3406 | int i; | ||
| 3407 | |||
| 3408 | spin_lock(&kvm_lock); | ||
| 3409 | list_for_each_entry(kvm, &vm_list, vm_list) | ||
| 3410 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
| 3411 | vcpu = kvm->vcpus[i]; | ||
| 3412 | if (vcpu) | ||
| 3413 | total += *(u32 *)((void *)vcpu + offset); | ||
| 3414 | } | ||
| 3415 | spin_unlock(&kvm_lock); | ||
| 3416 | return total; | ||
| 3417 | } | 3168 | } |
| 3418 | 3169 | ||
| 3419 | DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n"); | 3170 | struct kvm *kvm_arch_create_vm(void) |
| 3420 | |||
| 3421 | static __init void kvm_init_debug(void) | ||
| 3422 | { | 3171 | { |
| 3423 | struct kvm_stats_debugfs_item *p; | 3172 | struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); |
| 3424 | |||
| 3425 | debugfs_dir = debugfs_create_dir("kvm", NULL); | ||
| 3426 | for (p = debugfs_entries; p->name; ++p) | ||
| 3427 | p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir, | ||
| 3428 | (void *)(long)p->offset, | ||
| 3429 | &stat_fops); | ||
| 3430 | } | ||
| 3431 | 3173 | ||
| 3432 | static void kvm_exit_debug(void) | 3174 | if (!kvm) |
| 3433 | { | 3175 | return ERR_PTR(-ENOMEM); |
| 3434 | struct kvm_stats_debugfs_item *p; | ||
| 3435 | 3176 | ||
| 3436 | for (p = debugfs_entries; p->name; ++p) | 3177 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); |
| 3437 | debugfs_remove(p->dentry); | ||
| 3438 | debugfs_remove(debugfs_dir); | ||
| 3439 | } | ||
| 3440 | 3178 | ||
| 3441 | static int kvm_suspend(struct sys_device *dev, pm_message_t state) | 3179 | return kvm; |
| 3442 | { | ||
| 3443 | hardware_disable(NULL); | ||
| 3444 | return 0; | ||
| 3445 | } | 3180 | } |
| 3446 | 3181 | ||
| 3447 | static int kvm_resume(struct sys_device *dev) | 3182 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) |
| 3448 | { | 3183 | { |
| 3449 | hardware_enable(NULL); | 3184 | vcpu_load(vcpu); |
| 3450 | return 0; | 3185 | kvm_mmu_unload(vcpu); |
| 3186 | vcpu_put(vcpu); | ||
| 3451 | } | 3187 | } |
| 3452 | 3188 | ||
| 3453 | static struct sysdev_class kvm_sysdev_class = { | 3189 | static void kvm_free_vcpus(struct kvm *kvm) |
| 3454 | .name = "kvm", | ||
| 3455 | .suspend = kvm_suspend, | ||
| 3456 | .resume = kvm_resume, | ||
| 3457 | }; | ||
| 3458 | |||
| 3459 | static struct sys_device kvm_sysdev = { | ||
| 3460 | .id = 0, | ||
| 3461 | .cls = &kvm_sysdev_class, | ||
| 3462 | }; | ||
| 3463 | |||
| 3464 | hpa_t bad_page_address; | ||
| 3465 | |||
| 3466 | static inline | ||
| 3467 | struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) | ||
| 3468 | { | 3190 | { |
| 3469 | return container_of(pn, struct kvm_vcpu, preempt_notifier); | 3191 | unsigned int i; |
| 3470 | } | ||
| 3471 | 3192 | ||
| 3472 | static void kvm_sched_in(struct preempt_notifier *pn, int cpu) | 3193 | /* |
| 3473 | { | 3194 | * Unpin any mmu pages first. |
| 3474 | struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); | 3195 | */ |
| 3196 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | ||
| 3197 | if (kvm->vcpus[i]) | ||
| 3198 | kvm_unload_vcpu_mmu(kvm->vcpus[i]); | ||
| 3199 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
| 3200 | if (kvm->vcpus[i]) { | ||
| 3201 | kvm_arch_vcpu_free(kvm->vcpus[i]); | ||
| 3202 | kvm->vcpus[i] = NULL; | ||
| 3203 | } | ||
| 3204 | } | ||
| 3475 | 3205 | ||
| 3476 | kvm_x86_ops->vcpu_load(vcpu, cpu); | ||
| 3477 | } | 3206 | } |
| 3478 | 3207 | ||
| 3479 | static void kvm_sched_out(struct preempt_notifier *pn, | 3208 | void kvm_arch_destroy_vm(struct kvm *kvm) |
| 3480 | struct task_struct *next) | ||
| 3481 | { | 3209 | { |
| 3482 | struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); | 3210 | kfree(kvm->arch.vpic); |
| 3483 | 3211 | kfree(kvm->arch.vioapic); | |
| 3484 | kvm_x86_ops->vcpu_put(vcpu); | 3212 | kvm_free_vcpus(kvm); |
| 3213 | kvm_free_physmem(kvm); | ||
| 3214 | kfree(kvm); | ||
| 3485 | } | 3215 | } |
| 3486 | 3216 | ||
| 3487 | int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, | 3217 | int kvm_arch_set_memory_region(struct kvm *kvm, |
| 3488 | struct module *module) | 3218 | struct kvm_userspace_memory_region *mem, |
| 3219 | struct kvm_memory_slot old, | ||
| 3220 | int user_alloc) | ||
| 3489 | { | 3221 | { |
| 3490 | int r; | 3222 | int npages = mem->memory_size >> PAGE_SHIFT; |
| 3491 | int cpu; | 3223 | struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; |
| 3492 | |||
| 3493 | if (kvm_x86_ops) { | ||
| 3494 | printk(KERN_ERR "kvm: already loaded the other module\n"); | ||
| 3495 | return -EEXIST; | ||
| 3496 | } | ||
| 3497 | 3224 | ||
| 3498 | if (!ops->cpu_has_kvm_support()) { | 3225 | /*To keep backward compatibility with older userspace, |
| 3499 | printk(KERN_ERR "kvm: no hardware support\n"); | 3226 | *x86 needs to hanlde !user_alloc case. |
| 3500 | return -EOPNOTSUPP; | 3227 | */ |
| 3501 | } | 3228 | if (!user_alloc) { |
| 3502 | if (ops->disabled_by_bios()) { | 3229 | if (npages && !old.rmap) { |
| 3503 | printk(KERN_ERR "kvm: disabled by bios\n"); | 3230 | memslot->userspace_addr = do_mmap(NULL, 0, |
| 3504 | return -EOPNOTSUPP; | 3231 | npages * PAGE_SIZE, |
| 3505 | } | 3232 | PROT_READ | PROT_WRITE, |
| 3506 | 3233 | MAP_SHARED | MAP_ANONYMOUS, | |
| 3507 | kvm_x86_ops = ops; | 3234 | 0); |
| 3508 | 3235 | ||
| 3509 | r = kvm_x86_ops->hardware_setup(); | 3236 | if (IS_ERR((void *)memslot->userspace_addr)) |
| 3510 | if (r < 0) | 3237 | return PTR_ERR((void *)memslot->userspace_addr); |
| 3511 | goto out; | 3238 | } else { |
| 3512 | 3239 | if (!old.user_alloc && old.rmap) { | |
| 3513 | for_each_online_cpu(cpu) { | 3240 | int ret; |
| 3514 | smp_call_function_single(cpu, | 3241 | |
| 3515 | kvm_x86_ops->check_processor_compatibility, | 3242 | ret = do_munmap(current->mm, old.userspace_addr, |
| 3516 | &r, 0, 1); | 3243 | old.npages * PAGE_SIZE); |
| 3517 | if (r < 0) | 3244 | if (ret < 0) |
| 3518 | goto out_free_0; | 3245 | printk(KERN_WARNING |
| 3519 | } | 3246 | "kvm_vm_ioctl_set_memory_region: " |
| 3520 | 3247 | "failed to munmap memory\n"); | |
| 3521 | on_each_cpu(hardware_enable, NULL, 0, 1); | 3248 | } |
| 3522 | r = register_cpu_notifier(&kvm_cpu_notifier); | 3249 | } |
| 3523 | if (r) | ||
| 3524 | goto out_free_1; | ||
| 3525 | register_reboot_notifier(&kvm_reboot_notifier); | ||
| 3526 | |||
| 3527 | r = sysdev_class_register(&kvm_sysdev_class); | ||
| 3528 | if (r) | ||
| 3529 | goto out_free_2; | ||
| 3530 | |||
| 3531 | r = sysdev_register(&kvm_sysdev); | ||
| 3532 | if (r) | ||
| 3533 | goto out_free_3; | ||
| 3534 | |||
| 3535 | /* A kmem cache lets us meet the alignment requirements of fx_save. */ | ||
| 3536 | kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, | ||
| 3537 | __alignof__(struct kvm_vcpu), 0, 0); | ||
| 3538 | if (!kvm_vcpu_cache) { | ||
| 3539 | r = -ENOMEM; | ||
| 3540 | goto out_free_4; | ||
| 3541 | } | 3250 | } |
| 3542 | 3251 | ||
| 3543 | kvm_chardev_ops.owner = module; | 3252 | if (!kvm->arch.n_requested_mmu_pages) { |
| 3544 | 3253 | unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); | |
| 3545 | r = misc_register(&kvm_dev); | 3254 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); |
| 3546 | if (r) { | ||
| 3547 | printk (KERN_ERR "kvm: misc device register failed\n"); | ||
| 3548 | goto out_free; | ||
| 3549 | } | 3255 | } |
| 3550 | 3256 | ||
| 3551 | kvm_preempt_ops.sched_in = kvm_sched_in; | 3257 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); |
| 3552 | kvm_preempt_ops.sched_out = kvm_sched_out; | 3258 | kvm_flush_remote_tlbs(kvm); |
| 3553 | |||
| 3554 | return r; | ||
| 3555 | 3259 | ||
| 3556 | out_free: | 3260 | return 0; |
| 3557 | kmem_cache_destroy(kvm_vcpu_cache); | ||
| 3558 | out_free_4: | ||
| 3559 | sysdev_unregister(&kvm_sysdev); | ||
| 3560 | out_free_3: | ||
| 3561 | sysdev_class_unregister(&kvm_sysdev_class); | ||
| 3562 | out_free_2: | ||
| 3563 | unregister_reboot_notifier(&kvm_reboot_notifier); | ||
| 3564 | unregister_cpu_notifier(&kvm_cpu_notifier); | ||
| 3565 | out_free_1: | ||
| 3566 | on_each_cpu(hardware_disable, NULL, 0, 1); | ||
| 3567 | out_free_0: | ||
| 3568 | kvm_x86_ops->hardware_unsetup(); | ||
| 3569 | out: | ||
| 3570 | kvm_x86_ops = NULL; | ||
| 3571 | return r; | ||
| 3572 | } | 3261 | } |
| 3573 | 3262 | ||
| 3574 | void kvm_exit_x86(void) | 3263 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) |
| 3575 | { | 3264 | { |
| 3576 | misc_deregister(&kvm_dev); | 3265 | return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE |
| 3577 | kmem_cache_destroy(kvm_vcpu_cache); | 3266 | || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED; |
| 3578 | sysdev_unregister(&kvm_sysdev); | ||
| 3579 | sysdev_class_unregister(&kvm_sysdev_class); | ||
| 3580 | unregister_reboot_notifier(&kvm_reboot_notifier); | ||
| 3581 | unregister_cpu_notifier(&kvm_cpu_notifier); | ||
| 3582 | on_each_cpu(hardware_disable, NULL, 0, 1); | ||
| 3583 | kvm_x86_ops->hardware_unsetup(); | ||
| 3584 | kvm_x86_ops = NULL; | ||
| 3585 | } | 3267 | } |
| 3586 | 3268 | ||
| 3587 | static __init int kvm_init(void) | 3269 | static void vcpu_kick_intr(void *info) |
| 3588 | { | 3270 | { |
| 3589 | static struct page *bad_page; | 3271 | #ifdef DEBUG |
| 3590 | int r; | 3272 | struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; |
| 3591 | 3273 | printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); | |
| 3592 | r = kvm_mmu_module_init(); | 3274 | #endif |
| 3593 | if (r) | ||
| 3594 | goto out4; | ||
| 3595 | |||
| 3596 | kvm_init_debug(); | ||
| 3597 | |||
| 3598 | kvm_init_msr_list(); | ||
| 3599 | |||
| 3600 | if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) { | ||
| 3601 | r = -ENOMEM; | ||
| 3602 | goto out; | ||
| 3603 | } | ||
| 3604 | |||
| 3605 | bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT; | ||
| 3606 | memset(__va(bad_page_address), 0, PAGE_SIZE); | ||
| 3607 | |||
| 3608 | return 0; | ||
| 3609 | |||
| 3610 | out: | ||
| 3611 | kvm_exit_debug(); | ||
| 3612 | kvm_mmu_module_exit(); | ||
| 3613 | out4: | ||
| 3614 | return r; | ||
| 3615 | } | 3275 | } |
| 3616 | 3276 | ||
| 3617 | static __exit void kvm_exit(void) | 3277 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) |
| 3618 | { | 3278 | { |
| 3619 | kvm_exit_debug(); | 3279 | int ipi_pcpu = vcpu->cpu; |
| 3620 | __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); | ||
| 3621 | kvm_mmu_module_exit(); | ||
| 3622 | } | ||
| 3623 | |||
| 3624 | module_init(kvm_init) | ||
| 3625 | module_exit(kvm_exit) | ||
| 3626 | 3280 | ||
| 3627 | EXPORT_SYMBOL_GPL(kvm_init_x86); | 3281 | if (waitqueue_active(&vcpu->wq)) { |
| 3628 | EXPORT_SYMBOL_GPL(kvm_exit_x86); | 3282 | wake_up_interruptible(&vcpu->wq); |
| 3283 | ++vcpu->stat.halt_wakeup; | ||
| 3284 | } | ||
| 3285 | if (vcpu->guest_mode) | ||
| 3286 | smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); | ||
| 3287 | } | ||
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c new file mode 100644 index 000000000000..79586003397a --- /dev/null +++ b/arch/x86/kvm/x86_emulate.c | |||
| @@ -0,0 +1,1912 @@ | |||
| 1 | /****************************************************************************** | ||
| 2 | * x86_emulate.c | ||
| 3 | * | ||
| 4 | * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. | ||
| 5 | * | ||
| 6 | * Copyright (c) 2005 Keir Fraser | ||
| 7 | * | ||
| 8 | * Linux coding style, mod r/m decoder, segment base fixes, real-mode | ||
| 9 | * privileged instructions: | ||
| 10 | * | ||
| 11 | * Copyright (C) 2006 Qumranet | ||
| 12 | * | ||
| 13 | * Avi Kivity <avi@qumranet.com> | ||
| 14 | * Yaniv Kamay <yaniv@qumranet.com> | ||
| 15 | * | ||
| 16 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
| 17 | * the COPYING file in the top-level directory. | ||
| 18 | * | ||
| 19 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 | ||
| 20 | */ | ||
| 21 | |||
| 22 | #ifndef __KERNEL__ | ||
| 23 | #include <stdio.h> | ||
| 24 | #include <stdint.h> | ||
| 25 | #include <public/xen.h> | ||
| 26 | #define DPRINTF(_f, _a ...) printf(_f , ## _a) | ||
| 27 | #else | ||
| 28 | #include <linux/kvm_host.h> | ||
| 29 | #define DPRINTF(x...) do {} while (0) | ||
| 30 | #endif | ||
| 31 | #include <linux/module.h> | ||
| 32 | #include <asm/kvm_x86_emulate.h> | ||
| 33 | |||
| 34 | /* | ||
| 35 | * Opcode effective-address decode tables. | ||
| 36 | * Note that we only emulate instructions that have at least one memory | ||
| 37 | * operand (excluding implicit stack references). We assume that stack | ||
| 38 | * references and instruction fetches will never occur in special memory | ||
| 39 | * areas that require emulation. So, for example, 'mov <imm>,<reg>' need | ||
| 40 | * not be handled. | ||
| 41 | */ | ||
| 42 | |||
| 43 | /* Operand sizes: 8-bit operands or specified/overridden size. */ | ||
| 44 | #define ByteOp (1<<0) /* 8-bit operands. */ | ||
| 45 | /* Destination operand type. */ | ||
| 46 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ | ||
| 47 | #define DstReg (2<<1) /* Register operand. */ | ||
| 48 | #define DstMem (3<<1) /* Memory operand. */ | ||
| 49 | #define DstMask (3<<1) | ||
| 50 | /* Source operand type. */ | ||
| 51 | #define SrcNone (0<<3) /* No source operand. */ | ||
| 52 | #define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ | ||
| 53 | #define SrcReg (1<<3) /* Register operand. */ | ||
| 54 | #define SrcMem (2<<3) /* Memory operand. */ | ||
| 55 | #define SrcMem16 (3<<3) /* Memory operand (16-bit). */ | ||
| 56 | #define SrcMem32 (4<<3) /* Memory operand (32-bit). */ | ||
| 57 | #define SrcImm (5<<3) /* Immediate operand. */ | ||
| 58 | #define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ | ||
| 59 | #define SrcMask (7<<3) | ||
| 60 | /* Generic ModRM decode. */ | ||
| 61 | #define ModRM (1<<6) | ||
| 62 | /* Destination is only written; never read. */ | ||
| 63 | #define Mov (1<<7) | ||
| 64 | #define BitOp (1<<8) | ||
| 65 | #define MemAbs (1<<9) /* Memory operand is absolute displacement */ | ||
| 66 | #define String (1<<10) /* String instruction (rep capable) */ | ||
| 67 | #define Stack (1<<11) /* Stack instruction (push/pop) */ | ||
| 68 | |||
| 69 | static u16 opcode_table[256] = { | ||
| 70 | /* 0x00 - 0x07 */ | ||
| 71 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 72 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 73 | 0, 0, 0, 0, | ||
| 74 | /* 0x08 - 0x0F */ | ||
| 75 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 76 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 77 | 0, 0, 0, 0, | ||
| 78 | /* 0x10 - 0x17 */ | ||
| 79 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 80 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 81 | 0, 0, 0, 0, | ||
| 82 | /* 0x18 - 0x1F */ | ||
| 83 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 84 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 85 | 0, 0, 0, 0, | ||
| 86 | /* 0x20 - 0x27 */ | ||
| 87 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 88 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 89 | SrcImmByte, SrcImm, 0, 0, | ||
| 90 | /* 0x28 - 0x2F */ | ||
| 91 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 92 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 93 | 0, 0, 0, 0, | ||
| 94 | /* 0x30 - 0x37 */ | ||
| 95 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 96 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 97 | 0, 0, 0, 0, | ||
| 98 | /* 0x38 - 0x3F */ | ||
| 99 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 100 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 101 | 0, 0, 0, 0, | ||
| 102 | /* 0x40 - 0x47 */ | ||
| 103 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | ||
| 104 | /* 0x48 - 0x4F */ | ||
| 105 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | ||
| 106 | /* 0x50 - 0x57 */ | ||
| 107 | SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, | ||
| 108 | SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, | ||
| 109 | /* 0x58 - 0x5F */ | ||
| 110 | DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, | ||
| 111 | DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, | ||
| 112 | /* 0x60 - 0x67 */ | ||
| 113 | 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , | ||
| 114 | 0, 0, 0, 0, | ||
| 115 | /* 0x68 - 0x6F */ | ||
| 116 | 0, 0, ImplicitOps | Mov | Stack, 0, | ||
| 117 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ | ||
| 118 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ | ||
| 119 | /* 0x70 - 0x77 */ | ||
| 120 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 121 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 122 | /* 0x78 - 0x7F */ | ||
| 123 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 124 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 125 | /* 0x80 - 0x87 */ | ||
| 126 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, | ||
| 127 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
| 128 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 129 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 130 | /* 0x88 - 0x8F */ | ||
| 131 | ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, | ||
| 132 | ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 133 | 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack, | ||
| 134 | /* 0x90 - 0x9F */ | ||
| 135 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 136 | 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, | ||
| 137 | /* 0xA0 - 0xA7 */ | ||
| 138 | ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, | ||
| 139 | ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, | ||
| 140 | ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | ||
| 141 | ByteOp | ImplicitOps | String, ImplicitOps | String, | ||
| 142 | /* 0xA8 - 0xAF */ | ||
| 143 | 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | ||
| 144 | ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | ||
| 145 | ByteOp | ImplicitOps | String, ImplicitOps | String, | ||
| 146 | /* 0xB0 - 0xBF */ | ||
| 147 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 148 | /* 0xC0 - 0xC7 */ | ||
| 149 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
| 150 | 0, ImplicitOps | Stack, 0, 0, | ||
| 151 | ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, | ||
| 152 | /* 0xC8 - 0xCF */ | ||
| 153 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 154 | /* 0xD0 - 0xD7 */ | ||
| 155 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
| 156 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
| 157 | 0, 0, 0, 0, | ||
| 158 | /* 0xD8 - 0xDF */ | ||
| 159 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 160 | /* 0xE0 - 0xE7 */ | ||
| 161 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 162 | /* 0xE8 - 0xEF */ | ||
| 163 | ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, | ||
| 164 | 0, 0, 0, 0, | ||
| 165 | /* 0xF0 - 0xF7 */ | ||
| 166 | 0, 0, 0, 0, | ||
| 167 | ImplicitOps, ImplicitOps, | ||
| 168 | ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, | ||
| 169 | /* 0xF8 - 0xFF */ | ||
| 170 | ImplicitOps, 0, ImplicitOps, ImplicitOps, | ||
| 171 | 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM | ||
| 172 | }; | ||
| 173 | |||
| 174 | static u16 twobyte_table[256] = { | ||
| 175 | /* 0x00 - 0x0F */ | ||
| 176 | 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, | ||
| 177 | ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, | ||
| 178 | /* 0x10 - 0x1F */ | ||
| 179 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, | ||
| 180 | /* 0x20 - 0x2F */ | ||
| 181 | ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, | ||
| 182 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 183 | /* 0x30 - 0x3F */ | ||
| 184 | ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 185 | /* 0x40 - 0x47 */ | ||
| 186 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 187 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 188 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 189 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 190 | /* 0x48 - 0x4F */ | ||
| 191 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 192 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 193 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 194 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 195 | /* 0x50 - 0x5F */ | ||
| 196 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 197 | /* 0x60 - 0x6F */ | ||
| 198 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 199 | /* 0x70 - 0x7F */ | ||
| 200 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 201 | /* 0x80 - 0x8F */ | ||
| 202 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 203 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 204 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 205 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 206 | /* 0x90 - 0x9F */ | ||
| 207 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 208 | /* 0xA0 - 0xA7 */ | ||
| 209 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, | ||
| 210 | /* 0xA8 - 0xAF */ | ||
| 211 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, | ||
| 212 | /* 0xB0 - 0xB7 */ | ||
| 213 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, | ||
| 214 | DstMem | SrcReg | ModRM | BitOp, | ||
| 215 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
| 216 | DstReg | SrcMem16 | ModRM | Mov, | ||
| 217 | /* 0xB8 - 0xBF */ | ||
| 218 | 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, | ||
| 219 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
| 220 | DstReg | SrcMem16 | ModRM | Mov, | ||
| 221 | /* 0xC0 - 0xCF */ | ||
| 222 | 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, | ||
| 223 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 224 | /* 0xD0 - 0xDF */ | ||
| 225 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 226 | /* 0xE0 - 0xEF */ | ||
| 227 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 228 | /* 0xF0 - 0xFF */ | ||
| 229 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
| 230 | }; | ||
| 231 | |||
| 232 | /* EFLAGS bit definitions. */ | ||
| 233 | #define EFLG_OF (1<<11) | ||
| 234 | #define EFLG_DF (1<<10) | ||
| 235 | #define EFLG_SF (1<<7) | ||
| 236 | #define EFLG_ZF (1<<6) | ||
| 237 | #define EFLG_AF (1<<4) | ||
| 238 | #define EFLG_PF (1<<2) | ||
| 239 | #define EFLG_CF (1<<0) | ||
| 240 | |||
| 241 | /* | ||
| 242 | * Instruction emulation: | ||
| 243 | * Most instructions are emulated directly via a fragment of inline assembly | ||
| 244 | * code. This allows us to save/restore EFLAGS and thus very easily pick up | ||
| 245 | * any modified flags. | ||
| 246 | */ | ||
| 247 | |||
| 248 | #if defined(CONFIG_X86_64) | ||
| 249 | #define _LO32 "k" /* force 32-bit operand */ | ||
| 250 | #define _STK "%%rsp" /* stack pointer */ | ||
| 251 | #elif defined(__i386__) | ||
| 252 | #define _LO32 "" /* force 32-bit operand */ | ||
| 253 | #define _STK "%%esp" /* stack pointer */ | ||
| 254 | #endif | ||
| 255 | |||
| 256 | /* | ||
| 257 | * These EFLAGS bits are restored from saved value during emulation, and | ||
| 258 | * any changes are written back to the saved value after emulation. | ||
| 259 | */ | ||
| 260 | #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) | ||
| 261 | |||
| 262 | /* Before executing instruction: restore necessary bits in EFLAGS. */ | ||
| 263 | #define _PRE_EFLAGS(_sav, _msk, _tmp) \ | ||
| 264 | /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ | ||
| 265 | "movl %"_sav",%"_LO32 _tmp"; " \ | ||
| 266 | "push %"_tmp"; " \ | ||
| 267 | "push %"_tmp"; " \ | ||
| 268 | "movl %"_msk",%"_LO32 _tmp"; " \ | ||
| 269 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
| 270 | "pushf; " \ | ||
| 271 | "notl %"_LO32 _tmp"; " \ | ||
| 272 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
| 273 | "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ | ||
| 274 | "pop %"_tmp"; " \ | ||
| 275 | "orl %"_LO32 _tmp",("_STK"); " \ | ||
| 276 | "popf; " \ | ||
| 277 | "pop %"_sav"; " | ||
| 278 | |||
| 279 | /* After executing instruction: write-back necessary bits in EFLAGS. */ | ||
| 280 | #define _POST_EFLAGS(_sav, _msk, _tmp) \ | ||
| 281 | /* _sav |= EFLAGS & _msk; */ \ | ||
| 282 | "pushf; " \ | ||
| 283 | "pop %"_tmp"; " \ | ||
| 284 | "andl %"_msk",%"_LO32 _tmp"; " \ | ||
| 285 | "orl %"_LO32 _tmp",%"_sav"; " | ||
| 286 | |||
| 287 | /* Raw emulation: instruction has two explicit operands. */ | ||
| 288 | #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
| 289 | do { \ | ||
| 290 | unsigned long _tmp; \ | ||
| 291 | \ | ||
| 292 | switch ((_dst).bytes) { \ | ||
| 293 | case 2: \ | ||
| 294 | __asm__ __volatile__ ( \ | ||
| 295 | _PRE_EFLAGS("0", "4", "2") \ | ||
| 296 | _op"w %"_wx"3,%1; " \ | ||
| 297 | _POST_EFLAGS("0", "4", "2") \ | ||
| 298 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
| 299 | "=&r" (_tmp) \ | ||
| 300 | : _wy ((_src).val), "i" (EFLAGS_MASK)); \ | ||
| 301 | break; \ | ||
| 302 | case 4: \ | ||
| 303 | __asm__ __volatile__ ( \ | ||
| 304 | _PRE_EFLAGS("0", "4", "2") \ | ||
| 305 | _op"l %"_lx"3,%1; " \ | ||
| 306 | _POST_EFLAGS("0", "4", "2") \ | ||
| 307 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
| 308 | "=&r" (_tmp) \ | ||
| 309 | : _ly ((_src).val), "i" (EFLAGS_MASK)); \ | ||
| 310 | break; \ | ||
| 311 | case 8: \ | ||
| 312 | __emulate_2op_8byte(_op, _src, _dst, \ | ||
| 313 | _eflags, _qx, _qy); \ | ||
| 314 | break; \ | ||
| 315 | } \ | ||
| 316 | } while (0) | ||
| 317 | |||
| 318 | #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
| 319 | do { \ | ||
| 320 | unsigned long _tmp; \ | ||
| 321 | switch ((_dst).bytes) { \ | ||
| 322 | case 1: \ | ||
| 323 | __asm__ __volatile__ ( \ | ||
| 324 | _PRE_EFLAGS("0", "4", "2") \ | ||
| 325 | _op"b %"_bx"3,%1; " \ | ||
| 326 | _POST_EFLAGS("0", "4", "2") \ | ||
| 327 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
| 328 | "=&r" (_tmp) \ | ||
| 329 | : _by ((_src).val), "i" (EFLAGS_MASK)); \ | ||
| 330 | break; \ | ||
| 331 | default: \ | ||
| 332 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
| 333 | _wx, _wy, _lx, _ly, _qx, _qy); \ | ||
| 334 | break; \ | ||
| 335 | } \ | ||
| 336 | } while (0) | ||
| 337 | |||
| 338 | /* Source operand is byte-sized and may be restricted to just %cl. */ | ||
| 339 | #define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ | ||
| 340 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
| 341 | "b", "c", "b", "c", "b", "c", "b", "c") | ||
| 342 | |||
| 343 | /* Source operand is byte, word, long or quad sized. */ | ||
| 344 | #define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ | ||
| 345 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
| 346 | "b", "q", "w", "r", _LO32, "r", "", "r") | ||
| 347 | |||
| 348 | /* Source operand is word, long or quad sized. */ | ||
| 349 | #define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ | ||
| 350 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
| 351 | "w", "r", _LO32, "r", "", "r") | ||
| 352 | |||
| 353 | /* Instruction has only one explicit operand (no source operand). */ | ||
| 354 | #define emulate_1op(_op, _dst, _eflags) \ | ||
| 355 | do { \ | ||
| 356 | unsigned long _tmp; \ | ||
| 357 | \ | ||
| 358 | switch ((_dst).bytes) { \ | ||
| 359 | case 1: \ | ||
| 360 | __asm__ __volatile__ ( \ | ||
| 361 | _PRE_EFLAGS("0", "3", "2") \ | ||
| 362 | _op"b %1; " \ | ||
| 363 | _POST_EFLAGS("0", "3", "2") \ | ||
| 364 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
| 365 | "=&r" (_tmp) \ | ||
| 366 | : "i" (EFLAGS_MASK)); \ | ||
| 367 | break; \ | ||
| 368 | case 2: \ | ||
| 369 | __asm__ __volatile__ ( \ | ||
| 370 | _PRE_EFLAGS("0", "3", "2") \ | ||
| 371 | _op"w %1; " \ | ||
| 372 | _POST_EFLAGS("0", "3", "2") \ | ||
| 373 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
| 374 | "=&r" (_tmp) \ | ||
| 375 | : "i" (EFLAGS_MASK)); \ | ||
| 376 | break; \ | ||
| 377 | case 4: \ | ||
| 378 | __asm__ __volatile__ ( \ | ||
| 379 | _PRE_EFLAGS("0", "3", "2") \ | ||
| 380 | _op"l %1; " \ | ||
| 381 | _POST_EFLAGS("0", "3", "2") \ | ||
| 382 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
| 383 | "=&r" (_tmp) \ | ||
| 384 | : "i" (EFLAGS_MASK)); \ | ||
| 385 | break; \ | ||
| 386 | case 8: \ | ||
| 387 | __emulate_1op_8byte(_op, _dst, _eflags); \ | ||
| 388 | break; \ | ||
| 389 | } \ | ||
| 390 | } while (0) | ||
| 391 | |||
| 392 | /* Emulate an instruction with quadword operands (x86/64 only). */ | ||
| 393 | #if defined(CONFIG_X86_64) | ||
| 394 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ | ||
| 395 | do { \ | ||
| 396 | __asm__ __volatile__ ( \ | ||
| 397 | _PRE_EFLAGS("0", "4", "2") \ | ||
| 398 | _op"q %"_qx"3,%1; " \ | ||
| 399 | _POST_EFLAGS("0", "4", "2") \ | ||
| 400 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
| 401 | : _qy ((_src).val), "i" (EFLAGS_MASK)); \ | ||
| 402 | } while (0) | ||
| 403 | |||
| 404 | #define __emulate_1op_8byte(_op, _dst, _eflags) \ | ||
| 405 | do { \ | ||
| 406 | __asm__ __volatile__ ( \ | ||
| 407 | _PRE_EFLAGS("0", "3", "2") \ | ||
| 408 | _op"q %1; " \ | ||
| 409 | _POST_EFLAGS("0", "3", "2") \ | ||
| 410 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
| 411 | : "i" (EFLAGS_MASK)); \ | ||
| 412 | } while (0) | ||
| 413 | |||
| 414 | #elif defined(__i386__) | ||
| 415 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) | ||
| 416 | #define __emulate_1op_8byte(_op, _dst, _eflags) | ||
| 417 | #endif /* __i386__ */ | ||
| 418 | |||
| 419 | /* Fetch next part of the instruction being emulated. */ | ||
| 420 | #define insn_fetch(_type, _size, _eip) \ | ||
| 421 | ({ unsigned long _x; \ | ||
| 422 | rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ | ||
| 423 | if (rc != 0) \ | ||
| 424 | goto done; \ | ||
| 425 | (_eip) += (_size); \ | ||
| 426 | (_type)_x; \ | ||
| 427 | }) | ||
| 428 | |||
| 429 | /* Access/update address held in a register, based on addressing mode. */ | ||
| 430 | #define address_mask(reg) \ | ||
| 431 | ((c->ad_bytes == sizeof(unsigned long)) ? \ | ||
| 432 | (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1))) | ||
| 433 | #define register_address(base, reg) \ | ||
| 434 | ((base) + address_mask(reg)) | ||
| 435 | #define register_address_increment(reg, inc) \ | ||
| 436 | do { \ | ||
| 437 | /* signed type ensures sign extension to long */ \ | ||
| 438 | int _inc = (inc); \ | ||
| 439 | if (c->ad_bytes == sizeof(unsigned long)) \ | ||
| 440 | (reg) += _inc; \ | ||
| 441 | else \ | ||
| 442 | (reg) = ((reg) & \ | ||
| 443 | ~((1UL << (c->ad_bytes << 3)) - 1)) | \ | ||
| 444 | (((reg) + _inc) & \ | ||
| 445 | ((1UL << (c->ad_bytes << 3)) - 1)); \ | ||
| 446 | } while (0) | ||
| 447 | |||
| 448 | #define JMP_REL(rel) \ | ||
| 449 | do { \ | ||
| 450 | register_address_increment(c->eip, rel); \ | ||
| 451 | } while (0) | ||
| 452 | |||
| 453 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | ||
| 454 | struct x86_emulate_ops *ops, | ||
| 455 | unsigned long linear, u8 *dest) | ||
| 456 | { | ||
| 457 | struct fetch_cache *fc = &ctxt->decode.fetch; | ||
| 458 | int rc; | ||
| 459 | int size; | ||
| 460 | |||
| 461 | if (linear < fc->start || linear >= fc->end) { | ||
| 462 | size = min(15UL, PAGE_SIZE - offset_in_page(linear)); | ||
| 463 | rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); | ||
| 464 | if (rc) | ||
| 465 | return rc; | ||
| 466 | fc->start = linear; | ||
| 467 | fc->end = linear + size; | ||
| 468 | } | ||
| 469 | *dest = fc->data[linear - fc->start]; | ||
| 470 | return 0; | ||
| 471 | } | ||
| 472 | |||
| 473 | static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, | ||
| 474 | struct x86_emulate_ops *ops, | ||
| 475 | unsigned long eip, void *dest, unsigned size) | ||
| 476 | { | ||
| 477 | int rc = 0; | ||
| 478 | |||
| 479 | eip += ctxt->cs_base; | ||
| 480 | while (size--) { | ||
| 481 | rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); | ||
| 482 | if (rc) | ||
| 483 | return rc; | ||
| 484 | } | ||
| 485 | return 0; | ||
| 486 | } | ||
| 487 | |||
| 488 | /* | ||
| 489 | * Given the 'reg' portion of a ModRM byte, and a register block, return a | ||
| 490 | * pointer into the block that addresses the relevant register. | ||
| 491 | * @highbyte_regs specifies whether to decode AH,CH,DH,BH. | ||
| 492 | */ | ||
| 493 | static void *decode_register(u8 modrm_reg, unsigned long *regs, | ||
| 494 | int highbyte_regs) | ||
| 495 | { | ||
| 496 | void *p; | ||
| 497 | |||
| 498 | p = ®s[modrm_reg]; | ||
| 499 | if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) | ||
| 500 | p = (unsigned char *)®s[modrm_reg & 3] + 1; | ||
| 501 | return p; | ||
| 502 | } | ||
| 503 | |||
| 504 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, | ||
| 505 | struct x86_emulate_ops *ops, | ||
| 506 | void *ptr, | ||
| 507 | u16 *size, unsigned long *address, int op_bytes) | ||
| 508 | { | ||
| 509 | int rc; | ||
| 510 | |||
| 511 | if (op_bytes == 2) | ||
| 512 | op_bytes = 3; | ||
| 513 | *address = 0; | ||
| 514 | rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, | ||
| 515 | ctxt->vcpu); | ||
| 516 | if (rc) | ||
| 517 | return rc; | ||
| 518 | rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, | ||
| 519 | ctxt->vcpu); | ||
| 520 | return rc; | ||
| 521 | } | ||
| 522 | |||
| 523 | static int test_cc(unsigned int condition, unsigned int flags) | ||
| 524 | { | ||
| 525 | int rc = 0; | ||
| 526 | |||
| 527 | switch ((condition & 15) >> 1) { | ||
| 528 | case 0: /* o */ | ||
| 529 | rc |= (flags & EFLG_OF); | ||
| 530 | break; | ||
| 531 | case 1: /* b/c/nae */ | ||
| 532 | rc |= (flags & EFLG_CF); | ||
| 533 | break; | ||
| 534 | case 2: /* z/e */ | ||
| 535 | rc |= (flags & EFLG_ZF); | ||
| 536 | break; | ||
| 537 | case 3: /* be/na */ | ||
| 538 | rc |= (flags & (EFLG_CF|EFLG_ZF)); | ||
| 539 | break; | ||
| 540 | case 4: /* s */ | ||
| 541 | rc |= (flags & EFLG_SF); | ||
| 542 | break; | ||
| 543 | case 5: /* p/pe */ | ||
| 544 | rc |= (flags & EFLG_PF); | ||
| 545 | break; | ||
| 546 | case 7: /* le/ng */ | ||
| 547 | rc |= (flags & EFLG_ZF); | ||
| 548 | /* fall through */ | ||
| 549 | case 6: /* l/nge */ | ||
| 550 | rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); | ||
| 551 | break; | ||
| 552 | } | ||
| 553 | |||
| 554 | /* Odd condition identifiers (lsb == 1) have inverted sense. */ | ||
| 555 | return (!!rc ^ (condition & 1)); | ||
| 556 | } | ||
| 557 | |||
| 558 | static void decode_register_operand(struct operand *op, | ||
| 559 | struct decode_cache *c, | ||
| 560 | int inhibit_bytereg) | ||
| 561 | { | ||
| 562 | unsigned reg = c->modrm_reg; | ||
| 563 | int highbyte_regs = c->rex_prefix == 0; | ||
| 564 | |||
| 565 | if (!(c->d & ModRM)) | ||
| 566 | reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); | ||
| 567 | op->type = OP_REG; | ||
| 568 | if ((c->d & ByteOp) && !inhibit_bytereg) { | ||
| 569 | op->ptr = decode_register(reg, c->regs, highbyte_regs); | ||
| 570 | op->val = *(u8 *)op->ptr; | ||
| 571 | op->bytes = 1; | ||
| 572 | } else { | ||
| 573 | op->ptr = decode_register(reg, c->regs, 0); | ||
| 574 | op->bytes = c->op_bytes; | ||
| 575 | switch (op->bytes) { | ||
| 576 | case 2: | ||
| 577 | op->val = *(u16 *)op->ptr; | ||
| 578 | break; | ||
| 579 | case 4: | ||
| 580 | op->val = *(u32 *)op->ptr; | ||
| 581 | break; | ||
| 582 | case 8: | ||
| 583 | op->val = *(u64 *) op->ptr; | ||
| 584 | break; | ||
| 585 | } | ||
| 586 | } | ||
| 587 | op->orig_val = op->val; | ||
| 588 | } | ||
| 589 | |||
| 590 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, | ||
| 591 | struct x86_emulate_ops *ops) | ||
| 592 | { | ||
| 593 | struct decode_cache *c = &ctxt->decode; | ||
| 594 | u8 sib; | ||
| 595 | int index_reg = 0, base_reg = 0, scale, rip_relative = 0; | ||
| 596 | int rc = 0; | ||
| 597 | |||
| 598 | if (c->rex_prefix) { | ||
| 599 | c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ | ||
| 600 | index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ | ||
| 601 | c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ | ||
| 602 | } | ||
| 603 | |||
| 604 | c->modrm = insn_fetch(u8, 1, c->eip); | ||
| 605 | c->modrm_mod |= (c->modrm & 0xc0) >> 6; | ||
| 606 | c->modrm_reg |= (c->modrm & 0x38) >> 3; | ||
| 607 | c->modrm_rm |= (c->modrm & 0x07); | ||
| 608 | c->modrm_ea = 0; | ||
| 609 | c->use_modrm_ea = 1; | ||
| 610 | |||
| 611 | if (c->modrm_mod == 3) { | ||
| 612 | c->modrm_val = *(unsigned long *) | ||
| 613 | decode_register(c->modrm_rm, c->regs, c->d & ByteOp); | ||
| 614 | return rc; | ||
| 615 | } | ||
| 616 | |||
| 617 | if (c->ad_bytes == 2) { | ||
| 618 | unsigned bx = c->regs[VCPU_REGS_RBX]; | ||
| 619 | unsigned bp = c->regs[VCPU_REGS_RBP]; | ||
| 620 | unsigned si = c->regs[VCPU_REGS_RSI]; | ||
| 621 | unsigned di = c->regs[VCPU_REGS_RDI]; | ||
| 622 | |||
| 623 | /* 16-bit ModR/M decode. */ | ||
| 624 | switch (c->modrm_mod) { | ||
| 625 | case 0: | ||
| 626 | if (c->modrm_rm == 6) | ||
| 627 | c->modrm_ea += insn_fetch(u16, 2, c->eip); | ||
| 628 | break; | ||
| 629 | case 1: | ||
| 630 | c->modrm_ea += insn_fetch(s8, 1, c->eip); | ||
| 631 | break; | ||
| 632 | case 2: | ||
| 633 | c->modrm_ea += insn_fetch(u16, 2, c->eip); | ||
| 634 | break; | ||
| 635 | } | ||
| 636 | switch (c->modrm_rm) { | ||
| 637 | case 0: | ||
| 638 | c->modrm_ea += bx + si; | ||
| 639 | break; | ||
| 640 | case 1: | ||
| 641 | c->modrm_ea += bx + di; | ||
| 642 | break; | ||
| 643 | case 2: | ||
| 644 | c->modrm_ea += bp + si; | ||
| 645 | break; | ||
| 646 | case 3: | ||
| 647 | c->modrm_ea += bp + di; | ||
| 648 | break; | ||
| 649 | case 4: | ||
| 650 | c->modrm_ea += si; | ||
| 651 | break; | ||
| 652 | case 5: | ||
| 653 | c->modrm_ea += di; | ||
| 654 | break; | ||
| 655 | case 6: | ||
| 656 | if (c->modrm_mod != 0) | ||
| 657 | c->modrm_ea += bp; | ||
| 658 | break; | ||
| 659 | case 7: | ||
| 660 | c->modrm_ea += bx; | ||
| 661 | break; | ||
| 662 | } | ||
| 663 | if (c->modrm_rm == 2 || c->modrm_rm == 3 || | ||
| 664 | (c->modrm_rm == 6 && c->modrm_mod != 0)) | ||
| 665 | if (!c->override_base) | ||
| 666 | c->override_base = &ctxt->ss_base; | ||
| 667 | c->modrm_ea = (u16)c->modrm_ea; | ||
| 668 | } else { | ||
| 669 | /* 32/64-bit ModR/M decode. */ | ||
| 670 | switch (c->modrm_rm) { | ||
| 671 | case 4: | ||
| 672 | case 12: | ||
| 673 | sib = insn_fetch(u8, 1, c->eip); | ||
| 674 | index_reg |= (sib >> 3) & 7; | ||
| 675 | base_reg |= sib & 7; | ||
| 676 | scale = sib >> 6; | ||
| 677 | |||
| 678 | switch (base_reg) { | ||
| 679 | case 5: | ||
| 680 | if (c->modrm_mod != 0) | ||
| 681 | c->modrm_ea += c->regs[base_reg]; | ||
| 682 | else | ||
| 683 | c->modrm_ea += | ||
| 684 | insn_fetch(s32, 4, c->eip); | ||
| 685 | break; | ||
| 686 | default: | ||
| 687 | c->modrm_ea += c->regs[base_reg]; | ||
| 688 | } | ||
| 689 | switch (index_reg) { | ||
| 690 | case 4: | ||
| 691 | break; | ||
| 692 | default: | ||
| 693 | c->modrm_ea += c->regs[index_reg] << scale; | ||
| 694 | } | ||
| 695 | break; | ||
| 696 | case 5: | ||
| 697 | if (c->modrm_mod != 0) | ||
| 698 | c->modrm_ea += c->regs[c->modrm_rm]; | ||
| 699 | else if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
| 700 | rip_relative = 1; | ||
| 701 | break; | ||
| 702 | default: | ||
| 703 | c->modrm_ea += c->regs[c->modrm_rm]; | ||
| 704 | break; | ||
| 705 | } | ||
| 706 | switch (c->modrm_mod) { | ||
| 707 | case 0: | ||
| 708 | if (c->modrm_rm == 5) | ||
| 709 | c->modrm_ea += insn_fetch(s32, 4, c->eip); | ||
| 710 | break; | ||
| 711 | case 1: | ||
| 712 | c->modrm_ea += insn_fetch(s8, 1, c->eip); | ||
| 713 | break; | ||
| 714 | case 2: | ||
| 715 | c->modrm_ea += insn_fetch(s32, 4, c->eip); | ||
| 716 | break; | ||
| 717 | } | ||
| 718 | } | ||
| 719 | if (rip_relative) { | ||
| 720 | c->modrm_ea += c->eip; | ||
| 721 | switch (c->d & SrcMask) { | ||
| 722 | case SrcImmByte: | ||
| 723 | c->modrm_ea += 1; | ||
| 724 | break; | ||
| 725 | case SrcImm: | ||
| 726 | if (c->d & ByteOp) | ||
| 727 | c->modrm_ea += 1; | ||
| 728 | else | ||
| 729 | if (c->op_bytes == 8) | ||
| 730 | c->modrm_ea += 4; | ||
| 731 | else | ||
| 732 | c->modrm_ea += c->op_bytes; | ||
| 733 | } | ||
| 734 | } | ||
| 735 | done: | ||
| 736 | return rc; | ||
| 737 | } | ||
| 738 | |||
| 739 | static int decode_abs(struct x86_emulate_ctxt *ctxt, | ||
| 740 | struct x86_emulate_ops *ops) | ||
| 741 | { | ||
| 742 | struct decode_cache *c = &ctxt->decode; | ||
| 743 | int rc = 0; | ||
| 744 | |||
| 745 | switch (c->ad_bytes) { | ||
| 746 | case 2: | ||
| 747 | c->modrm_ea = insn_fetch(u16, 2, c->eip); | ||
| 748 | break; | ||
| 749 | case 4: | ||
| 750 | c->modrm_ea = insn_fetch(u32, 4, c->eip); | ||
| 751 | break; | ||
| 752 | case 8: | ||
| 753 | c->modrm_ea = insn_fetch(u64, 8, c->eip); | ||
| 754 | break; | ||
| 755 | } | ||
| 756 | done: | ||
| 757 | return rc; | ||
| 758 | } | ||
| 759 | |||
| 760 | int | ||
| 761 | x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
| 762 | { | ||
| 763 | struct decode_cache *c = &ctxt->decode; | ||
| 764 | int rc = 0; | ||
| 765 | int mode = ctxt->mode; | ||
| 766 | int def_op_bytes, def_ad_bytes; | ||
| 767 | |||
| 768 | /* Shadow copy of register state. Committed on successful emulation. */ | ||
| 769 | |||
| 770 | memset(c, 0, sizeof(struct decode_cache)); | ||
| 771 | c->eip = ctxt->vcpu->arch.rip; | ||
| 772 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); | ||
| 773 | |||
| 774 | switch (mode) { | ||
| 775 | case X86EMUL_MODE_REAL: | ||
| 776 | case X86EMUL_MODE_PROT16: | ||
| 777 | def_op_bytes = def_ad_bytes = 2; | ||
| 778 | break; | ||
| 779 | case X86EMUL_MODE_PROT32: | ||
| 780 | def_op_bytes = def_ad_bytes = 4; | ||
| 781 | break; | ||
| 782 | #ifdef CONFIG_X86_64 | ||
| 783 | case X86EMUL_MODE_PROT64: | ||
| 784 | def_op_bytes = 4; | ||
| 785 | def_ad_bytes = 8; | ||
| 786 | break; | ||
| 787 | #endif | ||
| 788 | default: | ||
| 789 | return -1; | ||
| 790 | } | ||
| 791 | |||
| 792 | c->op_bytes = def_op_bytes; | ||
| 793 | c->ad_bytes = def_ad_bytes; | ||
| 794 | |||
| 795 | /* Legacy prefixes. */ | ||
| 796 | for (;;) { | ||
| 797 | switch (c->b = insn_fetch(u8, 1, c->eip)) { | ||
| 798 | case 0x66: /* operand-size override */ | ||
| 799 | /* switch between 2/4 bytes */ | ||
| 800 | c->op_bytes = def_op_bytes ^ 6; | ||
| 801 | break; | ||
| 802 | case 0x67: /* address-size override */ | ||
| 803 | if (mode == X86EMUL_MODE_PROT64) | ||
| 804 | /* switch between 4/8 bytes */ | ||
| 805 | c->ad_bytes = def_ad_bytes ^ 12; | ||
| 806 | else | ||
| 807 | /* switch between 2/4 bytes */ | ||
| 808 | c->ad_bytes = def_ad_bytes ^ 6; | ||
| 809 | break; | ||
| 810 | case 0x2e: /* CS override */ | ||
| 811 | c->override_base = &ctxt->cs_base; | ||
| 812 | break; | ||
| 813 | case 0x3e: /* DS override */ | ||
| 814 | c->override_base = &ctxt->ds_base; | ||
| 815 | break; | ||
| 816 | case 0x26: /* ES override */ | ||
| 817 | c->override_base = &ctxt->es_base; | ||
| 818 | break; | ||
| 819 | case 0x64: /* FS override */ | ||
| 820 | c->override_base = &ctxt->fs_base; | ||
| 821 | break; | ||
| 822 | case 0x65: /* GS override */ | ||
| 823 | c->override_base = &ctxt->gs_base; | ||
| 824 | break; | ||
| 825 | case 0x36: /* SS override */ | ||
| 826 | c->override_base = &ctxt->ss_base; | ||
| 827 | break; | ||
| 828 | case 0x40 ... 0x4f: /* REX */ | ||
| 829 | if (mode != X86EMUL_MODE_PROT64) | ||
| 830 | goto done_prefixes; | ||
| 831 | c->rex_prefix = c->b; | ||
| 832 | continue; | ||
| 833 | case 0xf0: /* LOCK */ | ||
| 834 | c->lock_prefix = 1; | ||
| 835 | break; | ||
| 836 | case 0xf2: /* REPNE/REPNZ */ | ||
| 837 | c->rep_prefix = REPNE_PREFIX; | ||
| 838 | break; | ||
| 839 | case 0xf3: /* REP/REPE/REPZ */ | ||
| 840 | c->rep_prefix = REPE_PREFIX; | ||
| 841 | break; | ||
| 842 | default: | ||
| 843 | goto done_prefixes; | ||
| 844 | } | ||
| 845 | |||
| 846 | /* Any legacy prefix after a REX prefix nullifies its effect. */ | ||
| 847 | |||
| 848 | c->rex_prefix = 0; | ||
| 849 | } | ||
| 850 | |||
| 851 | done_prefixes: | ||
| 852 | |||
| 853 | /* REX prefix. */ | ||
| 854 | if (c->rex_prefix) | ||
| 855 | if (c->rex_prefix & 8) | ||
| 856 | c->op_bytes = 8; /* REX.W */ | ||
| 857 | |||
| 858 | /* Opcode byte(s). */ | ||
| 859 | c->d = opcode_table[c->b]; | ||
| 860 | if (c->d == 0) { | ||
| 861 | /* Two-byte opcode? */ | ||
| 862 | if (c->b == 0x0f) { | ||
| 863 | c->twobyte = 1; | ||
| 864 | c->b = insn_fetch(u8, 1, c->eip); | ||
| 865 | c->d = twobyte_table[c->b]; | ||
| 866 | } | ||
| 867 | |||
| 868 | /* Unrecognised? */ | ||
| 869 | if (c->d == 0) { | ||
| 870 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
| 871 | return -1; | ||
| 872 | } | ||
| 873 | } | ||
| 874 | |||
| 875 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) | ||
| 876 | c->op_bytes = 8; | ||
| 877 | |||
| 878 | /* ModRM and SIB bytes. */ | ||
| 879 | if (c->d & ModRM) | ||
| 880 | rc = decode_modrm(ctxt, ops); | ||
| 881 | else if (c->d & MemAbs) | ||
| 882 | rc = decode_abs(ctxt, ops); | ||
| 883 | if (rc) | ||
| 884 | goto done; | ||
| 885 | |||
| 886 | if (!c->override_base) | ||
| 887 | c->override_base = &ctxt->ds_base; | ||
| 888 | if (mode == X86EMUL_MODE_PROT64 && | ||
| 889 | c->override_base != &ctxt->fs_base && | ||
| 890 | c->override_base != &ctxt->gs_base) | ||
| 891 | c->override_base = NULL; | ||
| 892 | |||
| 893 | if (c->override_base) | ||
| 894 | c->modrm_ea += *c->override_base; | ||
| 895 | |||
| 896 | if (c->ad_bytes != 8) | ||
| 897 | c->modrm_ea = (u32)c->modrm_ea; | ||
| 898 | /* | ||
| 899 | * Decode and fetch the source operand: register, memory | ||
| 900 | * or immediate. | ||
| 901 | */ | ||
| 902 | switch (c->d & SrcMask) { | ||
| 903 | case SrcNone: | ||
| 904 | break; | ||
| 905 | case SrcReg: | ||
| 906 | decode_register_operand(&c->src, c, 0); | ||
| 907 | break; | ||
| 908 | case SrcMem16: | ||
| 909 | c->src.bytes = 2; | ||
| 910 | goto srcmem_common; | ||
| 911 | case SrcMem32: | ||
| 912 | c->src.bytes = 4; | ||
| 913 | goto srcmem_common; | ||
| 914 | case SrcMem: | ||
| 915 | c->src.bytes = (c->d & ByteOp) ? 1 : | ||
| 916 | c->op_bytes; | ||
| 917 | /* Don't fetch the address for invlpg: it could be unmapped. */ | ||
| 918 | if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) | ||
| 919 | break; | ||
| 920 | srcmem_common: | ||
| 921 | /* | ||
| 922 | * For instructions with a ModR/M byte, switch to register | ||
| 923 | * access if Mod = 3. | ||
| 924 | */ | ||
| 925 | if ((c->d & ModRM) && c->modrm_mod == 3) { | ||
| 926 | c->src.type = OP_REG; | ||
| 927 | break; | ||
| 928 | } | ||
| 929 | c->src.type = OP_MEM; | ||
| 930 | break; | ||
| 931 | case SrcImm: | ||
| 932 | c->src.type = OP_IMM; | ||
| 933 | c->src.ptr = (unsigned long *)c->eip; | ||
| 934 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
| 935 | if (c->src.bytes == 8) | ||
| 936 | c->src.bytes = 4; | ||
| 937 | /* NB. Immediates are sign-extended as necessary. */ | ||
| 938 | switch (c->src.bytes) { | ||
| 939 | case 1: | ||
| 940 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
| 941 | break; | ||
| 942 | case 2: | ||
| 943 | c->src.val = insn_fetch(s16, 2, c->eip); | ||
| 944 | break; | ||
| 945 | case 4: | ||
| 946 | c->src.val = insn_fetch(s32, 4, c->eip); | ||
| 947 | break; | ||
| 948 | } | ||
| 949 | break; | ||
| 950 | case SrcImmByte: | ||
| 951 | c->src.type = OP_IMM; | ||
| 952 | c->src.ptr = (unsigned long *)c->eip; | ||
| 953 | c->src.bytes = 1; | ||
| 954 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
| 955 | break; | ||
| 956 | } | ||
| 957 | |||
| 958 | /* Decode and fetch the destination operand: register or memory. */ | ||
| 959 | switch (c->d & DstMask) { | ||
| 960 | case ImplicitOps: | ||
| 961 | /* Special instructions do their own operand decoding. */ | ||
| 962 | return 0; | ||
| 963 | case DstReg: | ||
| 964 | decode_register_operand(&c->dst, c, | ||
| 965 | c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); | ||
| 966 | break; | ||
| 967 | case DstMem: | ||
| 968 | if ((c->d & ModRM) && c->modrm_mod == 3) { | ||
| 969 | c->dst.type = OP_REG; | ||
| 970 | break; | ||
| 971 | } | ||
| 972 | c->dst.type = OP_MEM; | ||
| 973 | break; | ||
| 974 | } | ||
| 975 | |||
| 976 | done: | ||
| 977 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | ||
| 978 | } | ||
| 979 | |||
| 980 | static inline void emulate_push(struct x86_emulate_ctxt *ctxt) | ||
| 981 | { | ||
| 982 | struct decode_cache *c = &ctxt->decode; | ||
| 983 | |||
| 984 | c->dst.type = OP_MEM; | ||
| 985 | c->dst.bytes = c->op_bytes; | ||
| 986 | c->dst.val = c->src.val; | ||
| 987 | register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes); | ||
| 988 | c->dst.ptr = (void *) register_address(ctxt->ss_base, | ||
| 989 | c->regs[VCPU_REGS_RSP]); | ||
| 990 | } | ||
| 991 | |||
| 992 | static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, | ||
| 993 | struct x86_emulate_ops *ops) | ||
| 994 | { | ||
| 995 | struct decode_cache *c = &ctxt->decode; | ||
| 996 | int rc; | ||
| 997 | |||
| 998 | rc = ops->read_std(register_address(ctxt->ss_base, | ||
| 999 | c->regs[VCPU_REGS_RSP]), | ||
| 1000 | &c->dst.val, c->dst.bytes, ctxt->vcpu); | ||
| 1001 | if (rc != 0) | ||
| 1002 | return rc; | ||
| 1003 | |||
| 1004 | register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes); | ||
| 1005 | |||
| 1006 | return 0; | ||
| 1007 | } | ||
| 1008 | |||
| 1009 | static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) | ||
| 1010 | { | ||
| 1011 | struct decode_cache *c = &ctxt->decode; | ||
| 1012 | switch (c->modrm_reg) { | ||
| 1013 | case 0: /* rol */ | ||
| 1014 | emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); | ||
| 1015 | break; | ||
| 1016 | case 1: /* ror */ | ||
| 1017 | emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); | ||
| 1018 | break; | ||
| 1019 | case 2: /* rcl */ | ||
| 1020 | emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); | ||
| 1021 | break; | ||
| 1022 | case 3: /* rcr */ | ||
| 1023 | emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); | ||
| 1024 | break; | ||
| 1025 | case 4: /* sal/shl */ | ||
| 1026 | case 6: /* sal/shl */ | ||
| 1027 | emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); | ||
| 1028 | break; | ||
| 1029 | case 5: /* shr */ | ||
| 1030 | emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); | ||
| 1031 | break; | ||
| 1032 | case 7: /* sar */ | ||
| 1033 | emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); | ||
| 1034 | break; | ||
| 1035 | } | ||
| 1036 | } | ||
| 1037 | |||
| 1038 | static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, | ||
| 1039 | struct x86_emulate_ops *ops) | ||
| 1040 | { | ||
| 1041 | struct decode_cache *c = &ctxt->decode; | ||
| 1042 | int rc = 0; | ||
| 1043 | |||
| 1044 | switch (c->modrm_reg) { | ||
| 1045 | case 0 ... 1: /* test */ | ||
| 1046 | /* | ||
| 1047 | * Special case in Grp3: test has an immediate | ||
| 1048 | * source operand. | ||
| 1049 | */ | ||
| 1050 | c->src.type = OP_IMM; | ||
| 1051 | c->src.ptr = (unsigned long *)c->eip; | ||
| 1052 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
| 1053 | if (c->src.bytes == 8) | ||
| 1054 | c->src.bytes = 4; | ||
| 1055 | switch (c->src.bytes) { | ||
| 1056 | case 1: | ||
| 1057 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
| 1058 | break; | ||
| 1059 | case 2: | ||
| 1060 | c->src.val = insn_fetch(s16, 2, c->eip); | ||
| 1061 | break; | ||
| 1062 | case 4: | ||
| 1063 | c->src.val = insn_fetch(s32, 4, c->eip); | ||
| 1064 | break; | ||
| 1065 | } | ||
| 1066 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); | ||
| 1067 | break; | ||
| 1068 | case 2: /* not */ | ||
| 1069 | c->dst.val = ~c->dst.val; | ||
| 1070 | break; | ||
| 1071 | case 3: /* neg */ | ||
| 1072 | emulate_1op("neg", c->dst, ctxt->eflags); | ||
| 1073 | break; | ||
| 1074 | default: | ||
| 1075 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
| 1076 | rc = X86EMUL_UNHANDLEABLE; | ||
| 1077 | break; | ||
| 1078 | } | ||
| 1079 | done: | ||
| 1080 | return rc; | ||
| 1081 | } | ||
| 1082 | |||
| 1083 | static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, | ||
| 1084 | struct x86_emulate_ops *ops) | ||
| 1085 | { | ||
| 1086 | struct decode_cache *c = &ctxt->decode; | ||
| 1087 | int rc; | ||
| 1088 | |||
| 1089 | switch (c->modrm_reg) { | ||
| 1090 | case 0: /* inc */ | ||
| 1091 | emulate_1op("inc", c->dst, ctxt->eflags); | ||
| 1092 | break; | ||
| 1093 | case 1: /* dec */ | ||
| 1094 | emulate_1op("dec", c->dst, ctxt->eflags); | ||
| 1095 | break; | ||
| 1096 | case 4: /* jmp abs */ | ||
| 1097 | if (c->b == 0xff) | ||
| 1098 | c->eip = c->dst.val; | ||
| 1099 | else { | ||
| 1100 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
| 1101 | return X86EMUL_UNHANDLEABLE; | ||
| 1102 | } | ||
| 1103 | break; | ||
| 1104 | case 6: /* push */ | ||
| 1105 | |||
| 1106 | /* 64-bit mode: PUSH always pushes a 64-bit operand. */ | ||
| 1107 | |||
| 1108 | if (ctxt->mode == X86EMUL_MODE_PROT64) { | ||
| 1109 | c->dst.bytes = 8; | ||
| 1110 | rc = ops->read_std((unsigned long)c->dst.ptr, | ||
| 1111 | &c->dst.val, 8, ctxt->vcpu); | ||
| 1112 | if (rc != 0) | ||
| 1113 | return rc; | ||
| 1114 | } | ||
| 1115 | register_address_increment(c->regs[VCPU_REGS_RSP], | ||
| 1116 | -c->dst.bytes); | ||
| 1117 | rc = ops->write_emulated(register_address(ctxt->ss_base, | ||
| 1118 | c->regs[VCPU_REGS_RSP]), &c->dst.val, | ||
| 1119 | c->dst.bytes, ctxt->vcpu); | ||
| 1120 | if (rc != 0) | ||
| 1121 | return rc; | ||
| 1122 | c->dst.type = OP_NONE; | ||
| 1123 | break; | ||
| 1124 | default: | ||
| 1125 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
| 1126 | return X86EMUL_UNHANDLEABLE; | ||
| 1127 | } | ||
| 1128 | return 0; | ||
| 1129 | } | ||
| 1130 | |||
| 1131 | static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, | ||
| 1132 | struct x86_emulate_ops *ops, | ||
| 1133 | unsigned long memop) | ||
| 1134 | { | ||
| 1135 | struct decode_cache *c = &ctxt->decode; | ||
| 1136 | u64 old, new; | ||
| 1137 | int rc; | ||
| 1138 | |||
| 1139 | rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); | ||
| 1140 | if (rc != 0) | ||
| 1141 | return rc; | ||
| 1142 | |||
| 1143 | if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || | ||
| 1144 | ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { | ||
| 1145 | |||
| 1146 | c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); | ||
| 1147 | c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); | ||
| 1148 | ctxt->eflags &= ~EFLG_ZF; | ||
| 1149 | |||
| 1150 | } else { | ||
| 1151 | new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | | ||
| 1152 | (u32) c->regs[VCPU_REGS_RBX]; | ||
| 1153 | |||
| 1154 | rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); | ||
| 1155 | if (rc != 0) | ||
| 1156 | return rc; | ||
| 1157 | ctxt->eflags |= EFLG_ZF; | ||
| 1158 | } | ||
| 1159 | return 0; | ||
| 1160 | } | ||
| 1161 | |||
| 1162 | static inline int writeback(struct x86_emulate_ctxt *ctxt, | ||
| 1163 | struct x86_emulate_ops *ops) | ||
| 1164 | { | ||
| 1165 | int rc; | ||
| 1166 | struct decode_cache *c = &ctxt->decode; | ||
| 1167 | |||
| 1168 | switch (c->dst.type) { | ||
| 1169 | case OP_REG: | ||
| 1170 | /* The 4-byte case *is* correct: | ||
| 1171 | * in 64-bit mode we zero-extend. | ||
| 1172 | */ | ||
| 1173 | switch (c->dst.bytes) { | ||
| 1174 | case 1: | ||
| 1175 | *(u8 *)c->dst.ptr = (u8)c->dst.val; | ||
| 1176 | break; | ||
| 1177 | case 2: | ||
| 1178 | *(u16 *)c->dst.ptr = (u16)c->dst.val; | ||
| 1179 | break; | ||
| 1180 | case 4: | ||
| 1181 | *c->dst.ptr = (u32)c->dst.val; | ||
| 1182 | break; /* 64b: zero-ext */ | ||
| 1183 | case 8: | ||
| 1184 | *c->dst.ptr = c->dst.val; | ||
| 1185 | break; | ||
| 1186 | } | ||
| 1187 | break; | ||
| 1188 | case OP_MEM: | ||
| 1189 | if (c->lock_prefix) | ||
| 1190 | rc = ops->cmpxchg_emulated( | ||
| 1191 | (unsigned long)c->dst.ptr, | ||
| 1192 | &c->dst.orig_val, | ||
| 1193 | &c->dst.val, | ||
| 1194 | c->dst.bytes, | ||
| 1195 | ctxt->vcpu); | ||
| 1196 | else | ||
| 1197 | rc = ops->write_emulated( | ||
| 1198 | (unsigned long)c->dst.ptr, | ||
| 1199 | &c->dst.val, | ||
| 1200 | c->dst.bytes, | ||
| 1201 | ctxt->vcpu); | ||
| 1202 | if (rc != 0) | ||
| 1203 | return rc; | ||
| 1204 | break; | ||
| 1205 | case OP_NONE: | ||
| 1206 | /* no writeback */ | ||
| 1207 | break; | ||
| 1208 | default: | ||
| 1209 | break; | ||
| 1210 | } | ||
| 1211 | return 0; | ||
| 1212 | } | ||
| 1213 | |||
| 1214 | int | ||
| 1215 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
| 1216 | { | ||
| 1217 | unsigned long memop = 0; | ||
| 1218 | u64 msr_data; | ||
| 1219 | unsigned long saved_eip = 0; | ||
| 1220 | struct decode_cache *c = &ctxt->decode; | ||
| 1221 | int rc = 0; | ||
| 1222 | |||
| 1223 | /* Shadow copy of register state. Committed on successful emulation. | ||
| 1224 | * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't | ||
| 1225 | * modify them. | ||
| 1226 | */ | ||
| 1227 | |||
| 1228 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); | ||
| 1229 | saved_eip = c->eip; | ||
| 1230 | |||
| 1231 | if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) | ||
| 1232 | memop = c->modrm_ea; | ||
| 1233 | |||
| 1234 | if (c->rep_prefix && (c->d & String)) { | ||
| 1235 | /* All REP prefixes have the same first termination condition */ | ||
| 1236 | if (c->regs[VCPU_REGS_RCX] == 0) { | ||
| 1237 | ctxt->vcpu->arch.rip = c->eip; | ||
| 1238 | goto done; | ||
| 1239 | } | ||
| 1240 | /* The second termination condition only applies for REPE | ||
| 1241 | * and REPNE. Test if the repeat string operation prefix is | ||
| 1242 | * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the | ||
| 1243 | * corresponding termination condition according to: | ||
| 1244 | * - if REPE/REPZ and ZF = 0 then done | ||
| 1245 | * - if REPNE/REPNZ and ZF = 1 then done | ||
| 1246 | */ | ||
| 1247 | if ((c->b == 0xa6) || (c->b == 0xa7) || | ||
| 1248 | (c->b == 0xae) || (c->b == 0xaf)) { | ||
| 1249 | if ((c->rep_prefix == REPE_PREFIX) && | ||
| 1250 | ((ctxt->eflags & EFLG_ZF) == 0)) { | ||
| 1251 | ctxt->vcpu->arch.rip = c->eip; | ||
| 1252 | goto done; | ||
| 1253 | } | ||
| 1254 | if ((c->rep_prefix == REPNE_PREFIX) && | ||
| 1255 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { | ||
| 1256 | ctxt->vcpu->arch.rip = c->eip; | ||
| 1257 | goto done; | ||
| 1258 | } | ||
| 1259 | } | ||
| 1260 | c->regs[VCPU_REGS_RCX]--; | ||
| 1261 | c->eip = ctxt->vcpu->arch.rip; | ||
| 1262 | } | ||
| 1263 | |||
| 1264 | if (c->src.type == OP_MEM) { | ||
| 1265 | c->src.ptr = (unsigned long *)memop; | ||
| 1266 | c->src.val = 0; | ||
| 1267 | rc = ops->read_emulated((unsigned long)c->src.ptr, | ||
| 1268 | &c->src.val, | ||
| 1269 | c->src.bytes, | ||
| 1270 | ctxt->vcpu); | ||
| 1271 | if (rc != 0) | ||
| 1272 | goto done; | ||
| 1273 | c->src.orig_val = c->src.val; | ||
| 1274 | } | ||
| 1275 | |||
| 1276 | if ((c->d & DstMask) == ImplicitOps) | ||
| 1277 | goto special_insn; | ||
| 1278 | |||
| 1279 | |||
| 1280 | if (c->dst.type == OP_MEM) { | ||
| 1281 | c->dst.ptr = (unsigned long *)memop; | ||
| 1282 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
| 1283 | c->dst.val = 0; | ||
| 1284 | if (c->d & BitOp) { | ||
| 1285 | unsigned long mask = ~(c->dst.bytes * 8 - 1); | ||
| 1286 | |||
| 1287 | c->dst.ptr = (void *)c->dst.ptr + | ||
| 1288 | (c->src.val & mask) / 8; | ||
| 1289 | } | ||
| 1290 | if (!(c->d & Mov) && | ||
| 1291 | /* optimisation - avoid slow emulated read */ | ||
| 1292 | ((rc = ops->read_emulated((unsigned long)c->dst.ptr, | ||
| 1293 | &c->dst.val, | ||
| 1294 | c->dst.bytes, ctxt->vcpu)) != 0)) | ||
| 1295 | goto done; | ||
| 1296 | } | ||
| 1297 | c->dst.orig_val = c->dst.val; | ||
| 1298 | |||
| 1299 | special_insn: | ||
| 1300 | |||
| 1301 | if (c->twobyte) | ||
| 1302 | goto twobyte_insn; | ||
| 1303 | |||
| 1304 | switch (c->b) { | ||
| 1305 | case 0x00 ... 0x05: | ||
| 1306 | add: /* add */ | ||
| 1307 | emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); | ||
| 1308 | break; | ||
| 1309 | case 0x08 ... 0x0d: | ||
| 1310 | or: /* or */ | ||
| 1311 | emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); | ||
| 1312 | break; | ||
| 1313 | case 0x10 ... 0x15: | ||
| 1314 | adc: /* adc */ | ||
| 1315 | emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); | ||
| 1316 | break; | ||
| 1317 | case 0x18 ... 0x1d: | ||
| 1318 | sbb: /* sbb */ | ||
| 1319 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); | ||
| 1320 | break; | ||
| 1321 | case 0x20 ... 0x23: | ||
| 1322 | and: /* and */ | ||
| 1323 | emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); | ||
| 1324 | break; | ||
| 1325 | case 0x24: /* and al imm8 */ | ||
| 1326 | c->dst.type = OP_REG; | ||
| 1327 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
| 1328 | c->dst.val = *(u8 *)c->dst.ptr; | ||
| 1329 | c->dst.bytes = 1; | ||
| 1330 | c->dst.orig_val = c->dst.val; | ||
| 1331 | goto and; | ||
| 1332 | case 0x25: /* and ax imm16, or eax imm32 */ | ||
| 1333 | c->dst.type = OP_REG; | ||
| 1334 | c->dst.bytes = c->op_bytes; | ||
| 1335 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
| 1336 | if (c->op_bytes == 2) | ||
| 1337 | c->dst.val = *(u16 *)c->dst.ptr; | ||
| 1338 | else | ||
| 1339 | c->dst.val = *(u32 *)c->dst.ptr; | ||
| 1340 | c->dst.orig_val = c->dst.val; | ||
| 1341 | goto and; | ||
| 1342 | case 0x28 ... 0x2d: | ||
| 1343 | sub: /* sub */ | ||
| 1344 | emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); | ||
| 1345 | break; | ||
| 1346 | case 0x30 ... 0x35: | ||
| 1347 | xor: /* xor */ | ||
| 1348 | emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); | ||
| 1349 | break; | ||
| 1350 | case 0x38 ... 0x3d: | ||
| 1351 | cmp: /* cmp */ | ||
| 1352 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
| 1353 | break; | ||
| 1354 | case 0x40 ... 0x47: /* inc r16/r32 */ | ||
| 1355 | emulate_1op("inc", c->dst, ctxt->eflags); | ||
| 1356 | break; | ||
| 1357 | case 0x48 ... 0x4f: /* dec r16/r32 */ | ||
| 1358 | emulate_1op("dec", c->dst, ctxt->eflags); | ||
| 1359 | break; | ||
| 1360 | case 0x50 ... 0x57: /* push reg */ | ||
| 1361 | c->dst.type = OP_MEM; | ||
| 1362 | c->dst.bytes = c->op_bytes; | ||
| 1363 | c->dst.val = c->src.val; | ||
| 1364 | register_address_increment(c->regs[VCPU_REGS_RSP], | ||
| 1365 | -c->op_bytes); | ||
| 1366 | c->dst.ptr = (void *) register_address( | ||
| 1367 | ctxt->ss_base, c->regs[VCPU_REGS_RSP]); | ||
| 1368 | break; | ||
| 1369 | case 0x58 ... 0x5f: /* pop reg */ | ||
| 1370 | pop_instruction: | ||
| 1371 | if ((rc = ops->read_std(register_address(ctxt->ss_base, | ||
| 1372 | c->regs[VCPU_REGS_RSP]), c->dst.ptr, | ||
| 1373 | c->op_bytes, ctxt->vcpu)) != 0) | ||
| 1374 | goto done; | ||
| 1375 | |||
| 1376 | register_address_increment(c->regs[VCPU_REGS_RSP], | ||
| 1377 | c->op_bytes); | ||
| 1378 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
| 1379 | break; | ||
| 1380 | case 0x63: /* movsxd */ | ||
| 1381 | if (ctxt->mode != X86EMUL_MODE_PROT64) | ||
| 1382 | goto cannot_emulate; | ||
| 1383 | c->dst.val = (s32) c->src.val; | ||
| 1384 | break; | ||
| 1385 | case 0x6a: /* push imm8 */ | ||
| 1386 | c->src.val = 0L; | ||
| 1387 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
| 1388 | emulate_push(ctxt); | ||
| 1389 | break; | ||
| 1390 | case 0x6c: /* insb */ | ||
| 1391 | case 0x6d: /* insw/insd */ | ||
| 1392 | if (kvm_emulate_pio_string(ctxt->vcpu, NULL, | ||
| 1393 | 1, | ||
| 1394 | (c->d & ByteOp) ? 1 : c->op_bytes, | ||
| 1395 | c->rep_prefix ? | ||
| 1396 | address_mask(c->regs[VCPU_REGS_RCX]) : 1, | ||
| 1397 | (ctxt->eflags & EFLG_DF), | ||
| 1398 | register_address(ctxt->es_base, | ||
| 1399 | c->regs[VCPU_REGS_RDI]), | ||
| 1400 | c->rep_prefix, | ||
| 1401 | c->regs[VCPU_REGS_RDX]) == 0) { | ||
| 1402 | c->eip = saved_eip; | ||
| 1403 | return -1; | ||
| 1404 | } | ||
| 1405 | return 0; | ||
| 1406 | case 0x6e: /* outsb */ | ||
| 1407 | case 0x6f: /* outsw/outsd */ | ||
| 1408 | if (kvm_emulate_pio_string(ctxt->vcpu, NULL, | ||
| 1409 | 0, | ||
| 1410 | (c->d & ByteOp) ? 1 : c->op_bytes, | ||
| 1411 | c->rep_prefix ? | ||
| 1412 | address_mask(c->regs[VCPU_REGS_RCX]) : 1, | ||
| 1413 | (ctxt->eflags & EFLG_DF), | ||
| 1414 | register_address(c->override_base ? | ||
| 1415 | *c->override_base : | ||
| 1416 | ctxt->ds_base, | ||
| 1417 | c->regs[VCPU_REGS_RSI]), | ||
| 1418 | c->rep_prefix, | ||
| 1419 | c->regs[VCPU_REGS_RDX]) == 0) { | ||
| 1420 | c->eip = saved_eip; | ||
| 1421 | return -1; | ||
| 1422 | } | ||
| 1423 | return 0; | ||
| 1424 | case 0x70 ... 0x7f: /* jcc (short) */ { | ||
| 1425 | int rel = insn_fetch(s8, 1, c->eip); | ||
| 1426 | |||
| 1427 | if (test_cc(c->b, ctxt->eflags)) | ||
| 1428 | JMP_REL(rel); | ||
| 1429 | break; | ||
| 1430 | } | ||
| 1431 | case 0x80 ... 0x83: /* Grp1 */ | ||
| 1432 | switch (c->modrm_reg) { | ||
| 1433 | case 0: | ||
| 1434 | goto add; | ||
| 1435 | case 1: | ||
| 1436 | goto or; | ||
| 1437 | case 2: | ||
| 1438 | goto adc; | ||
| 1439 | case 3: | ||
| 1440 | goto sbb; | ||
| 1441 | case 4: | ||
| 1442 | goto and; | ||
| 1443 | case 5: | ||
| 1444 | goto sub; | ||
| 1445 | case 6: | ||
| 1446 | goto xor; | ||
| 1447 | case 7: | ||
| 1448 | goto cmp; | ||
| 1449 | } | ||
| 1450 | break; | ||
| 1451 | case 0x84 ... 0x85: | ||
| 1452 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); | ||
| 1453 | break; | ||
| 1454 | case 0x86 ... 0x87: /* xchg */ | ||
| 1455 | /* Write back the register source. */ | ||
| 1456 | switch (c->dst.bytes) { | ||
| 1457 | case 1: | ||
| 1458 | *(u8 *) c->src.ptr = (u8) c->dst.val; | ||
| 1459 | break; | ||
| 1460 | case 2: | ||
| 1461 | *(u16 *) c->src.ptr = (u16) c->dst.val; | ||
| 1462 | break; | ||
| 1463 | case 4: | ||
| 1464 | *c->src.ptr = (u32) c->dst.val; | ||
| 1465 | break; /* 64b reg: zero-extend */ | ||
| 1466 | case 8: | ||
| 1467 | *c->src.ptr = c->dst.val; | ||
| 1468 | break; | ||
| 1469 | } | ||
| 1470 | /* | ||
| 1471 | * Write back the memory destination with implicit LOCK | ||
| 1472 | * prefix. | ||
| 1473 | */ | ||
| 1474 | c->dst.val = c->src.val; | ||
| 1475 | c->lock_prefix = 1; | ||
| 1476 | break; | ||
| 1477 | case 0x88 ... 0x8b: /* mov */ | ||
| 1478 | goto mov; | ||
| 1479 | case 0x8d: /* lea r16/r32, m */ | ||
| 1480 | c->dst.val = c->modrm_val; | ||
| 1481 | break; | ||
| 1482 | case 0x8f: /* pop (sole member of Grp1a) */ | ||
| 1483 | rc = emulate_grp1a(ctxt, ops); | ||
| 1484 | if (rc != 0) | ||
| 1485 | goto done; | ||
| 1486 | break; | ||
| 1487 | case 0x9c: /* pushf */ | ||
| 1488 | c->src.val = (unsigned long) ctxt->eflags; | ||
| 1489 | emulate_push(ctxt); | ||
| 1490 | break; | ||
| 1491 | case 0x9d: /* popf */ | ||
| 1492 | c->dst.ptr = (unsigned long *) &ctxt->eflags; | ||
| 1493 | goto pop_instruction; | ||
| 1494 | case 0xa0 ... 0xa1: /* mov */ | ||
| 1495 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | ||
| 1496 | c->dst.val = c->src.val; | ||
| 1497 | break; | ||
| 1498 | case 0xa2 ... 0xa3: /* mov */ | ||
| 1499 | c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; | ||
| 1500 | break; | ||
| 1501 | case 0xa4 ... 0xa5: /* movs */ | ||
| 1502 | c->dst.type = OP_MEM; | ||
| 1503 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
| 1504 | c->dst.ptr = (unsigned long *)register_address( | ||
| 1505 | ctxt->es_base, | ||
| 1506 | c->regs[VCPU_REGS_RDI]); | ||
| 1507 | if ((rc = ops->read_emulated(register_address( | ||
| 1508 | c->override_base ? *c->override_base : | ||
| 1509 | ctxt->ds_base, | ||
| 1510 | c->regs[VCPU_REGS_RSI]), | ||
| 1511 | &c->dst.val, | ||
| 1512 | c->dst.bytes, ctxt->vcpu)) != 0) | ||
| 1513 | goto done; | ||
| 1514 | register_address_increment(c->regs[VCPU_REGS_RSI], | ||
| 1515 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
| 1516 | : c->dst.bytes); | ||
| 1517 | register_address_increment(c->regs[VCPU_REGS_RDI], | ||
| 1518 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
| 1519 | : c->dst.bytes); | ||
| 1520 | break; | ||
| 1521 | case 0xa6 ... 0xa7: /* cmps */ | ||
| 1522 | c->src.type = OP_NONE; /* Disable writeback. */ | ||
| 1523 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
| 1524 | c->src.ptr = (unsigned long *)register_address( | ||
| 1525 | c->override_base ? *c->override_base : | ||
| 1526 | ctxt->ds_base, | ||
| 1527 | c->regs[VCPU_REGS_RSI]); | ||
| 1528 | if ((rc = ops->read_emulated((unsigned long)c->src.ptr, | ||
| 1529 | &c->src.val, | ||
| 1530 | c->src.bytes, | ||
| 1531 | ctxt->vcpu)) != 0) | ||
| 1532 | goto done; | ||
| 1533 | |||
| 1534 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
| 1535 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
| 1536 | c->dst.ptr = (unsigned long *)register_address( | ||
| 1537 | ctxt->es_base, | ||
| 1538 | c->regs[VCPU_REGS_RDI]); | ||
| 1539 | if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, | ||
| 1540 | &c->dst.val, | ||
| 1541 | c->dst.bytes, | ||
| 1542 | ctxt->vcpu)) != 0) | ||
| 1543 | goto done; | ||
| 1544 | |||
| 1545 | DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); | ||
| 1546 | |||
| 1547 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
| 1548 | |||
| 1549 | register_address_increment(c->regs[VCPU_REGS_RSI], | ||
| 1550 | (ctxt->eflags & EFLG_DF) ? -c->src.bytes | ||
| 1551 | : c->src.bytes); | ||
| 1552 | register_address_increment(c->regs[VCPU_REGS_RDI], | ||
| 1553 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
| 1554 | : c->dst.bytes); | ||
| 1555 | |||
| 1556 | break; | ||
| 1557 | case 0xaa ... 0xab: /* stos */ | ||
| 1558 | c->dst.type = OP_MEM; | ||
| 1559 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
| 1560 | c->dst.ptr = (unsigned long *)register_address( | ||
| 1561 | ctxt->es_base, | ||
| 1562 | c->regs[VCPU_REGS_RDI]); | ||
| 1563 | c->dst.val = c->regs[VCPU_REGS_RAX]; | ||
| 1564 | register_address_increment(c->regs[VCPU_REGS_RDI], | ||
| 1565 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
| 1566 | : c->dst.bytes); | ||
| 1567 | break; | ||
| 1568 | case 0xac ... 0xad: /* lods */ | ||
| 1569 | c->dst.type = OP_REG; | ||
| 1570 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
| 1571 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | ||
| 1572 | if ((rc = ops->read_emulated(register_address( | ||
| 1573 | c->override_base ? *c->override_base : | ||
| 1574 | ctxt->ds_base, | ||
| 1575 | c->regs[VCPU_REGS_RSI]), | ||
| 1576 | &c->dst.val, | ||
| 1577 | c->dst.bytes, | ||
| 1578 | ctxt->vcpu)) != 0) | ||
| 1579 | goto done; | ||
| 1580 | register_address_increment(c->regs[VCPU_REGS_RSI], | ||
| 1581 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
| 1582 | : c->dst.bytes); | ||
| 1583 | break; | ||
| 1584 | case 0xae ... 0xaf: /* scas */ | ||
| 1585 | DPRINTF("Urk! I don't handle SCAS.\n"); | ||
| 1586 | goto cannot_emulate; | ||
| 1587 | case 0xc0 ... 0xc1: | ||
| 1588 | emulate_grp2(ctxt); | ||
| 1589 | break; | ||
| 1590 | case 0xc3: /* ret */ | ||
| 1591 | c->dst.ptr = &c->eip; | ||
| 1592 | goto pop_instruction; | ||
| 1593 | case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ | ||
| 1594 | mov: | ||
| 1595 | c->dst.val = c->src.val; | ||
| 1596 | break; | ||
| 1597 | case 0xd0 ... 0xd1: /* Grp2 */ | ||
| 1598 | c->src.val = 1; | ||
| 1599 | emulate_grp2(ctxt); | ||
| 1600 | break; | ||
| 1601 | case 0xd2 ... 0xd3: /* Grp2 */ | ||
| 1602 | c->src.val = c->regs[VCPU_REGS_RCX]; | ||
| 1603 | emulate_grp2(ctxt); | ||
| 1604 | break; | ||
| 1605 | case 0xe8: /* call (near) */ { | ||
| 1606 | long int rel; | ||
| 1607 | switch (c->op_bytes) { | ||
| 1608 | case 2: | ||
| 1609 | rel = insn_fetch(s16, 2, c->eip); | ||
| 1610 | break; | ||
| 1611 | case 4: | ||
| 1612 | rel = insn_fetch(s32, 4, c->eip); | ||
| 1613 | break; | ||
| 1614 | default: | ||
| 1615 | DPRINTF("Call: Invalid op_bytes\n"); | ||
| 1616 | goto cannot_emulate; | ||
| 1617 | } | ||
| 1618 | c->src.val = (unsigned long) c->eip; | ||
| 1619 | JMP_REL(rel); | ||
| 1620 | c->op_bytes = c->ad_bytes; | ||
| 1621 | emulate_push(ctxt); | ||
| 1622 | break; | ||
| 1623 | } | ||
| 1624 | case 0xe9: /* jmp rel */ | ||
| 1625 | case 0xeb: /* jmp rel short */ | ||
| 1626 | JMP_REL(c->src.val); | ||
| 1627 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
| 1628 | break; | ||
| 1629 | case 0xf4: /* hlt */ | ||
| 1630 | ctxt->vcpu->arch.halt_request = 1; | ||
| 1631 | goto done; | ||
| 1632 | case 0xf5: /* cmc */ | ||
| 1633 | /* complement carry flag from eflags reg */ | ||
| 1634 | ctxt->eflags ^= EFLG_CF; | ||
| 1635 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
| 1636 | break; | ||
| 1637 | case 0xf6 ... 0xf7: /* Grp3 */ | ||
| 1638 | rc = emulate_grp3(ctxt, ops); | ||
| 1639 | if (rc != 0) | ||
| 1640 | goto done; | ||
| 1641 | break; | ||
| 1642 | case 0xf8: /* clc */ | ||
| 1643 | ctxt->eflags &= ~EFLG_CF; | ||
| 1644 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
| 1645 | break; | ||
| 1646 | case 0xfa: /* cli */ | ||
| 1647 | ctxt->eflags &= ~X86_EFLAGS_IF; | ||
| 1648 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
| 1649 | break; | ||
| 1650 | case 0xfb: /* sti */ | ||
| 1651 | ctxt->eflags |= X86_EFLAGS_IF; | ||
| 1652 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
| 1653 | break; | ||
| 1654 | case 0xfe ... 0xff: /* Grp4/Grp5 */ | ||
| 1655 | rc = emulate_grp45(ctxt, ops); | ||
| 1656 | if (rc != 0) | ||
| 1657 | goto done; | ||
| 1658 | break; | ||
| 1659 | } | ||
| 1660 | |||
| 1661 | writeback: | ||
| 1662 | rc = writeback(ctxt, ops); | ||
| 1663 | if (rc != 0) | ||
| 1664 | goto done; | ||
| 1665 | |||
| 1666 | /* Commit shadow register state. */ | ||
| 1667 | memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); | ||
| 1668 | ctxt->vcpu->arch.rip = c->eip; | ||
| 1669 | |||
| 1670 | done: | ||
| 1671 | if (rc == X86EMUL_UNHANDLEABLE) { | ||
| 1672 | c->eip = saved_eip; | ||
| 1673 | return -1; | ||
| 1674 | } | ||
| 1675 | return 0; | ||
| 1676 | |||
| 1677 | twobyte_insn: | ||
| 1678 | switch (c->b) { | ||
| 1679 | case 0x01: /* lgdt, lidt, lmsw */ | ||
| 1680 | switch (c->modrm_reg) { | ||
| 1681 | u16 size; | ||
| 1682 | unsigned long address; | ||
| 1683 | |||
| 1684 | case 0: /* vmcall */ | ||
| 1685 | if (c->modrm_mod != 3 || c->modrm_rm != 1) | ||
| 1686 | goto cannot_emulate; | ||
| 1687 | |||
| 1688 | rc = kvm_fix_hypercall(ctxt->vcpu); | ||
| 1689 | if (rc) | ||
| 1690 | goto done; | ||
| 1691 | |||
| 1692 | kvm_emulate_hypercall(ctxt->vcpu); | ||
| 1693 | break; | ||
| 1694 | case 2: /* lgdt */ | ||
| 1695 | rc = read_descriptor(ctxt, ops, c->src.ptr, | ||
| 1696 | &size, &address, c->op_bytes); | ||
| 1697 | if (rc) | ||
| 1698 | goto done; | ||
| 1699 | realmode_lgdt(ctxt->vcpu, size, address); | ||
| 1700 | break; | ||
| 1701 | case 3: /* lidt/vmmcall */ | ||
| 1702 | if (c->modrm_mod == 3 && c->modrm_rm == 1) { | ||
| 1703 | rc = kvm_fix_hypercall(ctxt->vcpu); | ||
| 1704 | if (rc) | ||
| 1705 | goto done; | ||
| 1706 | kvm_emulate_hypercall(ctxt->vcpu); | ||
| 1707 | } else { | ||
| 1708 | rc = read_descriptor(ctxt, ops, c->src.ptr, | ||
| 1709 | &size, &address, | ||
| 1710 | c->op_bytes); | ||
| 1711 | if (rc) | ||
| 1712 | goto done; | ||
| 1713 | realmode_lidt(ctxt->vcpu, size, address); | ||
| 1714 | } | ||
| 1715 | break; | ||
| 1716 | case 4: /* smsw */ | ||
| 1717 | if (c->modrm_mod != 3) | ||
| 1718 | goto cannot_emulate; | ||
| 1719 | *(u16 *)&c->regs[c->modrm_rm] | ||
| 1720 | = realmode_get_cr(ctxt->vcpu, 0); | ||
| 1721 | break; | ||
| 1722 | case 6: /* lmsw */ | ||
| 1723 | if (c->modrm_mod != 3) | ||
| 1724 | goto cannot_emulate; | ||
| 1725 | realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val, | ||
| 1726 | &ctxt->eflags); | ||
| 1727 | break; | ||
| 1728 | case 7: /* invlpg*/ | ||
| 1729 | emulate_invlpg(ctxt->vcpu, memop); | ||
| 1730 | break; | ||
| 1731 | default: | ||
| 1732 | goto cannot_emulate; | ||
| 1733 | } | ||
| 1734 | /* Disable writeback. */ | ||
| 1735 | c->dst.type = OP_NONE; | ||
| 1736 | break; | ||
| 1737 | case 0x06: | ||
| 1738 | emulate_clts(ctxt->vcpu); | ||
| 1739 | c->dst.type = OP_NONE; | ||
| 1740 | break; | ||
| 1741 | case 0x08: /* invd */ | ||
| 1742 | case 0x09: /* wbinvd */ | ||
| 1743 | case 0x0d: /* GrpP (prefetch) */ | ||
| 1744 | case 0x18: /* Grp16 (prefetch/nop) */ | ||
| 1745 | c->dst.type = OP_NONE; | ||
| 1746 | break; | ||
| 1747 | case 0x20: /* mov cr, reg */ | ||
| 1748 | if (c->modrm_mod != 3) | ||
| 1749 | goto cannot_emulate; | ||
| 1750 | c->regs[c->modrm_rm] = | ||
| 1751 | realmode_get_cr(ctxt->vcpu, c->modrm_reg); | ||
| 1752 | c->dst.type = OP_NONE; /* no writeback */ | ||
| 1753 | break; | ||
| 1754 | case 0x21: /* mov from dr to reg */ | ||
| 1755 | if (c->modrm_mod != 3) | ||
| 1756 | goto cannot_emulate; | ||
| 1757 | rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); | ||
| 1758 | if (rc) | ||
| 1759 | goto cannot_emulate; | ||
| 1760 | c->dst.type = OP_NONE; /* no writeback */ | ||
| 1761 | break; | ||
| 1762 | case 0x22: /* mov reg, cr */ | ||
| 1763 | if (c->modrm_mod != 3) | ||
| 1764 | goto cannot_emulate; | ||
| 1765 | realmode_set_cr(ctxt->vcpu, | ||
| 1766 | c->modrm_reg, c->modrm_val, &ctxt->eflags); | ||
| 1767 | c->dst.type = OP_NONE; | ||
| 1768 | break; | ||
| 1769 | case 0x23: /* mov from reg to dr */ | ||
| 1770 | if (c->modrm_mod != 3) | ||
| 1771 | goto cannot_emulate; | ||
| 1772 | rc = emulator_set_dr(ctxt, c->modrm_reg, | ||
| 1773 | c->regs[c->modrm_rm]); | ||
| 1774 | if (rc) | ||
| 1775 | goto cannot_emulate; | ||
| 1776 | c->dst.type = OP_NONE; /* no writeback */ | ||
| 1777 | break; | ||
| 1778 | case 0x30: | ||
| 1779 | /* wrmsr */ | ||
| 1780 | msr_data = (u32)c->regs[VCPU_REGS_RAX] | ||
| 1781 | | ((u64)c->regs[VCPU_REGS_RDX] << 32); | ||
| 1782 | rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); | ||
| 1783 | if (rc) { | ||
| 1784 | kvm_inject_gp(ctxt->vcpu, 0); | ||
| 1785 | c->eip = ctxt->vcpu->arch.rip; | ||
| 1786 | } | ||
| 1787 | rc = X86EMUL_CONTINUE; | ||
| 1788 | c->dst.type = OP_NONE; | ||
| 1789 | break; | ||
| 1790 | case 0x32: | ||
| 1791 | /* rdmsr */ | ||
| 1792 | rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); | ||
| 1793 | if (rc) { | ||
| 1794 | kvm_inject_gp(ctxt->vcpu, 0); | ||
| 1795 | c->eip = ctxt->vcpu->arch.rip; | ||
| 1796 | } else { | ||
| 1797 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; | ||
| 1798 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; | ||
| 1799 | } | ||
| 1800 | rc = X86EMUL_CONTINUE; | ||
| 1801 | c->dst.type = OP_NONE; | ||
| 1802 | break; | ||
| 1803 | case 0x40 ... 0x4f: /* cmov */ | ||
| 1804 | c->dst.val = c->dst.orig_val = c->src.val; | ||
| 1805 | if (!test_cc(c->b, ctxt->eflags)) | ||
| 1806 | c->dst.type = OP_NONE; /* no writeback */ | ||
| 1807 | break; | ||
| 1808 | case 0x80 ... 0x8f: /* jnz rel, etc*/ { | ||
| 1809 | long int rel; | ||
| 1810 | |||
| 1811 | switch (c->op_bytes) { | ||
| 1812 | case 2: | ||
| 1813 | rel = insn_fetch(s16, 2, c->eip); | ||
| 1814 | break; | ||
| 1815 | case 4: | ||
| 1816 | rel = insn_fetch(s32, 4, c->eip); | ||
| 1817 | break; | ||
| 1818 | case 8: | ||
| 1819 | rel = insn_fetch(s64, 8, c->eip); | ||
| 1820 | break; | ||
| 1821 | default: | ||
| 1822 | DPRINTF("jnz: Invalid op_bytes\n"); | ||
| 1823 | goto cannot_emulate; | ||
| 1824 | } | ||
| 1825 | if (test_cc(c->b, ctxt->eflags)) | ||
| 1826 | JMP_REL(rel); | ||
| 1827 | c->dst.type = OP_NONE; | ||
| 1828 | break; | ||
| 1829 | } | ||
| 1830 | case 0xa3: | ||
| 1831 | bt: /* bt */ | ||
| 1832 | c->dst.type = OP_NONE; | ||
| 1833 | /* only subword offset */ | ||
| 1834 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
| 1835 | emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); | ||
| 1836 | break; | ||
| 1837 | case 0xab: | ||
| 1838 | bts: /* bts */ | ||
| 1839 | /* only subword offset */ | ||
| 1840 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
| 1841 | emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); | ||
| 1842 | break; | ||
| 1843 | case 0xb0 ... 0xb1: /* cmpxchg */ | ||
| 1844 | /* | ||
| 1845 | * Save real source value, then compare EAX against | ||
| 1846 | * destination. | ||
| 1847 | */ | ||
| 1848 | c->src.orig_val = c->src.val; | ||
| 1849 | c->src.val = c->regs[VCPU_REGS_RAX]; | ||
| 1850 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
| 1851 | if (ctxt->eflags & EFLG_ZF) { | ||
| 1852 | /* Success: write back to memory. */ | ||
| 1853 | c->dst.val = c->src.orig_val; | ||
| 1854 | } else { | ||
| 1855 | /* Failure: write the value we saw to EAX. */ | ||
| 1856 | c->dst.type = OP_REG; | ||
| 1857 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | ||
| 1858 | } | ||
| 1859 | break; | ||
| 1860 | case 0xb3: | ||
| 1861 | btr: /* btr */ | ||
| 1862 | /* only subword offset */ | ||
| 1863 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
| 1864 | emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); | ||
| 1865 | break; | ||
| 1866 | case 0xb6 ... 0xb7: /* movzx */ | ||
| 1867 | c->dst.bytes = c->op_bytes; | ||
| 1868 | c->dst.val = (c->d & ByteOp) ? (u8) c->src.val | ||
| 1869 | : (u16) c->src.val; | ||
| 1870 | break; | ||
| 1871 | case 0xba: /* Grp8 */ | ||
| 1872 | switch (c->modrm_reg & 3) { | ||
| 1873 | case 0: | ||
| 1874 | goto bt; | ||
| 1875 | case 1: | ||
| 1876 | goto bts; | ||
| 1877 | case 2: | ||
| 1878 | goto btr; | ||
| 1879 | case 3: | ||
| 1880 | goto btc; | ||
| 1881 | } | ||
| 1882 | break; | ||
| 1883 | case 0xbb: | ||
| 1884 | btc: /* btc */ | ||
| 1885 | /* only subword offset */ | ||
| 1886 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
| 1887 | emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); | ||
| 1888 | break; | ||
| 1889 | case 0xbe ... 0xbf: /* movsx */ | ||
| 1890 | c->dst.bytes = c->op_bytes; | ||
| 1891 | c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : | ||
| 1892 | (s16) c->src.val; | ||
| 1893 | break; | ||
| 1894 | case 0xc3: /* movnti */ | ||
| 1895 | c->dst.bytes = c->op_bytes; | ||
| 1896 | c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : | ||
| 1897 | (u64) c->src.val; | ||
| 1898 | break; | ||
| 1899 | case 0xc7: /* Grp9 (cmpxchg8b) */ | ||
| 1900 | rc = emulate_grp9(ctxt, ops, memop); | ||
| 1901 | if (rc != 0) | ||
| 1902 | goto done; | ||
| 1903 | c->dst.type = OP_NONE; | ||
| 1904 | break; | ||
| 1905 | } | ||
| 1906 | goto writeback; | ||
| 1907 | |||
| 1908 | cannot_emulate: | ||
| 1909 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
| 1910 | c->eip = saved_eip; | ||
| 1911 | return -1; | ||
| 1912 | } | ||
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index a63373759f08..5afdde4895dc 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
| @@ -67,6 +67,7 @@ | |||
| 67 | #include <asm/mce.h> | 67 | #include <asm/mce.h> |
| 68 | #include <asm/io.h> | 68 | #include <asm/io.h> |
| 69 | #include <asm/i387.h> | 69 | #include <asm/i387.h> |
| 70 | #include <asm/reboot.h> /* for struct machine_ops */ | ||
| 70 | 71 | ||
| 71 | /*G:010 Welcome to the Guest! | 72 | /*G:010 Welcome to the Guest! |
| 72 | * | 73 | * |
| @@ -813,7 +814,7 @@ static void lguest_safe_halt(void) | |||
| 813 | * rather than virtual addresses, so we use __pa() here. */ | 814 | * rather than virtual addresses, so we use __pa() here. */ |
| 814 | static void lguest_power_off(void) | 815 | static void lguest_power_off(void) |
| 815 | { | 816 | { |
| 816 | hcall(LHCALL_CRASH, __pa("Power down"), 0, 0); | 817 | hcall(LHCALL_SHUTDOWN, __pa("Power down"), LGUEST_SHUTDOWN_POWEROFF, 0); |
| 817 | } | 818 | } |
| 818 | 819 | ||
| 819 | /* | 820 | /* |
| @@ -823,7 +824,7 @@ static void lguest_power_off(void) | |||
| 823 | */ | 824 | */ |
| 824 | static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) | 825 | static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) |
| 825 | { | 826 | { |
| 826 | hcall(LHCALL_CRASH, __pa(p), 0, 0); | 827 | hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0); |
| 827 | /* The hcall won't return, but to keep gcc happy, we're "done". */ | 828 | /* The hcall won't return, but to keep gcc happy, we're "done". */ |
| 828 | return NOTIFY_DONE; | 829 | return NOTIFY_DONE; |
| 829 | } | 830 | } |
| @@ -927,6 +928,11 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, | |||
| 927 | return insn_len; | 928 | return insn_len; |
| 928 | } | 929 | } |
| 929 | 930 | ||
| 931 | static void lguest_restart(char *reason) | ||
| 932 | { | ||
| 933 | hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0); | ||
| 934 | } | ||
| 935 | |||
| 930 | /*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops | 936 | /*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops |
| 931 | * structures in the kernel provide points for (almost) every routine we have | 937 | * structures in the kernel provide points for (almost) every routine we have |
| 932 | * to override to avoid privileged instructions. */ | 938 | * to override to avoid privileged instructions. */ |
| @@ -1060,6 +1066,7 @@ __init void lguest_init(void) | |||
| 1060 | * the Guest routine to power off. */ | 1066 | * the Guest routine to power off. */ |
| 1061 | pm_power_off = lguest_power_off; | 1067 | pm_power_off = lguest_power_off; |
| 1062 | 1068 | ||
| 1069 | machine_ops.restart = lguest_restart; | ||
| 1063 | /* Now we're set up, call start_kernel() in init/main.c and we proceed | 1070 | /* Now we're set up, call start_kernel() in init/main.c and we proceed |
| 1064 | * to boot as normal. It never returns. */ | 1071 | * to boot as normal. It never returns. */ |
| 1065 | start_kernel(); | 1072 | start_kernel(); |
diff --git a/block/bsg.c b/block/bsg.c index 69b0a9d33306..8917c5174dc2 100644 --- a/block/bsg.c +++ b/block/bsg.c | |||
| @@ -279,6 +279,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr) | |||
| 279 | goto out; | 279 | goto out; |
| 280 | } | 280 | } |
| 281 | rq->next_rq = next_rq; | 281 | rq->next_rq = next_rq; |
| 282 | next_rq->cmd_type = rq->cmd_type; | ||
| 282 | 283 | ||
| 283 | dxferp = (void*)(unsigned long)hdr->din_xferp; | 284 | dxferp = (void*)(unsigned long)hdr->din_xferp; |
| 284 | ret = blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len); | 285 | ret = blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len); |
diff --git a/drivers/Kconfig b/drivers/Kconfig index f4076d9e9902..08d4ae201597 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig | |||
| @@ -90,8 +90,6 @@ source "drivers/dca/Kconfig" | |||
| 90 | 90 | ||
| 91 | source "drivers/auxdisplay/Kconfig" | 91 | source "drivers/auxdisplay/Kconfig" |
| 92 | 92 | ||
| 93 | source "drivers/kvm/Kconfig" | ||
| 94 | |||
| 95 | source "drivers/uio/Kconfig" | 93 | source "drivers/uio/Kconfig" |
| 96 | 94 | ||
| 97 | source "drivers/virtio/Kconfig" | 95 | source "drivers/virtio/Kconfig" |
diff --git a/drivers/Makefile b/drivers/Makefile index d92d4d82d001..0ee9a8a4095e 100644 --- a/drivers/Makefile +++ b/drivers/Makefile | |||
| @@ -47,7 +47,6 @@ obj-$(CONFIG_SPI) += spi/ | |||
| 47 | obj-$(CONFIG_PCCARD) += pcmcia/ | 47 | obj-$(CONFIG_PCCARD) += pcmcia/ |
| 48 | obj-$(CONFIG_DIO) += dio/ | 48 | obj-$(CONFIG_DIO) += dio/ |
| 49 | obj-$(CONFIG_SBUS) += sbus/ | 49 | obj-$(CONFIG_SBUS) += sbus/ |
| 50 | obj-$(CONFIG_KVM) += kvm/ | ||
| 51 | obj-$(CONFIG_ZORRO) += zorro/ | 50 | obj-$(CONFIG_ZORRO) += zorro/ |
| 52 | obj-$(CONFIG_MAC) += macintosh/ | 51 | obj-$(CONFIG_MAC) += macintosh/ |
| 53 | obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ | 52 | obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ |
| @@ -73,7 +72,7 @@ obj-$(CONFIG_ISDN) += isdn/ | |||
| 73 | obj-$(CONFIG_EDAC) += edac/ | 72 | obj-$(CONFIG_EDAC) += edac/ |
| 74 | obj-$(CONFIG_MCA) += mca/ | 73 | obj-$(CONFIG_MCA) += mca/ |
| 75 | obj-$(CONFIG_EISA) += eisa/ | 74 | obj-$(CONFIG_EISA) += eisa/ |
| 76 | obj-$(CONFIG_LGUEST_GUEST) += lguest/ | 75 | obj-y += lguest/ |
| 77 | obj-$(CONFIG_CPU_FREQ) += cpufreq/ | 76 | obj-$(CONFIG_CPU_FREQ) += cpufreq/ |
| 78 | obj-$(CONFIG_CPU_IDLE) += cpuidle/ | 77 | obj-$(CONFIG_CPU_IDLE) += cpuidle/ |
| 79 | obj-$(CONFIG_MMC) += mmc/ | 78 | obj-$(CONFIG_MMC) += mmc/ |
diff --git a/drivers/base/bus.c b/drivers/base/bus.c index f484495b2ad1..055989e94799 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c | |||
| @@ -163,15 +163,6 @@ static struct kset *bus_kset; | |||
| 163 | 163 | ||
| 164 | #ifdef CONFIG_HOTPLUG | 164 | #ifdef CONFIG_HOTPLUG |
| 165 | /* Manually detach a device from its associated driver. */ | 165 | /* Manually detach a device from its associated driver. */ |
| 166 | static int driver_helper(struct device *dev, void *data) | ||
| 167 | { | ||
| 168 | const char *name = data; | ||
| 169 | |||
| 170 | if (strcmp(name, dev->bus_id) == 0) | ||
| 171 | return 1; | ||
| 172 | return 0; | ||
| 173 | } | ||
| 174 | |||
| 175 | static ssize_t driver_unbind(struct device_driver *drv, | 166 | static ssize_t driver_unbind(struct device_driver *drv, |
| 176 | const char *buf, size_t count) | 167 | const char *buf, size_t count) |
| 177 | { | 168 | { |
| @@ -179,7 +170,7 @@ static ssize_t driver_unbind(struct device_driver *drv, | |||
| 179 | struct device *dev; | 170 | struct device *dev; |
| 180 | int err = -ENODEV; | 171 | int err = -ENODEV; |
| 181 | 172 | ||
| 182 | dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); | 173 | dev = bus_find_device_by_name(bus, NULL, buf); |
| 183 | if (dev && dev->driver == drv) { | 174 | if (dev && dev->driver == drv) { |
| 184 | if (dev->parent) /* Needed for USB */ | 175 | if (dev->parent) /* Needed for USB */ |
| 185 | down(&dev->parent->sem); | 176 | down(&dev->parent->sem); |
| @@ -206,7 +197,7 @@ static ssize_t driver_bind(struct device_driver *drv, | |||
| 206 | struct device *dev; | 197 | struct device *dev; |
| 207 | int err = -ENODEV; | 198 | int err = -ENODEV; |
| 208 | 199 | ||
| 209 | dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); | 200 | dev = bus_find_device_by_name(bus, NULL, buf); |
| 210 | if (dev && dev->driver == NULL) { | 201 | if (dev && dev->driver == NULL) { |
| 211 | if (dev->parent) /* Needed for USB */ | 202 | if (dev->parent) /* Needed for USB */ |
| 212 | down(&dev->parent->sem); | 203 | down(&dev->parent->sem); |
| @@ -250,7 +241,7 @@ static ssize_t store_drivers_probe(struct bus_type *bus, | |||
| 250 | { | 241 | { |
| 251 | struct device *dev; | 242 | struct device *dev; |
| 252 | 243 | ||
| 253 | dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); | 244 | dev = bus_find_device_by_name(bus, NULL, buf); |
| 254 | if (!dev) | 245 | if (!dev) |
| 255 | return -ENODEV; | 246 | return -ENODEV; |
| 256 | if (bus_rescan_devices_helper(dev, NULL) != 0) | 247 | if (bus_rescan_devices_helper(dev, NULL) != 0) |
| @@ -338,6 +329,32 @@ struct device *bus_find_device(struct bus_type *bus, | |||
| 338 | } | 329 | } |
| 339 | EXPORT_SYMBOL_GPL(bus_find_device); | 330 | EXPORT_SYMBOL_GPL(bus_find_device); |
| 340 | 331 | ||
| 332 | static int match_name(struct device *dev, void *data) | ||
| 333 | { | ||
| 334 | const char *name = data; | ||
| 335 | |||
| 336 | if (strcmp(name, dev->bus_id) == 0) | ||
| 337 | return 1; | ||
| 338 | return 0; | ||
| 339 | } | ||
| 340 | |||
| 341 | /** | ||
| 342 | * bus_find_device_by_name - device iterator for locating a particular device of a specific name | ||
| 343 | * @bus: bus type | ||
| 344 | * @start: Device to begin with | ||
| 345 | * @name: name of the device to match | ||
| 346 | * | ||
| 347 | * This is similar to the bus_find_device() function above, but it handles | ||
| 348 | * searching by a name automatically, no need to write another strcmp matching | ||
| 349 | * function. | ||
| 350 | */ | ||
| 351 | struct device *bus_find_device_by_name(struct bus_type *bus, | ||
| 352 | struct device *start, const char *name) | ||
| 353 | { | ||
| 354 | return bus_find_device(bus, start, (void *)name, match_name); | ||
| 355 | } | ||
| 356 | EXPORT_SYMBOL_GPL(bus_find_device_by_name); | ||
| 357 | |||
| 341 | static struct device_driver *next_driver(struct klist_iter *i) | 358 | static struct device_driver *next_driver(struct klist_iter *i) |
| 342 | { | 359 | { |
| 343 | struct klist_node *n = klist_next(i); | 360 | struct klist_node *n = klist_next(i); |
diff --git a/drivers/base/class.c b/drivers/base/class.c index 59cf35894cfc..9d915376c313 100644 --- a/drivers/base/class.c +++ b/drivers/base/class.c | |||
| @@ -149,7 +149,7 @@ int class_register(struct class *cls) | |||
| 149 | if (error) | 149 | if (error) |
| 150 | return error; | 150 | return error; |
| 151 | 151 | ||
| 152 | #ifdef CONFIG_SYSFS_DEPRECATED | 152 | #if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK) |
| 153 | /* let the block class directory show up in the root of sysfs */ | 153 | /* let the block class directory show up in the root of sysfs */ |
| 154 | if (cls != &block_class) | 154 | if (cls != &block_class) |
| 155 | cls->subsys.kobj.kset = class_kset; | 155 | cls->subsys.kobj.kset = class_kset; |
| @@ -863,7 +863,7 @@ EXPORT_SYMBOL_GPL(class_for_each_device); | |||
| 863 | * The callback should return 0 if the device doesn't match and non-zero | 863 | * The callback should return 0 if the device doesn't match and non-zero |
| 864 | * if it does. If the callback returns non-zero, this function will | 864 | * if it does. If the callback returns non-zero, this function will |
| 865 | * return to the caller and not iterate over any more devices. | 865 | * return to the caller and not iterate over any more devices. |
| 866 | 866 | * | |
| 867 | * Note, you will need to drop the reference with put_device() after use. | 867 | * Note, you will need to drop the reference with put_device() after use. |
| 868 | * | 868 | * |
| 869 | * We hold class->sem in this function, so it can not be | 869 | * We hold class->sem in this function, so it can not be |
diff --git a/drivers/base/core.c b/drivers/base/core.c index edf3bbeb8d6a..b1727876182c 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c | |||
| @@ -27,9 +27,17 @@ | |||
| 27 | int (*platform_notify)(struct device *dev) = NULL; | 27 | int (*platform_notify)(struct device *dev) = NULL; |
| 28 | int (*platform_notify_remove)(struct device *dev) = NULL; | 28 | int (*platform_notify_remove)(struct device *dev) = NULL; |
| 29 | 29 | ||
| 30 | /* | 30 | #ifdef CONFIG_BLOCK |
| 31 | * sysfs bindings for devices. | 31 | static inline int device_is_not_partition(struct device *dev) |
| 32 | */ | 32 | { |
| 33 | return !(dev->type == &part_type); | ||
| 34 | } | ||
| 35 | #else | ||
| 36 | static inline int device_is_not_partition(struct device *dev) | ||
| 37 | { | ||
| 38 | return 1; | ||
| 39 | } | ||
| 40 | #endif | ||
| 33 | 41 | ||
| 34 | /** | 42 | /** |
| 35 | * dev_driver_string - Return a device's driver name, if at all possible | 43 | * dev_driver_string - Return a device's driver name, if at all possible |
| @@ -652,14 +660,14 @@ static int device_add_class_symlinks(struct device *dev) | |||
| 652 | #ifdef CONFIG_SYSFS_DEPRECATED | 660 | #ifdef CONFIG_SYSFS_DEPRECATED |
| 653 | /* stacked class devices need a symlink in the class directory */ | 661 | /* stacked class devices need a symlink in the class directory */ |
| 654 | if (dev->kobj.parent != &dev->class->subsys.kobj && | 662 | if (dev->kobj.parent != &dev->class->subsys.kobj && |
| 655 | dev->type != &part_type) { | 663 | device_is_not_partition(dev)) { |
| 656 | error = sysfs_create_link(&dev->class->subsys.kobj, &dev->kobj, | 664 | error = sysfs_create_link(&dev->class->subsys.kobj, &dev->kobj, |
| 657 | dev->bus_id); | 665 | dev->bus_id); |
| 658 | if (error) | 666 | if (error) |
| 659 | goto out_subsys; | 667 | goto out_subsys; |
| 660 | } | 668 | } |
| 661 | 669 | ||
| 662 | if (dev->parent && dev->type != &part_type) { | 670 | if (dev->parent && device_is_not_partition(dev)) { |
| 663 | struct device *parent = dev->parent; | 671 | struct device *parent = dev->parent; |
| 664 | char *class_name; | 672 | char *class_name; |
| 665 | 673 | ||
| @@ -688,11 +696,11 @@ static int device_add_class_symlinks(struct device *dev) | |||
| 688 | return 0; | 696 | return 0; |
| 689 | 697 | ||
| 690 | out_device: | 698 | out_device: |
| 691 | if (dev->parent && dev->type != &part_type) | 699 | if (dev->parent && device_is_not_partition(dev)) |
| 692 | sysfs_remove_link(&dev->kobj, "device"); | 700 | sysfs_remove_link(&dev->kobj, "device"); |
| 693 | out_busid: | 701 | out_busid: |
| 694 | if (dev->kobj.parent != &dev->class->subsys.kobj && | 702 | if (dev->kobj.parent != &dev->class->subsys.kobj && |
| 695 | dev->type != &part_type) | 703 | device_is_not_partition(dev)) |
| 696 | sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); | 704 | sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); |
| 697 | #else | 705 | #else |
| 698 | /* link in the class directory pointing to the device */ | 706 | /* link in the class directory pointing to the device */ |
| @@ -701,7 +709,7 @@ out_busid: | |||
| 701 | if (error) | 709 | if (error) |
| 702 | goto out_subsys; | 710 | goto out_subsys; |
| 703 | 711 | ||
| 704 | if (dev->parent && dev->type != &part_type) { | 712 | if (dev->parent && device_is_not_partition(dev)) { |
| 705 | error = sysfs_create_link(&dev->kobj, &dev->parent->kobj, | 713 | error = sysfs_create_link(&dev->kobj, &dev->parent->kobj, |
| 706 | "device"); | 714 | "device"); |
| 707 | if (error) | 715 | if (error) |
| @@ -725,7 +733,7 @@ static void device_remove_class_symlinks(struct device *dev) | |||
| 725 | return; | 733 | return; |
| 726 | 734 | ||
| 727 | #ifdef CONFIG_SYSFS_DEPRECATED | 735 | #ifdef CONFIG_SYSFS_DEPRECATED |
| 728 | if (dev->parent && dev->type != &part_type) { | 736 | if (dev->parent && device_is_not_partition(dev)) { |
| 729 | char *class_name; | 737 | char *class_name; |
| 730 | 738 | ||
| 731 | class_name = make_class_name(dev->class->name, &dev->kobj); | 739 | class_name = make_class_name(dev->class->name, &dev->kobj); |
| @@ -737,10 +745,10 @@ static void device_remove_class_symlinks(struct device *dev) | |||
| 737 | } | 745 | } |
| 738 | 746 | ||
| 739 | if (dev->kobj.parent != &dev->class->subsys.kobj && | 747 | if (dev->kobj.parent != &dev->class->subsys.kobj && |
| 740 | dev->type != &part_type) | 748 | device_is_not_partition(dev)) |
| 741 | sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); | 749 | sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); |
| 742 | #else | 750 | #else |
| 743 | if (dev->parent && dev->type != &part_type) | 751 | if (dev->parent && device_is_not_partition(dev)) |
| 744 | sysfs_remove_link(&dev->kobj, "device"); | 752 | sysfs_remove_link(&dev->kobj, "device"); |
| 745 | 753 | ||
| 746 | sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); | 754 | sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); |
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index f2d2c7e2c76b..195ce7c12319 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c | |||
| @@ -1571,7 +1571,6 @@ static struct scsi_host_template srp_template = { | |||
| 1571 | .this_id = -1, | 1571 | .this_id = -1, |
| 1572 | .cmd_per_lun = SRP_SQ_SIZE, | 1572 | .cmd_per_lun = SRP_SQ_SIZE, |
| 1573 | .use_clustering = ENABLE_CLUSTERING, | 1573 | .use_clustering = ENABLE_CLUSTERING, |
| 1574 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1575 | .shost_attrs = srp_host_attrs | 1574 | .shost_attrs = srp_host_attrs |
| 1576 | }; | 1575 | }; |
| 1577 | 1576 | ||
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h deleted file mode 100644 index 11fc014e2b30..000000000000 --- a/drivers/kvm/irq.h +++ /dev/null | |||
| @@ -1,165 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * irq.h: in kernel interrupt controller related definitions | ||
| 3 | * Copyright (c) 2007, Intel Corporation. | ||
| 4 | * | ||
| 5 | * This program is free software; you can redistribute it and/or modify it | ||
| 6 | * under the terms and conditions of the GNU General Public License, | ||
| 7 | * version 2, as published by the Free Software Foundation. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
| 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
| 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
| 12 | * more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License along with | ||
| 15 | * this program; if not, write to the Free Software Foundation, Inc., 59 Temple | ||
| 16 | * Place - Suite 330, Boston, MA 02111-1307 USA. | ||
| 17 | * Authors: | ||
| 18 | * Yaozu (Eddie) Dong <Eddie.dong@intel.com> | ||
| 19 | * | ||
| 20 | */ | ||
| 21 | |||
| 22 | #ifndef __IRQ_H | ||
| 23 | #define __IRQ_H | ||
| 24 | |||
| 25 | #include "kvm.h" | ||
| 26 | |||
| 27 | typedef void irq_request_func(void *opaque, int level); | ||
| 28 | |||
| 29 | struct kvm_kpic_state { | ||
| 30 | u8 last_irr; /* edge detection */ | ||
| 31 | u8 irr; /* interrupt request register */ | ||
| 32 | u8 imr; /* interrupt mask register */ | ||
| 33 | u8 isr; /* interrupt service register */ | ||
| 34 | u8 priority_add; /* highest irq priority */ | ||
| 35 | u8 irq_base; | ||
| 36 | u8 read_reg_select; | ||
| 37 | u8 poll; | ||
| 38 | u8 special_mask; | ||
| 39 | u8 init_state; | ||
| 40 | u8 auto_eoi; | ||
| 41 | u8 rotate_on_auto_eoi; | ||
| 42 | u8 special_fully_nested_mode; | ||
| 43 | u8 init4; /* true if 4 byte init */ | ||
| 44 | u8 elcr; /* PIIX edge/trigger selection */ | ||
| 45 | u8 elcr_mask; | ||
| 46 | struct kvm_pic *pics_state; | ||
| 47 | }; | ||
| 48 | |||
| 49 | struct kvm_pic { | ||
| 50 | struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ | ||
| 51 | irq_request_func *irq_request; | ||
| 52 | void *irq_request_opaque; | ||
| 53 | int output; /* intr from master PIC */ | ||
| 54 | struct kvm_io_device dev; | ||
| 55 | }; | ||
| 56 | |||
| 57 | struct kvm_pic *kvm_create_pic(struct kvm *kvm); | ||
| 58 | void kvm_pic_set_irq(void *opaque, int irq, int level); | ||
| 59 | int kvm_pic_read_irq(struct kvm_pic *s); | ||
| 60 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v); | ||
| 61 | int kvm_cpu_has_interrupt(struct kvm_vcpu *v); | ||
| 62 | void kvm_pic_update_irq(struct kvm_pic *s); | ||
| 63 | |||
| 64 | #define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS | ||
| 65 | #define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ | ||
| 66 | #define IOAPIC_EDGE_TRIG 0 | ||
| 67 | #define IOAPIC_LEVEL_TRIG 1 | ||
| 68 | |||
| 69 | #define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 | ||
| 70 | #define IOAPIC_MEM_LENGTH 0x100 | ||
| 71 | |||
| 72 | /* Direct registers. */ | ||
| 73 | #define IOAPIC_REG_SELECT 0x00 | ||
| 74 | #define IOAPIC_REG_WINDOW 0x10 | ||
| 75 | #define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ | ||
| 76 | |||
| 77 | /* Indirect registers. */ | ||
| 78 | #define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ | ||
| 79 | #define IOAPIC_REG_VERSION 0x01 | ||
| 80 | #define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ | ||
| 81 | |||
| 82 | struct kvm_ioapic { | ||
| 83 | u64 base_address; | ||
| 84 | u32 ioregsel; | ||
| 85 | u32 id; | ||
| 86 | u32 irr; | ||
| 87 | u32 pad; | ||
| 88 | union ioapic_redir_entry { | ||
| 89 | u64 bits; | ||
| 90 | struct { | ||
| 91 | u8 vector; | ||
| 92 | u8 delivery_mode:3; | ||
| 93 | u8 dest_mode:1; | ||
| 94 | u8 delivery_status:1; | ||
| 95 | u8 polarity:1; | ||
| 96 | u8 remote_irr:1; | ||
| 97 | u8 trig_mode:1; | ||
| 98 | u8 mask:1; | ||
| 99 | u8 reserve:7; | ||
| 100 | u8 reserved[4]; | ||
| 101 | u8 dest_id; | ||
| 102 | } fields; | ||
| 103 | } redirtbl[IOAPIC_NUM_PINS]; | ||
| 104 | struct kvm_io_device dev; | ||
| 105 | struct kvm *kvm; | ||
| 106 | }; | ||
| 107 | |||
| 108 | struct kvm_lapic { | ||
| 109 | unsigned long base_address; | ||
| 110 | struct kvm_io_device dev; | ||
| 111 | struct { | ||
| 112 | atomic_t pending; | ||
| 113 | s64 period; /* unit: ns */ | ||
| 114 | u32 divide_count; | ||
| 115 | ktime_t last_update; | ||
| 116 | struct hrtimer dev; | ||
| 117 | } timer; | ||
| 118 | struct kvm_vcpu *vcpu; | ||
| 119 | struct page *regs_page; | ||
| 120 | void *regs; | ||
| 121 | }; | ||
| 122 | |||
| 123 | #ifdef DEBUG | ||
| 124 | #define ASSERT(x) \ | ||
| 125 | do { \ | ||
| 126 | if (!(x)) { \ | ||
| 127 | printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ | ||
| 128 | __FILE__, __LINE__, #x); \ | ||
| 129 | BUG(); \ | ||
| 130 | } \ | ||
| 131 | } while (0) | ||
| 132 | #else | ||
| 133 | #define ASSERT(x) do { } while (0) | ||
| 134 | #endif | ||
| 135 | |||
| 136 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu); | ||
| 137 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); | ||
| 138 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); | ||
| 139 | int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); | ||
| 140 | int kvm_create_lapic(struct kvm_vcpu *vcpu); | ||
| 141 | void kvm_lapic_reset(struct kvm_vcpu *vcpu); | ||
| 142 | void kvm_free_apic(struct kvm_lapic *apic); | ||
| 143 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); | ||
| 144 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); | ||
| 145 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); | ||
| 146 | struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, | ||
| 147 | unsigned long bitmap); | ||
| 148 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); | ||
| 149 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); | ||
| 150 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); | ||
| 151 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); | ||
| 152 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); | ||
| 153 | int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig); | ||
| 154 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); | ||
| 155 | int kvm_ioapic_init(struct kvm *kvm); | ||
| 156 | void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); | ||
| 157 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu); | ||
| 158 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); | ||
| 159 | void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
| 160 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
| 161 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); | ||
| 162 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); | ||
| 163 | void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); | ||
| 164 | |||
| 165 | #endif | ||
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c deleted file mode 100644 index feb5ac986c5d..000000000000 --- a/drivers/kvm/mmu.c +++ /dev/null | |||
| @@ -1,1498 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * Kernel-based Virtual Machine driver for Linux | ||
| 3 | * | ||
| 4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
| 5 | * machines without emulation or binary translation. | ||
| 6 | * | ||
| 7 | * MMU support | ||
| 8 | * | ||
| 9 | * Copyright (C) 2006 Qumranet, Inc. | ||
| 10 | * | ||
| 11 | * Authors: | ||
| 12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
| 13 | * Avi Kivity <avi@qumranet.com> | ||
| 14 | * | ||
| 15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
| 16 | * the COPYING file in the top-level directory. | ||
| 17 | * | ||
| 18 | */ | ||
| 19 | |||
| 20 | #include "vmx.h" | ||
| 21 | #include "kvm.h" | ||
| 22 | |||
| 23 | #include <linux/types.h> | ||
| 24 | #include <linux/string.h> | ||
| 25 | #include <linux/mm.h> | ||
| 26 | #include <linux/highmem.h> | ||
| 27 | #include <linux/module.h> | ||
| 28 | |||
| 29 | #include <asm/page.h> | ||
| 30 | #include <asm/cmpxchg.h> | ||
| 31 | |||
| 32 | #undef MMU_DEBUG | ||
| 33 | |||
| 34 | #undef AUDIT | ||
| 35 | |||
| 36 | #ifdef AUDIT | ||
| 37 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); | ||
| 38 | #else | ||
| 39 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} | ||
| 40 | #endif | ||
| 41 | |||
| 42 | #ifdef MMU_DEBUG | ||
| 43 | |||
| 44 | #define pgprintk(x...) do { if (dbg) printk(x); } while (0) | ||
| 45 | #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) | ||
| 46 | |||
| 47 | #else | ||
| 48 | |||
| 49 | #define pgprintk(x...) do { } while (0) | ||
| 50 | #define rmap_printk(x...) do { } while (0) | ||
| 51 | |||
| 52 | #endif | ||
| 53 | |||
| 54 | #if defined(MMU_DEBUG) || defined(AUDIT) | ||
| 55 | static int dbg = 1; | ||
| 56 | #endif | ||
| 57 | |||
| 58 | #ifndef MMU_DEBUG | ||
| 59 | #define ASSERT(x) do { } while (0) | ||
| 60 | #else | ||
| 61 | #define ASSERT(x) \ | ||
| 62 | if (!(x)) { \ | ||
| 63 | printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ | ||
| 64 | __FILE__, __LINE__, #x); \ | ||
| 65 | } | ||
| 66 | #endif | ||
| 67 | |||
| 68 | #define PT64_PT_BITS 9 | ||
| 69 | #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) | ||
| 70 | #define PT32_PT_BITS 10 | ||
| 71 | #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) | ||
| 72 | |||
| 73 | #define PT_WRITABLE_SHIFT 1 | ||
| 74 | |||
| 75 | #define PT_PRESENT_MASK (1ULL << 0) | ||
| 76 | #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) | ||
| 77 | #define PT_USER_MASK (1ULL << 2) | ||
| 78 | #define PT_PWT_MASK (1ULL << 3) | ||
| 79 | #define PT_PCD_MASK (1ULL << 4) | ||
| 80 | #define PT_ACCESSED_MASK (1ULL << 5) | ||
| 81 | #define PT_DIRTY_MASK (1ULL << 6) | ||
| 82 | #define PT_PAGE_SIZE_MASK (1ULL << 7) | ||
| 83 | #define PT_PAT_MASK (1ULL << 7) | ||
| 84 | #define PT_GLOBAL_MASK (1ULL << 8) | ||
| 85 | #define PT64_NX_MASK (1ULL << 63) | ||
| 86 | |||
| 87 | #define PT_PAT_SHIFT 7 | ||
| 88 | #define PT_DIR_PAT_SHIFT 12 | ||
| 89 | #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) | ||
| 90 | |||
| 91 | #define PT32_DIR_PSE36_SIZE 4 | ||
| 92 | #define PT32_DIR_PSE36_SHIFT 13 | ||
| 93 | #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) | ||
| 94 | |||
| 95 | |||
| 96 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | ||
| 97 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | ||
| 98 | |||
| 99 | #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) | ||
| 100 | |||
| 101 | #define VALID_PAGE(x) ((x) != INVALID_PAGE) | ||
| 102 | |||
| 103 | #define PT64_LEVEL_BITS 9 | ||
| 104 | |||
| 105 | #define PT64_LEVEL_SHIFT(level) \ | ||
| 106 | ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS ) | ||
| 107 | |||
| 108 | #define PT64_LEVEL_MASK(level) \ | ||
| 109 | (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) | ||
| 110 | |||
| 111 | #define PT64_INDEX(address, level)\ | ||
| 112 | (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) | ||
| 113 | |||
| 114 | |||
| 115 | #define PT32_LEVEL_BITS 10 | ||
| 116 | |||
| 117 | #define PT32_LEVEL_SHIFT(level) \ | ||
| 118 | ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS ) | ||
| 119 | |||
| 120 | #define PT32_LEVEL_MASK(level) \ | ||
| 121 | (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) | ||
| 122 | |||
| 123 | #define PT32_INDEX(address, level)\ | ||
| 124 | (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) | ||
| 125 | |||
| 126 | |||
| 127 | #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) | ||
| 128 | #define PT64_DIR_BASE_ADDR_MASK \ | ||
| 129 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) | ||
| 130 | |||
| 131 | #define PT32_BASE_ADDR_MASK PAGE_MASK | ||
| 132 | #define PT32_DIR_BASE_ADDR_MASK \ | ||
| 133 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) | ||
| 134 | |||
| 135 | |||
| 136 | #define PFERR_PRESENT_MASK (1U << 0) | ||
| 137 | #define PFERR_WRITE_MASK (1U << 1) | ||
| 138 | #define PFERR_USER_MASK (1U << 2) | ||
| 139 | #define PFERR_FETCH_MASK (1U << 4) | ||
| 140 | |||
| 141 | #define PT64_ROOT_LEVEL 4 | ||
| 142 | #define PT32_ROOT_LEVEL 2 | ||
| 143 | #define PT32E_ROOT_LEVEL 3 | ||
| 144 | |||
| 145 | #define PT_DIRECTORY_LEVEL 2 | ||
| 146 | #define PT_PAGE_TABLE_LEVEL 1 | ||
| 147 | |||
| 148 | #define RMAP_EXT 4 | ||
| 149 | |||
| 150 | struct kvm_rmap_desc { | ||
| 151 | u64 *shadow_ptes[RMAP_EXT]; | ||
| 152 | struct kvm_rmap_desc *more; | ||
| 153 | }; | ||
| 154 | |||
| 155 | static struct kmem_cache *pte_chain_cache; | ||
| 156 | static struct kmem_cache *rmap_desc_cache; | ||
| 157 | static struct kmem_cache *mmu_page_header_cache; | ||
| 158 | |||
| 159 | static int is_write_protection(struct kvm_vcpu *vcpu) | ||
| 160 | { | ||
| 161 | return vcpu->cr0 & X86_CR0_WP; | ||
| 162 | } | ||
| 163 | |||
| 164 | static int is_cpuid_PSE36(void) | ||
| 165 | { | ||
| 166 | return 1; | ||
| 167 | } | ||
| 168 | |||
| 169 | static int is_nx(struct kvm_vcpu *vcpu) | ||
| 170 | { | ||
| 171 | return vcpu->shadow_efer & EFER_NX; | ||
| 172 | } | ||
| 173 | |||
| 174 | static int is_present_pte(unsigned long pte) | ||
| 175 | { | ||
| 176 | return pte & PT_PRESENT_MASK; | ||
| 177 | } | ||
| 178 | |||
| 179 | static int is_writeble_pte(unsigned long pte) | ||
| 180 | { | ||
| 181 | return pte & PT_WRITABLE_MASK; | ||
| 182 | } | ||
| 183 | |||
| 184 | static int is_io_pte(unsigned long pte) | ||
| 185 | { | ||
| 186 | return pte & PT_SHADOW_IO_MARK; | ||
| 187 | } | ||
| 188 | |||
| 189 | static int is_rmap_pte(u64 pte) | ||
| 190 | { | ||
| 191 | return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK)) | ||
| 192 | == (PT_WRITABLE_MASK | PT_PRESENT_MASK); | ||
| 193 | } | ||
| 194 | |||
| 195 | static void set_shadow_pte(u64 *sptep, u64 spte) | ||
| 196 | { | ||
| 197 | #ifdef CONFIG_X86_64 | ||
| 198 | set_64bit((unsigned long *)sptep, spte); | ||
| 199 | #else | ||
| 200 | set_64bit((unsigned long long *)sptep, spte); | ||
| 201 | #endif | ||
| 202 | } | ||
| 203 | |||
| 204 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | ||
| 205 | struct kmem_cache *base_cache, int min) | ||
| 206 | { | ||
| 207 | void *obj; | ||
| 208 | |||
| 209 | if (cache->nobjs >= min) | ||
| 210 | return 0; | ||
| 211 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | ||
| 212 | obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); | ||
| 213 | if (!obj) | ||
| 214 | return -ENOMEM; | ||
| 215 | cache->objects[cache->nobjs++] = obj; | ||
| 216 | } | ||
| 217 | return 0; | ||
| 218 | } | ||
| 219 | |||
| 220 | static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) | ||
| 221 | { | ||
| 222 | while (mc->nobjs) | ||
| 223 | kfree(mc->objects[--mc->nobjs]); | ||
| 224 | } | ||
| 225 | |||
| 226 | static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, | ||
| 227 | int min) | ||
| 228 | { | ||
| 229 | struct page *page; | ||
| 230 | |||
| 231 | if (cache->nobjs >= min) | ||
| 232 | return 0; | ||
| 233 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | ||
| 234 | page = alloc_page(GFP_KERNEL); | ||
| 235 | if (!page) | ||
| 236 | return -ENOMEM; | ||
| 237 | set_page_private(page, 0); | ||
| 238 | cache->objects[cache->nobjs++] = page_address(page); | ||
| 239 | } | ||
| 240 | return 0; | ||
| 241 | } | ||
| 242 | |||
| 243 | static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) | ||
| 244 | { | ||
| 245 | while (mc->nobjs) | ||
| 246 | free_page((unsigned long)mc->objects[--mc->nobjs]); | ||
| 247 | } | ||
| 248 | |||
| 249 | static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) | ||
| 250 | { | ||
| 251 | int r; | ||
| 252 | |||
| 253 | kvm_mmu_free_some_pages(vcpu); | ||
| 254 | r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache, | ||
| 255 | pte_chain_cache, 4); | ||
| 256 | if (r) | ||
| 257 | goto out; | ||
| 258 | r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, | ||
| 259 | rmap_desc_cache, 1); | ||
| 260 | if (r) | ||
| 261 | goto out; | ||
| 262 | r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4); | ||
| 263 | if (r) | ||
| 264 | goto out; | ||
| 265 | r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache, | ||
| 266 | mmu_page_header_cache, 4); | ||
| 267 | out: | ||
| 268 | return r; | ||
| 269 | } | ||
| 270 | |||
| 271 | static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) | ||
| 272 | { | ||
| 273 | mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); | ||
| 274 | mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache); | ||
| 275 | mmu_free_memory_cache_page(&vcpu->mmu_page_cache); | ||
| 276 | mmu_free_memory_cache(&vcpu->mmu_page_header_cache); | ||
| 277 | } | ||
| 278 | |||
| 279 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | ||
| 280 | size_t size) | ||
| 281 | { | ||
| 282 | void *p; | ||
| 283 | |||
| 284 | BUG_ON(!mc->nobjs); | ||
| 285 | p = mc->objects[--mc->nobjs]; | ||
| 286 | memset(p, 0, size); | ||
| 287 | return p; | ||
| 288 | } | ||
| 289 | |||
| 290 | static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) | ||
| 291 | { | ||
| 292 | return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache, | ||
| 293 | sizeof(struct kvm_pte_chain)); | ||
| 294 | } | ||
| 295 | |||
| 296 | static void mmu_free_pte_chain(struct kvm_pte_chain *pc) | ||
| 297 | { | ||
| 298 | kfree(pc); | ||
| 299 | } | ||
| 300 | |||
| 301 | static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) | ||
| 302 | { | ||
| 303 | return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache, | ||
| 304 | sizeof(struct kvm_rmap_desc)); | ||
| 305 | } | ||
| 306 | |||
| 307 | static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) | ||
| 308 | { | ||
| 309 | kfree(rd); | ||
| 310 | } | ||
| 311 | |||
| 312 | /* | ||
| 313 | * Reverse mapping data structures: | ||
| 314 | * | ||
| 315 | * If page->private bit zero is zero, then page->private points to the | ||
| 316 | * shadow page table entry that points to page_address(page). | ||
| 317 | * | ||
| 318 | * If page->private bit zero is one, (then page->private & ~1) points | ||
| 319 | * to a struct kvm_rmap_desc containing more mappings. | ||
| 320 | */ | ||
| 321 | static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte) | ||
| 322 | { | ||
| 323 | struct page *page; | ||
| 324 | struct kvm_rmap_desc *desc; | ||
| 325 | int i; | ||
| 326 | |||
| 327 | if (!is_rmap_pte(*spte)) | ||
| 328 | return; | ||
| 329 | page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
| 330 | if (!page_private(page)) { | ||
| 331 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); | ||
| 332 | set_page_private(page,(unsigned long)spte); | ||
| 333 | } else if (!(page_private(page) & 1)) { | ||
| 334 | rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); | ||
| 335 | desc = mmu_alloc_rmap_desc(vcpu); | ||
| 336 | desc->shadow_ptes[0] = (u64 *)page_private(page); | ||
| 337 | desc->shadow_ptes[1] = spte; | ||
| 338 | set_page_private(page,(unsigned long)desc | 1); | ||
| 339 | } else { | ||
| 340 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | ||
| 341 | desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); | ||
| 342 | while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) | ||
| 343 | desc = desc->more; | ||
| 344 | if (desc->shadow_ptes[RMAP_EXT-1]) { | ||
| 345 | desc->more = mmu_alloc_rmap_desc(vcpu); | ||
| 346 | desc = desc->more; | ||
| 347 | } | ||
| 348 | for (i = 0; desc->shadow_ptes[i]; ++i) | ||
| 349 | ; | ||
| 350 | desc->shadow_ptes[i] = spte; | ||
| 351 | } | ||
| 352 | } | ||
| 353 | |||
| 354 | static void rmap_desc_remove_entry(struct page *page, | ||
| 355 | struct kvm_rmap_desc *desc, | ||
| 356 | int i, | ||
| 357 | struct kvm_rmap_desc *prev_desc) | ||
| 358 | { | ||
| 359 | int j; | ||
| 360 | |||
| 361 | for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) | ||
| 362 | ; | ||
| 363 | desc->shadow_ptes[i] = desc->shadow_ptes[j]; | ||
| 364 | desc->shadow_ptes[j] = NULL; | ||
| 365 | if (j != 0) | ||
| 366 | return; | ||
| 367 | if (!prev_desc && !desc->more) | ||
| 368 | set_page_private(page,(unsigned long)desc->shadow_ptes[0]); | ||
| 369 | else | ||
| 370 | if (prev_desc) | ||
| 371 | prev_desc->more = desc->more; | ||
| 372 | else | ||
| 373 | set_page_private(page,(unsigned long)desc->more | 1); | ||
| 374 | mmu_free_rmap_desc(desc); | ||
| 375 | } | ||
| 376 | |||
| 377 | static void rmap_remove(u64 *spte) | ||
| 378 | { | ||
| 379 | struct page *page; | ||
| 380 | struct kvm_rmap_desc *desc; | ||
| 381 | struct kvm_rmap_desc *prev_desc; | ||
| 382 | int i; | ||
| 383 | |||
| 384 | if (!is_rmap_pte(*spte)) | ||
| 385 | return; | ||
| 386 | page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
| 387 | if (!page_private(page)) { | ||
| 388 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | ||
| 389 | BUG(); | ||
| 390 | } else if (!(page_private(page) & 1)) { | ||
| 391 | rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); | ||
| 392 | if ((u64 *)page_private(page) != spte) { | ||
| 393 | printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", | ||
| 394 | spte, *spte); | ||
| 395 | BUG(); | ||
| 396 | } | ||
| 397 | set_page_private(page,0); | ||
| 398 | } else { | ||
| 399 | rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); | ||
| 400 | desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); | ||
| 401 | prev_desc = NULL; | ||
| 402 | while (desc) { | ||
| 403 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) | ||
| 404 | if (desc->shadow_ptes[i] == spte) { | ||
| 405 | rmap_desc_remove_entry(page, | ||
| 406 | desc, i, | ||
| 407 | prev_desc); | ||
| 408 | return; | ||
| 409 | } | ||
| 410 | prev_desc = desc; | ||
| 411 | desc = desc->more; | ||
| 412 | } | ||
| 413 | BUG(); | ||
| 414 | } | ||
| 415 | } | ||
| 416 | |||
| 417 | static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) | ||
| 418 | { | ||
| 419 | struct kvm *kvm = vcpu->kvm; | ||
| 420 | struct page *page; | ||
| 421 | struct kvm_rmap_desc *desc; | ||
| 422 | u64 *spte; | ||
| 423 | |||
| 424 | page = gfn_to_page(kvm, gfn); | ||
| 425 | BUG_ON(!page); | ||
| 426 | |||
| 427 | while (page_private(page)) { | ||
| 428 | if (!(page_private(page) & 1)) | ||
| 429 | spte = (u64 *)page_private(page); | ||
| 430 | else { | ||
| 431 | desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); | ||
| 432 | spte = desc->shadow_ptes[0]; | ||
| 433 | } | ||
| 434 | BUG_ON(!spte); | ||
| 435 | BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT | ||
| 436 | != page_to_pfn(page)); | ||
| 437 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | ||
| 438 | BUG_ON(!(*spte & PT_WRITABLE_MASK)); | ||
| 439 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | ||
| 440 | rmap_remove(spte); | ||
| 441 | set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); | ||
| 442 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
| 443 | } | ||
| 444 | } | ||
| 445 | |||
| 446 | #ifdef MMU_DEBUG | ||
| 447 | static int is_empty_shadow_page(u64 *spt) | ||
| 448 | { | ||
| 449 | u64 *pos; | ||
| 450 | u64 *end; | ||
| 451 | |||
| 452 | for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) | ||
| 453 | if (*pos != 0) { | ||
| 454 | printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, | ||
| 455 | pos, *pos); | ||
| 456 | return 0; | ||
| 457 | } | ||
| 458 | return 1; | ||
| 459 | } | ||
| 460 | #endif | ||
| 461 | |||
| 462 | static void kvm_mmu_free_page(struct kvm *kvm, | ||
| 463 | struct kvm_mmu_page *page_head) | ||
| 464 | { | ||
| 465 | ASSERT(is_empty_shadow_page(page_head->spt)); | ||
| 466 | list_del(&page_head->link); | ||
| 467 | __free_page(virt_to_page(page_head->spt)); | ||
| 468 | kfree(page_head); | ||
| 469 | ++kvm->n_free_mmu_pages; | ||
| 470 | } | ||
| 471 | |||
| 472 | static unsigned kvm_page_table_hashfn(gfn_t gfn) | ||
| 473 | { | ||
| 474 | return gfn; | ||
| 475 | } | ||
| 476 | |||
| 477 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | ||
| 478 | u64 *parent_pte) | ||
| 479 | { | ||
| 480 | struct kvm_mmu_page *page; | ||
| 481 | |||
| 482 | if (!vcpu->kvm->n_free_mmu_pages) | ||
| 483 | return NULL; | ||
| 484 | |||
| 485 | page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache, | ||
| 486 | sizeof *page); | ||
| 487 | page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE); | ||
| 488 | set_page_private(virt_to_page(page->spt), (unsigned long)page); | ||
| 489 | list_add(&page->link, &vcpu->kvm->active_mmu_pages); | ||
| 490 | ASSERT(is_empty_shadow_page(page->spt)); | ||
| 491 | page->slot_bitmap = 0; | ||
| 492 | page->multimapped = 0; | ||
| 493 | page->parent_pte = parent_pte; | ||
| 494 | --vcpu->kvm->n_free_mmu_pages; | ||
| 495 | return page; | ||
| 496 | } | ||
| 497 | |||
| 498 | static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, | ||
| 499 | struct kvm_mmu_page *page, u64 *parent_pte) | ||
| 500 | { | ||
| 501 | struct kvm_pte_chain *pte_chain; | ||
| 502 | struct hlist_node *node; | ||
| 503 | int i; | ||
| 504 | |||
| 505 | if (!parent_pte) | ||
| 506 | return; | ||
| 507 | if (!page->multimapped) { | ||
| 508 | u64 *old = page->parent_pte; | ||
| 509 | |||
| 510 | if (!old) { | ||
| 511 | page->parent_pte = parent_pte; | ||
| 512 | return; | ||
| 513 | } | ||
| 514 | page->multimapped = 1; | ||
| 515 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
| 516 | INIT_HLIST_HEAD(&page->parent_ptes); | ||
| 517 | hlist_add_head(&pte_chain->link, &page->parent_ptes); | ||
| 518 | pte_chain->parent_ptes[0] = old; | ||
| 519 | } | ||
| 520 | hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) { | ||
| 521 | if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) | ||
| 522 | continue; | ||
| 523 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) | ||
| 524 | if (!pte_chain->parent_ptes[i]) { | ||
| 525 | pte_chain->parent_ptes[i] = parent_pte; | ||
| 526 | return; | ||
| 527 | } | ||
| 528 | } | ||
| 529 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
| 530 | BUG_ON(!pte_chain); | ||
| 531 | hlist_add_head(&pte_chain->link, &page->parent_ptes); | ||
| 532 | pte_chain->parent_ptes[0] = parent_pte; | ||
| 533 | } | ||
| 534 | |||
| 535 | static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page, | ||
| 536 | u64 *parent_pte) | ||
| 537 | { | ||
| 538 | struct kvm_pte_chain *pte_chain; | ||
| 539 | struct hlist_node *node; | ||
| 540 | int i; | ||
| 541 | |||
| 542 | if (!page->multimapped) { | ||
| 543 | BUG_ON(page->parent_pte != parent_pte); | ||
| 544 | page->parent_pte = NULL; | ||
| 545 | return; | ||
| 546 | } | ||
| 547 | hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) | ||
| 548 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
| 549 | if (!pte_chain->parent_ptes[i]) | ||
| 550 | break; | ||
| 551 | if (pte_chain->parent_ptes[i] != parent_pte) | ||
| 552 | continue; | ||
| 553 | while (i + 1 < NR_PTE_CHAIN_ENTRIES | ||
| 554 | && pte_chain->parent_ptes[i + 1]) { | ||
| 555 | pte_chain->parent_ptes[i] | ||
| 556 | = pte_chain->parent_ptes[i + 1]; | ||
| 557 | ++i; | ||
| 558 | } | ||
| 559 | pte_chain->parent_ptes[i] = NULL; | ||
| 560 | if (i == 0) { | ||
| 561 | hlist_del(&pte_chain->link); | ||
| 562 | mmu_free_pte_chain(pte_chain); | ||
| 563 | if (hlist_empty(&page->parent_ptes)) { | ||
| 564 | page->multimapped = 0; | ||
| 565 | page->parent_pte = NULL; | ||
| 566 | } | ||
| 567 | } | ||
| 568 | return; | ||
| 569 | } | ||
| 570 | BUG(); | ||
| 571 | } | ||
| 572 | |||
| 573 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu, | ||
| 574 | gfn_t gfn) | ||
| 575 | { | ||
| 576 | unsigned index; | ||
| 577 | struct hlist_head *bucket; | ||
| 578 | struct kvm_mmu_page *page; | ||
| 579 | struct hlist_node *node; | ||
| 580 | |||
| 581 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
| 582 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
| 583 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
| 584 | hlist_for_each_entry(page, node, bucket, hash_link) | ||
| 585 | if (page->gfn == gfn && !page->role.metaphysical) { | ||
| 586 | pgprintk("%s: found role %x\n", | ||
| 587 | __FUNCTION__, page->role.word); | ||
| 588 | return page; | ||
| 589 | } | ||
| 590 | return NULL; | ||
| 591 | } | ||
| 592 | |||
| 593 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | ||
| 594 | gfn_t gfn, | ||
| 595 | gva_t gaddr, | ||
| 596 | unsigned level, | ||
| 597 | int metaphysical, | ||
| 598 | unsigned hugepage_access, | ||
| 599 | u64 *parent_pte) | ||
| 600 | { | ||
| 601 | union kvm_mmu_page_role role; | ||
| 602 | unsigned index; | ||
| 603 | unsigned quadrant; | ||
| 604 | struct hlist_head *bucket; | ||
| 605 | struct kvm_mmu_page *page; | ||
| 606 | struct hlist_node *node; | ||
| 607 | |||
| 608 | role.word = 0; | ||
| 609 | role.glevels = vcpu->mmu.root_level; | ||
| 610 | role.level = level; | ||
| 611 | role.metaphysical = metaphysical; | ||
| 612 | role.hugepage_access = hugepage_access; | ||
| 613 | if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) { | ||
| 614 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); | ||
| 615 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | ||
| 616 | role.quadrant = quadrant; | ||
| 617 | } | ||
| 618 | pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, | ||
| 619 | gfn, role.word); | ||
| 620 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
| 621 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
| 622 | hlist_for_each_entry(page, node, bucket, hash_link) | ||
| 623 | if (page->gfn == gfn && page->role.word == role.word) { | ||
| 624 | mmu_page_add_parent_pte(vcpu, page, parent_pte); | ||
| 625 | pgprintk("%s: found\n", __FUNCTION__); | ||
| 626 | return page; | ||
| 627 | } | ||
| 628 | page = kvm_mmu_alloc_page(vcpu, parent_pte); | ||
| 629 | if (!page) | ||
| 630 | return page; | ||
| 631 | pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); | ||
| 632 | page->gfn = gfn; | ||
| 633 | page->role = role; | ||
| 634 | hlist_add_head(&page->hash_link, bucket); | ||
| 635 | if (!metaphysical) | ||
| 636 | rmap_write_protect(vcpu, gfn); | ||
| 637 | return page; | ||
| 638 | } | ||
| 639 | |||
| 640 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | ||
| 641 | struct kvm_mmu_page *page) | ||
| 642 | { | ||
| 643 | unsigned i; | ||
| 644 | u64 *pt; | ||
| 645 | u64 ent; | ||
| 646 | |||
| 647 | pt = page->spt; | ||
| 648 | |||
| 649 | if (page->role.level == PT_PAGE_TABLE_LEVEL) { | ||
| 650 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
| 651 | if (pt[i] & PT_PRESENT_MASK) | ||
| 652 | rmap_remove(&pt[i]); | ||
| 653 | pt[i] = 0; | ||
| 654 | } | ||
| 655 | kvm_flush_remote_tlbs(kvm); | ||
| 656 | return; | ||
| 657 | } | ||
| 658 | |||
| 659 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
| 660 | ent = pt[i]; | ||
| 661 | |||
| 662 | pt[i] = 0; | ||
| 663 | if (!(ent & PT_PRESENT_MASK)) | ||
| 664 | continue; | ||
| 665 | ent &= PT64_BASE_ADDR_MASK; | ||
| 666 | mmu_page_remove_parent_pte(page_header(ent), &pt[i]); | ||
| 667 | } | ||
| 668 | kvm_flush_remote_tlbs(kvm); | ||
| 669 | } | ||
| 670 | |||
| 671 | static void kvm_mmu_put_page(struct kvm_mmu_page *page, | ||
| 672 | u64 *parent_pte) | ||
| 673 | { | ||
| 674 | mmu_page_remove_parent_pte(page, parent_pte); | ||
| 675 | } | ||
| 676 | |||
| 677 | static void kvm_mmu_zap_page(struct kvm *kvm, | ||
| 678 | struct kvm_mmu_page *page) | ||
| 679 | { | ||
| 680 | u64 *parent_pte; | ||
| 681 | |||
| 682 | while (page->multimapped || page->parent_pte) { | ||
| 683 | if (!page->multimapped) | ||
| 684 | parent_pte = page->parent_pte; | ||
| 685 | else { | ||
| 686 | struct kvm_pte_chain *chain; | ||
| 687 | |||
| 688 | chain = container_of(page->parent_ptes.first, | ||
| 689 | struct kvm_pte_chain, link); | ||
| 690 | parent_pte = chain->parent_ptes[0]; | ||
| 691 | } | ||
| 692 | BUG_ON(!parent_pte); | ||
| 693 | kvm_mmu_put_page(page, parent_pte); | ||
| 694 | set_shadow_pte(parent_pte, 0); | ||
| 695 | } | ||
| 696 | kvm_mmu_page_unlink_children(kvm, page); | ||
| 697 | if (!page->root_count) { | ||
| 698 | hlist_del(&page->hash_link); | ||
| 699 | kvm_mmu_free_page(kvm, page); | ||
| 700 | } else | ||
| 701 | list_move(&page->link, &kvm->active_mmu_pages); | ||
| 702 | } | ||
| 703 | |||
| 704 | static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
| 705 | { | ||
| 706 | unsigned index; | ||
| 707 | struct hlist_head *bucket; | ||
| 708 | struct kvm_mmu_page *page; | ||
| 709 | struct hlist_node *node, *n; | ||
| 710 | int r; | ||
| 711 | |||
| 712 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
| 713 | r = 0; | ||
| 714 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
| 715 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
| 716 | hlist_for_each_entry_safe(page, node, n, bucket, hash_link) | ||
| 717 | if (page->gfn == gfn && !page->role.metaphysical) { | ||
| 718 | pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, | ||
| 719 | page->role.word); | ||
| 720 | kvm_mmu_zap_page(vcpu->kvm, page); | ||
| 721 | r = 1; | ||
| 722 | } | ||
| 723 | return r; | ||
| 724 | } | ||
| 725 | |||
| 726 | static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
| 727 | { | ||
| 728 | struct kvm_mmu_page *page; | ||
| 729 | |||
| 730 | while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) { | ||
| 731 | pgprintk("%s: zap %lx %x\n", | ||
| 732 | __FUNCTION__, gfn, page->role.word); | ||
| 733 | kvm_mmu_zap_page(vcpu->kvm, page); | ||
| 734 | } | ||
| 735 | } | ||
| 736 | |||
| 737 | static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) | ||
| 738 | { | ||
| 739 | int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); | ||
| 740 | struct kvm_mmu_page *page_head = page_header(__pa(pte)); | ||
| 741 | |||
| 742 | __set_bit(slot, &page_head->slot_bitmap); | ||
| 743 | } | ||
| 744 | |||
| 745 | hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) | ||
| 746 | { | ||
| 747 | hpa_t hpa = gpa_to_hpa(vcpu, gpa); | ||
| 748 | |||
| 749 | return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa; | ||
| 750 | } | ||
| 751 | |||
| 752 | hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) | ||
| 753 | { | ||
| 754 | struct page *page; | ||
| 755 | |||
| 756 | ASSERT((gpa & HPA_ERR_MASK) == 0); | ||
| 757 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
| 758 | if (!page) | ||
| 759 | return gpa | HPA_ERR_MASK; | ||
| 760 | return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT) | ||
| 761 | | (gpa & (PAGE_SIZE-1)); | ||
| 762 | } | ||
| 763 | |||
| 764 | hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva) | ||
| 765 | { | ||
| 766 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); | ||
| 767 | |||
| 768 | if (gpa == UNMAPPED_GVA) | ||
| 769 | return UNMAPPED_GVA; | ||
| 770 | return gpa_to_hpa(vcpu, gpa); | ||
| 771 | } | ||
| 772 | |||
| 773 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | ||
| 774 | { | ||
| 775 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); | ||
| 776 | |||
| 777 | if (gpa == UNMAPPED_GVA) | ||
| 778 | return NULL; | ||
| 779 | return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT); | ||
| 780 | } | ||
| 781 | |||
| 782 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | ||
| 783 | { | ||
| 784 | } | ||
| 785 | |||
| 786 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) | ||
| 787 | { | ||
| 788 | int level = PT32E_ROOT_LEVEL; | ||
| 789 | hpa_t table_addr = vcpu->mmu.root_hpa; | ||
| 790 | |||
| 791 | for (; ; level--) { | ||
| 792 | u32 index = PT64_INDEX(v, level); | ||
| 793 | u64 *table; | ||
| 794 | u64 pte; | ||
| 795 | |||
| 796 | ASSERT(VALID_PAGE(table_addr)); | ||
| 797 | table = __va(table_addr); | ||
| 798 | |||
| 799 | if (level == 1) { | ||
| 800 | pte = table[index]; | ||
| 801 | if (is_present_pte(pte) && is_writeble_pte(pte)) | ||
| 802 | return 0; | ||
| 803 | mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); | ||
| 804 | page_header_update_slot(vcpu->kvm, table, v); | ||
| 805 | table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | | ||
| 806 | PT_USER_MASK; | ||
| 807 | rmap_add(vcpu, &table[index]); | ||
| 808 | return 0; | ||
| 809 | } | ||
| 810 | |||
| 811 | if (table[index] == 0) { | ||
| 812 | struct kvm_mmu_page *new_table; | ||
| 813 | gfn_t pseudo_gfn; | ||
| 814 | |||
| 815 | pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) | ||
| 816 | >> PAGE_SHIFT; | ||
| 817 | new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, | ||
| 818 | v, level - 1, | ||
| 819 | 1, 0, &table[index]); | ||
| 820 | if (!new_table) { | ||
| 821 | pgprintk("nonpaging_map: ENOMEM\n"); | ||
| 822 | return -ENOMEM; | ||
| 823 | } | ||
| 824 | |||
| 825 | table[index] = __pa(new_table->spt) | PT_PRESENT_MASK | ||
| 826 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
| 827 | } | ||
| 828 | table_addr = table[index] & PT64_BASE_ADDR_MASK; | ||
| 829 | } | ||
| 830 | } | ||
| 831 | |||
| 832 | static void mmu_free_roots(struct kvm_vcpu *vcpu) | ||
| 833 | { | ||
| 834 | int i; | ||
| 835 | struct kvm_mmu_page *page; | ||
| 836 | |||
| 837 | if (!VALID_PAGE(vcpu->mmu.root_hpa)) | ||
| 838 | return; | ||
| 839 | #ifdef CONFIG_X86_64 | ||
| 840 | if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
| 841 | hpa_t root = vcpu->mmu.root_hpa; | ||
| 842 | |||
| 843 | page = page_header(root); | ||
| 844 | --page->root_count; | ||
| 845 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
| 846 | return; | ||
| 847 | } | ||
| 848 | #endif | ||
| 849 | for (i = 0; i < 4; ++i) { | ||
| 850 | hpa_t root = vcpu->mmu.pae_root[i]; | ||
| 851 | |||
| 852 | if (root) { | ||
| 853 | root &= PT64_BASE_ADDR_MASK; | ||
| 854 | page = page_header(root); | ||
| 855 | --page->root_count; | ||
| 856 | } | ||
| 857 | vcpu->mmu.pae_root[i] = INVALID_PAGE; | ||
| 858 | } | ||
| 859 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
| 860 | } | ||
| 861 | |||
| 862 | static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | ||
| 863 | { | ||
| 864 | int i; | ||
| 865 | gfn_t root_gfn; | ||
| 866 | struct kvm_mmu_page *page; | ||
| 867 | |||
| 868 | root_gfn = vcpu->cr3 >> PAGE_SHIFT; | ||
| 869 | |||
| 870 | #ifdef CONFIG_X86_64 | ||
| 871 | if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
| 872 | hpa_t root = vcpu->mmu.root_hpa; | ||
| 873 | |||
| 874 | ASSERT(!VALID_PAGE(root)); | ||
| 875 | page = kvm_mmu_get_page(vcpu, root_gfn, 0, | ||
| 876 | PT64_ROOT_LEVEL, 0, 0, NULL); | ||
| 877 | root = __pa(page->spt); | ||
| 878 | ++page->root_count; | ||
| 879 | vcpu->mmu.root_hpa = root; | ||
| 880 | return; | ||
| 881 | } | ||
| 882 | #endif | ||
| 883 | for (i = 0; i < 4; ++i) { | ||
| 884 | hpa_t root = vcpu->mmu.pae_root[i]; | ||
| 885 | |||
| 886 | ASSERT(!VALID_PAGE(root)); | ||
| 887 | if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) { | ||
| 888 | if (!is_present_pte(vcpu->pdptrs[i])) { | ||
| 889 | vcpu->mmu.pae_root[i] = 0; | ||
| 890 | continue; | ||
| 891 | } | ||
| 892 | root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT; | ||
| 893 | } else if (vcpu->mmu.root_level == 0) | ||
| 894 | root_gfn = 0; | ||
| 895 | page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | ||
| 896 | PT32_ROOT_LEVEL, !is_paging(vcpu), | ||
| 897 | 0, NULL); | ||
| 898 | root = __pa(page->spt); | ||
| 899 | ++page->root_count; | ||
| 900 | vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; | ||
| 901 | } | ||
| 902 | vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root); | ||
| 903 | } | ||
| 904 | |||
| 905 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
| 906 | { | ||
| 907 | return vaddr; | ||
| 908 | } | ||
| 909 | |||
| 910 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | ||
| 911 | u32 error_code) | ||
| 912 | { | ||
| 913 | gpa_t addr = gva; | ||
| 914 | hpa_t paddr; | ||
| 915 | int r; | ||
| 916 | |||
| 917 | r = mmu_topup_memory_caches(vcpu); | ||
| 918 | if (r) | ||
| 919 | return r; | ||
| 920 | |||
| 921 | ASSERT(vcpu); | ||
| 922 | ASSERT(VALID_PAGE(vcpu->mmu.root_hpa)); | ||
| 923 | |||
| 924 | |||
| 925 | paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK); | ||
| 926 | |||
| 927 | if (is_error_hpa(paddr)) | ||
| 928 | return 1; | ||
| 929 | |||
| 930 | return nonpaging_map(vcpu, addr & PAGE_MASK, paddr); | ||
| 931 | } | ||
| 932 | |||
| 933 | static void nonpaging_free(struct kvm_vcpu *vcpu) | ||
| 934 | { | ||
| 935 | mmu_free_roots(vcpu); | ||
| 936 | } | ||
| 937 | |||
| 938 | static int nonpaging_init_context(struct kvm_vcpu *vcpu) | ||
| 939 | { | ||
| 940 | struct kvm_mmu *context = &vcpu->mmu; | ||
| 941 | |||
| 942 | context->new_cr3 = nonpaging_new_cr3; | ||
| 943 | context->page_fault = nonpaging_page_fault; | ||
| 944 | context->gva_to_gpa = nonpaging_gva_to_gpa; | ||
| 945 | context->free = nonpaging_free; | ||
| 946 | context->root_level = 0; | ||
| 947 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
| 948 | context->root_hpa = INVALID_PAGE; | ||
| 949 | return 0; | ||
| 950 | } | ||
| 951 | |||
| 952 | static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | ||
| 953 | { | ||
| 954 | ++vcpu->stat.tlb_flush; | ||
| 955 | kvm_x86_ops->tlb_flush(vcpu); | ||
| 956 | } | ||
| 957 | |||
| 958 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | ||
| 959 | { | ||
| 960 | pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); | ||
| 961 | mmu_free_roots(vcpu); | ||
| 962 | } | ||
| 963 | |||
| 964 | static void inject_page_fault(struct kvm_vcpu *vcpu, | ||
| 965 | u64 addr, | ||
| 966 | u32 err_code) | ||
| 967 | { | ||
| 968 | kvm_x86_ops->inject_page_fault(vcpu, addr, err_code); | ||
| 969 | } | ||
| 970 | |||
| 971 | static void paging_free(struct kvm_vcpu *vcpu) | ||
| 972 | { | ||
| 973 | nonpaging_free(vcpu); | ||
| 974 | } | ||
| 975 | |||
| 976 | #define PTTYPE 64 | ||
| 977 | #include "paging_tmpl.h" | ||
| 978 | #undef PTTYPE | ||
| 979 | |||
| 980 | #define PTTYPE 32 | ||
| 981 | #include "paging_tmpl.h" | ||
| 982 | #undef PTTYPE | ||
| 983 | |||
| 984 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | ||
| 985 | { | ||
| 986 | struct kvm_mmu *context = &vcpu->mmu; | ||
| 987 | |||
| 988 | ASSERT(is_pae(vcpu)); | ||
| 989 | context->new_cr3 = paging_new_cr3; | ||
| 990 | context->page_fault = paging64_page_fault; | ||
| 991 | context->gva_to_gpa = paging64_gva_to_gpa; | ||
| 992 | context->free = paging_free; | ||
| 993 | context->root_level = level; | ||
| 994 | context->shadow_root_level = level; | ||
| 995 | context->root_hpa = INVALID_PAGE; | ||
| 996 | return 0; | ||
| 997 | } | ||
| 998 | |||
| 999 | static int paging64_init_context(struct kvm_vcpu *vcpu) | ||
| 1000 | { | ||
| 1001 | return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); | ||
| 1002 | } | ||
| 1003 | |||
| 1004 | static int paging32_init_context(struct kvm_vcpu *vcpu) | ||
| 1005 | { | ||
| 1006 | struct kvm_mmu *context = &vcpu->mmu; | ||
| 1007 | |||
| 1008 | context->new_cr3 = paging_new_cr3; | ||
| 1009 | context->page_fault = paging32_page_fault; | ||
| 1010 | context->gva_to_gpa = paging32_gva_to_gpa; | ||
| 1011 | context->free = paging_free; | ||
| 1012 | context->root_level = PT32_ROOT_LEVEL; | ||
| 1013 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
| 1014 | context->root_hpa = INVALID_PAGE; | ||
| 1015 | return 0; | ||
| 1016 | } | ||
| 1017 | |||
| 1018 | static int paging32E_init_context(struct kvm_vcpu *vcpu) | ||
| 1019 | { | ||
| 1020 | return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); | ||
| 1021 | } | ||
| 1022 | |||
| 1023 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | ||
| 1024 | { | ||
| 1025 | ASSERT(vcpu); | ||
| 1026 | ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); | ||
| 1027 | |||
| 1028 | if (!is_paging(vcpu)) | ||
| 1029 | return nonpaging_init_context(vcpu); | ||
| 1030 | else if (is_long_mode(vcpu)) | ||
| 1031 | return paging64_init_context(vcpu); | ||
| 1032 | else if (is_pae(vcpu)) | ||
| 1033 | return paging32E_init_context(vcpu); | ||
| 1034 | else | ||
| 1035 | return paging32_init_context(vcpu); | ||
| 1036 | } | ||
| 1037 | |||
| 1038 | static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) | ||
| 1039 | { | ||
| 1040 | ASSERT(vcpu); | ||
| 1041 | if (VALID_PAGE(vcpu->mmu.root_hpa)) { | ||
| 1042 | vcpu->mmu.free(vcpu); | ||
| 1043 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
| 1044 | } | ||
| 1045 | } | ||
| 1046 | |||
| 1047 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) | ||
| 1048 | { | ||
| 1049 | destroy_kvm_mmu(vcpu); | ||
| 1050 | return init_kvm_mmu(vcpu); | ||
| 1051 | } | ||
| 1052 | EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); | ||
| 1053 | |||
| 1054 | int kvm_mmu_load(struct kvm_vcpu *vcpu) | ||
| 1055 | { | ||
| 1056 | int r; | ||
| 1057 | |||
| 1058 | mutex_lock(&vcpu->kvm->lock); | ||
| 1059 | r = mmu_topup_memory_caches(vcpu); | ||
| 1060 | if (r) | ||
| 1061 | goto out; | ||
| 1062 | mmu_alloc_roots(vcpu); | ||
| 1063 | kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); | ||
| 1064 | kvm_mmu_flush_tlb(vcpu); | ||
| 1065 | out: | ||
| 1066 | mutex_unlock(&vcpu->kvm->lock); | ||
| 1067 | return r; | ||
| 1068 | } | ||
| 1069 | EXPORT_SYMBOL_GPL(kvm_mmu_load); | ||
| 1070 | |||
| 1071 | void kvm_mmu_unload(struct kvm_vcpu *vcpu) | ||
| 1072 | { | ||
| 1073 | mmu_free_roots(vcpu); | ||
| 1074 | } | ||
| 1075 | |||
| 1076 | static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | ||
| 1077 | struct kvm_mmu_page *page, | ||
| 1078 | u64 *spte) | ||
| 1079 | { | ||
| 1080 | u64 pte; | ||
| 1081 | struct kvm_mmu_page *child; | ||
| 1082 | |||
| 1083 | pte = *spte; | ||
| 1084 | if (is_present_pte(pte)) { | ||
| 1085 | if (page->role.level == PT_PAGE_TABLE_LEVEL) | ||
| 1086 | rmap_remove(spte); | ||
| 1087 | else { | ||
| 1088 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
| 1089 | mmu_page_remove_parent_pte(child, spte); | ||
| 1090 | } | ||
| 1091 | } | ||
| 1092 | set_shadow_pte(spte, 0); | ||
| 1093 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
| 1094 | } | ||
| 1095 | |||
| 1096 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | ||
| 1097 | struct kvm_mmu_page *page, | ||
| 1098 | u64 *spte, | ||
| 1099 | const void *new, int bytes) | ||
| 1100 | { | ||
| 1101 | if (page->role.level != PT_PAGE_TABLE_LEVEL) | ||
| 1102 | return; | ||
| 1103 | |||
| 1104 | if (page->role.glevels == PT32_ROOT_LEVEL) | ||
| 1105 | paging32_update_pte(vcpu, page, spte, new, bytes); | ||
| 1106 | else | ||
| 1107 | paging64_update_pte(vcpu, page, spte, new, bytes); | ||
| 1108 | } | ||
| 1109 | |||
| 1110 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
| 1111 | const u8 *new, int bytes) | ||
| 1112 | { | ||
| 1113 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
| 1114 | struct kvm_mmu_page *page; | ||
| 1115 | struct hlist_node *node, *n; | ||
| 1116 | struct hlist_head *bucket; | ||
| 1117 | unsigned index; | ||
| 1118 | u64 *spte; | ||
| 1119 | unsigned offset = offset_in_page(gpa); | ||
| 1120 | unsigned pte_size; | ||
| 1121 | unsigned page_offset; | ||
| 1122 | unsigned misaligned; | ||
| 1123 | unsigned quadrant; | ||
| 1124 | int level; | ||
| 1125 | int flooded = 0; | ||
| 1126 | int npte; | ||
| 1127 | |||
| 1128 | pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); | ||
| 1129 | if (gfn == vcpu->last_pt_write_gfn) { | ||
| 1130 | ++vcpu->last_pt_write_count; | ||
| 1131 | if (vcpu->last_pt_write_count >= 3) | ||
| 1132 | flooded = 1; | ||
| 1133 | } else { | ||
| 1134 | vcpu->last_pt_write_gfn = gfn; | ||
| 1135 | vcpu->last_pt_write_count = 1; | ||
| 1136 | } | ||
| 1137 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
| 1138 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
| 1139 | hlist_for_each_entry_safe(page, node, n, bucket, hash_link) { | ||
| 1140 | if (page->gfn != gfn || page->role.metaphysical) | ||
| 1141 | continue; | ||
| 1142 | pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; | ||
| 1143 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | ||
| 1144 | misaligned |= bytes < 4; | ||
| 1145 | if (misaligned || flooded) { | ||
| 1146 | /* | ||
| 1147 | * Misaligned accesses are too much trouble to fix | ||
| 1148 | * up; also, they usually indicate a page is not used | ||
| 1149 | * as a page table. | ||
| 1150 | * | ||
| 1151 | * If we're seeing too many writes to a page, | ||
| 1152 | * it may no longer be a page table, or we may be | ||
| 1153 | * forking, in which case it is better to unmap the | ||
| 1154 | * page. | ||
| 1155 | */ | ||
| 1156 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | ||
| 1157 | gpa, bytes, page->role.word); | ||
| 1158 | kvm_mmu_zap_page(vcpu->kvm, page); | ||
| 1159 | continue; | ||
| 1160 | } | ||
| 1161 | page_offset = offset; | ||
| 1162 | level = page->role.level; | ||
| 1163 | npte = 1; | ||
| 1164 | if (page->role.glevels == PT32_ROOT_LEVEL) { | ||
| 1165 | page_offset <<= 1; /* 32->64 */ | ||
| 1166 | /* | ||
| 1167 | * A 32-bit pde maps 4MB while the shadow pdes map | ||
| 1168 | * only 2MB. So we need to double the offset again | ||
| 1169 | * and zap two pdes instead of one. | ||
| 1170 | */ | ||
| 1171 | if (level == PT32_ROOT_LEVEL) { | ||
| 1172 | page_offset &= ~7; /* kill rounding error */ | ||
| 1173 | page_offset <<= 1; | ||
| 1174 | npte = 2; | ||
| 1175 | } | ||
| 1176 | quadrant = page_offset >> PAGE_SHIFT; | ||
| 1177 | page_offset &= ~PAGE_MASK; | ||
| 1178 | if (quadrant != page->role.quadrant) | ||
| 1179 | continue; | ||
| 1180 | } | ||
| 1181 | spte = &page->spt[page_offset / sizeof(*spte)]; | ||
| 1182 | while (npte--) { | ||
| 1183 | mmu_pte_write_zap_pte(vcpu, page, spte); | ||
| 1184 | mmu_pte_write_new_pte(vcpu, page, spte, new, bytes); | ||
| 1185 | ++spte; | ||
| 1186 | } | ||
| 1187 | } | ||
| 1188 | } | ||
| 1189 | |||
| 1190 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | ||
| 1191 | { | ||
| 1192 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); | ||
| 1193 | |||
| 1194 | return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT); | ||
| 1195 | } | ||
| 1196 | |||
| 1197 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | ||
| 1198 | { | ||
| 1199 | while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) { | ||
| 1200 | struct kvm_mmu_page *page; | ||
| 1201 | |||
| 1202 | page = container_of(vcpu->kvm->active_mmu_pages.prev, | ||
| 1203 | struct kvm_mmu_page, link); | ||
| 1204 | kvm_mmu_zap_page(vcpu->kvm, page); | ||
| 1205 | } | ||
| 1206 | } | ||
| 1207 | |||
| 1208 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | ||
| 1209 | { | ||
| 1210 | struct kvm_mmu_page *page; | ||
| 1211 | |||
| 1212 | while (!list_empty(&vcpu->kvm->active_mmu_pages)) { | ||
| 1213 | page = container_of(vcpu->kvm->active_mmu_pages.next, | ||
| 1214 | struct kvm_mmu_page, link); | ||
| 1215 | kvm_mmu_zap_page(vcpu->kvm, page); | ||
| 1216 | } | ||
| 1217 | free_page((unsigned long)vcpu->mmu.pae_root); | ||
| 1218 | } | ||
| 1219 | |||
| 1220 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | ||
| 1221 | { | ||
| 1222 | struct page *page; | ||
| 1223 | int i; | ||
| 1224 | |||
| 1225 | ASSERT(vcpu); | ||
| 1226 | |||
| 1227 | vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES; | ||
| 1228 | |||
| 1229 | /* | ||
| 1230 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. | ||
| 1231 | * Therefore we need to allocate shadow page tables in the first | ||
| 1232 | * 4GB of memory, which happens to fit the DMA32 zone. | ||
| 1233 | */ | ||
| 1234 | page = alloc_page(GFP_KERNEL | __GFP_DMA32); | ||
| 1235 | if (!page) | ||
| 1236 | goto error_1; | ||
| 1237 | vcpu->mmu.pae_root = page_address(page); | ||
| 1238 | for (i = 0; i < 4; ++i) | ||
| 1239 | vcpu->mmu.pae_root[i] = INVALID_PAGE; | ||
| 1240 | |||
| 1241 | return 0; | ||
| 1242 | |||
| 1243 | error_1: | ||
| 1244 | free_mmu_pages(vcpu); | ||
| 1245 | return -ENOMEM; | ||
| 1246 | } | ||
| 1247 | |||
| 1248 | int kvm_mmu_create(struct kvm_vcpu *vcpu) | ||
| 1249 | { | ||
| 1250 | ASSERT(vcpu); | ||
| 1251 | ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); | ||
| 1252 | |||
| 1253 | return alloc_mmu_pages(vcpu); | ||
| 1254 | } | ||
| 1255 | |||
| 1256 | int kvm_mmu_setup(struct kvm_vcpu *vcpu) | ||
| 1257 | { | ||
| 1258 | ASSERT(vcpu); | ||
| 1259 | ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); | ||
| 1260 | |||
| 1261 | return init_kvm_mmu(vcpu); | ||
| 1262 | } | ||
| 1263 | |||
| 1264 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | ||
| 1265 | { | ||
| 1266 | ASSERT(vcpu); | ||
| 1267 | |||
| 1268 | destroy_kvm_mmu(vcpu); | ||
| 1269 | free_mmu_pages(vcpu); | ||
| 1270 | mmu_free_memory_caches(vcpu); | ||
| 1271 | } | ||
| 1272 | |||
| 1273 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | ||
| 1274 | { | ||
| 1275 | struct kvm_mmu_page *page; | ||
| 1276 | |||
| 1277 | list_for_each_entry(page, &kvm->active_mmu_pages, link) { | ||
| 1278 | int i; | ||
| 1279 | u64 *pt; | ||
| 1280 | |||
| 1281 | if (!test_bit(slot, &page->slot_bitmap)) | ||
| 1282 | continue; | ||
| 1283 | |||
| 1284 | pt = page->spt; | ||
| 1285 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
| 1286 | /* avoid RMW */ | ||
| 1287 | if (pt[i] & PT_WRITABLE_MASK) { | ||
| 1288 | rmap_remove(&pt[i]); | ||
| 1289 | pt[i] &= ~PT_WRITABLE_MASK; | ||
| 1290 | } | ||
| 1291 | } | ||
| 1292 | } | ||
| 1293 | |||
| 1294 | void kvm_mmu_zap_all(struct kvm *kvm) | ||
| 1295 | { | ||
| 1296 | struct kvm_mmu_page *page, *node; | ||
| 1297 | |||
| 1298 | list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link) | ||
| 1299 | kvm_mmu_zap_page(kvm, page); | ||
| 1300 | |||
| 1301 | kvm_flush_remote_tlbs(kvm); | ||
| 1302 | } | ||
| 1303 | |||
| 1304 | void kvm_mmu_module_exit(void) | ||
| 1305 | { | ||
| 1306 | if (pte_chain_cache) | ||
| 1307 | kmem_cache_destroy(pte_chain_cache); | ||
| 1308 | if (rmap_desc_cache) | ||
| 1309 | kmem_cache_destroy(rmap_desc_cache); | ||
| 1310 | if (mmu_page_header_cache) | ||
| 1311 | kmem_cache_destroy(mmu_page_header_cache); | ||
| 1312 | } | ||
| 1313 | |||
| 1314 | int kvm_mmu_module_init(void) | ||
| 1315 | { | ||
| 1316 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", | ||
| 1317 | sizeof(struct kvm_pte_chain), | ||
| 1318 | 0, 0, NULL); | ||
| 1319 | if (!pte_chain_cache) | ||
| 1320 | goto nomem; | ||
| 1321 | rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", | ||
| 1322 | sizeof(struct kvm_rmap_desc), | ||
| 1323 | 0, 0, NULL); | ||
| 1324 | if (!rmap_desc_cache) | ||
| 1325 | goto nomem; | ||
| 1326 | |||
| 1327 | mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", | ||
| 1328 | sizeof(struct kvm_mmu_page), | ||
| 1329 | 0, 0, NULL); | ||
| 1330 | if (!mmu_page_header_cache) | ||
| 1331 | goto nomem; | ||
| 1332 | |||
| 1333 | return 0; | ||
| 1334 | |||
| 1335 | nomem: | ||
| 1336 | kvm_mmu_module_exit(); | ||
| 1337 | return -ENOMEM; | ||
| 1338 | } | ||
| 1339 | |||
| 1340 | #ifdef AUDIT | ||
| 1341 | |||
| 1342 | static const char *audit_msg; | ||
| 1343 | |||
| 1344 | static gva_t canonicalize(gva_t gva) | ||
| 1345 | { | ||
| 1346 | #ifdef CONFIG_X86_64 | ||
| 1347 | gva = (long long)(gva << 16) >> 16; | ||
| 1348 | #endif | ||
| 1349 | return gva; | ||
| 1350 | } | ||
| 1351 | |||
| 1352 | static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | ||
| 1353 | gva_t va, int level) | ||
| 1354 | { | ||
| 1355 | u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); | ||
| 1356 | int i; | ||
| 1357 | gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); | ||
| 1358 | |||
| 1359 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { | ||
| 1360 | u64 ent = pt[i]; | ||
| 1361 | |||
| 1362 | if (!(ent & PT_PRESENT_MASK)) | ||
| 1363 | continue; | ||
| 1364 | |||
| 1365 | va = canonicalize(va); | ||
| 1366 | if (level > 1) | ||
| 1367 | audit_mappings_page(vcpu, ent, va, level - 1); | ||
| 1368 | else { | ||
| 1369 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va); | ||
| 1370 | hpa_t hpa = gpa_to_hpa(vcpu, gpa); | ||
| 1371 | |||
| 1372 | if ((ent & PT_PRESENT_MASK) | ||
| 1373 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | ||
| 1374 | printk(KERN_ERR "audit error: (%s) levels %d" | ||
| 1375 | " gva %lx gpa %llx hpa %llx ent %llx\n", | ||
| 1376 | audit_msg, vcpu->mmu.root_level, | ||
| 1377 | va, gpa, hpa, ent); | ||
| 1378 | } | ||
| 1379 | } | ||
| 1380 | } | ||
| 1381 | |||
| 1382 | static void audit_mappings(struct kvm_vcpu *vcpu) | ||
| 1383 | { | ||
| 1384 | unsigned i; | ||
| 1385 | |||
| 1386 | if (vcpu->mmu.root_level == 4) | ||
| 1387 | audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4); | ||
| 1388 | else | ||
| 1389 | for (i = 0; i < 4; ++i) | ||
| 1390 | if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK) | ||
| 1391 | audit_mappings_page(vcpu, | ||
| 1392 | vcpu->mmu.pae_root[i], | ||
| 1393 | i << 30, | ||
| 1394 | 2); | ||
| 1395 | } | ||
| 1396 | |||
| 1397 | static int count_rmaps(struct kvm_vcpu *vcpu) | ||
| 1398 | { | ||
| 1399 | int nmaps = 0; | ||
| 1400 | int i, j, k; | ||
| 1401 | |||
| 1402 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
| 1403 | struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; | ||
| 1404 | struct kvm_rmap_desc *d; | ||
| 1405 | |||
| 1406 | for (j = 0; j < m->npages; ++j) { | ||
| 1407 | struct page *page = m->phys_mem[j]; | ||
| 1408 | |||
| 1409 | if (!page->private) | ||
| 1410 | continue; | ||
| 1411 | if (!(page->private & 1)) { | ||
| 1412 | ++nmaps; | ||
| 1413 | continue; | ||
| 1414 | } | ||
| 1415 | d = (struct kvm_rmap_desc *)(page->private & ~1ul); | ||
| 1416 | while (d) { | ||
| 1417 | for (k = 0; k < RMAP_EXT; ++k) | ||
| 1418 | if (d->shadow_ptes[k]) | ||
| 1419 | ++nmaps; | ||
| 1420 | else | ||
| 1421 | break; | ||
| 1422 | d = d->more; | ||
| 1423 | } | ||
| 1424 | } | ||
| 1425 | } | ||
| 1426 | return nmaps; | ||
| 1427 | } | ||
| 1428 | |||
| 1429 | static int count_writable_mappings(struct kvm_vcpu *vcpu) | ||
| 1430 | { | ||
| 1431 | int nmaps = 0; | ||
| 1432 | struct kvm_mmu_page *page; | ||
| 1433 | int i; | ||
| 1434 | |||
| 1435 | list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { | ||
| 1436 | u64 *pt = page->spt; | ||
| 1437 | |||
| 1438 | if (page->role.level != PT_PAGE_TABLE_LEVEL) | ||
| 1439 | continue; | ||
| 1440 | |||
| 1441 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
| 1442 | u64 ent = pt[i]; | ||
| 1443 | |||
| 1444 | if (!(ent & PT_PRESENT_MASK)) | ||
| 1445 | continue; | ||
| 1446 | if (!(ent & PT_WRITABLE_MASK)) | ||
| 1447 | continue; | ||
| 1448 | ++nmaps; | ||
| 1449 | } | ||
| 1450 | } | ||
| 1451 | return nmaps; | ||
| 1452 | } | ||
| 1453 | |||
| 1454 | static void audit_rmap(struct kvm_vcpu *vcpu) | ||
| 1455 | { | ||
| 1456 | int n_rmap = count_rmaps(vcpu); | ||
| 1457 | int n_actual = count_writable_mappings(vcpu); | ||
| 1458 | |||
| 1459 | if (n_rmap != n_actual) | ||
| 1460 | printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", | ||
| 1461 | __FUNCTION__, audit_msg, n_rmap, n_actual); | ||
| 1462 | } | ||
| 1463 | |||
| 1464 | static void audit_write_protection(struct kvm_vcpu *vcpu) | ||
| 1465 | { | ||
| 1466 | struct kvm_mmu_page *page; | ||
| 1467 | |||
| 1468 | list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { | ||
| 1469 | hfn_t hfn; | ||
| 1470 | struct page *pg; | ||
| 1471 | |||
| 1472 | if (page->role.metaphysical) | ||
| 1473 | continue; | ||
| 1474 | |||
| 1475 | hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT) | ||
| 1476 | >> PAGE_SHIFT; | ||
| 1477 | pg = pfn_to_page(hfn); | ||
| 1478 | if (pg->private) | ||
| 1479 | printk(KERN_ERR "%s: (%s) shadow page has writable" | ||
| 1480 | " mappings: gfn %lx role %x\n", | ||
| 1481 | __FUNCTION__, audit_msg, page->gfn, | ||
| 1482 | page->role.word); | ||
| 1483 | } | ||
| 1484 | } | ||
| 1485 | |||
| 1486 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) | ||
| 1487 | { | ||
| 1488 | int olddbg = dbg; | ||
| 1489 | |||
| 1490 | dbg = 0; | ||
| 1491 | audit_msg = msg; | ||
| 1492 | audit_rmap(vcpu); | ||
| 1493 | audit_write_protection(vcpu); | ||
| 1494 | audit_mappings(vcpu); | ||
| 1495 | dbg = olddbg; | ||
| 1496 | } | ||
| 1497 | |||
| 1498 | #endif | ||
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h deleted file mode 100644 index 6b094b44f8fb..000000000000 --- a/drivers/kvm/paging_tmpl.h +++ /dev/null | |||
| @@ -1,511 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * Kernel-based Virtual Machine driver for Linux | ||
| 3 | * | ||
| 4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
| 5 | * machines without emulation or binary translation. | ||
| 6 | * | ||
| 7 | * MMU support | ||
| 8 | * | ||
| 9 | * Copyright (C) 2006 Qumranet, Inc. | ||
| 10 | * | ||
| 11 | * Authors: | ||
| 12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
| 13 | * Avi Kivity <avi@qumranet.com> | ||
| 14 | * | ||
| 15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
| 16 | * the COPYING file in the top-level directory. | ||
| 17 | * | ||
| 18 | */ | ||
| 19 | |||
| 20 | /* | ||
| 21 | * We need the mmu code to access both 32-bit and 64-bit guest ptes, | ||
| 22 | * so the code in this file is compiled twice, once per pte size. | ||
| 23 | */ | ||
| 24 | |||
| 25 | #if PTTYPE == 64 | ||
| 26 | #define pt_element_t u64 | ||
| 27 | #define guest_walker guest_walker64 | ||
| 28 | #define FNAME(name) paging##64_##name | ||
| 29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK | ||
| 30 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK | ||
| 31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
| 32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
| 33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | ||
| 34 | #ifdef CONFIG_X86_64 | ||
| 35 | #define PT_MAX_FULL_LEVELS 4 | ||
| 36 | #else | ||
| 37 | #define PT_MAX_FULL_LEVELS 2 | ||
| 38 | #endif | ||
| 39 | #elif PTTYPE == 32 | ||
| 40 | #define pt_element_t u32 | ||
| 41 | #define guest_walker guest_walker32 | ||
| 42 | #define FNAME(name) paging##32_##name | ||
| 43 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK | ||
| 44 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK | ||
| 45 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | ||
| 46 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
| 47 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | ||
| 48 | #define PT_MAX_FULL_LEVELS 2 | ||
| 49 | #else | ||
| 50 | #error Invalid PTTYPE value | ||
| 51 | #endif | ||
| 52 | |||
| 53 | /* | ||
| 54 | * The guest_walker structure emulates the behavior of the hardware page | ||
| 55 | * table walker. | ||
| 56 | */ | ||
| 57 | struct guest_walker { | ||
| 58 | int level; | ||
| 59 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; | ||
| 60 | pt_element_t *table; | ||
| 61 | pt_element_t pte; | ||
| 62 | pt_element_t *ptep; | ||
| 63 | struct page *page; | ||
| 64 | int index; | ||
| 65 | pt_element_t inherited_ar; | ||
| 66 | gfn_t gfn; | ||
| 67 | u32 error_code; | ||
| 68 | }; | ||
| 69 | |||
| 70 | /* | ||
| 71 | * Fetch a guest pte for a guest virtual address | ||
| 72 | */ | ||
| 73 | static int FNAME(walk_addr)(struct guest_walker *walker, | ||
| 74 | struct kvm_vcpu *vcpu, gva_t addr, | ||
| 75 | int write_fault, int user_fault, int fetch_fault) | ||
| 76 | { | ||
| 77 | hpa_t hpa; | ||
| 78 | struct kvm_memory_slot *slot; | ||
| 79 | pt_element_t *ptep; | ||
| 80 | pt_element_t root; | ||
| 81 | gfn_t table_gfn; | ||
| 82 | |||
| 83 | pgprintk("%s: addr %lx\n", __FUNCTION__, addr); | ||
| 84 | walker->level = vcpu->mmu.root_level; | ||
| 85 | walker->table = NULL; | ||
| 86 | walker->page = NULL; | ||
| 87 | walker->ptep = NULL; | ||
| 88 | root = vcpu->cr3; | ||
| 89 | #if PTTYPE == 64 | ||
| 90 | if (!is_long_mode(vcpu)) { | ||
| 91 | walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3]; | ||
| 92 | root = *walker->ptep; | ||
| 93 | walker->pte = root; | ||
| 94 | if (!(root & PT_PRESENT_MASK)) | ||
| 95 | goto not_present; | ||
| 96 | --walker->level; | ||
| 97 | } | ||
| 98 | #endif | ||
| 99 | table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
| 100 | walker->table_gfn[walker->level - 1] = table_gfn; | ||
| 101 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
| 102 | walker->level - 1, table_gfn); | ||
| 103 | slot = gfn_to_memslot(vcpu->kvm, table_gfn); | ||
| 104 | hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK); | ||
| 105 | walker->page = pfn_to_page(hpa >> PAGE_SHIFT); | ||
| 106 | walker->table = kmap_atomic(walker->page, KM_USER0); | ||
| 107 | |||
| 108 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || | ||
| 109 | (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); | ||
| 110 | |||
| 111 | walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK; | ||
| 112 | |||
| 113 | for (;;) { | ||
| 114 | int index = PT_INDEX(addr, walker->level); | ||
| 115 | hpa_t paddr; | ||
| 116 | |||
| 117 | ptep = &walker->table[index]; | ||
| 118 | walker->index = index; | ||
| 119 | ASSERT(((unsigned long)walker->table & PAGE_MASK) == | ||
| 120 | ((unsigned long)ptep & PAGE_MASK)); | ||
| 121 | |||
| 122 | if (!is_present_pte(*ptep)) | ||
| 123 | goto not_present; | ||
| 124 | |||
| 125 | if (write_fault && !is_writeble_pte(*ptep)) | ||
| 126 | if (user_fault || is_write_protection(vcpu)) | ||
| 127 | goto access_error; | ||
| 128 | |||
| 129 | if (user_fault && !(*ptep & PT_USER_MASK)) | ||
| 130 | goto access_error; | ||
| 131 | |||
| 132 | #if PTTYPE == 64 | ||
| 133 | if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK)) | ||
| 134 | goto access_error; | ||
| 135 | #endif | ||
| 136 | |||
| 137 | if (!(*ptep & PT_ACCESSED_MASK)) { | ||
| 138 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
| 139 | *ptep |= PT_ACCESSED_MASK; | ||
| 140 | } | ||
| 141 | |||
| 142 | if (walker->level == PT_PAGE_TABLE_LEVEL) { | ||
| 143 | walker->gfn = (*ptep & PT_BASE_ADDR_MASK) | ||
| 144 | >> PAGE_SHIFT; | ||
| 145 | break; | ||
| 146 | } | ||
| 147 | |||
| 148 | if (walker->level == PT_DIRECTORY_LEVEL | ||
| 149 | && (*ptep & PT_PAGE_SIZE_MASK) | ||
| 150 | && (PTTYPE == 64 || is_pse(vcpu))) { | ||
| 151 | walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK) | ||
| 152 | >> PAGE_SHIFT; | ||
| 153 | walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); | ||
| 154 | break; | ||
| 155 | } | ||
| 156 | |||
| 157 | walker->inherited_ar &= walker->table[index]; | ||
| 158 | table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
| 159 | kunmap_atomic(walker->table, KM_USER0); | ||
| 160 | paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT); | ||
| 161 | walker->page = pfn_to_page(paddr >> PAGE_SHIFT); | ||
| 162 | walker->table = kmap_atomic(walker->page, KM_USER0); | ||
| 163 | --walker->level; | ||
| 164 | walker->table_gfn[walker->level - 1 ] = table_gfn; | ||
| 165 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
| 166 | walker->level - 1, table_gfn); | ||
| 167 | } | ||
| 168 | walker->pte = *ptep; | ||
| 169 | if (walker->page) | ||
| 170 | walker->ptep = NULL; | ||
| 171 | if (walker->table) | ||
| 172 | kunmap_atomic(walker->table, KM_USER0); | ||
| 173 | pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep); | ||
| 174 | return 1; | ||
| 175 | |||
| 176 | not_present: | ||
| 177 | walker->error_code = 0; | ||
| 178 | goto err; | ||
| 179 | |||
| 180 | access_error: | ||
| 181 | walker->error_code = PFERR_PRESENT_MASK; | ||
| 182 | |||
| 183 | err: | ||
| 184 | if (write_fault) | ||
| 185 | walker->error_code |= PFERR_WRITE_MASK; | ||
| 186 | if (user_fault) | ||
| 187 | walker->error_code |= PFERR_USER_MASK; | ||
| 188 | if (fetch_fault) | ||
| 189 | walker->error_code |= PFERR_FETCH_MASK; | ||
| 190 | if (walker->table) | ||
| 191 | kunmap_atomic(walker->table, KM_USER0); | ||
| 192 | return 0; | ||
| 193 | } | ||
| 194 | |||
| 195 | static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, | ||
| 196 | struct guest_walker *walker) | ||
| 197 | { | ||
| 198 | mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]); | ||
| 199 | } | ||
| 200 | |||
| 201 | static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, | ||
| 202 | u64 *shadow_pte, | ||
| 203 | gpa_t gaddr, | ||
| 204 | pt_element_t gpte, | ||
| 205 | u64 access_bits, | ||
| 206 | int user_fault, | ||
| 207 | int write_fault, | ||
| 208 | int *ptwrite, | ||
| 209 | struct guest_walker *walker, | ||
| 210 | gfn_t gfn) | ||
| 211 | { | ||
| 212 | hpa_t paddr; | ||
| 213 | int dirty = gpte & PT_DIRTY_MASK; | ||
| 214 | u64 spte = *shadow_pte; | ||
| 215 | int was_rmapped = is_rmap_pte(spte); | ||
| 216 | |||
| 217 | pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d" | ||
| 218 | " user_fault %d gfn %lx\n", | ||
| 219 | __FUNCTION__, spte, (u64)gpte, access_bits, | ||
| 220 | write_fault, user_fault, gfn); | ||
| 221 | |||
| 222 | if (write_fault && !dirty) { | ||
| 223 | pt_element_t *guest_ent, *tmp = NULL; | ||
| 224 | |||
| 225 | if (walker->ptep) | ||
| 226 | guest_ent = walker->ptep; | ||
| 227 | else { | ||
| 228 | tmp = kmap_atomic(walker->page, KM_USER0); | ||
| 229 | guest_ent = &tmp[walker->index]; | ||
| 230 | } | ||
| 231 | |||
| 232 | *guest_ent |= PT_DIRTY_MASK; | ||
| 233 | if (!walker->ptep) | ||
| 234 | kunmap_atomic(tmp, KM_USER0); | ||
| 235 | dirty = 1; | ||
| 236 | FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); | ||
| 237 | } | ||
| 238 | |||
| 239 | spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; | ||
| 240 | spte |= gpte & PT64_NX_MASK; | ||
| 241 | if (!dirty) | ||
| 242 | access_bits &= ~PT_WRITABLE_MASK; | ||
| 243 | |||
| 244 | paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); | ||
| 245 | |||
| 246 | spte |= PT_PRESENT_MASK; | ||
| 247 | if (access_bits & PT_USER_MASK) | ||
| 248 | spte |= PT_USER_MASK; | ||
| 249 | |||
| 250 | if (is_error_hpa(paddr)) { | ||
| 251 | spte |= gaddr; | ||
| 252 | spte |= PT_SHADOW_IO_MARK; | ||
| 253 | spte &= ~PT_PRESENT_MASK; | ||
| 254 | set_shadow_pte(shadow_pte, spte); | ||
| 255 | return; | ||
| 256 | } | ||
| 257 | |||
| 258 | spte |= paddr; | ||
| 259 | |||
| 260 | if ((access_bits & PT_WRITABLE_MASK) | ||
| 261 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { | ||
| 262 | struct kvm_mmu_page *shadow; | ||
| 263 | |||
| 264 | spte |= PT_WRITABLE_MASK; | ||
| 265 | if (user_fault) { | ||
| 266 | mmu_unshadow(vcpu, gfn); | ||
| 267 | goto unshadowed; | ||
| 268 | } | ||
| 269 | |||
| 270 | shadow = kvm_mmu_lookup_page(vcpu, gfn); | ||
| 271 | if (shadow) { | ||
| 272 | pgprintk("%s: found shadow page for %lx, marking ro\n", | ||
| 273 | __FUNCTION__, gfn); | ||
| 274 | access_bits &= ~PT_WRITABLE_MASK; | ||
| 275 | if (is_writeble_pte(spte)) { | ||
| 276 | spte &= ~PT_WRITABLE_MASK; | ||
| 277 | kvm_x86_ops->tlb_flush(vcpu); | ||
| 278 | } | ||
| 279 | if (write_fault) | ||
| 280 | *ptwrite = 1; | ||
| 281 | } | ||
| 282 | } | ||
| 283 | |||
| 284 | unshadowed: | ||
| 285 | |||
| 286 | if (access_bits & PT_WRITABLE_MASK) | ||
| 287 | mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); | ||
| 288 | |||
| 289 | set_shadow_pte(shadow_pte, spte); | ||
| 290 | page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); | ||
| 291 | if (!was_rmapped) | ||
| 292 | rmap_add(vcpu, shadow_pte); | ||
| 293 | } | ||
| 294 | |||
| 295 | static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte, | ||
| 296 | u64 *shadow_pte, u64 access_bits, | ||
| 297 | int user_fault, int write_fault, int *ptwrite, | ||
| 298 | struct guest_walker *walker, gfn_t gfn) | ||
| 299 | { | ||
| 300 | access_bits &= gpte; | ||
| 301 | FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK, | ||
| 302 | gpte, access_bits, user_fault, write_fault, | ||
| 303 | ptwrite, walker, gfn); | ||
| 304 | } | ||
| 305 | |||
| 306 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | ||
| 307 | u64 *spte, const void *pte, int bytes) | ||
| 308 | { | ||
| 309 | pt_element_t gpte; | ||
| 310 | |||
| 311 | if (bytes < sizeof(pt_element_t)) | ||
| 312 | return; | ||
| 313 | gpte = *(const pt_element_t *)pte; | ||
| 314 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) | ||
| 315 | return; | ||
| 316 | pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); | ||
| 317 | FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, | ||
| 318 | 0, NULL, NULL, | ||
| 319 | (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
| 320 | } | ||
| 321 | |||
| 322 | static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde, | ||
| 323 | u64 *shadow_pte, u64 access_bits, | ||
| 324 | int user_fault, int write_fault, int *ptwrite, | ||
| 325 | struct guest_walker *walker, gfn_t gfn) | ||
| 326 | { | ||
| 327 | gpa_t gaddr; | ||
| 328 | |||
| 329 | access_bits &= gpde; | ||
| 330 | gaddr = (gpa_t)gfn << PAGE_SHIFT; | ||
| 331 | if (PTTYPE == 32 && is_cpuid_PSE36()) | ||
| 332 | gaddr |= (gpde & PT32_DIR_PSE36_MASK) << | ||
| 333 | (32 - PT32_DIR_PSE36_SHIFT); | ||
| 334 | FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, | ||
| 335 | gpde, access_bits, user_fault, write_fault, | ||
| 336 | ptwrite, walker, gfn); | ||
| 337 | } | ||
| 338 | |||
| 339 | /* | ||
| 340 | * Fetch a shadow pte for a specific level in the paging hierarchy. | ||
| 341 | */ | ||
| 342 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | ||
| 343 | struct guest_walker *walker, | ||
| 344 | int user_fault, int write_fault, int *ptwrite) | ||
| 345 | { | ||
| 346 | hpa_t shadow_addr; | ||
| 347 | int level; | ||
| 348 | u64 *shadow_ent; | ||
| 349 | u64 *prev_shadow_ent = NULL; | ||
| 350 | |||
| 351 | if (!is_present_pte(walker->pte)) | ||
| 352 | return NULL; | ||
| 353 | |||
| 354 | shadow_addr = vcpu->mmu.root_hpa; | ||
| 355 | level = vcpu->mmu.shadow_root_level; | ||
| 356 | if (level == PT32E_ROOT_LEVEL) { | ||
| 357 | shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3]; | ||
| 358 | shadow_addr &= PT64_BASE_ADDR_MASK; | ||
| 359 | --level; | ||
| 360 | } | ||
| 361 | |||
| 362 | for (; ; level--) { | ||
| 363 | u32 index = SHADOW_PT_INDEX(addr, level); | ||
| 364 | struct kvm_mmu_page *shadow_page; | ||
| 365 | u64 shadow_pte; | ||
| 366 | int metaphysical; | ||
| 367 | gfn_t table_gfn; | ||
| 368 | unsigned hugepage_access = 0; | ||
| 369 | |||
| 370 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; | ||
| 371 | if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { | ||
| 372 | if (level == PT_PAGE_TABLE_LEVEL) | ||
| 373 | break; | ||
| 374 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | ||
| 375 | prev_shadow_ent = shadow_ent; | ||
| 376 | continue; | ||
| 377 | } | ||
| 378 | |||
| 379 | if (level == PT_PAGE_TABLE_LEVEL) | ||
| 380 | break; | ||
| 381 | |||
| 382 | if (level - 1 == PT_PAGE_TABLE_LEVEL | ||
| 383 | && walker->level == PT_DIRECTORY_LEVEL) { | ||
| 384 | metaphysical = 1; | ||
| 385 | hugepage_access = walker->pte; | ||
| 386 | hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK; | ||
| 387 | if (walker->pte & PT64_NX_MASK) | ||
| 388 | hugepage_access |= (1 << 2); | ||
| 389 | hugepage_access >>= PT_WRITABLE_SHIFT; | ||
| 390 | table_gfn = (walker->pte & PT_BASE_ADDR_MASK) | ||
| 391 | >> PAGE_SHIFT; | ||
| 392 | } else { | ||
| 393 | metaphysical = 0; | ||
| 394 | table_gfn = walker->table_gfn[level - 2]; | ||
| 395 | } | ||
| 396 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | ||
| 397 | metaphysical, hugepage_access, | ||
| 398 | shadow_ent); | ||
| 399 | shadow_addr = __pa(shadow_page->spt); | ||
| 400 | shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | ||
| 401 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
| 402 | *shadow_ent = shadow_pte; | ||
| 403 | prev_shadow_ent = shadow_ent; | ||
| 404 | } | ||
| 405 | |||
| 406 | if (walker->level == PT_DIRECTORY_LEVEL) { | ||
| 407 | FNAME(set_pde)(vcpu, walker->pte, shadow_ent, | ||
| 408 | walker->inherited_ar, user_fault, write_fault, | ||
| 409 | ptwrite, walker, walker->gfn); | ||
| 410 | } else { | ||
| 411 | ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); | ||
| 412 | FNAME(set_pte)(vcpu, walker->pte, shadow_ent, | ||
| 413 | walker->inherited_ar, user_fault, write_fault, | ||
| 414 | ptwrite, walker, walker->gfn); | ||
| 415 | } | ||
| 416 | return shadow_ent; | ||
| 417 | } | ||
| 418 | |||
| 419 | /* | ||
| 420 | * Page fault handler. There are several causes for a page fault: | ||
| 421 | * - there is no shadow pte for the guest pte | ||
| 422 | * - write access through a shadow pte marked read only so that we can set | ||
| 423 | * the dirty bit | ||
| 424 | * - write access to a shadow pte marked read only so we can update the page | ||
| 425 | * dirty bitmap, when userspace requests it | ||
| 426 | * - mmio access; in this case we will never install a present shadow pte | ||
| 427 | * - normal guest page fault due to the guest pte marked not present, not | ||
| 428 | * writable, or not executable | ||
| 429 | * | ||
| 430 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or | ||
| 431 | * a negative value on error. | ||
| 432 | */ | ||
| 433 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | ||
| 434 | u32 error_code) | ||
| 435 | { | ||
| 436 | int write_fault = error_code & PFERR_WRITE_MASK; | ||
| 437 | int user_fault = error_code & PFERR_USER_MASK; | ||
| 438 | int fetch_fault = error_code & PFERR_FETCH_MASK; | ||
| 439 | struct guest_walker walker; | ||
| 440 | u64 *shadow_pte; | ||
| 441 | int write_pt = 0; | ||
| 442 | int r; | ||
| 443 | |||
| 444 | pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); | ||
| 445 | kvm_mmu_audit(vcpu, "pre page fault"); | ||
| 446 | |||
| 447 | r = mmu_topup_memory_caches(vcpu); | ||
| 448 | if (r) | ||
| 449 | return r; | ||
| 450 | |||
| 451 | /* | ||
| 452 | * Look up the shadow pte for the faulting address. | ||
| 453 | */ | ||
| 454 | r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, | ||
| 455 | fetch_fault); | ||
| 456 | |||
| 457 | /* | ||
| 458 | * The page is not mapped by the guest. Let the guest handle it. | ||
| 459 | */ | ||
| 460 | if (!r) { | ||
| 461 | pgprintk("%s: guest page fault\n", __FUNCTION__); | ||
| 462 | inject_page_fault(vcpu, addr, walker.error_code); | ||
| 463 | vcpu->last_pt_write_count = 0; /* reset fork detector */ | ||
| 464 | return 0; | ||
| 465 | } | ||
| 466 | |||
| 467 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | ||
| 468 | &write_pt); | ||
| 469 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, | ||
| 470 | shadow_pte, *shadow_pte, write_pt); | ||
| 471 | |||
| 472 | if (!write_pt) | ||
| 473 | vcpu->last_pt_write_count = 0; /* reset fork detector */ | ||
| 474 | |||
| 475 | /* | ||
| 476 | * mmio: emulate if accessible, otherwise its a guest fault. | ||
| 477 | */ | ||
| 478 | if (is_io_pte(*shadow_pte)) | ||
| 479 | return 1; | ||
| 480 | |||
| 481 | ++vcpu->stat.pf_fixed; | ||
| 482 | kvm_mmu_audit(vcpu, "post page fault (fixed)"); | ||
| 483 | |||
| 484 | return write_pt; | ||
| 485 | } | ||
| 486 | |||
| 487 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
| 488 | { | ||
| 489 | struct guest_walker walker; | ||
| 490 | gpa_t gpa = UNMAPPED_GVA; | ||
| 491 | int r; | ||
| 492 | |||
| 493 | r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); | ||
| 494 | |||
| 495 | if (r) { | ||
| 496 | gpa = (gpa_t)walker.gfn << PAGE_SHIFT; | ||
| 497 | gpa |= vaddr & ~PAGE_MASK; | ||
| 498 | } | ||
| 499 | |||
| 500 | return gpa; | ||
| 501 | } | ||
| 502 | |||
| 503 | #undef pt_element_t | ||
| 504 | #undef guest_walker | ||
| 505 | #undef FNAME | ||
| 506 | #undef PT_BASE_ADDR_MASK | ||
| 507 | #undef PT_INDEX | ||
| 508 | #undef SHADOW_PT_INDEX | ||
| 509 | #undef PT_LEVEL_MASK | ||
| 510 | #undef PT_DIR_BASE_ADDR_MASK | ||
| 511 | #undef PT_MAX_FULL_LEVELS | ||
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c deleted file mode 100644 index bd46de6bf891..000000000000 --- a/drivers/kvm/x86_emulate.c +++ /dev/null | |||
| @@ -1,1662 +0,0 @@ | |||
| 1 | /****************************************************************************** | ||
| 2 | * x86_emulate.c | ||
| 3 | * | ||
| 4 | * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. | ||
| 5 | * | ||
| 6 | * Copyright (c) 2005 Keir Fraser | ||
| 7 | * | ||
| 8 | * Linux coding style, mod r/m decoder, segment base fixes, real-mode | ||
| 9 | * privileged instructions: | ||
| 10 | * | ||
| 11 | * Copyright (C) 2006 Qumranet | ||
| 12 | * | ||
| 13 | * Avi Kivity <avi@qumranet.com> | ||
| 14 | * Yaniv Kamay <yaniv@qumranet.com> | ||
| 15 | * | ||
| 16 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
| 17 | * the COPYING file in the top-level directory. | ||
| 18 | * | ||
| 19 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 | ||
| 20 | */ | ||
| 21 | |||
| 22 | #ifndef __KERNEL__ | ||
| 23 | #include <stdio.h> | ||
| 24 | #include <stdint.h> | ||
| 25 | #include <public/xen.h> | ||
| 26 | #define DPRINTF(_f, _a ...) printf( _f , ## _a ) | ||
| 27 | #else | ||
| 28 | #include "kvm.h" | ||
| 29 | #define DPRINTF(x...) do {} while (0) | ||
| 30 | #endif | ||
| 31 | #include "x86_emulate.h" | ||
| 32 | #include <linux/module.h> | ||
| 33 | |||
| 34 | /* | ||
| 35 | * Opcode effective-address decode tables. | ||
| 36 | * Note that we only emulate instructions that have at least one memory | ||
| 37 | * operand (excluding implicit stack references). We assume that stack | ||
| 38 | * references and instruction fetches will never occur in special memory | ||
| 39 | * areas that require emulation. So, for example, 'mov <imm>,<reg>' need | ||
| 40 | * not be handled. | ||
| 41 | */ | ||
| 42 | |||
| 43 | /* Operand sizes: 8-bit operands or specified/overridden size. */ | ||
| 44 | #define ByteOp (1<<0) /* 8-bit operands. */ | ||
| 45 | /* Destination operand type. */ | ||
| 46 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ | ||
| 47 | #define DstReg (2<<1) /* Register operand. */ | ||
| 48 | #define DstMem (3<<1) /* Memory operand. */ | ||
| 49 | #define DstMask (3<<1) | ||
| 50 | /* Source operand type. */ | ||
| 51 | #define SrcNone (0<<3) /* No source operand. */ | ||
| 52 | #define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ | ||
| 53 | #define SrcReg (1<<3) /* Register operand. */ | ||
| 54 | #define SrcMem (2<<3) /* Memory operand. */ | ||
| 55 | #define SrcMem16 (3<<3) /* Memory operand (16-bit). */ | ||
| 56 | #define SrcMem32 (4<<3) /* Memory operand (32-bit). */ | ||
| 57 | #define SrcImm (5<<3) /* Immediate operand. */ | ||
| 58 | #define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ | ||
| 59 | #define SrcMask (7<<3) | ||
| 60 | /* Generic ModRM decode. */ | ||
| 61 | #define ModRM (1<<6) | ||
| 62 | /* Destination is only written; never read. */ | ||
| 63 | #define Mov (1<<7) | ||
| 64 | #define BitOp (1<<8) | ||
| 65 | |||
| 66 | static u8 opcode_table[256] = { | ||
| 67 | /* 0x00 - 0x07 */ | ||
| 68 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 69 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 70 | 0, 0, 0, 0, | ||
| 71 | /* 0x08 - 0x0F */ | ||
| 72 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 73 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 74 | 0, 0, 0, 0, | ||
| 75 | /* 0x10 - 0x17 */ | ||
| 76 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 77 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 78 | 0, 0, 0, 0, | ||
| 79 | /* 0x18 - 0x1F */ | ||
| 80 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 81 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 82 | 0, 0, 0, 0, | ||
| 83 | /* 0x20 - 0x27 */ | ||
| 84 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 85 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 86 | SrcImmByte, SrcImm, 0, 0, | ||
| 87 | /* 0x28 - 0x2F */ | ||
| 88 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 89 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 90 | 0, 0, 0, 0, | ||
| 91 | /* 0x30 - 0x37 */ | ||
| 92 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 93 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 94 | 0, 0, 0, 0, | ||
| 95 | /* 0x38 - 0x3F */ | ||
| 96 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 97 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
| 98 | 0, 0, 0, 0, | ||
| 99 | /* 0x40 - 0x4F */ | ||
| 100 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 101 | /* 0x50 - 0x57 */ | ||
| 102 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 103 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 104 | /* 0x58 - 0x5F */ | ||
| 105 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 106 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 107 | /* 0x60 - 0x67 */ | ||
| 108 | 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , | ||
| 109 | 0, 0, 0, 0, | ||
| 110 | /* 0x68 - 0x6F */ | ||
| 111 | 0, 0, ImplicitOps|Mov, 0, | ||
| 112 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ | ||
| 113 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ | ||
| 114 | /* 0x70 - 0x77 */ | ||
| 115 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 116 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 117 | /* 0x78 - 0x7F */ | ||
| 118 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 119 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 120 | /* 0x80 - 0x87 */ | ||
| 121 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, | ||
| 122 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
| 123 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 124 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
| 125 | /* 0x88 - 0x8F */ | ||
| 126 | ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, | ||
| 127 | ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 128 | 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov, | ||
| 129 | /* 0x90 - 0x9F */ | ||
| 130 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0, | ||
| 131 | /* 0xA0 - 0xA7 */ | ||
| 132 | ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov, | ||
| 133 | ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov, | ||
| 134 | ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, | ||
| 135 | ByteOp | ImplicitOps, ImplicitOps, | ||
| 136 | /* 0xA8 - 0xAF */ | ||
| 137 | 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, | ||
| 138 | ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, | ||
| 139 | ByteOp | ImplicitOps, ImplicitOps, | ||
| 140 | /* 0xB0 - 0xBF */ | ||
| 141 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 142 | /* 0xC0 - 0xC7 */ | ||
| 143 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
| 144 | 0, ImplicitOps, 0, 0, | ||
| 145 | ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, | ||
| 146 | /* 0xC8 - 0xCF */ | ||
| 147 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 148 | /* 0xD0 - 0xD7 */ | ||
| 149 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
| 150 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
| 151 | 0, 0, 0, 0, | ||
| 152 | /* 0xD8 - 0xDF */ | ||
| 153 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 154 | /* 0xE0 - 0xE7 */ | ||
| 155 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 156 | /* 0xE8 - 0xEF */ | ||
| 157 | ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0, | ||
| 158 | /* 0xF0 - 0xF7 */ | ||
| 159 | 0, 0, 0, 0, | ||
| 160 | ImplicitOps, 0, | ||
| 161 | ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, | ||
| 162 | /* 0xF8 - 0xFF */ | ||
| 163 | 0, 0, 0, 0, | ||
| 164 | 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM | ||
| 165 | }; | ||
| 166 | |||
| 167 | static u16 twobyte_table[256] = { | ||
| 168 | /* 0x00 - 0x0F */ | ||
| 169 | 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, | ||
| 170 | ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, | ||
| 171 | /* 0x10 - 0x1F */ | ||
| 172 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, | ||
| 173 | /* 0x20 - 0x2F */ | ||
| 174 | ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, | ||
| 175 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 176 | /* 0x30 - 0x3F */ | ||
| 177 | ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 178 | /* 0x40 - 0x47 */ | ||
| 179 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 180 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 181 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 182 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 183 | /* 0x48 - 0x4F */ | ||
| 184 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 185 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 186 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 187 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
| 188 | /* 0x50 - 0x5F */ | ||
| 189 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 190 | /* 0x60 - 0x6F */ | ||
| 191 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 192 | /* 0x70 - 0x7F */ | ||
| 193 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 194 | /* 0x80 - 0x8F */ | ||
| 195 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 196 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 197 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 198 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
| 199 | /* 0x90 - 0x9F */ | ||
| 200 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 201 | /* 0xA0 - 0xA7 */ | ||
| 202 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, | ||
| 203 | /* 0xA8 - 0xAF */ | ||
| 204 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, | ||
| 205 | /* 0xB0 - 0xB7 */ | ||
| 206 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, | ||
| 207 | DstMem | SrcReg | ModRM | BitOp, | ||
| 208 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
| 209 | DstReg | SrcMem16 | ModRM | Mov, | ||
| 210 | /* 0xB8 - 0xBF */ | ||
| 211 | 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, | ||
| 212 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
| 213 | DstReg | SrcMem16 | ModRM | Mov, | ||
| 214 | /* 0xC0 - 0xCF */ | ||
| 215 | 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, | ||
| 216 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 217 | /* 0xD0 - 0xDF */ | ||
| 218 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 219 | /* 0xE0 - 0xEF */ | ||
| 220 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 221 | /* 0xF0 - 0xFF */ | ||
| 222 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
| 223 | }; | ||
| 224 | |||
| 225 | /* Type, address-of, and value of an instruction's operand. */ | ||
| 226 | struct operand { | ||
| 227 | enum { OP_REG, OP_MEM, OP_IMM } type; | ||
| 228 | unsigned int bytes; | ||
| 229 | unsigned long val, orig_val, *ptr; | ||
| 230 | }; | ||
| 231 | |||
| 232 | /* EFLAGS bit definitions. */ | ||
| 233 | #define EFLG_OF (1<<11) | ||
| 234 | #define EFLG_DF (1<<10) | ||
| 235 | #define EFLG_SF (1<<7) | ||
| 236 | #define EFLG_ZF (1<<6) | ||
| 237 | #define EFLG_AF (1<<4) | ||
| 238 | #define EFLG_PF (1<<2) | ||
| 239 | #define EFLG_CF (1<<0) | ||
| 240 | |||
| 241 | /* | ||
| 242 | * Instruction emulation: | ||
| 243 | * Most instructions are emulated directly via a fragment of inline assembly | ||
| 244 | * code. This allows us to save/restore EFLAGS and thus very easily pick up | ||
| 245 | * any modified flags. | ||
| 246 | */ | ||
| 247 | |||
| 248 | #if defined(CONFIG_X86_64) | ||
| 249 | #define _LO32 "k" /* force 32-bit operand */ | ||
| 250 | #define _STK "%%rsp" /* stack pointer */ | ||
| 251 | #elif defined(__i386__) | ||
| 252 | #define _LO32 "" /* force 32-bit operand */ | ||
| 253 | #define _STK "%%esp" /* stack pointer */ | ||
| 254 | #endif | ||
| 255 | |||
| 256 | /* | ||
| 257 | * These EFLAGS bits are restored from saved value during emulation, and | ||
| 258 | * any changes are written back to the saved value after emulation. | ||
| 259 | */ | ||
| 260 | #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) | ||
| 261 | |||
| 262 | /* Before executing instruction: restore necessary bits in EFLAGS. */ | ||
| 263 | #define _PRE_EFLAGS(_sav, _msk, _tmp) \ | ||
| 264 | /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \ | ||
| 265 | "push %"_sav"; " \ | ||
| 266 | "movl %"_msk",%"_LO32 _tmp"; " \ | ||
| 267 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
| 268 | "pushf; " \ | ||
| 269 | "notl %"_LO32 _tmp"; " \ | ||
| 270 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
| 271 | "pop %"_tmp"; " \ | ||
| 272 | "orl %"_LO32 _tmp",("_STK"); " \ | ||
| 273 | "popf; " \ | ||
| 274 | /* _sav &= ~msk; */ \ | ||
| 275 | "movl %"_msk",%"_LO32 _tmp"; " \ | ||
| 276 | "notl %"_LO32 _tmp"; " \ | ||
| 277 | "andl %"_LO32 _tmp",%"_sav"; " | ||
| 278 | |||
| 279 | /* After executing instruction: write-back necessary bits in EFLAGS. */ | ||
| 280 | #define _POST_EFLAGS(_sav, _msk, _tmp) \ | ||
| 281 | /* _sav |= EFLAGS & _msk; */ \ | ||
| 282 | "pushf; " \ | ||
| 283 | "pop %"_tmp"; " \ | ||
| 284 | "andl %"_msk",%"_LO32 _tmp"; " \ | ||
| 285 | "orl %"_LO32 _tmp",%"_sav"; " | ||
| 286 | |||
| 287 | /* Raw emulation: instruction has two explicit operands. */ | ||
| 288 | #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
| 289 | do { \ | ||
| 290 | unsigned long _tmp; \ | ||
| 291 | \ | ||
| 292 | switch ((_dst).bytes) { \ | ||
| 293 | case 2: \ | ||
| 294 | __asm__ __volatile__ ( \ | ||
| 295 | _PRE_EFLAGS("0","4","2") \ | ||
| 296 | _op"w %"_wx"3,%1; " \ | ||
| 297 | _POST_EFLAGS("0","4","2") \ | ||
| 298 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
| 299 | "=&r" (_tmp) \ | ||
| 300 | : _wy ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
| 301 | break; \ | ||
| 302 | case 4: \ | ||
| 303 | __asm__ __volatile__ ( \ | ||
| 304 | _PRE_EFLAGS("0","4","2") \ | ||
| 305 | _op"l %"_lx"3,%1; " \ | ||
| 306 | _POST_EFLAGS("0","4","2") \ | ||
| 307 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
| 308 | "=&r" (_tmp) \ | ||
| 309 | : _ly ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
| 310 | break; \ | ||
| 311 | case 8: \ | ||
| 312 | __emulate_2op_8byte(_op, _src, _dst, \ | ||
| 313 | _eflags, _qx, _qy); \ | ||
| 314 | break; \ | ||
| 315 | } \ | ||
| 316 | } while (0) | ||
| 317 | |||
| 318 | #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
| 319 | do { \ | ||
| 320 | unsigned long _tmp; \ | ||
| 321 | switch ( (_dst).bytes ) \ | ||
| 322 | { \ | ||
| 323 | case 1: \ | ||
| 324 | __asm__ __volatile__ ( \ | ||
| 325 | _PRE_EFLAGS("0","4","2") \ | ||
| 326 | _op"b %"_bx"3,%1; " \ | ||
| 327 | _POST_EFLAGS("0","4","2") \ | ||
| 328 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
| 329 | "=&r" (_tmp) \ | ||
| 330 | : _by ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
| 331 | break; \ | ||
| 332 | default: \ | ||
| 333 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
| 334 | _wx, _wy, _lx, _ly, _qx, _qy); \ | ||
| 335 | break; \ | ||
| 336 | } \ | ||
| 337 | } while (0) | ||
| 338 | |||
| 339 | /* Source operand is byte-sized and may be restricted to just %cl. */ | ||
| 340 | #define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ | ||
| 341 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
| 342 | "b", "c", "b", "c", "b", "c", "b", "c") | ||
| 343 | |||
| 344 | /* Source operand is byte, word, long or quad sized. */ | ||
| 345 | #define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ | ||
| 346 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
| 347 | "b", "q", "w", "r", _LO32, "r", "", "r") | ||
| 348 | |||
| 349 | /* Source operand is word, long or quad sized. */ | ||
| 350 | #define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ | ||
| 351 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
| 352 | "w", "r", _LO32, "r", "", "r") | ||
| 353 | |||
| 354 | /* Instruction has only one explicit operand (no source operand). */ | ||
| 355 | #define emulate_1op(_op, _dst, _eflags) \ | ||
| 356 | do { \ | ||
| 357 | unsigned long _tmp; \ | ||
| 358 | \ | ||
| 359 | switch ( (_dst).bytes ) \ | ||
| 360 | { \ | ||
| 361 | case 1: \ | ||
| 362 | __asm__ __volatile__ ( \ | ||
| 363 | _PRE_EFLAGS("0","3","2") \ | ||
| 364 | _op"b %1; " \ | ||
| 365 | _POST_EFLAGS("0","3","2") \ | ||
| 366 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
| 367 | "=&r" (_tmp) \ | ||
| 368 | : "i" (EFLAGS_MASK) ); \ | ||
| 369 | break; \ | ||
| 370 | case 2: \ | ||
| 371 | __asm__ __volatile__ ( \ | ||
| 372 | _PRE_EFLAGS("0","3","2") \ | ||
| 373 | _op"w %1; " \ | ||
| 374 | _POST_EFLAGS("0","3","2") \ | ||
| 375 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
| 376 | "=&r" (_tmp) \ | ||
| 377 | : "i" (EFLAGS_MASK) ); \ | ||
| 378 | break; \ | ||
| 379 | case 4: \ | ||
| 380 | __asm__ __volatile__ ( \ | ||
| 381 | _PRE_EFLAGS("0","3","2") \ | ||
| 382 | _op"l %1; " \ | ||
| 383 | _POST_EFLAGS("0","3","2") \ | ||
| 384 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
| 385 | "=&r" (_tmp) \ | ||
| 386 | : "i" (EFLAGS_MASK) ); \ | ||
| 387 | break; \ | ||
| 388 | case 8: \ | ||
| 389 | __emulate_1op_8byte(_op, _dst, _eflags); \ | ||
| 390 | break; \ | ||
| 391 | } \ | ||
| 392 | } while (0) | ||
| 393 | |||
| 394 | /* Emulate an instruction with quadword operands (x86/64 only). */ | ||
| 395 | #if defined(CONFIG_X86_64) | ||
| 396 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ | ||
| 397 | do { \ | ||
| 398 | __asm__ __volatile__ ( \ | ||
| 399 | _PRE_EFLAGS("0","4","2") \ | ||
| 400 | _op"q %"_qx"3,%1; " \ | ||
| 401 | _POST_EFLAGS("0","4","2") \ | ||
| 402 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
| 403 | : _qy ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
| 404 | } while (0) | ||
| 405 | |||
| 406 | #define __emulate_1op_8byte(_op, _dst, _eflags) \ | ||
| 407 | do { \ | ||
| 408 | __asm__ __volatile__ ( \ | ||
| 409 | _PRE_EFLAGS("0","3","2") \ | ||
| 410 | _op"q %1; " \ | ||
| 411 | _POST_EFLAGS("0","3","2") \ | ||
| 412 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
| 413 | : "i" (EFLAGS_MASK) ); \ | ||
| 414 | } while (0) | ||
| 415 | |||
| 416 | #elif defined(__i386__) | ||
| 417 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) | ||
| 418 | #define __emulate_1op_8byte(_op, _dst, _eflags) | ||
| 419 | #endif /* __i386__ */ | ||
| 420 | |||
| 421 | /* Fetch next part of the instruction being emulated. */ | ||
| 422 | #define insn_fetch(_type, _size, _eip) \ | ||
| 423 | ({ unsigned long _x; \ | ||
| 424 | rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \ | ||
| 425 | (_size), ctxt->vcpu); \ | ||
| 426 | if ( rc != 0 ) \ | ||
| 427 | goto done; \ | ||
| 428 | (_eip) += (_size); \ | ||
| 429 | (_type)_x; \ | ||
| 430 | }) | ||
| 431 | |||
| 432 | /* Access/update address held in a register, based on addressing mode. */ | ||
| 433 | #define address_mask(reg) \ | ||
| 434 | ((ad_bytes == sizeof(unsigned long)) ? \ | ||
| 435 | (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1))) | ||
| 436 | #define register_address(base, reg) \ | ||
| 437 | ((base) + address_mask(reg)) | ||
| 438 | #define register_address_increment(reg, inc) \ | ||
| 439 | do { \ | ||
| 440 | /* signed type ensures sign extension to long */ \ | ||
| 441 | int _inc = (inc); \ | ||
| 442 | if ( ad_bytes == sizeof(unsigned long) ) \ | ||
| 443 | (reg) += _inc; \ | ||
| 444 | else \ | ||
| 445 | (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \ | ||
| 446 | (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \ | ||
| 447 | } while (0) | ||
| 448 | |||
| 449 | #define JMP_REL(rel) \ | ||
| 450 | do { \ | ||
| 451 | register_address_increment(_eip, rel); \ | ||
| 452 | } while (0) | ||
| 453 | |||
| 454 | /* | ||
| 455 | * Given the 'reg' portion of a ModRM byte, and a register block, return a | ||
| 456 | * pointer into the block that addresses the relevant register. | ||
| 457 | * @highbyte_regs specifies whether to decode AH,CH,DH,BH. | ||
| 458 | */ | ||
| 459 | static void *decode_register(u8 modrm_reg, unsigned long *regs, | ||
| 460 | int highbyte_regs) | ||
| 461 | { | ||
| 462 | void *p; | ||
| 463 | |||
| 464 | p = ®s[modrm_reg]; | ||
| 465 | if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) | ||
| 466 | p = (unsigned char *)®s[modrm_reg & 3] + 1; | ||
| 467 | return p; | ||
| 468 | } | ||
| 469 | |||
| 470 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, | ||
| 471 | struct x86_emulate_ops *ops, | ||
| 472 | void *ptr, | ||
| 473 | u16 *size, unsigned long *address, int op_bytes) | ||
| 474 | { | ||
| 475 | int rc; | ||
| 476 | |||
| 477 | if (op_bytes == 2) | ||
| 478 | op_bytes = 3; | ||
| 479 | *address = 0; | ||
| 480 | rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, | ||
| 481 | ctxt->vcpu); | ||
| 482 | if (rc) | ||
| 483 | return rc; | ||
| 484 | rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, | ||
| 485 | ctxt->vcpu); | ||
| 486 | return rc; | ||
| 487 | } | ||
| 488 | |||
| 489 | static int test_cc(unsigned int condition, unsigned int flags) | ||
| 490 | { | ||
| 491 | int rc = 0; | ||
| 492 | |||
| 493 | switch ((condition & 15) >> 1) { | ||
| 494 | case 0: /* o */ | ||
| 495 | rc |= (flags & EFLG_OF); | ||
| 496 | break; | ||
| 497 | case 1: /* b/c/nae */ | ||
| 498 | rc |= (flags & EFLG_CF); | ||
| 499 | break; | ||
| 500 | case 2: /* z/e */ | ||
| 501 | rc |= (flags & EFLG_ZF); | ||
| 502 | break; | ||
| 503 | case 3: /* be/na */ | ||
| 504 | rc |= (flags & (EFLG_CF|EFLG_ZF)); | ||
| 505 | break; | ||
| 506 | case 4: /* s */ | ||
| 507 | rc |= (flags & EFLG_SF); | ||
| 508 | break; | ||
| 509 | case 5: /* p/pe */ | ||
| 510 | rc |= (flags & EFLG_PF); | ||
| 511 | break; | ||
| 512 | case 7: /* le/ng */ | ||
| 513 | rc |= (flags & EFLG_ZF); | ||
| 514 | /* fall through */ | ||
| 515 | case 6: /* l/nge */ | ||
| 516 | rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); | ||
| 517 | break; | ||
| 518 | } | ||
| 519 | |||
| 520 | /* Odd condition identifiers (lsb == 1) have inverted sense. */ | ||
| 521 | return (!!rc ^ (condition & 1)); | ||
| 522 | } | ||
| 523 | |||
| 524 | int | ||
| 525 | x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
| 526 | { | ||
| 527 | unsigned d; | ||
| 528 | u8 b, sib, twobyte = 0, rex_prefix = 0; | ||
| 529 | u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0; | ||
| 530 | unsigned long *override_base = NULL; | ||
| 531 | unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i; | ||
| 532 | int rc = 0; | ||
| 533 | struct operand src, dst; | ||
| 534 | unsigned long cr2 = ctxt->cr2; | ||
| 535 | int mode = ctxt->mode; | ||
| 536 | unsigned long modrm_ea; | ||
| 537 | int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0; | ||
| 538 | int no_wb = 0; | ||
| 539 | u64 msr_data; | ||
| 540 | |||
| 541 | /* Shadow copy of register state. Committed on successful emulation. */ | ||
| 542 | unsigned long _regs[NR_VCPU_REGS]; | ||
| 543 | unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags; | ||
| 544 | unsigned long modrm_val = 0; | ||
| 545 | |||
| 546 | memcpy(_regs, ctxt->vcpu->regs, sizeof _regs); | ||
| 547 | |||
| 548 | switch (mode) { | ||
| 549 | case X86EMUL_MODE_REAL: | ||
| 550 | case X86EMUL_MODE_PROT16: | ||
| 551 | op_bytes = ad_bytes = 2; | ||
| 552 | break; | ||
| 553 | case X86EMUL_MODE_PROT32: | ||
| 554 | op_bytes = ad_bytes = 4; | ||
| 555 | break; | ||
| 556 | #ifdef CONFIG_X86_64 | ||
| 557 | case X86EMUL_MODE_PROT64: | ||
| 558 | op_bytes = 4; | ||
| 559 | ad_bytes = 8; | ||
| 560 | break; | ||
| 561 | #endif | ||
| 562 | default: | ||
| 563 | return -1; | ||
| 564 | } | ||
| 565 | |||
| 566 | /* Legacy prefixes. */ | ||
| 567 | for (i = 0; i < 8; i++) { | ||
| 568 | switch (b = insn_fetch(u8, 1, _eip)) { | ||
| 569 | case 0x66: /* operand-size override */ | ||
| 570 | op_bytes ^= 6; /* switch between 2/4 bytes */ | ||
| 571 | break; | ||
| 572 | case 0x67: /* address-size override */ | ||
| 573 | if (mode == X86EMUL_MODE_PROT64) | ||
| 574 | ad_bytes ^= 12; /* switch between 4/8 bytes */ | ||
| 575 | else | ||
| 576 | ad_bytes ^= 6; /* switch between 2/4 bytes */ | ||
| 577 | break; | ||
| 578 | case 0x2e: /* CS override */ | ||
| 579 | override_base = &ctxt->cs_base; | ||
| 580 | break; | ||
| 581 | case 0x3e: /* DS override */ | ||
| 582 | override_base = &ctxt->ds_base; | ||
| 583 | break; | ||
| 584 | case 0x26: /* ES override */ | ||
| 585 | override_base = &ctxt->es_base; | ||
| 586 | break; | ||
| 587 | case 0x64: /* FS override */ | ||
| 588 | override_base = &ctxt->fs_base; | ||
| 589 | break; | ||
| 590 | case 0x65: /* GS override */ | ||
| 591 | override_base = &ctxt->gs_base; | ||
| 592 | break; | ||
| 593 | case 0x36: /* SS override */ | ||
| 594 | override_base = &ctxt->ss_base; | ||
| 595 | break; | ||
| 596 | case 0xf0: /* LOCK */ | ||
| 597 | lock_prefix = 1; | ||
| 598 | break; | ||
| 599 | case 0xf2: /* REPNE/REPNZ */ | ||
| 600 | case 0xf3: /* REP/REPE/REPZ */ | ||
| 601 | rep_prefix = 1; | ||
| 602 | break; | ||
| 603 | default: | ||
| 604 | goto done_prefixes; | ||
| 605 | } | ||
| 606 | } | ||
| 607 | |||
| 608 | done_prefixes: | ||
| 609 | |||
| 610 | /* REX prefix. */ | ||
| 611 | if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) { | ||
| 612 | rex_prefix = b; | ||
| 613 | if (b & 8) | ||
| 614 | op_bytes = 8; /* REX.W */ | ||
| 615 | modrm_reg = (b & 4) << 1; /* REX.R */ | ||
| 616 | index_reg = (b & 2) << 2; /* REX.X */ | ||
| 617 | modrm_rm = base_reg = (b & 1) << 3; /* REG.B */ | ||
| 618 | b = insn_fetch(u8, 1, _eip); | ||
| 619 | } | ||
| 620 | |||
| 621 | /* Opcode byte(s). */ | ||
| 622 | d = opcode_table[b]; | ||
| 623 | if (d == 0) { | ||
| 624 | /* Two-byte opcode? */ | ||
| 625 | if (b == 0x0f) { | ||
| 626 | twobyte = 1; | ||
| 627 | b = insn_fetch(u8, 1, _eip); | ||
| 628 | d = twobyte_table[b]; | ||
| 629 | } | ||
| 630 | |||
| 631 | /* Unrecognised? */ | ||
| 632 | if (d == 0) | ||
| 633 | goto cannot_emulate; | ||
| 634 | } | ||
| 635 | |||
| 636 | /* ModRM and SIB bytes. */ | ||
| 637 | if (d & ModRM) { | ||
| 638 | modrm = insn_fetch(u8, 1, _eip); | ||
| 639 | modrm_mod |= (modrm & 0xc0) >> 6; | ||
| 640 | modrm_reg |= (modrm & 0x38) >> 3; | ||
| 641 | modrm_rm |= (modrm & 0x07); | ||
| 642 | modrm_ea = 0; | ||
| 643 | use_modrm_ea = 1; | ||
| 644 | |||
| 645 | if (modrm_mod == 3) { | ||
| 646 | modrm_val = *(unsigned long *) | ||
| 647 | decode_register(modrm_rm, _regs, d & ByteOp); | ||
| 648 | goto modrm_done; | ||
| 649 | } | ||
| 650 | |||
| 651 | if (ad_bytes == 2) { | ||
| 652 | unsigned bx = _regs[VCPU_REGS_RBX]; | ||
| 653 | unsigned bp = _regs[VCPU_REGS_RBP]; | ||
| 654 | unsigned si = _regs[VCPU_REGS_RSI]; | ||
| 655 | unsigned di = _regs[VCPU_REGS_RDI]; | ||
| 656 | |||
| 657 | /* 16-bit ModR/M decode. */ | ||
| 658 | switch (modrm_mod) { | ||
| 659 | case 0: | ||
| 660 | if (modrm_rm == 6) | ||
| 661 | modrm_ea += insn_fetch(u16, 2, _eip); | ||
| 662 | break; | ||
| 663 | case 1: | ||
| 664 | modrm_ea += insn_fetch(s8, 1, _eip); | ||
| 665 | break; | ||
| 666 | case 2: | ||
| 667 | modrm_ea += insn_fetch(u16, 2, _eip); | ||
| 668 | break; | ||
| 669 | } | ||
| 670 | switch (modrm_rm) { | ||
| 671 | case 0: | ||
| 672 | modrm_ea += bx + si; | ||
| 673 | break; | ||
| 674 | case 1: | ||
| 675 | modrm_ea += bx + di; | ||
| 676 | break; | ||
| 677 | case 2: | ||
| 678 | modrm_ea += bp + si; | ||
| 679 | break; | ||
| 680 | case 3: | ||
| 681 | modrm_ea += bp + di; | ||
| 682 | break; | ||
| 683 | case 4: | ||
| 684 | modrm_ea += si; | ||
| 685 | break; | ||
| 686 | case 5: | ||
| 687 | modrm_ea += di; | ||
| 688 | break; | ||
| 689 | case 6: | ||
| 690 | if (modrm_mod != 0) | ||
| 691 | modrm_ea += bp; | ||
| 692 | break; | ||
| 693 | case 7: | ||
| 694 | modrm_ea += bx; | ||
| 695 | break; | ||
| 696 | } | ||
| 697 | if (modrm_rm == 2 || modrm_rm == 3 || | ||
| 698 | (modrm_rm == 6 && modrm_mod != 0)) | ||
| 699 | if (!override_base) | ||
| 700 | override_base = &ctxt->ss_base; | ||
| 701 | modrm_ea = (u16)modrm_ea; | ||
| 702 | } else { | ||
| 703 | /* 32/64-bit ModR/M decode. */ | ||
| 704 | switch (modrm_rm) { | ||
| 705 | case 4: | ||
| 706 | case 12: | ||
| 707 | sib = insn_fetch(u8, 1, _eip); | ||
| 708 | index_reg |= (sib >> 3) & 7; | ||
| 709 | base_reg |= sib & 7; | ||
| 710 | scale = sib >> 6; | ||
| 711 | |||
| 712 | switch (base_reg) { | ||
| 713 | case 5: | ||
| 714 | if (modrm_mod != 0) | ||
| 715 | modrm_ea += _regs[base_reg]; | ||
| 716 | else | ||
| 717 | modrm_ea += insn_fetch(s32, 4, _eip); | ||
| 718 | break; | ||
| 719 | default: | ||
| 720 | modrm_ea += _regs[base_reg]; | ||
| 721 | } | ||
| 722 | switch (index_reg) { | ||
| 723 | case 4: | ||
| 724 | break; | ||
| 725 | default: | ||
| 726 | modrm_ea += _regs[index_reg] << scale; | ||
| 727 | |||
| 728 | } | ||
| 729 | break; | ||
| 730 | case 5: | ||
| 731 | if (modrm_mod != 0) | ||
| 732 | modrm_ea += _regs[modrm_rm]; | ||
| 733 | else if (mode == X86EMUL_MODE_PROT64) | ||
| 734 | rip_relative = 1; | ||
| 735 | break; | ||
| 736 | default: | ||
| 737 | modrm_ea += _regs[modrm_rm]; | ||
| 738 | break; | ||
| 739 | } | ||
| 740 | switch (modrm_mod) { | ||
| 741 | case 0: | ||
| 742 | if (modrm_rm == 5) | ||
| 743 | modrm_ea += insn_fetch(s32, 4, _eip); | ||
| 744 | break; | ||
| 745 | case 1: | ||
| 746 | modrm_ea += insn_fetch(s8, 1, _eip); | ||
| 747 | break; | ||
| 748 | case 2: | ||
| 749 | modrm_ea += insn_fetch(s32, 4, _eip); | ||
| 750 | break; | ||
| 751 | } | ||
| 752 | } | ||
| 753 | if (!override_base) | ||
| 754 | override_base = &ctxt->ds_base; | ||
| 755 | if (mode == X86EMUL_MODE_PROT64 && | ||
| 756 | override_base != &ctxt->fs_base && | ||
| 757 | override_base != &ctxt->gs_base) | ||
| 758 | override_base = NULL; | ||
| 759 | |||
| 760 | if (override_base) | ||
| 761 | modrm_ea += *override_base; | ||
| 762 | |||
| 763 | if (rip_relative) { | ||
| 764 | modrm_ea += _eip; | ||
| 765 | switch (d & SrcMask) { | ||
| 766 | case SrcImmByte: | ||
| 767 | modrm_ea += 1; | ||
| 768 | break; | ||
| 769 | case SrcImm: | ||
| 770 | if (d & ByteOp) | ||
| 771 | modrm_ea += 1; | ||
| 772 | else | ||
| 773 | if (op_bytes == 8) | ||
| 774 | modrm_ea += 4; | ||
| 775 | else | ||
| 776 | modrm_ea += op_bytes; | ||
| 777 | } | ||
| 778 | } | ||
| 779 | if (ad_bytes != 8) | ||
| 780 | modrm_ea = (u32)modrm_ea; | ||
| 781 | cr2 = modrm_ea; | ||
| 782 | modrm_done: | ||
| 783 | ; | ||
| 784 | } | ||
| 785 | |||
| 786 | /* | ||
| 787 | * Decode and fetch the source operand: register, memory | ||
| 788 | * or immediate. | ||
| 789 | */ | ||
| 790 | switch (d & SrcMask) { | ||
| 791 | case SrcNone: | ||
| 792 | break; | ||
| 793 | case SrcReg: | ||
| 794 | src.type = OP_REG; | ||
| 795 | if (d & ByteOp) { | ||
| 796 | src.ptr = decode_register(modrm_reg, _regs, | ||
| 797 | (rex_prefix == 0)); | ||
| 798 | src.val = src.orig_val = *(u8 *) src.ptr; | ||
| 799 | src.bytes = 1; | ||
| 800 | } else { | ||
| 801 | src.ptr = decode_register(modrm_reg, _regs, 0); | ||
| 802 | switch ((src.bytes = op_bytes)) { | ||
| 803 | case 2: | ||
| 804 | src.val = src.orig_val = *(u16 *) src.ptr; | ||
| 805 | break; | ||
| 806 | case 4: | ||
| 807 | src.val = src.orig_val = *(u32 *) src.ptr; | ||
| 808 | break; | ||
| 809 | case 8: | ||
| 810 | src.val = src.orig_val = *(u64 *) src.ptr; | ||
| 811 | break; | ||
| 812 | } | ||
| 813 | } | ||
| 814 | break; | ||
| 815 | case SrcMem16: | ||
| 816 | src.bytes = 2; | ||
| 817 | goto srcmem_common; | ||
| 818 | case SrcMem32: | ||
| 819 | src.bytes = 4; | ||
| 820 | goto srcmem_common; | ||
| 821 | case SrcMem: | ||
| 822 | src.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
| 823 | /* Don't fetch the address for invlpg: it could be unmapped. */ | ||
| 824 | if (twobyte && b == 0x01 && modrm_reg == 7) | ||
| 825 | break; | ||
| 826 | srcmem_common: | ||
| 827 | /* | ||
| 828 | * For instructions with a ModR/M byte, switch to register | ||
| 829 | * access if Mod = 3. | ||
| 830 | */ | ||
| 831 | if ((d & ModRM) && modrm_mod == 3) { | ||
| 832 | src.type = OP_REG; | ||
| 833 | break; | ||
| 834 | } | ||
| 835 | src.type = OP_MEM; | ||
| 836 | src.ptr = (unsigned long *)cr2; | ||
| 837 | src.val = 0; | ||
| 838 | if ((rc = ops->read_emulated((unsigned long)src.ptr, | ||
| 839 | &src.val, src.bytes, ctxt->vcpu)) != 0) | ||
| 840 | goto done; | ||
| 841 | src.orig_val = src.val; | ||
| 842 | break; | ||
| 843 | case SrcImm: | ||
| 844 | src.type = OP_IMM; | ||
| 845 | src.ptr = (unsigned long *)_eip; | ||
| 846 | src.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
| 847 | if (src.bytes == 8) | ||
| 848 | src.bytes = 4; | ||
| 849 | /* NB. Immediates are sign-extended as necessary. */ | ||
| 850 | switch (src.bytes) { | ||
| 851 | case 1: | ||
| 852 | src.val = insn_fetch(s8, 1, _eip); | ||
| 853 | break; | ||
| 854 | case 2: | ||
| 855 | src.val = insn_fetch(s16, 2, _eip); | ||
| 856 | break; | ||
| 857 | case 4: | ||
| 858 | src.val = insn_fetch(s32, 4, _eip); | ||
| 859 | break; | ||
| 860 | } | ||
| 861 | break; | ||
| 862 | case SrcImmByte: | ||
| 863 | src.type = OP_IMM; | ||
| 864 | src.ptr = (unsigned long *)_eip; | ||
| 865 | src.bytes = 1; | ||
| 866 | src.val = insn_fetch(s8, 1, _eip); | ||
| 867 | break; | ||
| 868 | } | ||
| 869 | |||
| 870 | /* Decode and fetch the destination operand: register or memory. */ | ||
| 871 | switch (d & DstMask) { | ||
| 872 | case ImplicitOps: | ||
| 873 | /* Special instructions do their own operand decoding. */ | ||
| 874 | goto special_insn; | ||
| 875 | case DstReg: | ||
| 876 | dst.type = OP_REG; | ||
| 877 | if ((d & ByteOp) | ||
| 878 | && !(twobyte && (b == 0xb6 || b == 0xb7))) { | ||
| 879 | dst.ptr = decode_register(modrm_reg, _regs, | ||
| 880 | (rex_prefix == 0)); | ||
| 881 | dst.val = *(u8 *) dst.ptr; | ||
| 882 | dst.bytes = 1; | ||
| 883 | } else { | ||
| 884 | dst.ptr = decode_register(modrm_reg, _regs, 0); | ||
| 885 | switch ((dst.bytes = op_bytes)) { | ||
| 886 | case 2: | ||
| 887 | dst.val = *(u16 *)dst.ptr; | ||
| 888 | break; | ||
| 889 | case 4: | ||
| 890 | dst.val = *(u32 *)dst.ptr; | ||
| 891 | break; | ||
| 892 | case 8: | ||
| 893 | dst.val = *(u64 *)dst.ptr; | ||
| 894 | break; | ||
| 895 | } | ||
| 896 | } | ||
| 897 | break; | ||
| 898 | case DstMem: | ||
| 899 | dst.type = OP_MEM; | ||
| 900 | dst.ptr = (unsigned long *)cr2; | ||
| 901 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
| 902 | dst.val = 0; | ||
| 903 | /* | ||
| 904 | * For instructions with a ModR/M byte, switch to register | ||
| 905 | * access if Mod = 3. | ||
| 906 | */ | ||
| 907 | if ((d & ModRM) && modrm_mod == 3) { | ||
| 908 | dst.type = OP_REG; | ||
| 909 | break; | ||
| 910 | } | ||
| 911 | if (d & BitOp) { | ||
| 912 | unsigned long mask = ~(dst.bytes * 8 - 1); | ||
| 913 | |||
| 914 | dst.ptr = (void *)dst.ptr + (src.val & mask) / 8; | ||
| 915 | } | ||
| 916 | if (!(d & Mov) && /* optimisation - avoid slow emulated read */ | ||
| 917 | ((rc = ops->read_emulated((unsigned long)dst.ptr, | ||
| 918 | &dst.val, dst.bytes, ctxt->vcpu)) != 0)) | ||
| 919 | goto done; | ||
| 920 | break; | ||
| 921 | } | ||
| 922 | dst.orig_val = dst.val; | ||
| 923 | |||
| 924 | if (twobyte) | ||
| 925 | goto twobyte_insn; | ||
| 926 | |||
| 927 | switch (b) { | ||
| 928 | case 0x00 ... 0x05: | ||
| 929 | add: /* add */ | ||
| 930 | emulate_2op_SrcV("add", src, dst, _eflags); | ||
| 931 | break; | ||
| 932 | case 0x08 ... 0x0d: | ||
| 933 | or: /* or */ | ||
| 934 | emulate_2op_SrcV("or", src, dst, _eflags); | ||
| 935 | break; | ||
| 936 | case 0x10 ... 0x15: | ||
| 937 | adc: /* adc */ | ||
| 938 | emulate_2op_SrcV("adc", src, dst, _eflags); | ||
| 939 | break; | ||
| 940 | case 0x18 ... 0x1d: | ||
| 941 | sbb: /* sbb */ | ||
| 942 | emulate_2op_SrcV("sbb", src, dst, _eflags); | ||
| 943 | break; | ||
| 944 | case 0x20 ... 0x23: | ||
| 945 | and: /* and */ | ||
| 946 | emulate_2op_SrcV("and", src, dst, _eflags); | ||
| 947 | break; | ||
| 948 | case 0x24: /* and al imm8 */ | ||
| 949 | dst.type = OP_REG; | ||
| 950 | dst.ptr = &_regs[VCPU_REGS_RAX]; | ||
| 951 | dst.val = *(u8 *)dst.ptr; | ||
| 952 | dst.bytes = 1; | ||
| 953 | dst.orig_val = dst.val; | ||
| 954 | goto and; | ||
| 955 | case 0x25: /* and ax imm16, or eax imm32 */ | ||
| 956 | dst.type = OP_REG; | ||
| 957 | dst.bytes = op_bytes; | ||
| 958 | dst.ptr = &_regs[VCPU_REGS_RAX]; | ||
| 959 | if (op_bytes == 2) | ||
| 960 | dst.val = *(u16 *)dst.ptr; | ||
| 961 | else | ||
| 962 | dst.val = *(u32 *)dst.ptr; | ||
| 963 | dst.orig_val = dst.val; | ||
| 964 | goto and; | ||
| 965 | case 0x28 ... 0x2d: | ||
| 966 | sub: /* sub */ | ||
| 967 | emulate_2op_SrcV("sub", src, dst, _eflags); | ||
| 968 | break; | ||
| 969 | case 0x30 ... 0x35: | ||
| 970 | xor: /* xor */ | ||
| 971 | emulate_2op_SrcV("xor", src, dst, _eflags); | ||
| 972 | break; | ||
| 973 | case 0x38 ... 0x3d: | ||
| 974 | cmp: /* cmp */ | ||
| 975 | emulate_2op_SrcV("cmp", src, dst, _eflags); | ||
| 976 | break; | ||
| 977 | case 0x63: /* movsxd */ | ||
| 978 | if (mode != X86EMUL_MODE_PROT64) | ||
| 979 | goto cannot_emulate; | ||
| 980 | dst.val = (s32) src.val; | ||
| 981 | break; | ||
| 982 | case 0x80 ... 0x83: /* Grp1 */ | ||
| 983 | switch (modrm_reg) { | ||
| 984 | case 0: | ||
| 985 | goto add; | ||
| 986 | case 1: | ||
| 987 | goto or; | ||
| 988 | case 2: | ||
| 989 | goto adc; | ||
| 990 | case 3: | ||
| 991 | goto sbb; | ||
| 992 | case 4: | ||
| 993 | goto and; | ||
| 994 | case 5: | ||
| 995 | goto sub; | ||
| 996 | case 6: | ||
| 997 | goto xor; | ||
| 998 | case 7: | ||
| 999 | goto cmp; | ||
| 1000 | } | ||
| 1001 | break; | ||
| 1002 | case 0x84 ... 0x85: | ||
| 1003 | test: /* test */ | ||
| 1004 | emulate_2op_SrcV("test", src, dst, _eflags); | ||
| 1005 | break; | ||
| 1006 | case 0x86 ... 0x87: /* xchg */ | ||
| 1007 | /* Write back the register source. */ | ||
| 1008 | switch (dst.bytes) { | ||
| 1009 | case 1: | ||
| 1010 | *(u8 *) src.ptr = (u8) dst.val; | ||
| 1011 | break; | ||
| 1012 | case 2: | ||
| 1013 | *(u16 *) src.ptr = (u16) dst.val; | ||
| 1014 | break; | ||
| 1015 | case 4: | ||
| 1016 | *src.ptr = (u32) dst.val; | ||
| 1017 | break; /* 64b reg: zero-extend */ | ||
| 1018 | case 8: | ||
| 1019 | *src.ptr = dst.val; | ||
| 1020 | break; | ||
| 1021 | } | ||
| 1022 | /* | ||
| 1023 | * Write back the memory destination with implicit LOCK | ||
| 1024 | * prefix. | ||
| 1025 | */ | ||
| 1026 | dst.val = src.val; | ||
| 1027 | lock_prefix = 1; | ||
| 1028 | break; | ||
| 1029 | case 0x88 ... 0x8b: /* mov */ | ||
| 1030 | goto mov; | ||
| 1031 | case 0x8d: /* lea r16/r32, m */ | ||
| 1032 | dst.val = modrm_val; | ||
| 1033 | break; | ||
| 1034 | case 0x8f: /* pop (sole member of Grp1a) */ | ||
| 1035 | /* 64-bit mode: POP always pops a 64-bit operand. */ | ||
| 1036 | if (mode == X86EMUL_MODE_PROT64) | ||
| 1037 | dst.bytes = 8; | ||
| 1038 | if ((rc = ops->read_std(register_address(ctxt->ss_base, | ||
| 1039 | _regs[VCPU_REGS_RSP]), | ||
| 1040 | &dst.val, dst.bytes, ctxt->vcpu)) != 0) | ||
| 1041 | goto done; | ||
| 1042 | register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes); | ||
| 1043 | break; | ||
| 1044 | case 0xa0 ... 0xa1: /* mov */ | ||
| 1045 | dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; | ||
| 1046 | dst.val = src.val; | ||
| 1047 | _eip += ad_bytes; /* skip src displacement */ | ||
| 1048 | break; | ||
| 1049 | case 0xa2 ... 0xa3: /* mov */ | ||
| 1050 | dst.val = (unsigned long)_regs[VCPU_REGS_RAX]; | ||
| 1051 | _eip += ad_bytes; /* skip dst displacement */ | ||
| 1052 | break; | ||
| 1053 | case 0xc0 ... 0xc1: | ||
| 1054 | grp2: /* Grp2 */ | ||
| 1055 | switch (modrm_reg) { | ||
| 1056 | case 0: /* rol */ | ||
| 1057 | emulate_2op_SrcB("rol", src, dst, _eflags); | ||
| 1058 | break; | ||
| 1059 | case 1: /* ror */ | ||
| 1060 | emulate_2op_SrcB("ror", src, dst, _eflags); | ||
| 1061 | break; | ||
| 1062 | case 2: /* rcl */ | ||
| 1063 | emulate_2op_SrcB("rcl", src, dst, _eflags); | ||
| 1064 | break; | ||
| 1065 | case 3: /* rcr */ | ||
| 1066 | emulate_2op_SrcB("rcr", src, dst, _eflags); | ||
| 1067 | break; | ||
| 1068 | case 4: /* sal/shl */ | ||
| 1069 | case 6: /* sal/shl */ | ||
| 1070 | emulate_2op_SrcB("sal", src, dst, _eflags); | ||
| 1071 | break; | ||
| 1072 | case 5: /* shr */ | ||
| 1073 | emulate_2op_SrcB("shr", src, dst, _eflags); | ||
| 1074 | break; | ||
| 1075 | case 7: /* sar */ | ||
| 1076 | emulate_2op_SrcB("sar", src, dst, _eflags); | ||
| 1077 | break; | ||
| 1078 | } | ||
| 1079 | break; | ||
| 1080 | case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ | ||
| 1081 | mov: | ||
| 1082 | dst.val = src.val; | ||
| 1083 | break; | ||
| 1084 | case 0xd0 ... 0xd1: /* Grp2 */ | ||
| 1085 | src.val = 1; | ||
| 1086 | goto grp2; | ||
| 1087 | case 0xd2 ... 0xd3: /* Grp2 */ | ||
| 1088 | src.val = _regs[VCPU_REGS_RCX]; | ||
| 1089 | goto grp2; | ||
| 1090 | case 0xf6 ... 0xf7: /* Grp3 */ | ||
| 1091 | switch (modrm_reg) { | ||
| 1092 | case 0 ... 1: /* test */ | ||
| 1093 | /* | ||
| 1094 | * Special case in Grp3: test has an immediate | ||
| 1095 | * source operand. | ||
| 1096 | */ | ||
| 1097 | src.type = OP_IMM; | ||
| 1098 | src.ptr = (unsigned long *)_eip; | ||
| 1099 | src.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
| 1100 | if (src.bytes == 8) | ||
| 1101 | src.bytes = 4; | ||
| 1102 | switch (src.bytes) { | ||
| 1103 | case 1: | ||
| 1104 | src.val = insn_fetch(s8, 1, _eip); | ||
| 1105 | break; | ||
| 1106 | case 2: | ||
| 1107 | src.val = insn_fetch(s16, 2, _eip); | ||
| 1108 | break; | ||
| 1109 | case 4: | ||
| 1110 | src.val = insn_fetch(s32, 4, _eip); | ||
| 1111 | break; | ||
| 1112 | } | ||
| 1113 | goto test; | ||
| 1114 | case 2: /* not */ | ||
| 1115 | dst.val = ~dst.val; | ||
| 1116 | break; | ||
| 1117 | case 3: /* neg */ | ||
| 1118 | emulate_1op("neg", dst, _eflags); | ||
| 1119 | break; | ||
| 1120 | default: | ||
| 1121 | goto cannot_emulate; | ||
| 1122 | } | ||
| 1123 | break; | ||
| 1124 | case 0xfe ... 0xff: /* Grp4/Grp5 */ | ||
| 1125 | switch (modrm_reg) { | ||
| 1126 | case 0: /* inc */ | ||
| 1127 | emulate_1op("inc", dst, _eflags); | ||
| 1128 | break; | ||
| 1129 | case 1: /* dec */ | ||
| 1130 | emulate_1op("dec", dst, _eflags); | ||
| 1131 | break; | ||
| 1132 | case 4: /* jmp abs */ | ||
| 1133 | if (b == 0xff) | ||
| 1134 | _eip = dst.val; | ||
| 1135 | else | ||
| 1136 | goto cannot_emulate; | ||
| 1137 | break; | ||
| 1138 | case 6: /* push */ | ||
| 1139 | /* 64-bit mode: PUSH always pushes a 64-bit operand. */ | ||
| 1140 | if (mode == X86EMUL_MODE_PROT64) { | ||
| 1141 | dst.bytes = 8; | ||
| 1142 | if ((rc = ops->read_std((unsigned long)dst.ptr, | ||
| 1143 | &dst.val, 8, | ||
| 1144 | ctxt->vcpu)) != 0) | ||
| 1145 | goto done; | ||
| 1146 | } | ||
| 1147 | register_address_increment(_regs[VCPU_REGS_RSP], | ||
| 1148 | -dst.bytes); | ||
| 1149 | if ((rc = ops->write_emulated( | ||
| 1150 | register_address(ctxt->ss_base, | ||
| 1151 | _regs[VCPU_REGS_RSP]), | ||
| 1152 | &dst.val, dst.bytes, ctxt->vcpu)) != 0) | ||
| 1153 | goto done; | ||
| 1154 | no_wb = 1; | ||
| 1155 | break; | ||
| 1156 | default: | ||
| 1157 | goto cannot_emulate; | ||
| 1158 | } | ||
| 1159 | break; | ||
| 1160 | } | ||
| 1161 | |||
| 1162 | writeback: | ||
| 1163 | if (!no_wb) { | ||
| 1164 | switch (dst.type) { | ||
| 1165 | case OP_REG: | ||
| 1166 | /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ | ||
| 1167 | switch (dst.bytes) { | ||
| 1168 | case 1: | ||
| 1169 | *(u8 *)dst.ptr = (u8)dst.val; | ||
| 1170 | break; | ||
| 1171 | case 2: | ||
| 1172 | *(u16 *)dst.ptr = (u16)dst.val; | ||
| 1173 | break; | ||
| 1174 | case 4: | ||
| 1175 | *dst.ptr = (u32)dst.val; | ||
| 1176 | break; /* 64b: zero-ext */ | ||
| 1177 | case 8: | ||
| 1178 | *dst.ptr = dst.val; | ||
| 1179 | break; | ||
| 1180 | } | ||
| 1181 | break; | ||
| 1182 | case OP_MEM: | ||
| 1183 | if (lock_prefix) | ||
| 1184 | rc = ops->cmpxchg_emulated((unsigned long)dst. | ||
| 1185 | ptr, &dst.orig_val, | ||
| 1186 | &dst.val, dst.bytes, | ||
| 1187 | ctxt->vcpu); | ||
| 1188 | else | ||
| 1189 | rc = ops->write_emulated((unsigned long)dst.ptr, | ||
| 1190 | &dst.val, dst.bytes, | ||
| 1191 | ctxt->vcpu); | ||
| 1192 | if (rc != 0) | ||
| 1193 | goto done; | ||
| 1194 | default: | ||
| 1195 | break; | ||
| 1196 | } | ||
| 1197 | } | ||
| 1198 | |||
| 1199 | /* Commit shadow register state. */ | ||
| 1200 | memcpy(ctxt->vcpu->regs, _regs, sizeof _regs); | ||
| 1201 | ctxt->eflags = _eflags; | ||
| 1202 | ctxt->vcpu->rip = _eip; | ||
| 1203 | |||
| 1204 | done: | ||
| 1205 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | ||
| 1206 | |||
| 1207 | special_insn: | ||
| 1208 | if (twobyte) | ||
| 1209 | goto twobyte_special_insn; | ||
| 1210 | switch(b) { | ||
| 1211 | case 0x50 ... 0x57: /* push reg */ | ||
| 1212 | if (op_bytes == 2) | ||
| 1213 | src.val = (u16) _regs[b & 0x7]; | ||
| 1214 | else | ||
| 1215 | src.val = (u32) _regs[b & 0x7]; | ||
| 1216 | dst.type = OP_MEM; | ||
| 1217 | dst.bytes = op_bytes; | ||
| 1218 | dst.val = src.val; | ||
| 1219 | register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes); | ||
| 1220 | dst.ptr = (void *) register_address( | ||
| 1221 | ctxt->ss_base, _regs[VCPU_REGS_RSP]); | ||
| 1222 | break; | ||
| 1223 | case 0x58 ... 0x5f: /* pop reg */ | ||
| 1224 | dst.ptr = (unsigned long *)&_regs[b & 0x7]; | ||
| 1225 | pop_instruction: | ||
| 1226 | if ((rc = ops->read_std(register_address(ctxt->ss_base, | ||
| 1227 | _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu)) | ||
| 1228 | != 0) | ||
| 1229 | goto done; | ||
| 1230 | |||
| 1231 | register_address_increment(_regs[VCPU_REGS_RSP], op_bytes); | ||
| 1232 | no_wb = 1; /* Disable writeback. */ | ||
| 1233 | break; | ||
| 1234 | case 0x6a: /* push imm8 */ | ||
| 1235 | src.val = 0L; | ||
| 1236 | src.val = insn_fetch(s8, 1, _eip); | ||
| 1237 | push: | ||
| 1238 | dst.type = OP_MEM; | ||
| 1239 | dst.bytes = op_bytes; | ||
| 1240 | dst.val = src.val; | ||
| 1241 | register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes); | ||
| 1242 | dst.ptr = (void *) register_address(ctxt->ss_base, | ||
| 1243 | _regs[VCPU_REGS_RSP]); | ||
| 1244 | break; | ||
| 1245 | case 0x6c: /* insb */ | ||
| 1246 | case 0x6d: /* insw/insd */ | ||
| 1247 | if (kvm_emulate_pio_string(ctxt->vcpu, NULL, | ||
| 1248 | 1, /* in */ | ||
| 1249 | (d & ByteOp) ? 1 : op_bytes, /* size */ | ||
| 1250 | rep_prefix ? | ||
| 1251 | address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */ | ||
| 1252 | (_eflags & EFLG_DF), /* down */ | ||
| 1253 | register_address(ctxt->es_base, | ||
| 1254 | _regs[VCPU_REGS_RDI]), /* address */ | ||
| 1255 | rep_prefix, | ||
| 1256 | _regs[VCPU_REGS_RDX] /* port */ | ||
| 1257 | ) == 0) | ||
| 1258 | return -1; | ||
| 1259 | return 0; | ||
| 1260 | case 0x6e: /* outsb */ | ||
| 1261 | case 0x6f: /* outsw/outsd */ | ||
| 1262 | if (kvm_emulate_pio_string(ctxt->vcpu, NULL, | ||
| 1263 | 0, /* in */ | ||
| 1264 | (d & ByteOp) ? 1 : op_bytes, /* size */ | ||
| 1265 | rep_prefix ? | ||
| 1266 | address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */ | ||
| 1267 | (_eflags & EFLG_DF), /* down */ | ||
| 1268 | register_address(override_base ? | ||
| 1269 | *override_base : ctxt->ds_base, | ||
| 1270 | _regs[VCPU_REGS_RSI]), /* address */ | ||
| 1271 | rep_prefix, | ||
| 1272 | _regs[VCPU_REGS_RDX] /* port */ | ||
| 1273 | ) == 0) | ||
| 1274 | return -1; | ||
| 1275 | return 0; | ||
| 1276 | case 0x70 ... 0x7f: /* jcc (short) */ { | ||
| 1277 | int rel = insn_fetch(s8, 1, _eip); | ||
| 1278 | |||
| 1279 | if (test_cc(b, _eflags)) | ||
| 1280 | JMP_REL(rel); | ||
| 1281 | break; | ||
| 1282 | } | ||
| 1283 | case 0x9c: /* pushf */ | ||
| 1284 | src.val = (unsigned long) _eflags; | ||
| 1285 | goto push; | ||
| 1286 | case 0x9d: /* popf */ | ||
| 1287 | dst.ptr = (unsigned long *) &_eflags; | ||
| 1288 | goto pop_instruction; | ||
| 1289 | case 0xc3: /* ret */ | ||
| 1290 | dst.ptr = &_eip; | ||
| 1291 | goto pop_instruction; | ||
| 1292 | case 0xf4: /* hlt */ | ||
| 1293 | ctxt->vcpu->halt_request = 1; | ||
| 1294 | goto done; | ||
| 1295 | } | ||
| 1296 | if (rep_prefix) { | ||
| 1297 | if (_regs[VCPU_REGS_RCX] == 0) { | ||
| 1298 | ctxt->vcpu->rip = _eip; | ||
| 1299 | goto done; | ||
| 1300 | } | ||
| 1301 | _regs[VCPU_REGS_RCX]--; | ||
| 1302 | _eip = ctxt->vcpu->rip; | ||
| 1303 | } | ||
| 1304 | switch (b) { | ||
| 1305 | case 0xa4 ... 0xa5: /* movs */ | ||
| 1306 | dst.type = OP_MEM; | ||
| 1307 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
| 1308 | dst.ptr = (unsigned long *)register_address(ctxt->es_base, | ||
| 1309 | _regs[VCPU_REGS_RDI]); | ||
| 1310 | if ((rc = ops->read_emulated(register_address( | ||
| 1311 | override_base ? *override_base : ctxt->ds_base, | ||
| 1312 | _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0) | ||
| 1313 | goto done; | ||
| 1314 | register_address_increment(_regs[VCPU_REGS_RSI], | ||
| 1315 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
| 1316 | register_address_increment(_regs[VCPU_REGS_RDI], | ||
| 1317 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
| 1318 | break; | ||
| 1319 | case 0xa6 ... 0xa7: /* cmps */ | ||
| 1320 | DPRINTF("Urk! I don't handle CMPS.\n"); | ||
| 1321 | goto cannot_emulate; | ||
| 1322 | case 0xaa ... 0xab: /* stos */ | ||
| 1323 | dst.type = OP_MEM; | ||
| 1324 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
| 1325 | dst.ptr = (unsigned long *)cr2; | ||
| 1326 | dst.val = _regs[VCPU_REGS_RAX]; | ||
| 1327 | register_address_increment(_regs[VCPU_REGS_RDI], | ||
| 1328 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
| 1329 | break; | ||
| 1330 | case 0xac ... 0xad: /* lods */ | ||
| 1331 | dst.type = OP_REG; | ||
| 1332 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
| 1333 | dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; | ||
| 1334 | if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes, | ||
| 1335 | ctxt->vcpu)) != 0) | ||
| 1336 | goto done; | ||
| 1337 | register_address_increment(_regs[VCPU_REGS_RSI], | ||
| 1338 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
| 1339 | break; | ||
| 1340 | case 0xae ... 0xaf: /* scas */ | ||
| 1341 | DPRINTF("Urk! I don't handle SCAS.\n"); | ||
| 1342 | goto cannot_emulate; | ||
| 1343 | case 0xe8: /* call (near) */ { | ||
| 1344 | long int rel; | ||
| 1345 | switch (op_bytes) { | ||
| 1346 | case 2: | ||
| 1347 | rel = insn_fetch(s16, 2, _eip); | ||
| 1348 | break; | ||
| 1349 | case 4: | ||
| 1350 | rel = insn_fetch(s32, 4, _eip); | ||
| 1351 | break; | ||
| 1352 | case 8: | ||
| 1353 | rel = insn_fetch(s64, 8, _eip); | ||
| 1354 | break; | ||
| 1355 | default: | ||
| 1356 | DPRINTF("Call: Invalid op_bytes\n"); | ||
| 1357 | goto cannot_emulate; | ||
| 1358 | } | ||
| 1359 | src.val = (unsigned long) _eip; | ||
| 1360 | JMP_REL(rel); | ||
| 1361 | op_bytes = ad_bytes; | ||
| 1362 | goto push; | ||
| 1363 | } | ||
| 1364 | case 0xe9: /* jmp rel */ | ||
| 1365 | case 0xeb: /* jmp rel short */ | ||
| 1366 | JMP_REL(src.val); | ||
| 1367 | no_wb = 1; /* Disable writeback. */ | ||
| 1368 | break; | ||
| 1369 | |||
| 1370 | |||
| 1371 | } | ||
| 1372 | goto writeback; | ||
| 1373 | |||
| 1374 | twobyte_insn: | ||
| 1375 | switch (b) { | ||
| 1376 | case 0x01: /* lgdt, lidt, lmsw */ | ||
| 1377 | /* Disable writeback. */ | ||
| 1378 | no_wb = 1; | ||
| 1379 | switch (modrm_reg) { | ||
| 1380 | u16 size; | ||
| 1381 | unsigned long address; | ||
| 1382 | |||
| 1383 | case 2: /* lgdt */ | ||
| 1384 | rc = read_descriptor(ctxt, ops, src.ptr, | ||
| 1385 | &size, &address, op_bytes); | ||
| 1386 | if (rc) | ||
| 1387 | goto done; | ||
| 1388 | realmode_lgdt(ctxt->vcpu, size, address); | ||
| 1389 | break; | ||
| 1390 | case 3: /* lidt */ | ||
| 1391 | rc = read_descriptor(ctxt, ops, src.ptr, | ||
| 1392 | &size, &address, op_bytes); | ||
| 1393 | if (rc) | ||
| 1394 | goto done; | ||
| 1395 | realmode_lidt(ctxt->vcpu, size, address); | ||
| 1396 | break; | ||
| 1397 | case 4: /* smsw */ | ||
| 1398 | if (modrm_mod != 3) | ||
| 1399 | goto cannot_emulate; | ||
| 1400 | *(u16 *)&_regs[modrm_rm] | ||
| 1401 | = realmode_get_cr(ctxt->vcpu, 0); | ||
| 1402 | break; | ||
| 1403 | case 6: /* lmsw */ | ||
| 1404 | if (modrm_mod != 3) | ||
| 1405 | goto cannot_emulate; | ||
| 1406 | realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags); | ||
| 1407 | break; | ||
| 1408 | case 7: /* invlpg*/ | ||
| 1409 | emulate_invlpg(ctxt->vcpu, cr2); | ||
| 1410 | break; | ||
| 1411 | default: | ||
| 1412 | goto cannot_emulate; | ||
| 1413 | } | ||
| 1414 | break; | ||
| 1415 | case 0x21: /* mov from dr to reg */ | ||
| 1416 | no_wb = 1; | ||
| 1417 | if (modrm_mod != 3) | ||
| 1418 | goto cannot_emulate; | ||
| 1419 | rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]); | ||
| 1420 | break; | ||
| 1421 | case 0x23: /* mov from reg to dr */ | ||
| 1422 | no_wb = 1; | ||
| 1423 | if (modrm_mod != 3) | ||
| 1424 | goto cannot_emulate; | ||
| 1425 | rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]); | ||
| 1426 | break; | ||
| 1427 | case 0x40 ... 0x4f: /* cmov */ | ||
| 1428 | dst.val = dst.orig_val = src.val; | ||
| 1429 | no_wb = 1; | ||
| 1430 | /* | ||
| 1431 | * First, assume we're decoding an even cmov opcode | ||
| 1432 | * (lsb == 0). | ||
| 1433 | */ | ||
| 1434 | switch ((b & 15) >> 1) { | ||
| 1435 | case 0: /* cmovo */ | ||
| 1436 | no_wb = (_eflags & EFLG_OF) ? 0 : 1; | ||
| 1437 | break; | ||
| 1438 | case 1: /* cmovb/cmovc/cmovnae */ | ||
| 1439 | no_wb = (_eflags & EFLG_CF) ? 0 : 1; | ||
| 1440 | break; | ||
| 1441 | case 2: /* cmovz/cmove */ | ||
| 1442 | no_wb = (_eflags & EFLG_ZF) ? 0 : 1; | ||
| 1443 | break; | ||
| 1444 | case 3: /* cmovbe/cmovna */ | ||
| 1445 | no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1; | ||
| 1446 | break; | ||
| 1447 | case 4: /* cmovs */ | ||
| 1448 | no_wb = (_eflags & EFLG_SF) ? 0 : 1; | ||
| 1449 | break; | ||
| 1450 | case 5: /* cmovp/cmovpe */ | ||
| 1451 | no_wb = (_eflags & EFLG_PF) ? 0 : 1; | ||
| 1452 | break; | ||
| 1453 | case 7: /* cmovle/cmovng */ | ||
| 1454 | no_wb = (_eflags & EFLG_ZF) ? 0 : 1; | ||
| 1455 | /* fall through */ | ||
| 1456 | case 6: /* cmovl/cmovnge */ | ||
| 1457 | no_wb &= (!(_eflags & EFLG_SF) != | ||
| 1458 | !(_eflags & EFLG_OF)) ? 0 : 1; | ||
| 1459 | break; | ||
| 1460 | } | ||
| 1461 | /* Odd cmov opcodes (lsb == 1) have inverted sense. */ | ||
| 1462 | no_wb ^= b & 1; | ||
| 1463 | break; | ||
| 1464 | case 0xa3: | ||
| 1465 | bt: /* bt */ | ||
| 1466 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
| 1467 | emulate_2op_SrcV_nobyte("bt", src, dst, _eflags); | ||
| 1468 | break; | ||
| 1469 | case 0xab: | ||
| 1470 | bts: /* bts */ | ||
| 1471 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
| 1472 | emulate_2op_SrcV_nobyte("bts", src, dst, _eflags); | ||
| 1473 | break; | ||
| 1474 | case 0xb0 ... 0xb1: /* cmpxchg */ | ||
| 1475 | /* | ||
| 1476 | * Save real source value, then compare EAX against | ||
| 1477 | * destination. | ||
| 1478 | */ | ||
| 1479 | src.orig_val = src.val; | ||
| 1480 | src.val = _regs[VCPU_REGS_RAX]; | ||
| 1481 | emulate_2op_SrcV("cmp", src, dst, _eflags); | ||
| 1482 | if (_eflags & EFLG_ZF) { | ||
| 1483 | /* Success: write back to memory. */ | ||
| 1484 | dst.val = src.orig_val; | ||
| 1485 | } else { | ||
| 1486 | /* Failure: write the value we saw to EAX. */ | ||
| 1487 | dst.type = OP_REG; | ||
| 1488 | dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; | ||
| 1489 | } | ||
| 1490 | break; | ||
| 1491 | case 0xb3: | ||
| 1492 | btr: /* btr */ | ||
| 1493 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
| 1494 | emulate_2op_SrcV_nobyte("btr", src, dst, _eflags); | ||
| 1495 | break; | ||
| 1496 | case 0xb6 ... 0xb7: /* movzx */ | ||
| 1497 | dst.bytes = op_bytes; | ||
| 1498 | dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val; | ||
| 1499 | break; | ||
| 1500 | case 0xba: /* Grp8 */ | ||
| 1501 | switch (modrm_reg & 3) { | ||
| 1502 | case 0: | ||
| 1503 | goto bt; | ||
| 1504 | case 1: | ||
| 1505 | goto bts; | ||
| 1506 | case 2: | ||
| 1507 | goto btr; | ||
| 1508 | case 3: | ||
| 1509 | goto btc; | ||
| 1510 | } | ||
| 1511 | break; | ||
| 1512 | case 0xbb: | ||
| 1513 | btc: /* btc */ | ||
| 1514 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
| 1515 | emulate_2op_SrcV_nobyte("btc", src, dst, _eflags); | ||
| 1516 | break; | ||
| 1517 | case 0xbe ... 0xbf: /* movsx */ | ||
| 1518 | dst.bytes = op_bytes; | ||
| 1519 | dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val; | ||
| 1520 | break; | ||
| 1521 | case 0xc3: /* movnti */ | ||
| 1522 | dst.bytes = op_bytes; | ||
| 1523 | dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val; | ||
| 1524 | break; | ||
| 1525 | } | ||
| 1526 | goto writeback; | ||
| 1527 | |||
| 1528 | twobyte_special_insn: | ||
| 1529 | /* Disable writeback. */ | ||
| 1530 | no_wb = 1; | ||
| 1531 | switch (b) { | ||
| 1532 | case 0x06: | ||
| 1533 | emulate_clts(ctxt->vcpu); | ||
| 1534 | break; | ||
| 1535 | case 0x08: /* invd */ | ||
| 1536 | break; | ||
| 1537 | case 0x09: /* wbinvd */ | ||
| 1538 | break; | ||
| 1539 | case 0x0d: /* GrpP (prefetch) */ | ||
| 1540 | case 0x18: /* Grp16 (prefetch/nop) */ | ||
| 1541 | break; | ||
| 1542 | case 0x20: /* mov cr, reg */ | ||
| 1543 | if (modrm_mod != 3) | ||
| 1544 | goto cannot_emulate; | ||
| 1545 | _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg); | ||
| 1546 | break; | ||
| 1547 | case 0x22: /* mov reg, cr */ | ||
| 1548 | if (modrm_mod != 3) | ||
| 1549 | goto cannot_emulate; | ||
| 1550 | realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags); | ||
| 1551 | break; | ||
| 1552 | case 0x30: | ||
| 1553 | /* wrmsr */ | ||
| 1554 | msr_data = (u32)_regs[VCPU_REGS_RAX] | ||
| 1555 | | ((u64)_regs[VCPU_REGS_RDX] << 32); | ||
| 1556 | rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data); | ||
| 1557 | if (rc) { | ||
| 1558 | kvm_x86_ops->inject_gp(ctxt->vcpu, 0); | ||
| 1559 | _eip = ctxt->vcpu->rip; | ||
| 1560 | } | ||
| 1561 | rc = X86EMUL_CONTINUE; | ||
| 1562 | break; | ||
| 1563 | case 0x32: | ||
| 1564 | /* rdmsr */ | ||
| 1565 | rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data); | ||
| 1566 | if (rc) { | ||
| 1567 | kvm_x86_ops->inject_gp(ctxt->vcpu, 0); | ||
| 1568 | _eip = ctxt->vcpu->rip; | ||
| 1569 | } else { | ||
| 1570 | _regs[VCPU_REGS_RAX] = (u32)msr_data; | ||
| 1571 | _regs[VCPU_REGS_RDX] = msr_data >> 32; | ||
| 1572 | } | ||
| 1573 | rc = X86EMUL_CONTINUE; | ||
| 1574 | break; | ||
| 1575 | case 0x80 ... 0x8f: /* jnz rel, etc*/ { | ||
| 1576 | long int rel; | ||
| 1577 | |||
| 1578 | switch (op_bytes) { | ||
| 1579 | case 2: | ||
| 1580 | rel = insn_fetch(s16, 2, _eip); | ||
| 1581 | break; | ||
| 1582 | case 4: | ||
| 1583 | rel = insn_fetch(s32, 4, _eip); | ||
| 1584 | break; | ||
| 1585 | case 8: | ||
| 1586 | rel = insn_fetch(s64, 8, _eip); | ||
| 1587 | break; | ||
| 1588 | default: | ||
| 1589 | DPRINTF("jnz: Invalid op_bytes\n"); | ||
| 1590 | goto cannot_emulate; | ||
| 1591 | } | ||
| 1592 | if (test_cc(b, _eflags)) | ||
| 1593 | JMP_REL(rel); | ||
| 1594 | break; | ||
| 1595 | } | ||
| 1596 | case 0xc7: /* Grp9 (cmpxchg8b) */ | ||
| 1597 | { | ||
| 1598 | u64 old, new; | ||
| 1599 | if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu)) | ||
| 1600 | != 0) | ||
| 1601 | goto done; | ||
| 1602 | if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) || | ||
| 1603 | ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) { | ||
| 1604 | _regs[VCPU_REGS_RAX] = (u32) (old >> 0); | ||
| 1605 | _regs[VCPU_REGS_RDX] = (u32) (old >> 32); | ||
| 1606 | _eflags &= ~EFLG_ZF; | ||
| 1607 | } else { | ||
| 1608 | new = ((u64)_regs[VCPU_REGS_RCX] << 32) | ||
| 1609 | | (u32) _regs[VCPU_REGS_RBX]; | ||
| 1610 | if ((rc = ops->cmpxchg_emulated(cr2, &old, | ||
| 1611 | &new, 8, ctxt->vcpu)) != 0) | ||
| 1612 | goto done; | ||
| 1613 | _eflags |= EFLG_ZF; | ||
| 1614 | } | ||
| 1615 | break; | ||
| 1616 | } | ||
| 1617 | } | ||
| 1618 | goto writeback; | ||
| 1619 | |||
| 1620 | cannot_emulate: | ||
| 1621 | DPRINTF("Cannot emulate %02x\n", b); | ||
| 1622 | return -1; | ||
| 1623 | } | ||
| 1624 | |||
| 1625 | #ifdef __XEN__ | ||
| 1626 | |||
| 1627 | #include <asm/mm.h> | ||
| 1628 | #include <asm/uaccess.h> | ||
| 1629 | |||
| 1630 | int | ||
| 1631 | x86_emulate_read_std(unsigned long addr, | ||
| 1632 | unsigned long *val, | ||
| 1633 | unsigned int bytes, struct x86_emulate_ctxt *ctxt) | ||
| 1634 | { | ||
| 1635 | unsigned int rc; | ||
| 1636 | |||
| 1637 | *val = 0; | ||
| 1638 | |||
| 1639 | if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) { | ||
| 1640 | propagate_page_fault(addr + bytes - rc, 0); /* read fault */ | ||
| 1641 | return X86EMUL_PROPAGATE_FAULT; | ||
| 1642 | } | ||
| 1643 | |||
| 1644 | return X86EMUL_CONTINUE; | ||
| 1645 | } | ||
| 1646 | |||
| 1647 | int | ||
| 1648 | x86_emulate_write_std(unsigned long addr, | ||
| 1649 | unsigned long val, | ||
| 1650 | unsigned int bytes, struct x86_emulate_ctxt *ctxt) | ||
| 1651 | { | ||
| 1652 | unsigned int rc; | ||
| 1653 | |||
| 1654 | if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) { | ||
| 1655 | propagate_page_fault(addr + bytes - rc, PGERR_write_access); | ||
| 1656 | return X86EMUL_PROPAGATE_FAULT; | ||
| 1657 | } | ||
| 1658 | |||
| 1659 | return X86EMUL_CONTINUE; | ||
| 1660 | } | ||
| 1661 | |||
| 1662 | #endif | ||
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index cb4c67025d52..7743d73768df 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c | |||
| @@ -151,43 +151,43 @@ int lguest_address_ok(const struct lguest *lg, | |||
| 151 | /* This routine copies memory from the Guest. Here we can see how useful the | 151 | /* This routine copies memory from the Guest. Here we can see how useful the |
| 152 | * kill_lguest() routine we met in the Launcher can be: we return a random | 152 | * kill_lguest() routine we met in the Launcher can be: we return a random |
| 153 | * value (all zeroes) instead of needing to return an error. */ | 153 | * value (all zeroes) instead of needing to return an error. */ |
| 154 | void __lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) | 154 | void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) |
| 155 | { | 155 | { |
| 156 | if (!lguest_address_ok(lg, addr, bytes) | 156 | if (!lguest_address_ok(cpu->lg, addr, bytes) |
| 157 | || copy_from_user(b, lg->mem_base + addr, bytes) != 0) { | 157 | || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) { |
| 158 | /* copy_from_user should do this, but as we rely on it... */ | 158 | /* copy_from_user should do this, but as we rely on it... */ |
| 159 | memset(b, 0, bytes); | 159 | memset(b, 0, bytes); |
| 160 | kill_guest(lg, "bad read address %#lx len %u", addr, bytes); | 160 | kill_guest(cpu, "bad read address %#lx len %u", addr, bytes); |
| 161 | } | 161 | } |
| 162 | } | 162 | } |
| 163 | 163 | ||
| 164 | /* This is the write (copy into guest) version. */ | 164 | /* This is the write (copy into guest) version. */ |
| 165 | void __lgwrite(struct lguest *lg, unsigned long addr, const void *b, | 165 | void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, |
| 166 | unsigned bytes) | 166 | unsigned bytes) |
| 167 | { | 167 | { |
| 168 | if (!lguest_address_ok(lg, addr, bytes) | 168 | if (!lguest_address_ok(cpu->lg, addr, bytes) |
| 169 | || copy_to_user(lg->mem_base + addr, b, bytes) != 0) | 169 | || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0) |
| 170 | kill_guest(lg, "bad write address %#lx len %u", addr, bytes); | 170 | kill_guest(cpu, "bad write address %#lx len %u", addr, bytes); |
| 171 | } | 171 | } |
| 172 | /*:*/ | 172 | /*:*/ |
| 173 | 173 | ||
| 174 | /*H:030 Let's jump straight to the the main loop which runs the Guest. | 174 | /*H:030 Let's jump straight to the the main loop which runs the Guest. |
| 175 | * Remember, this is called by the Launcher reading /dev/lguest, and we keep | 175 | * Remember, this is called by the Launcher reading /dev/lguest, and we keep |
| 176 | * going around and around until something interesting happens. */ | 176 | * going around and around until something interesting happens. */ |
| 177 | int run_guest(struct lguest *lg, unsigned long __user *user) | 177 | int run_guest(struct lg_cpu *cpu, unsigned long __user *user) |
| 178 | { | 178 | { |
| 179 | /* We stop running once the Guest is dead. */ | 179 | /* We stop running once the Guest is dead. */ |
| 180 | while (!lg->dead) { | 180 | while (!cpu->lg->dead) { |
| 181 | /* First we run any hypercalls the Guest wants done. */ | 181 | /* First we run any hypercalls the Guest wants done. */ |
| 182 | if (lg->hcall) | 182 | if (cpu->hcall) |
| 183 | do_hypercalls(lg); | 183 | do_hypercalls(cpu); |
| 184 | 184 | ||
| 185 | /* It's possible the Guest did a NOTIFY hypercall to the | 185 | /* It's possible the Guest did a NOTIFY hypercall to the |
| 186 | * Launcher, in which case we return from the read() now. */ | 186 | * Launcher, in which case we return from the read() now. */ |
| 187 | if (lg->pending_notify) { | 187 | if (cpu->pending_notify) { |
| 188 | if (put_user(lg->pending_notify, user)) | 188 | if (put_user(cpu->pending_notify, user)) |
| 189 | return -EFAULT; | 189 | return -EFAULT; |
| 190 | return sizeof(lg->pending_notify); | 190 | return sizeof(cpu->pending_notify); |
| 191 | } | 191 | } |
| 192 | 192 | ||
| 193 | /* Check for signals */ | 193 | /* Check for signals */ |
| @@ -195,13 +195,13 @@ int run_guest(struct lguest *lg, unsigned long __user *user) | |||
| 195 | return -ERESTARTSYS; | 195 | return -ERESTARTSYS; |
| 196 | 196 | ||
| 197 | /* If Waker set break_out, return to Launcher. */ | 197 | /* If Waker set break_out, return to Launcher. */ |
| 198 | if (lg->break_out) | 198 | if (cpu->break_out) |
| 199 | return -EAGAIN; | 199 | return -EAGAIN; |
| 200 | 200 | ||
| 201 | /* Check if there are any interrupts which can be delivered | 201 | /* Check if there are any interrupts which can be delivered |
| 202 | * now: if so, this sets up the hander to be executed when we | 202 | * now: if so, this sets up the hander to be executed when we |
| 203 | * next run the Guest. */ | 203 | * next run the Guest. */ |
| 204 | maybe_do_interrupt(lg); | 204 | maybe_do_interrupt(cpu); |
| 205 | 205 | ||
| 206 | /* All long-lived kernel loops need to check with this horrible | 206 | /* All long-lived kernel loops need to check with this horrible |
| 207 | * thing called the freezer. If the Host is trying to suspend, | 207 | * thing called the freezer. If the Host is trying to suspend, |
| @@ -210,12 +210,12 @@ int run_guest(struct lguest *lg, unsigned long __user *user) | |||
| 210 | 210 | ||
| 211 | /* Just make absolutely sure the Guest is still alive. One of | 211 | /* Just make absolutely sure the Guest is still alive. One of |
| 212 | * those hypercalls could have been fatal, for example. */ | 212 | * those hypercalls could have been fatal, for example. */ |
| 213 | if (lg->dead) | 213 | if (cpu->lg->dead) |
| 214 | break; | 214 | break; |
| 215 | 215 | ||
| 216 | /* If the Guest asked to be stopped, we sleep. The Guest's | 216 | /* If the Guest asked to be stopped, we sleep. The Guest's |
| 217 | * clock timer or LHCALL_BREAK from the Waker will wake us. */ | 217 | * clock timer or LHCALL_BREAK from the Waker will wake us. */ |
| 218 | if (lg->halted) { | 218 | if (cpu->halted) { |
| 219 | set_current_state(TASK_INTERRUPTIBLE); | 219 | set_current_state(TASK_INTERRUPTIBLE); |
| 220 | schedule(); | 220 | schedule(); |
| 221 | continue; | 221 | continue; |
| @@ -226,15 +226,17 @@ int run_guest(struct lguest *lg, unsigned long __user *user) | |||
| 226 | local_irq_disable(); | 226 | local_irq_disable(); |
| 227 | 227 | ||
| 228 | /* Actually run the Guest until something happens. */ | 228 | /* Actually run the Guest until something happens. */ |
| 229 | lguest_arch_run_guest(lg); | 229 | lguest_arch_run_guest(cpu); |
| 230 | 230 | ||
| 231 | /* Now we're ready to be interrupted or moved to other CPUs */ | 231 | /* Now we're ready to be interrupted or moved to other CPUs */ |
| 232 | local_irq_enable(); | 232 | local_irq_enable(); |
| 233 | 233 | ||
| 234 | /* Now we deal with whatever happened to the Guest. */ | 234 | /* Now we deal with whatever happened to the Guest. */ |
| 235 | lguest_arch_handle_trap(lg); | 235 | lguest_arch_handle_trap(cpu); |
| 236 | } | 236 | } |
| 237 | 237 | ||
| 238 | if (cpu->lg->dead == ERR_PTR(-ERESTART)) | ||
| 239 | return -ERESTART; | ||
| 238 | /* The Guest is dead => "No such file or directory" */ | 240 | /* The Guest is dead => "No such file or directory" */ |
| 239 | return -ENOENT; | 241 | return -ENOENT; |
| 240 | } | 242 | } |
| @@ -253,7 +255,7 @@ static int __init init(void) | |||
| 253 | 255 | ||
| 254 | /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */ | 256 | /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */ |
| 255 | if (paravirt_enabled()) { | 257 | if (paravirt_enabled()) { |
| 256 | printk("lguest is afraid of %s\n", pv_info.name); | 258 | printk("lguest is afraid of being a guest\n"); |
| 257 | return -EPERM; | 259 | return -EPERM; |
| 258 | } | 260 | } |
| 259 | 261 | ||
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index b478affe8f91..0f2cb4fd7c69 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c | |||
| @@ -23,13 +23,14 @@ | |||
| 23 | #include <linux/uaccess.h> | 23 | #include <linux/uaccess.h> |
| 24 | #include <linux/syscalls.h> | 24 | #include <linux/syscalls.h> |
| 25 | #include <linux/mm.h> | 25 | #include <linux/mm.h> |
| 26 | #include <linux/ktime.h> | ||
| 26 | #include <asm/page.h> | 27 | #include <asm/page.h> |
| 27 | #include <asm/pgtable.h> | 28 | #include <asm/pgtable.h> |
| 28 | #include "lg.h" | 29 | #include "lg.h" |
| 29 | 30 | ||
| 30 | /*H:120 This is the core hypercall routine: where the Guest gets what it wants. | 31 | /*H:120 This is the core hypercall routine: where the Guest gets what it wants. |
| 31 | * Or gets killed. Or, in the case of LHCALL_CRASH, both. */ | 32 | * Or gets killed. Or, in the case of LHCALL_CRASH, both. */ |
| 32 | static void do_hcall(struct lguest *lg, struct hcall_args *args) | 33 | static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) |
| 33 | { | 34 | { |
| 34 | switch (args->arg0) { | 35 | switch (args->arg0) { |
| 35 | case LHCALL_FLUSH_ASYNC: | 36 | case LHCALL_FLUSH_ASYNC: |
| @@ -39,60 +40,62 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args) | |||
| 39 | case LHCALL_LGUEST_INIT: | 40 | case LHCALL_LGUEST_INIT: |
| 40 | /* You can't get here unless you're already initialized. Don't | 41 | /* You can't get here unless you're already initialized. Don't |
| 41 | * do that. */ | 42 | * do that. */ |
| 42 | kill_guest(lg, "already have lguest_data"); | 43 | kill_guest(cpu, "already have lguest_data"); |
| 43 | break; | 44 | break; |
| 44 | case LHCALL_CRASH: { | 45 | case LHCALL_SHUTDOWN: { |
| 45 | /* Crash is such a trivial hypercall that we do it in four | 46 | /* Shutdown is such a trivial hypercall that we do it in four |
| 46 | * lines right here. */ | 47 | * lines right here. */ |
| 47 | char msg[128]; | 48 | char msg[128]; |
| 48 | /* If the lgread fails, it will call kill_guest() itself; the | 49 | /* If the lgread fails, it will call kill_guest() itself; the |
| 49 | * kill_guest() with the message will be ignored. */ | 50 | * kill_guest() with the message will be ignored. */ |
| 50 | __lgread(lg, msg, args->arg1, sizeof(msg)); | 51 | __lgread(cpu, msg, args->arg1, sizeof(msg)); |
| 51 | msg[sizeof(msg)-1] = '\0'; | 52 | msg[sizeof(msg)-1] = '\0'; |
| 52 | kill_guest(lg, "CRASH: %s", msg); | 53 | kill_guest(cpu, "CRASH: %s", msg); |
| 54 | if (args->arg2 == LGUEST_SHUTDOWN_RESTART) | ||
| 55 | cpu->lg->dead = ERR_PTR(-ERESTART); | ||
| 53 | break; | 56 | break; |
| 54 | } | 57 | } |
| 55 | case LHCALL_FLUSH_TLB: | 58 | case LHCALL_FLUSH_TLB: |
| 56 | /* FLUSH_TLB comes in two flavors, depending on the | 59 | /* FLUSH_TLB comes in two flavors, depending on the |
| 57 | * argument: */ | 60 | * argument: */ |
| 58 | if (args->arg1) | 61 | if (args->arg1) |
| 59 | guest_pagetable_clear_all(lg); | 62 | guest_pagetable_clear_all(cpu); |
| 60 | else | 63 | else |
| 61 | guest_pagetable_flush_user(lg); | 64 | guest_pagetable_flush_user(cpu); |
| 62 | break; | 65 | break; |
| 63 | 66 | ||
| 64 | /* All these calls simply pass the arguments through to the right | 67 | /* All these calls simply pass the arguments through to the right |
| 65 | * routines. */ | 68 | * routines. */ |
| 66 | case LHCALL_NEW_PGTABLE: | 69 | case LHCALL_NEW_PGTABLE: |
| 67 | guest_new_pagetable(lg, args->arg1); | 70 | guest_new_pagetable(cpu, args->arg1); |
| 68 | break; | 71 | break; |
| 69 | case LHCALL_SET_STACK: | 72 | case LHCALL_SET_STACK: |
| 70 | guest_set_stack(lg, args->arg1, args->arg2, args->arg3); | 73 | guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); |
| 71 | break; | 74 | break; |
| 72 | case LHCALL_SET_PTE: | 75 | case LHCALL_SET_PTE: |
| 73 | guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3)); | 76 | guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); |
| 74 | break; | 77 | break; |
| 75 | case LHCALL_SET_PMD: | 78 | case LHCALL_SET_PMD: |
| 76 | guest_set_pmd(lg, args->arg1, args->arg2); | 79 | guest_set_pmd(cpu->lg, args->arg1, args->arg2); |
| 77 | break; | 80 | break; |
| 78 | case LHCALL_SET_CLOCKEVENT: | 81 | case LHCALL_SET_CLOCKEVENT: |
| 79 | guest_set_clockevent(lg, args->arg1); | 82 | guest_set_clockevent(cpu, args->arg1); |
| 80 | break; | 83 | break; |
| 81 | case LHCALL_TS: | 84 | case LHCALL_TS: |
| 82 | /* This sets the TS flag, as we saw used in run_guest(). */ | 85 | /* This sets the TS flag, as we saw used in run_guest(). */ |
| 83 | lg->ts = args->arg1; | 86 | cpu->ts = args->arg1; |
| 84 | break; | 87 | break; |
| 85 | case LHCALL_HALT: | 88 | case LHCALL_HALT: |
| 86 | /* Similarly, this sets the halted flag for run_guest(). */ | 89 | /* Similarly, this sets the halted flag for run_guest(). */ |
| 87 | lg->halted = 1; | 90 | cpu->halted = 1; |
| 88 | break; | 91 | break; |
| 89 | case LHCALL_NOTIFY: | 92 | case LHCALL_NOTIFY: |
| 90 | lg->pending_notify = args->arg1; | 93 | cpu->pending_notify = args->arg1; |
| 91 | break; | 94 | break; |
| 92 | default: | 95 | default: |
| 93 | /* It should be an architecture-specific hypercall. */ | 96 | /* It should be an architecture-specific hypercall. */ |
| 94 | if (lguest_arch_do_hcall(lg, args)) | 97 | if (lguest_arch_do_hcall(cpu, args)) |
| 95 | kill_guest(lg, "Bad hypercall %li\n", args->arg0); | 98 | kill_guest(cpu, "Bad hypercall %li\n", args->arg0); |
| 96 | } | 99 | } |
| 97 | } | 100 | } |
| 98 | /*:*/ | 101 | /*:*/ |
| @@ -104,13 +107,13 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args) | |||
| 104 | * Guest put them in the ring, but we also promise the Guest that they will | 107 | * Guest put them in the ring, but we also promise the Guest that they will |
| 105 | * happen before any normal hypercall (which is why we check this before | 108 | * happen before any normal hypercall (which is why we check this before |
| 106 | * checking for a normal hcall). */ | 109 | * checking for a normal hcall). */ |
| 107 | static void do_async_hcalls(struct lguest *lg) | 110 | static void do_async_hcalls(struct lg_cpu *cpu) |
| 108 | { | 111 | { |
| 109 | unsigned int i; | 112 | unsigned int i; |
| 110 | u8 st[LHCALL_RING_SIZE]; | 113 | u8 st[LHCALL_RING_SIZE]; |
| 111 | 114 | ||
| 112 | /* For simplicity, we copy the entire call status array in at once. */ | 115 | /* For simplicity, we copy the entire call status array in at once. */ |
| 113 | if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) | 116 | if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st))) |
| 114 | return; | 117 | return; |
| 115 | 118 | ||
| 116 | /* We process "struct lguest_data"s hcalls[] ring once. */ | 119 | /* We process "struct lguest_data"s hcalls[] ring once. */ |
| @@ -119,7 +122,7 @@ static void do_async_hcalls(struct lguest *lg) | |||
| 119 | /* We remember where we were up to from last time. This makes | 122 | /* We remember where we were up to from last time. This makes |
| 120 | * sure that the hypercalls are done in the order the Guest | 123 | * sure that the hypercalls are done in the order the Guest |
| 121 | * places them in the ring. */ | 124 | * places them in the ring. */ |
| 122 | unsigned int n = lg->next_hcall; | 125 | unsigned int n = cpu->next_hcall; |
| 123 | 126 | ||
| 124 | /* 0xFF means there's no call here (yet). */ | 127 | /* 0xFF means there's no call here (yet). */ |
| 125 | if (st[n] == 0xFF) | 128 | if (st[n] == 0xFF) |
| @@ -127,65 +130,65 @@ static void do_async_hcalls(struct lguest *lg) | |||
| 127 | 130 | ||
| 128 | /* OK, we have hypercall. Increment the "next_hcall" cursor, | 131 | /* OK, we have hypercall. Increment the "next_hcall" cursor, |
| 129 | * and wrap back to 0 if we reach the end. */ | 132 | * and wrap back to 0 if we reach the end. */ |
| 130 | if (++lg->next_hcall == LHCALL_RING_SIZE) | 133 | if (++cpu->next_hcall == LHCALL_RING_SIZE) |
| 131 | lg->next_hcall = 0; | 134 | cpu->next_hcall = 0; |
| 132 | 135 | ||
| 133 | /* Copy the hypercall arguments into a local copy of | 136 | /* Copy the hypercall arguments into a local copy of |
| 134 | * the hcall_args struct. */ | 137 | * the hcall_args struct. */ |
| 135 | if (copy_from_user(&args, &lg->lguest_data->hcalls[n], | 138 | if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n], |
| 136 | sizeof(struct hcall_args))) { | 139 | sizeof(struct hcall_args))) { |
| 137 | kill_guest(lg, "Fetching async hypercalls"); | 140 | kill_guest(cpu, "Fetching async hypercalls"); |
| 138 | break; | 141 | break; |
| 139 | } | 142 | } |
| 140 | 143 | ||
| 141 | /* Do the hypercall, same as a normal one. */ | 144 | /* Do the hypercall, same as a normal one. */ |
| 142 | do_hcall(lg, &args); | 145 | do_hcall(cpu, &args); |
| 143 | 146 | ||
| 144 | /* Mark the hypercall done. */ | 147 | /* Mark the hypercall done. */ |
| 145 | if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { | 148 | if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) { |
| 146 | kill_guest(lg, "Writing result for async hypercall"); | 149 | kill_guest(cpu, "Writing result for async hypercall"); |
| 147 | break; | 150 | break; |
| 148 | } | 151 | } |
| 149 | 152 | ||
| 150 | /* Stop doing hypercalls if they want to notify the Launcher: | 153 | /* Stop doing hypercalls if they want to notify the Launcher: |
| 151 | * it needs to service this first. */ | 154 | * it needs to service this first. */ |
| 152 | if (lg->pending_notify) | 155 | if (cpu->pending_notify) |
| 153 | break; | 156 | break; |
| 154 | } | 157 | } |
| 155 | } | 158 | } |
| 156 | 159 | ||
| 157 | /* Last of all, we look at what happens first of all. The very first time the | 160 | /* Last of all, we look at what happens first of all. The very first time the |
| 158 | * Guest makes a hypercall, we end up here to set things up: */ | 161 | * Guest makes a hypercall, we end up here to set things up: */ |
| 159 | static void initialize(struct lguest *lg) | 162 | static void initialize(struct lg_cpu *cpu) |
| 160 | { | 163 | { |
| 161 | /* You can't do anything until you're initialized. The Guest knows the | 164 | /* You can't do anything until you're initialized. The Guest knows the |
| 162 | * rules, so we're unforgiving here. */ | 165 | * rules, so we're unforgiving here. */ |
| 163 | if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) { | 166 | if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) { |
| 164 | kill_guest(lg, "hypercall %li before INIT", lg->hcall->arg0); | 167 | kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0); |
| 165 | return; | 168 | return; |
| 166 | } | 169 | } |
| 167 | 170 | ||
| 168 | if (lguest_arch_init_hypercalls(lg)) | 171 | if (lguest_arch_init_hypercalls(cpu)) |
| 169 | kill_guest(lg, "bad guest page %p", lg->lguest_data); | 172 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); |
| 170 | 173 | ||
| 171 | /* The Guest tells us where we're not to deliver interrupts by putting | 174 | /* The Guest tells us where we're not to deliver interrupts by putting |
| 172 | * the range of addresses into "struct lguest_data". */ | 175 | * the range of addresses into "struct lguest_data". */ |
| 173 | if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) | 176 | if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start) |
| 174 | || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)) | 177 | || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end)) |
| 175 | kill_guest(lg, "bad guest page %p", lg->lguest_data); | 178 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); |
| 176 | 179 | ||
| 177 | /* We write the current time into the Guest's data page once so it can | 180 | /* We write the current time into the Guest's data page once so it can |
| 178 | * set its clock. */ | 181 | * set its clock. */ |
| 179 | write_timestamp(lg); | 182 | write_timestamp(cpu); |
| 180 | 183 | ||
| 181 | /* page_tables.c will also do some setup. */ | 184 | /* page_tables.c will also do some setup. */ |
| 182 | page_table_guest_data_init(lg); | 185 | page_table_guest_data_init(cpu); |
| 183 | 186 | ||
| 184 | /* This is the one case where the above accesses might have been the | 187 | /* This is the one case where the above accesses might have been the |
| 185 | * first write to a Guest page. This may have caused a copy-on-write | 188 | * first write to a Guest page. This may have caused a copy-on-write |
| 186 | * fault, but the old page might be (read-only) in the Guest | 189 | * fault, but the old page might be (read-only) in the Guest |
| 187 | * pagetable. */ | 190 | * pagetable. */ |
| 188 | guest_pagetable_clear_all(lg); | 191 | guest_pagetable_clear_all(cpu); |
| 189 | } | 192 | } |
| 190 | 193 | ||
| 191 | /*H:100 | 194 | /*H:100 |
| @@ -194,27 +197,27 @@ static void initialize(struct lguest *lg) | |||
| 194 | * Remember from the Guest, hypercalls come in two flavors: normal and | 197 | * Remember from the Guest, hypercalls come in two flavors: normal and |
| 195 | * asynchronous. This file handles both of types. | 198 | * asynchronous. This file handles both of types. |
| 196 | */ | 199 | */ |
| 197 | void do_hypercalls(struct lguest *lg) | 200 | void do_hypercalls(struct lg_cpu *cpu) |
| 198 | { | 201 | { |
| 199 | /* Not initialized yet? This hypercall must do it. */ | 202 | /* Not initialized yet? This hypercall must do it. */ |
| 200 | if (unlikely(!lg->lguest_data)) { | 203 | if (unlikely(!cpu->lg->lguest_data)) { |
| 201 | /* Set up the "struct lguest_data" */ | 204 | /* Set up the "struct lguest_data" */ |
| 202 | initialize(lg); | 205 | initialize(cpu); |
| 203 | /* Hcall is done. */ | 206 | /* Hcall is done. */ |
| 204 | lg->hcall = NULL; | 207 | cpu->hcall = NULL; |
| 205 | return; | 208 | return; |
| 206 | } | 209 | } |
| 207 | 210 | ||
| 208 | /* The Guest has initialized. | 211 | /* The Guest has initialized. |
| 209 | * | 212 | * |
| 210 | * Look in the hypercall ring for the async hypercalls: */ | 213 | * Look in the hypercall ring for the async hypercalls: */ |
| 211 | do_async_hcalls(lg); | 214 | do_async_hcalls(cpu); |
| 212 | 215 | ||
| 213 | /* If we stopped reading the hypercall ring because the Guest did a | 216 | /* If we stopped reading the hypercall ring because the Guest did a |
| 214 | * NOTIFY to the Launcher, we want to return now. Otherwise we do | 217 | * NOTIFY to the Launcher, we want to return now. Otherwise we do |
| 215 | * the hypercall. */ | 218 | * the hypercall. */ |
| 216 | if (!lg->pending_notify) { | 219 | if (!cpu->pending_notify) { |
| 217 | do_hcall(lg, lg->hcall); | 220 | do_hcall(cpu, cpu->hcall); |
| 218 | /* Tricky point: we reset the hcall pointer to mark the | 221 | /* Tricky point: we reset the hcall pointer to mark the |
| 219 | * hypercall as "done". We use the hcall pointer rather than | 222 | * hypercall as "done". We use the hcall pointer rather than |
| 220 | * the trap number to indicate a hypercall is pending. | 223 | * the trap number to indicate a hypercall is pending. |
| @@ -225,16 +228,17 @@ void do_hypercalls(struct lguest *lg) | |||
| 225 | * Launcher, the run_guest() loop will exit without running the | 228 | * Launcher, the run_guest() loop will exit without running the |
| 226 | * Guest. When it comes back it would try to re-run the | 229 | * Guest. When it comes back it would try to re-run the |
| 227 | * hypercall. */ | 230 | * hypercall. */ |
| 228 | lg->hcall = NULL; | 231 | cpu->hcall = NULL; |
| 229 | } | 232 | } |
| 230 | } | 233 | } |
| 231 | 234 | ||
| 232 | /* This routine supplies the Guest with time: it's used for wallclock time at | 235 | /* This routine supplies the Guest with time: it's used for wallclock time at |
| 233 | * initial boot and as a rough time source if the TSC isn't available. */ | 236 | * initial boot and as a rough time source if the TSC isn't available. */ |
| 234 | void write_timestamp(struct lguest *lg) | 237 | void write_timestamp(struct lg_cpu *cpu) |
| 235 | { | 238 | { |
| 236 | struct timespec now; | 239 | struct timespec now; |
| 237 | ktime_get_real_ts(&now); | 240 | ktime_get_real_ts(&now); |
| 238 | if (copy_to_user(&lg->lguest_data->time, &now, sizeof(struct timespec))) | 241 | if (copy_to_user(&cpu->lg->lguest_data->time, |
| 239 | kill_guest(lg, "Writing timestamp"); | 242 | &now, sizeof(struct timespec))) |
| 243 | kill_guest(cpu, "Writing timestamp"); | ||
| 240 | } | 244 | } |
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 2b66f79c208b..32e97c1858e5 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c | |||
| @@ -41,11 +41,11 @@ static int idt_present(u32 lo, u32 hi) | |||
| 41 | 41 | ||
| 42 | /* We need a helper to "push" a value onto the Guest's stack, since that's a | 42 | /* We need a helper to "push" a value onto the Guest's stack, since that's a |
| 43 | * big part of what delivering an interrupt does. */ | 43 | * big part of what delivering an interrupt does. */ |
| 44 | static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) | 44 | static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) |
| 45 | { | 45 | { |
| 46 | /* Stack grows upwards: move stack then write value. */ | 46 | /* Stack grows upwards: move stack then write value. */ |
| 47 | *gstack -= 4; | 47 | *gstack -= 4; |
| 48 | lgwrite(lg, *gstack, u32, val); | 48 | lgwrite(cpu, *gstack, u32, val); |
| 49 | } | 49 | } |
| 50 | 50 | ||
| 51 | /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or | 51 | /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or |
| @@ -60,7 +60,7 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) | |||
| 60 | * We set up the stack just like the CPU does for a real interrupt, so it's | 60 | * We set up the stack just like the CPU does for a real interrupt, so it's |
| 61 | * identical for the Guest (and the standard "iret" instruction will undo | 61 | * identical for the Guest (and the standard "iret" instruction will undo |
| 62 | * it). */ | 62 | * it). */ |
| 63 | static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) | 63 | static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, int has_err) |
| 64 | { | 64 | { |
| 65 | unsigned long gstack, origstack; | 65 | unsigned long gstack, origstack; |
| 66 | u32 eflags, ss, irq_enable; | 66 | u32 eflags, ss, irq_enable; |
| @@ -69,59 +69,59 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) | |||
| 69 | /* There are two cases for interrupts: one where the Guest is already | 69 | /* There are two cases for interrupts: one where the Guest is already |
| 70 | * in the kernel, and a more complex one where the Guest is in | 70 | * in the kernel, and a more complex one where the Guest is in |
| 71 | * userspace. We check the privilege level to find out. */ | 71 | * userspace. We check the privilege level to find out. */ |
| 72 | if ((lg->regs->ss&0x3) != GUEST_PL) { | 72 | if ((cpu->regs->ss&0x3) != GUEST_PL) { |
| 73 | /* The Guest told us their kernel stack with the SET_STACK | 73 | /* The Guest told us their kernel stack with the SET_STACK |
| 74 | * hypercall: both the virtual address and the segment */ | 74 | * hypercall: both the virtual address and the segment */ |
| 75 | virtstack = lg->esp1; | 75 | virtstack = cpu->esp1; |
| 76 | ss = lg->ss1; | 76 | ss = cpu->ss1; |
| 77 | 77 | ||
| 78 | origstack = gstack = guest_pa(lg, virtstack); | 78 | origstack = gstack = guest_pa(cpu, virtstack); |
| 79 | /* We push the old stack segment and pointer onto the new | 79 | /* We push the old stack segment and pointer onto the new |
| 80 | * stack: when the Guest does an "iret" back from the interrupt | 80 | * stack: when the Guest does an "iret" back from the interrupt |
| 81 | * handler the CPU will notice they're dropping privilege | 81 | * handler the CPU will notice they're dropping privilege |
| 82 | * levels and expect these here. */ | 82 | * levels and expect these here. */ |
| 83 | push_guest_stack(lg, &gstack, lg->regs->ss); | 83 | push_guest_stack(cpu, &gstack, cpu->regs->ss); |
| 84 | push_guest_stack(lg, &gstack, lg->regs->esp); | 84 | push_guest_stack(cpu, &gstack, cpu->regs->esp); |
| 85 | } else { | 85 | } else { |
| 86 | /* We're staying on the same Guest (kernel) stack. */ | 86 | /* We're staying on the same Guest (kernel) stack. */ |
| 87 | virtstack = lg->regs->esp; | 87 | virtstack = cpu->regs->esp; |
| 88 | ss = lg->regs->ss; | 88 | ss = cpu->regs->ss; |
| 89 | 89 | ||
| 90 | origstack = gstack = guest_pa(lg, virtstack); | 90 | origstack = gstack = guest_pa(cpu, virtstack); |
| 91 | } | 91 | } |
| 92 | 92 | ||
| 93 | /* Remember that we never let the Guest actually disable interrupts, so | 93 | /* Remember that we never let the Guest actually disable interrupts, so |
| 94 | * the "Interrupt Flag" bit is always set. We copy that bit from the | 94 | * the "Interrupt Flag" bit is always set. We copy that bit from the |
| 95 | * Guest's "irq_enabled" field into the eflags word: we saw the Guest | 95 | * Guest's "irq_enabled" field into the eflags word: we saw the Guest |
| 96 | * copy it back in "lguest_iret". */ | 96 | * copy it back in "lguest_iret". */ |
| 97 | eflags = lg->regs->eflags; | 97 | eflags = cpu->regs->eflags; |
| 98 | if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0 | 98 | if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 |
| 99 | && !(irq_enable & X86_EFLAGS_IF)) | 99 | && !(irq_enable & X86_EFLAGS_IF)) |
| 100 | eflags &= ~X86_EFLAGS_IF; | 100 | eflags &= ~X86_EFLAGS_IF; |
| 101 | 101 | ||
| 102 | /* An interrupt is expected to push three things on the stack: the old | 102 | /* An interrupt is expected to push three things on the stack: the old |
| 103 | * "eflags" word, the old code segment, and the old instruction | 103 | * "eflags" word, the old code segment, and the old instruction |
| 104 | * pointer. */ | 104 | * pointer. */ |
| 105 | push_guest_stack(lg, &gstack, eflags); | 105 | push_guest_stack(cpu, &gstack, eflags); |
| 106 | push_guest_stack(lg, &gstack, lg->regs->cs); | 106 | push_guest_stack(cpu, &gstack, cpu->regs->cs); |
| 107 | push_guest_stack(lg, &gstack, lg->regs->eip); | 107 | push_guest_stack(cpu, &gstack, cpu->regs->eip); |
| 108 | 108 | ||
| 109 | /* For the six traps which supply an error code, we push that, too. */ | 109 | /* For the six traps which supply an error code, we push that, too. */ |
| 110 | if (has_err) | 110 | if (has_err) |
| 111 | push_guest_stack(lg, &gstack, lg->regs->errcode); | 111 | push_guest_stack(cpu, &gstack, cpu->regs->errcode); |
| 112 | 112 | ||
| 113 | /* Now we've pushed all the old state, we change the stack, the code | 113 | /* Now we've pushed all the old state, we change the stack, the code |
| 114 | * segment and the address to execute. */ | 114 | * segment and the address to execute. */ |
| 115 | lg->regs->ss = ss; | 115 | cpu->regs->ss = ss; |
| 116 | lg->regs->esp = virtstack + (gstack - origstack); | 116 | cpu->regs->esp = virtstack + (gstack - origstack); |
| 117 | lg->regs->cs = (__KERNEL_CS|GUEST_PL); | 117 | cpu->regs->cs = (__KERNEL_CS|GUEST_PL); |
| 118 | lg->regs->eip = idt_address(lo, hi); | 118 | cpu->regs->eip = idt_address(lo, hi); |
| 119 | 119 | ||
| 120 | /* There are two kinds of interrupt handlers: 0xE is an "interrupt | 120 | /* There are two kinds of interrupt handlers: 0xE is an "interrupt |
| 121 | * gate" which expects interrupts to be disabled on entry. */ | 121 | * gate" which expects interrupts to be disabled on entry. */ |
| 122 | if (idt_type(lo, hi) == 0xE) | 122 | if (idt_type(lo, hi) == 0xE) |
| 123 | if (put_user(0, &lg->lguest_data->irq_enabled)) | 123 | if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) |
| 124 | kill_guest(lg, "Disabling interrupts"); | 124 | kill_guest(cpu, "Disabling interrupts"); |
| 125 | } | 125 | } |
| 126 | 126 | ||
| 127 | /*H:205 | 127 | /*H:205 |
| @@ -129,23 +129,23 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) | |||
| 129 | * | 129 | * |
| 130 | * maybe_do_interrupt() gets called before every entry to the Guest, to see if | 130 | * maybe_do_interrupt() gets called before every entry to the Guest, to see if |
| 131 | * we should divert the Guest to running an interrupt handler. */ | 131 | * we should divert the Guest to running an interrupt handler. */ |
| 132 | void maybe_do_interrupt(struct lguest *lg) | 132 | void maybe_do_interrupt(struct lg_cpu *cpu) |
| 133 | { | 133 | { |
| 134 | unsigned int irq; | 134 | unsigned int irq; |
| 135 | DECLARE_BITMAP(blk, LGUEST_IRQS); | 135 | DECLARE_BITMAP(blk, LGUEST_IRQS); |
| 136 | struct desc_struct *idt; | 136 | struct desc_struct *idt; |
| 137 | 137 | ||
| 138 | /* If the Guest hasn't even initialized yet, we can do nothing. */ | 138 | /* If the Guest hasn't even initialized yet, we can do nothing. */ |
| 139 | if (!lg->lguest_data) | 139 | if (!cpu->lg->lguest_data) |
| 140 | return; | 140 | return; |
| 141 | 141 | ||
| 142 | /* Take our "irqs_pending" array and remove any interrupts the Guest | 142 | /* Take our "irqs_pending" array and remove any interrupts the Guest |
| 143 | * wants blocked: the result ends up in "blk". */ | 143 | * wants blocked: the result ends up in "blk". */ |
| 144 | if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts, | 144 | if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, |
| 145 | sizeof(blk))) | 145 | sizeof(blk))) |
| 146 | return; | 146 | return; |
| 147 | 147 | ||
| 148 | bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS); | 148 | bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); |
| 149 | 149 | ||
| 150 | /* Find the first interrupt. */ | 150 | /* Find the first interrupt. */ |
| 151 | irq = find_first_bit(blk, LGUEST_IRQS); | 151 | irq = find_first_bit(blk, LGUEST_IRQS); |
| @@ -155,19 +155,20 @@ void maybe_do_interrupt(struct lguest *lg) | |||
| 155 | 155 | ||
| 156 | /* They may be in the middle of an iret, where they asked us never to | 156 | /* They may be in the middle of an iret, where they asked us never to |
| 157 | * deliver interrupts. */ | 157 | * deliver interrupts. */ |
| 158 | if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end) | 158 | if (cpu->regs->eip >= cpu->lg->noirq_start && |
| 159 | (cpu->regs->eip < cpu->lg->noirq_end)) | ||
| 159 | return; | 160 | return; |
| 160 | 161 | ||
| 161 | /* If they're halted, interrupts restart them. */ | 162 | /* If they're halted, interrupts restart them. */ |
| 162 | if (lg->halted) { | 163 | if (cpu->halted) { |
| 163 | /* Re-enable interrupts. */ | 164 | /* Re-enable interrupts. */ |
| 164 | if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled)) | 165 | if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled)) |
| 165 | kill_guest(lg, "Re-enabling interrupts"); | 166 | kill_guest(cpu, "Re-enabling interrupts"); |
| 166 | lg->halted = 0; | 167 | cpu->halted = 0; |
| 167 | } else { | 168 | } else { |
| 168 | /* Otherwise we check if they have interrupts disabled. */ | 169 | /* Otherwise we check if they have interrupts disabled. */ |
| 169 | u32 irq_enabled; | 170 | u32 irq_enabled; |
| 170 | if (get_user(irq_enabled, &lg->lguest_data->irq_enabled)) | 171 | if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) |
| 171 | irq_enabled = 0; | 172 | irq_enabled = 0; |
| 172 | if (!irq_enabled) | 173 | if (!irq_enabled) |
| 173 | return; | 174 | return; |
| @@ -176,15 +177,15 @@ void maybe_do_interrupt(struct lguest *lg) | |||
| 176 | /* Look at the IDT entry the Guest gave us for this interrupt. The | 177 | /* Look at the IDT entry the Guest gave us for this interrupt. The |
| 177 | * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip | 178 | * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip |
| 178 | * over them. */ | 179 | * over them. */ |
| 179 | idt = &lg->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; | 180 | idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; |
| 180 | /* If they don't have a handler (yet?), we just ignore it */ | 181 | /* If they don't have a handler (yet?), we just ignore it */ |
| 181 | if (idt_present(idt->a, idt->b)) { | 182 | if (idt_present(idt->a, idt->b)) { |
| 182 | /* OK, mark it no longer pending and deliver it. */ | 183 | /* OK, mark it no longer pending and deliver it. */ |
| 183 | clear_bit(irq, lg->irqs_pending); | 184 | clear_bit(irq, cpu->irqs_pending); |
| 184 | /* set_guest_interrupt() takes the interrupt descriptor and a | 185 | /* set_guest_interrupt() takes the interrupt descriptor and a |
| 185 | * flag to say whether this interrupt pushes an error code onto | 186 | * flag to say whether this interrupt pushes an error code onto |
| 186 | * the stack as well: virtual interrupts never do. */ | 187 | * the stack as well: virtual interrupts never do. */ |
| 187 | set_guest_interrupt(lg, idt->a, idt->b, 0); | 188 | set_guest_interrupt(cpu, idt->a, idt->b, 0); |
| 188 | } | 189 | } |
| 189 | 190 | ||
| 190 | /* Every time we deliver an interrupt, we update the timestamp in the | 191 | /* Every time we deliver an interrupt, we update the timestamp in the |
| @@ -192,7 +193,7 @@ void maybe_do_interrupt(struct lguest *lg) | |||
| 192 | * did this more often, but it can actually be quite slow: doing it | 193 | * did this more often, but it can actually be quite slow: doing it |
| 193 | * here is a compromise which means at least it gets updated every | 194 | * here is a compromise which means at least it gets updated every |
| 194 | * timer interrupt. */ | 195 | * timer interrupt. */ |
| 195 | write_timestamp(lg); | 196 | write_timestamp(cpu); |
| 196 | } | 197 | } |
| 197 | /*:*/ | 198 | /*:*/ |
| 198 | 199 | ||
| @@ -245,19 +246,19 @@ static int has_err(unsigned int trap) | |||
| 245 | } | 246 | } |
| 246 | 247 | ||
| 247 | /* deliver_trap() returns true if it could deliver the trap. */ | 248 | /* deliver_trap() returns true if it could deliver the trap. */ |
| 248 | int deliver_trap(struct lguest *lg, unsigned int num) | 249 | int deliver_trap(struct lg_cpu *cpu, unsigned int num) |
| 249 | { | 250 | { |
| 250 | /* Trap numbers are always 8 bit, but we set an impossible trap number | 251 | /* Trap numbers are always 8 bit, but we set an impossible trap number |
| 251 | * for traps inside the Switcher, so check that here. */ | 252 | * for traps inside the Switcher, so check that here. */ |
| 252 | if (num >= ARRAY_SIZE(lg->arch.idt)) | 253 | if (num >= ARRAY_SIZE(cpu->arch.idt)) |
| 253 | return 0; | 254 | return 0; |
| 254 | 255 | ||
| 255 | /* Early on the Guest hasn't set the IDT entries (or maybe it put a | 256 | /* Early on the Guest hasn't set the IDT entries (or maybe it put a |
| 256 | * bogus one in): if we fail here, the Guest will be killed. */ | 257 | * bogus one in): if we fail here, the Guest will be killed. */ |
| 257 | if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b)) | 258 | if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) |
| 258 | return 0; | 259 | return 0; |
| 259 | set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, | 260 | set_guest_interrupt(cpu, cpu->arch.idt[num].a, |
| 260 | has_err(num)); | 261 | cpu->arch.idt[num].b, has_err(num)); |
| 261 | return 1; | 262 | return 1; |
| 262 | } | 263 | } |
| 263 | 264 | ||
| @@ -309,18 +310,18 @@ static int direct_trap(unsigned int num) | |||
| 309 | * the Guest. | 310 | * the Guest. |
| 310 | * | 311 | * |
| 311 | * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ | 312 | * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ |
| 312 | void pin_stack_pages(struct lguest *lg) | 313 | void pin_stack_pages(struct lg_cpu *cpu) |
| 313 | { | 314 | { |
| 314 | unsigned int i; | 315 | unsigned int i; |
| 315 | 316 | ||
| 316 | /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or | 317 | /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or |
| 317 | * two pages of stack space. */ | 318 | * two pages of stack space. */ |
| 318 | for (i = 0; i < lg->stack_pages; i++) | 319 | for (i = 0; i < cpu->lg->stack_pages; i++) |
| 319 | /* The stack grows *upwards*, so the address we're given is the | 320 | /* The stack grows *upwards*, so the address we're given is the |
| 320 | * start of the page after the kernel stack. Subtract one to | 321 | * start of the page after the kernel stack. Subtract one to |
| 321 | * get back onto the first stack page, and keep subtracting to | 322 | * get back onto the first stack page, and keep subtracting to |
| 322 | * get to the rest of the stack pages. */ | 323 | * get to the rest of the stack pages. */ |
| 323 | pin_page(lg, lg->esp1 - 1 - i * PAGE_SIZE); | 324 | pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); |
| 324 | } | 325 | } |
| 325 | 326 | ||
| 326 | /* Direct traps also mean that we need to know whenever the Guest wants to use | 327 | /* Direct traps also mean that we need to know whenever the Guest wants to use |
| @@ -331,21 +332,21 @@ void pin_stack_pages(struct lguest *lg) | |||
| 331 | * | 332 | * |
| 332 | * In Linux each process has its own kernel stack, so this happens a lot: we | 333 | * In Linux each process has its own kernel stack, so this happens a lot: we |
| 333 | * change stacks on each context switch. */ | 334 | * change stacks on each context switch. */ |
| 334 | void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) | 335 | void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) |
| 335 | { | 336 | { |
| 336 | /* You are not allowed have a stack segment with privilege level 0: bad | 337 | /* You are not allowed have a stack segment with privilege level 0: bad |
| 337 | * Guest! */ | 338 | * Guest! */ |
| 338 | if ((seg & 0x3) != GUEST_PL) | 339 | if ((seg & 0x3) != GUEST_PL) |
| 339 | kill_guest(lg, "bad stack segment %i", seg); | 340 | kill_guest(cpu, "bad stack segment %i", seg); |
| 340 | /* We only expect one or two stack pages. */ | 341 | /* We only expect one or two stack pages. */ |
| 341 | if (pages > 2) | 342 | if (pages > 2) |
| 342 | kill_guest(lg, "bad stack pages %u", pages); | 343 | kill_guest(cpu, "bad stack pages %u", pages); |
| 343 | /* Save where the stack is, and how many pages */ | 344 | /* Save where the stack is, and how many pages */ |
| 344 | lg->ss1 = seg; | 345 | cpu->ss1 = seg; |
| 345 | lg->esp1 = esp; | 346 | cpu->esp1 = esp; |
| 346 | lg->stack_pages = pages; | 347 | cpu->lg->stack_pages = pages; |
| 347 | /* Make sure the new stack pages are mapped */ | 348 | /* Make sure the new stack pages are mapped */ |
| 348 | pin_stack_pages(lg); | 349 | pin_stack_pages(cpu); |
| 349 | } | 350 | } |
| 350 | 351 | ||
| 351 | /* All this reference to mapping stacks leads us neatly into the other complex | 352 | /* All this reference to mapping stacks leads us neatly into the other complex |
| @@ -353,7 +354,7 @@ void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) | |||
| 353 | 354 | ||
| 354 | /*H:235 This is the routine which actually checks the Guest's IDT entry and | 355 | /*H:235 This is the routine which actually checks the Guest's IDT entry and |
| 355 | * transfers it into the entry in "struct lguest": */ | 356 | * transfers it into the entry in "struct lguest": */ |
| 356 | static void set_trap(struct lguest *lg, struct desc_struct *trap, | 357 | static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, |
| 357 | unsigned int num, u32 lo, u32 hi) | 358 | unsigned int num, u32 lo, u32 hi) |
| 358 | { | 359 | { |
| 359 | u8 type = idt_type(lo, hi); | 360 | u8 type = idt_type(lo, hi); |
| @@ -366,7 +367,7 @@ static void set_trap(struct lguest *lg, struct desc_struct *trap, | |||
| 366 | 367 | ||
| 367 | /* We only support interrupt and trap gates. */ | 368 | /* We only support interrupt and trap gates. */ |
| 368 | if (type != 0xE && type != 0xF) | 369 | if (type != 0xE && type != 0xF) |
| 369 | kill_guest(lg, "bad IDT type %i", type); | 370 | kill_guest(cpu, "bad IDT type %i", type); |
| 370 | 371 | ||
| 371 | /* We only copy the handler address, present bit, privilege level and | 372 | /* We only copy the handler address, present bit, privilege level and |
| 372 | * type. The privilege level controls where the trap can be triggered | 373 | * type. The privilege level controls where the trap can be triggered |
| @@ -383,7 +384,7 @@ static void set_trap(struct lguest *lg, struct desc_struct *trap, | |||
| 383 | * | 384 | * |
| 384 | * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the | 385 | * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the |
| 385 | * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ | 386 | * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ |
| 386 | void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi) | 387 | void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) |
| 387 | { | 388 | { |
| 388 | /* Guest never handles: NMI, doublefault, spurious interrupt or | 389 | /* Guest never handles: NMI, doublefault, spurious interrupt or |
| 389 | * hypercall. We ignore when it tries to set them. */ | 390 | * hypercall. We ignore when it tries to set them. */ |
| @@ -392,13 +393,13 @@ void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi) | |||
| 392 | 393 | ||
| 393 | /* Mark the IDT as changed: next time the Guest runs we'll know we have | 394 | /* Mark the IDT as changed: next time the Guest runs we'll know we have |
| 394 | * to copy this again. */ | 395 | * to copy this again. */ |
| 395 | lg->changed |= CHANGED_IDT; | 396 | cpu->changed |= CHANGED_IDT; |
| 396 | 397 | ||
| 397 | /* Check that the Guest doesn't try to step outside the bounds. */ | 398 | /* Check that the Guest doesn't try to step outside the bounds. */ |
| 398 | if (num >= ARRAY_SIZE(lg->arch.idt)) | 399 | if (num >= ARRAY_SIZE(cpu->arch.idt)) |
| 399 | kill_guest(lg, "Setting idt entry %u", num); | 400 | kill_guest(cpu, "Setting idt entry %u", num); |
| 400 | else | 401 | else |
| 401 | set_trap(lg, &lg->arch.idt[num], num, lo, hi); | 402 | set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); |
| 402 | } | 403 | } |
| 403 | 404 | ||
| 404 | /* The default entry for each interrupt points into the Switcher routines which | 405 | /* The default entry for each interrupt points into the Switcher routines which |
| @@ -434,14 +435,14 @@ void setup_default_idt_entries(struct lguest_ro_state *state, | |||
| 434 | /*H:240 We don't use the IDT entries in the "struct lguest" directly, instead | 435 | /*H:240 We don't use the IDT entries in the "struct lguest" directly, instead |
| 435 | * we copy them into the IDT which we've set up for Guests on this CPU, just | 436 | * we copy them into the IDT which we've set up for Guests on this CPU, just |
| 436 | * before we run the Guest. This routine does that copy. */ | 437 | * before we run the Guest. This routine does that copy. */ |
| 437 | void copy_traps(const struct lguest *lg, struct desc_struct *idt, | 438 | void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, |
| 438 | const unsigned long *def) | 439 | const unsigned long *def) |
| 439 | { | 440 | { |
| 440 | unsigned int i; | 441 | unsigned int i; |
| 441 | 442 | ||
| 442 | /* We can simply copy the direct traps, otherwise we use the default | 443 | /* We can simply copy the direct traps, otherwise we use the default |
| 443 | * ones in the Switcher: they will return to the Host. */ | 444 | * ones in the Switcher: they will return to the Host. */ |
| 444 | for (i = 0; i < ARRAY_SIZE(lg->arch.idt); i++) { | 445 | for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { |
| 445 | /* If no Guest can ever override this trap, leave it alone. */ | 446 | /* If no Guest can ever override this trap, leave it alone. */ |
| 446 | if (!direct_trap(i)) | 447 | if (!direct_trap(i)) |
| 447 | continue; | 448 | continue; |
| @@ -450,8 +451,8 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt, | |||
| 450 | * Interrupt gates (type 14) disable interrupts as they are | 451 | * Interrupt gates (type 14) disable interrupts as they are |
| 451 | * entered, which we never let the Guest do. Not present | 452 | * entered, which we never let the Guest do. Not present |
| 452 | * entries (type 0x0) also can't go direct, of course. */ | 453 | * entries (type 0x0) also can't go direct, of course. */ |
| 453 | if (idt_type(lg->arch.idt[i].a, lg->arch.idt[i].b) == 0xF) | 454 | if (idt_type(cpu->arch.idt[i].a, cpu->arch.idt[i].b) == 0xF) |
| 454 | idt[i] = lg->arch.idt[i]; | 455 | idt[i] = cpu->arch.idt[i]; |
| 455 | else | 456 | else |
| 456 | /* Reset it to the default. */ | 457 | /* Reset it to the default. */ |
| 457 | default_idt_entry(&idt[i], i, def[i]); | 458 | default_idt_entry(&idt[i], i, def[i]); |
| @@ -470,13 +471,13 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt, | |||
| 470 | * infrastructure to set a callback at that time. | 471 | * infrastructure to set a callback at that time. |
| 471 | * | 472 | * |
| 472 | * 0 means "turn off the clock". */ | 473 | * 0 means "turn off the clock". */ |
| 473 | void guest_set_clockevent(struct lguest *lg, unsigned long delta) | 474 | void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) |
| 474 | { | 475 | { |
| 475 | ktime_t expires; | 476 | ktime_t expires; |
| 476 | 477 | ||
| 477 | if (unlikely(delta == 0)) { | 478 | if (unlikely(delta == 0)) { |
| 478 | /* Clock event device is shutting down. */ | 479 | /* Clock event device is shutting down. */ |
| 479 | hrtimer_cancel(&lg->hrt); | 480 | hrtimer_cancel(&cpu->hrt); |
| 480 | return; | 481 | return; |
| 481 | } | 482 | } |
| 482 | 483 | ||
| @@ -484,25 +485,25 @@ void guest_set_clockevent(struct lguest *lg, unsigned long delta) | |||
| 484 | * all the time between now and the timer interrupt it asked for. This | 485 | * all the time between now and the timer interrupt it asked for. This |
| 485 | * is almost always the right thing to do. */ | 486 | * is almost always the right thing to do. */ |
| 486 | expires = ktime_add_ns(ktime_get_real(), delta); | 487 | expires = ktime_add_ns(ktime_get_real(), delta); |
| 487 | hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS); | 488 | hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); |
| 488 | } | 489 | } |
| 489 | 490 | ||
| 490 | /* This is the function called when the Guest's timer expires. */ | 491 | /* This is the function called when the Guest's timer expires. */ |
| 491 | static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) | 492 | static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) |
| 492 | { | 493 | { |
| 493 | struct lguest *lg = container_of(timer, struct lguest, hrt); | 494 | struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); |
| 494 | 495 | ||
| 495 | /* Remember the first interrupt is the timer interrupt. */ | 496 | /* Remember the first interrupt is the timer interrupt. */ |
| 496 | set_bit(0, lg->irqs_pending); | 497 | set_bit(0, cpu->irqs_pending); |
| 497 | /* If the Guest is actually stopped, we need to wake it up. */ | 498 | /* If the Guest is actually stopped, we need to wake it up. */ |
| 498 | if (lg->halted) | 499 | if (cpu->halted) |
| 499 | wake_up_process(lg->tsk); | 500 | wake_up_process(cpu->tsk); |
| 500 | return HRTIMER_NORESTART; | 501 | return HRTIMER_NORESTART; |
| 501 | } | 502 | } |
| 502 | 503 | ||
| 503 | /* This sets up the timer for this Guest. */ | 504 | /* This sets up the timer for this Guest. */ |
| 504 | void init_clockdev(struct lguest *lg) | 505 | void init_clockdev(struct lg_cpu *cpu) |
| 505 | { | 506 | { |
| 506 | hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS); | 507 | hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS); |
| 507 | lg->hrt.function = clockdev_fn; | 508 | cpu->hrt.function = clockdev_fn; |
| 508 | } | 509 | } |
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 86924891b5eb..2337e1a06f02 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | #include <linux/lguest.h> | 8 | #include <linux/lguest.h> |
| 9 | #include <linux/lguest_launcher.h> | 9 | #include <linux/lguest_launcher.h> |
| 10 | #include <linux/wait.h> | 10 | #include <linux/wait.h> |
| 11 | #include <linux/hrtimer.h> | ||
| 11 | #include <linux/err.h> | 12 | #include <linux/err.h> |
| 12 | #include <asm/semaphore.h> | 13 | #include <asm/semaphore.h> |
| 13 | 14 | ||
| @@ -38,58 +39,72 @@ struct lguest_pages | |||
| 38 | #define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */ | 39 | #define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */ |
| 39 | #define CHANGED_ALL 3 | 40 | #define CHANGED_ALL 3 |
| 40 | 41 | ||
| 41 | /* The private info the thread maintains about the guest. */ | 42 | struct lguest; |
| 42 | struct lguest | 43 | |
| 43 | { | 44 | struct lg_cpu { |
| 44 | /* At end of a page shared mapped over lguest_pages in guest. */ | 45 | unsigned int id; |
| 45 | unsigned long regs_page; | 46 | struct lguest *lg; |
| 46 | struct lguest_regs *regs; | ||
| 47 | struct lguest_data __user *lguest_data; | ||
| 48 | struct task_struct *tsk; | 47 | struct task_struct *tsk; |
| 49 | struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ | 48 | struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ |
| 50 | u32 pfn_limit; | 49 | |
| 51 | /* This provides the offset to the base of guest-physical | ||
| 52 | * memory in the Launcher. */ | ||
| 53 | void __user *mem_base; | ||
| 54 | unsigned long kernel_address; | ||
| 55 | u32 cr2; | 50 | u32 cr2; |
| 56 | int halted; | ||
| 57 | int ts; | 51 | int ts; |
| 58 | u32 next_hcall; | ||
| 59 | u32 esp1; | 52 | u32 esp1; |
| 60 | u8 ss1; | 53 | u8 ss1; |
| 61 | 54 | ||
| 55 | /* Bitmap of what has changed: see CHANGED_* above. */ | ||
| 56 | int changed; | ||
| 57 | |||
| 58 | unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ | ||
| 59 | |||
| 60 | /* At end of a page shared mapped over lguest_pages in guest. */ | ||
| 61 | unsigned long regs_page; | ||
| 62 | struct lguest_regs *regs; | ||
| 63 | |||
| 64 | struct lguest_pages *last_pages; | ||
| 65 | |||
| 66 | int cpu_pgd; /* which pgd this cpu is currently using */ | ||
| 67 | |||
| 62 | /* If a hypercall was asked for, this points to the arguments. */ | 68 | /* If a hypercall was asked for, this points to the arguments. */ |
| 63 | struct hcall_args *hcall; | 69 | struct hcall_args *hcall; |
| 70 | u32 next_hcall; | ||
| 71 | |||
| 72 | /* Virtual clock device */ | ||
| 73 | struct hrtimer hrt; | ||
| 64 | 74 | ||
| 65 | /* Do we need to stop what we're doing and return to userspace? */ | 75 | /* Do we need to stop what we're doing and return to userspace? */ |
| 66 | int break_out; | 76 | int break_out; |
| 67 | wait_queue_head_t break_wq; | 77 | wait_queue_head_t break_wq; |
| 78 | int halted; | ||
| 68 | 79 | ||
| 69 | /* Bitmap of what has changed: see CHANGED_* above. */ | 80 | /* Pending virtual interrupts */ |
| 70 | int changed; | 81 | DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); |
| 71 | struct lguest_pages *last_pages; | 82 | |
| 83 | struct lg_cpu_arch arch; | ||
| 84 | }; | ||
| 85 | |||
| 86 | /* The private info the thread maintains about the guest. */ | ||
| 87 | struct lguest | ||
| 88 | { | ||
| 89 | struct lguest_data __user *lguest_data; | ||
| 90 | struct lg_cpu cpus[NR_CPUS]; | ||
| 91 | unsigned int nr_cpus; | ||
| 92 | |||
| 93 | u32 pfn_limit; | ||
| 94 | /* This provides the offset to the base of guest-physical | ||
| 95 | * memory in the Launcher. */ | ||
| 96 | void __user *mem_base; | ||
| 97 | unsigned long kernel_address; | ||
| 72 | 98 | ||
| 73 | /* We keep a small number of these. */ | ||
| 74 | u32 pgdidx; | ||
| 75 | struct pgdir pgdirs[4]; | 99 | struct pgdir pgdirs[4]; |
| 76 | 100 | ||
| 77 | unsigned long noirq_start, noirq_end; | 101 | unsigned long noirq_start, noirq_end; |
| 78 | unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ | ||
| 79 | 102 | ||
| 80 | unsigned int stack_pages; | 103 | unsigned int stack_pages; |
| 81 | u32 tsc_khz; | 104 | u32 tsc_khz; |
| 82 | 105 | ||
| 83 | /* Dead? */ | 106 | /* Dead? */ |
| 84 | const char *dead; | 107 | const char *dead; |
| 85 | |||
| 86 | struct lguest_arch arch; | ||
| 87 | |||
| 88 | /* Virtual clock device */ | ||
| 89 | struct hrtimer hrt; | ||
| 90 | |||
| 91 | /* Pending virtual interrupts */ | ||
| 92 | DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); | ||
| 93 | }; | 108 | }; |
| 94 | 109 | ||
| 95 | extern struct mutex lguest_lock; | 110 | extern struct mutex lguest_lock; |
| @@ -97,26 +112,26 @@ extern struct mutex lguest_lock; | |||
| 97 | /* core.c: */ | 112 | /* core.c: */ |
| 98 | int lguest_address_ok(const struct lguest *lg, | 113 | int lguest_address_ok(const struct lguest *lg, |
| 99 | unsigned long addr, unsigned long len); | 114 | unsigned long addr, unsigned long len); |
| 100 | void __lgread(struct lguest *, void *, unsigned long, unsigned); | 115 | void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); |
| 101 | void __lgwrite(struct lguest *, unsigned long, const void *, unsigned); | 116 | void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); |
| 102 | 117 | ||
| 103 | /*H:035 Using memory-copy operations like that is usually inconvient, so we | 118 | /*H:035 Using memory-copy operations like that is usually inconvient, so we |
| 104 | * have the following helper macros which read and write a specific type (often | 119 | * have the following helper macros which read and write a specific type (often |
| 105 | * an unsigned long). | 120 | * an unsigned long). |
| 106 | * | 121 | * |
| 107 | * This reads into a variable of the given type then returns that. */ | 122 | * This reads into a variable of the given type then returns that. */ |
| 108 | #define lgread(lg, addr, type) \ | 123 | #define lgread(cpu, addr, type) \ |
| 109 | ({ type _v; __lgread((lg), &_v, (addr), sizeof(_v)); _v; }) | 124 | ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; }) |
| 110 | 125 | ||
| 111 | /* This checks that the variable is of the given type, then writes it out. */ | 126 | /* This checks that the variable is of the given type, then writes it out. */ |
| 112 | #define lgwrite(lg, addr, type, val) \ | 127 | #define lgwrite(cpu, addr, type, val) \ |
| 113 | do { \ | 128 | do { \ |
| 114 | typecheck(type, val); \ | 129 | typecheck(type, val); \ |
| 115 | __lgwrite((lg), (addr), &(val), sizeof(val)); \ | 130 | __lgwrite((cpu), (addr), &(val), sizeof(val)); \ |
| 116 | } while(0) | 131 | } while(0) |
| 117 | /* (end of memory access helper routines) :*/ | 132 | /* (end of memory access helper routines) :*/ |
| 118 | 133 | ||
| 119 | int run_guest(struct lguest *lg, unsigned long __user *user); | 134 | int run_guest(struct lg_cpu *cpu, unsigned long __user *user); |
| 120 | 135 | ||
| 121 | /* Helper macros to obtain the first 12 or the last 20 bits, this is only the | 136 | /* Helper macros to obtain the first 12 or the last 20 bits, this is only the |
| 122 | * first step in the migration to the kernel types. pte_pfn is already defined | 137 | * first step in the migration to the kernel types. pte_pfn is already defined |
| @@ -126,52 +141,53 @@ int run_guest(struct lguest *lg, unsigned long __user *user); | |||
| 126 | #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) | 141 | #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) |
| 127 | 142 | ||
| 128 | /* interrupts_and_traps.c: */ | 143 | /* interrupts_and_traps.c: */ |
| 129 | void maybe_do_interrupt(struct lguest *lg); | 144 | void maybe_do_interrupt(struct lg_cpu *cpu); |
| 130 | int deliver_trap(struct lguest *lg, unsigned int num); | 145 | int deliver_trap(struct lg_cpu *cpu, unsigned int num); |
| 131 | void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi); | 146 | void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, |
| 132 | void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages); | 147 | u32 low, u32 hi); |
| 133 | void pin_stack_pages(struct lguest *lg); | 148 | void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages); |
| 149 | void pin_stack_pages(struct lg_cpu *cpu); | ||
| 134 | void setup_default_idt_entries(struct lguest_ro_state *state, | 150 | void setup_default_idt_entries(struct lguest_ro_state *state, |
| 135 | const unsigned long *def); | 151 | const unsigned long *def); |
| 136 | void copy_traps(const struct lguest *lg, struct desc_struct *idt, | 152 | void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, |
| 137 | const unsigned long *def); | 153 | const unsigned long *def); |
| 138 | void guest_set_clockevent(struct lguest *lg, unsigned long delta); | 154 | void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); |
| 139 | void init_clockdev(struct lguest *lg); | 155 | void init_clockdev(struct lg_cpu *cpu); |
| 140 | bool check_syscall_vector(struct lguest *lg); | 156 | bool check_syscall_vector(struct lguest *lg); |
| 141 | int init_interrupts(void); | 157 | int init_interrupts(void); |
| 142 | void free_interrupts(void); | 158 | void free_interrupts(void); |
| 143 | 159 | ||
| 144 | /* segments.c: */ | 160 | /* segments.c: */ |
| 145 | void setup_default_gdt_entries(struct lguest_ro_state *state); | 161 | void setup_default_gdt_entries(struct lguest_ro_state *state); |
| 146 | void setup_guest_gdt(struct lguest *lg); | 162 | void setup_guest_gdt(struct lg_cpu *cpu); |
| 147 | void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num); | 163 | void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num); |
| 148 | void guest_load_tls(struct lguest *lg, unsigned long tls_array); | 164 | void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array); |
| 149 | void copy_gdt(const struct lguest *lg, struct desc_struct *gdt); | 165 | void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt); |
| 150 | void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt); | 166 | void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt); |
| 151 | 167 | ||
| 152 | /* page_tables.c: */ | 168 | /* page_tables.c: */ |
| 153 | int init_guest_pagetable(struct lguest *lg, unsigned long pgtable); | 169 | int init_guest_pagetable(struct lguest *lg, unsigned long pgtable); |
| 154 | void free_guest_pagetable(struct lguest *lg); | 170 | void free_guest_pagetable(struct lguest *lg); |
| 155 | void guest_new_pagetable(struct lguest *lg, unsigned long pgtable); | 171 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); |
| 156 | void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); | 172 | void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); |
| 157 | void guest_pagetable_clear_all(struct lguest *lg); | 173 | void guest_pagetable_clear_all(struct lg_cpu *cpu); |
| 158 | void guest_pagetable_flush_user(struct lguest *lg); | 174 | void guest_pagetable_flush_user(struct lg_cpu *cpu); |
| 159 | void guest_set_pte(struct lguest *lg, unsigned long gpgdir, | 175 | void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, |
| 160 | unsigned long vaddr, pte_t val); | 176 | unsigned long vaddr, pte_t val); |
| 161 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); | 177 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages); |
| 162 | int demand_page(struct lguest *info, unsigned long cr2, int errcode); | 178 | int demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode); |
| 163 | void pin_page(struct lguest *lg, unsigned long vaddr); | 179 | void pin_page(struct lg_cpu *cpu, unsigned long vaddr); |
| 164 | unsigned long guest_pa(struct lguest *lg, unsigned long vaddr); | 180 | unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr); |
| 165 | void page_table_guest_data_init(struct lguest *lg); | 181 | void page_table_guest_data_init(struct lg_cpu *cpu); |
| 166 | 182 | ||
| 167 | /* <arch>/core.c: */ | 183 | /* <arch>/core.c: */ |
| 168 | void lguest_arch_host_init(void); | 184 | void lguest_arch_host_init(void); |
| 169 | void lguest_arch_host_fini(void); | 185 | void lguest_arch_host_fini(void); |
| 170 | void lguest_arch_run_guest(struct lguest *lg); | 186 | void lguest_arch_run_guest(struct lg_cpu *cpu); |
| 171 | void lguest_arch_handle_trap(struct lguest *lg); | 187 | void lguest_arch_handle_trap(struct lg_cpu *cpu); |
| 172 | int lguest_arch_init_hypercalls(struct lguest *lg); | 188 | int lguest_arch_init_hypercalls(struct lg_cpu *cpu); |
| 173 | int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args); | 189 | int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args); |
| 174 | void lguest_arch_setup_regs(struct lguest *lg, unsigned long start); | 190 | void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start); |
| 175 | 191 | ||
| 176 | /* <arch>/switcher.S: */ | 192 | /* <arch>/switcher.S: */ |
| 177 | extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; | 193 | extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; |
| @@ -181,8 +197,8 @@ int lguest_device_init(void); | |||
| 181 | void lguest_device_remove(void); | 197 | void lguest_device_remove(void); |
| 182 | 198 | ||
| 183 | /* hypercalls.c: */ | 199 | /* hypercalls.c: */ |
| 184 | void do_hypercalls(struct lguest *lg); | 200 | void do_hypercalls(struct lg_cpu *cpu); |
| 185 | void write_timestamp(struct lguest *lg); | 201 | void write_timestamp(struct lg_cpu *cpu); |
| 186 | 202 | ||
| 187 | /*L:035 | 203 | /*L:035 |
| 188 | * Let's step aside for the moment, to study one important routine that's used | 204 | * Let's step aside for the moment, to study one important routine that's used |
| @@ -208,12 +224,12 @@ void write_timestamp(struct lguest *lg); | |||
| 208 | * Like any macro which uses an "if", it is safely wrapped in a run-once "do { | 224 | * Like any macro which uses an "if", it is safely wrapped in a run-once "do { |
| 209 | * } while(0)". | 225 | * } while(0)". |
| 210 | */ | 226 | */ |
| 211 | #define kill_guest(lg, fmt...) \ | 227 | #define kill_guest(cpu, fmt...) \ |
| 212 | do { \ | 228 | do { \ |
| 213 | if (!(lg)->dead) { \ | 229 | if (!(cpu)->lg->dead) { \ |
| 214 | (lg)->dead = kasprintf(GFP_ATOMIC, fmt); \ | 230 | (cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt); \ |
| 215 | if (!(lg)->dead) \ | 231 | if (!(cpu)->lg->dead) \ |
| 216 | (lg)->dead = ERR_PTR(-ENOMEM); \ | 232 | (cpu)->lg->dead = ERR_PTR(-ENOMEM); \ |
| 217 | } \ | 233 | } \ |
| 218 | } while(0) | 234 | } while(0) |
| 219 | /* (End of aside) :*/ | 235 | /* (End of aside) :*/ |
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 3b92a61ba8d2..85d42d3d01a9 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <linux/uaccess.h> | 6 | #include <linux/uaccess.h> |
| 7 | #include <linux/miscdevice.h> | 7 | #include <linux/miscdevice.h> |
| 8 | #include <linux/fs.h> | 8 | #include <linux/fs.h> |
| 9 | #include <linux/sched.h> | ||
| 9 | #include "lg.h" | 10 | #include "lg.h" |
| 10 | 11 | ||
| 11 | /*L:055 When something happens, the Waker process needs a way to stop the | 12 | /*L:055 When something happens, the Waker process needs a way to stop the |
| @@ -13,7 +14,7 @@ | |||
| 13 | * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher | 14 | * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher |
| 14 | * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release | 15 | * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release |
| 15 | * the Waker. */ | 16 | * the Waker. */ |
| 16 | static int break_guest_out(struct lguest *lg, const unsigned long __user *input) | 17 | static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input) |
| 17 | { | 18 | { |
| 18 | unsigned long on; | 19 | unsigned long on; |
| 19 | 20 | ||
| @@ -22,21 +23,21 @@ static int break_guest_out(struct lguest *lg, const unsigned long __user *input) | |||
| 22 | return -EFAULT; | 23 | return -EFAULT; |
| 23 | 24 | ||
| 24 | if (on) { | 25 | if (on) { |
| 25 | lg->break_out = 1; | 26 | cpu->break_out = 1; |
| 26 | /* Pop it out of the Guest (may be running on different CPU) */ | 27 | /* Pop it out of the Guest (may be running on different CPU) */ |
| 27 | wake_up_process(lg->tsk); | 28 | wake_up_process(cpu->tsk); |
| 28 | /* Wait for them to reset it */ | 29 | /* Wait for them to reset it */ |
| 29 | return wait_event_interruptible(lg->break_wq, !lg->break_out); | 30 | return wait_event_interruptible(cpu->break_wq, !cpu->break_out); |
| 30 | } else { | 31 | } else { |
| 31 | lg->break_out = 0; | 32 | cpu->break_out = 0; |
| 32 | wake_up(&lg->break_wq); | 33 | wake_up(&cpu->break_wq); |
| 33 | return 0; | 34 | return 0; |
| 34 | } | 35 | } |
| 35 | } | 36 | } |
| 36 | 37 | ||
| 37 | /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt | 38 | /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt |
| 38 | * number to /dev/lguest. */ | 39 | * number to /dev/lguest. */ |
| 39 | static int user_send_irq(struct lguest *lg, const unsigned long __user *input) | 40 | static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) |
| 40 | { | 41 | { |
| 41 | unsigned long irq; | 42 | unsigned long irq; |
| 42 | 43 | ||
| @@ -46,7 +47,7 @@ static int user_send_irq(struct lguest *lg, const unsigned long __user *input) | |||
| 46 | return -EINVAL; | 47 | return -EINVAL; |
| 47 | /* Next time the Guest runs, the core code will see if it can deliver | 48 | /* Next time the Guest runs, the core code will see if it can deliver |
| 48 | * this interrupt. */ | 49 | * this interrupt. */ |
| 49 | set_bit(irq, lg->irqs_pending); | 50 | set_bit(irq, cpu->irqs_pending); |
| 50 | return 0; | 51 | return 0; |
| 51 | } | 52 | } |
| 52 | 53 | ||
| @@ -55,13 +56,21 @@ static int user_send_irq(struct lguest *lg, const unsigned long __user *input) | |||
| 55 | static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) | 56 | static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) |
| 56 | { | 57 | { |
| 57 | struct lguest *lg = file->private_data; | 58 | struct lguest *lg = file->private_data; |
| 59 | struct lg_cpu *cpu; | ||
| 60 | unsigned int cpu_id = *o; | ||
| 58 | 61 | ||
| 59 | /* You must write LHREQ_INITIALIZE first! */ | 62 | /* You must write LHREQ_INITIALIZE first! */ |
| 60 | if (!lg) | 63 | if (!lg) |
| 61 | return -EINVAL; | 64 | return -EINVAL; |
| 62 | 65 | ||
| 66 | /* Watch out for arbitrary vcpu indexes! */ | ||
| 67 | if (cpu_id >= lg->nr_cpus) | ||
| 68 | return -EINVAL; | ||
| 69 | |||
| 70 | cpu = &lg->cpus[cpu_id]; | ||
| 71 | |||
| 63 | /* If you're not the task which owns the Guest, go away. */ | 72 | /* If you're not the task which owns the Guest, go away. */ |
| 64 | if (current != lg->tsk) | 73 | if (current != cpu->tsk) |
| 65 | return -EPERM; | 74 | return -EPERM; |
| 66 | 75 | ||
| 67 | /* If the guest is already dead, we indicate why */ | 76 | /* If the guest is already dead, we indicate why */ |
| @@ -81,11 +90,53 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) | |||
| 81 | 90 | ||
| 82 | /* If we returned from read() last time because the Guest notified, | 91 | /* If we returned from read() last time because the Guest notified, |
| 83 | * clear the flag. */ | 92 | * clear the flag. */ |
| 84 | if (lg->pending_notify) | 93 | if (cpu->pending_notify) |
| 85 | lg->pending_notify = 0; | 94 | cpu->pending_notify = 0; |
| 86 | 95 | ||
| 87 | /* Run the Guest until something interesting happens. */ | 96 | /* Run the Guest until something interesting happens. */ |
| 88 | return run_guest(lg, (unsigned long __user *)user); | 97 | return run_guest(cpu, (unsigned long __user *)user); |
| 98 | } | ||
| 99 | |||
| 100 | static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) | ||
| 101 | { | ||
| 102 | if (id >= NR_CPUS) | ||
| 103 | return -EINVAL; | ||
| 104 | |||
| 105 | cpu->id = id; | ||
| 106 | cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); | ||
| 107 | cpu->lg->nr_cpus++; | ||
| 108 | init_clockdev(cpu); | ||
| 109 | |||
| 110 | /* We need a complete page for the Guest registers: they are accessible | ||
| 111 | * to the Guest and we can only grant it access to whole pages. */ | ||
| 112 | cpu->regs_page = get_zeroed_page(GFP_KERNEL); | ||
| 113 | if (!cpu->regs_page) | ||
| 114 | return -ENOMEM; | ||
| 115 | |||
| 116 | /* We actually put the registers at the bottom of the page. */ | ||
| 117 | cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); | ||
| 118 | |||
| 119 | /* Now we initialize the Guest's registers, handing it the start | ||
| 120 | * address. */ | ||
| 121 | lguest_arch_setup_regs(cpu, start_ip); | ||
| 122 | |||
| 123 | /* Initialize the queue for the waker to wait on */ | ||
| 124 | init_waitqueue_head(&cpu->break_wq); | ||
| 125 | |||
| 126 | /* We keep a pointer to the Launcher task (ie. current task) for when | ||
| 127 | * other Guests want to wake this one (inter-Guest I/O). */ | ||
| 128 | cpu->tsk = current; | ||
| 129 | |||
| 130 | /* We need to keep a pointer to the Launcher's memory map, because if | ||
| 131 | * the Launcher dies we need to clean it up. If we don't keep a | ||
| 132 | * reference, it is destroyed before close() is called. */ | ||
| 133 | cpu->mm = get_task_mm(cpu->tsk); | ||
| 134 | |||
| 135 | /* We remember which CPU's pages this Guest used last, for optimization | ||
| 136 | * when the same Guest runs on the same CPU twice. */ | ||
| 137 | cpu->last_pages = NULL; | ||
| 138 | |||
| 139 | return 0; | ||
| 89 | } | 140 | } |
| 90 | 141 | ||
| 91 | /*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit) | 142 | /*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit) |
| @@ -134,15 +185,10 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
| 134 | lg->mem_base = (void __user *)(long)args[0]; | 185 | lg->mem_base = (void __user *)(long)args[0]; |
| 135 | lg->pfn_limit = args[1]; | 186 | lg->pfn_limit = args[1]; |
| 136 | 187 | ||
| 137 | /* We need a complete page for the Guest registers: they are accessible | 188 | /* This is the first cpu */ |
| 138 | * to the Guest and we can only grant it access to whole pages. */ | 189 | err = lg_cpu_start(&lg->cpus[0], 0, args[3]); |
| 139 | lg->regs_page = get_zeroed_page(GFP_KERNEL); | 190 | if (err) |
| 140 | if (!lg->regs_page) { | ||
| 141 | err = -ENOMEM; | ||
| 142 | goto release_guest; | 191 | goto release_guest; |
| 143 | } | ||
| 144 | /* We actually put the registers at the bottom of the page. */ | ||
| 145 | lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs); | ||
| 146 | 192 | ||
| 147 | /* Initialize the Guest's shadow page tables, using the toplevel | 193 | /* Initialize the Guest's shadow page tables, using the toplevel |
| 148 | * address the Launcher gave us. This allocates memory, so can | 194 | * address the Launcher gave us. This allocates memory, so can |
| @@ -151,28 +197,6 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
| 151 | if (err) | 197 | if (err) |
| 152 | goto free_regs; | 198 | goto free_regs; |
| 153 | 199 | ||
| 154 | /* Now we initialize the Guest's registers, handing it the start | ||
| 155 | * address. */ | ||
| 156 | lguest_arch_setup_regs(lg, args[3]); | ||
| 157 | |||
| 158 | /* The timer for lguest's clock needs initialization. */ | ||
| 159 | init_clockdev(lg); | ||
| 160 | |||
| 161 | /* We keep a pointer to the Launcher task (ie. current task) for when | ||
| 162 | * other Guests want to wake this one (inter-Guest I/O). */ | ||
| 163 | lg->tsk = current; | ||
| 164 | /* We need to keep a pointer to the Launcher's memory map, because if | ||
| 165 | * the Launcher dies we need to clean it up. If we don't keep a | ||
| 166 | * reference, it is destroyed before close() is called. */ | ||
| 167 | lg->mm = get_task_mm(lg->tsk); | ||
| 168 | |||
| 169 | /* Initialize the queue for the waker to wait on */ | ||
| 170 | init_waitqueue_head(&lg->break_wq); | ||
| 171 | |||
| 172 | /* We remember which CPU's pages this Guest used last, for optimization | ||
| 173 | * when the same Guest runs on the same CPU twice. */ | ||
| 174 | lg->last_pages = NULL; | ||
| 175 | |||
| 176 | /* We keep our "struct lguest" in the file's private_data. */ | 200 | /* We keep our "struct lguest" in the file's private_data. */ |
| 177 | file->private_data = lg; | 201 | file->private_data = lg; |
| 178 | 202 | ||
| @@ -182,7 +206,8 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
| 182 | return sizeof(args); | 206 | return sizeof(args); |
| 183 | 207 | ||
| 184 | free_regs: | 208 | free_regs: |
| 185 | free_page(lg->regs_page); | 209 | /* FIXME: This should be in free_vcpu */ |
| 210 | free_page(lg->cpus[0].regs_page); | ||
| 186 | release_guest: | 211 | release_guest: |
| 187 | kfree(lg); | 212 | kfree(lg); |
| 188 | unlock: | 213 | unlock: |
| @@ -202,30 +227,37 @@ static ssize_t write(struct file *file, const char __user *in, | |||
| 202 | struct lguest *lg = file->private_data; | 227 | struct lguest *lg = file->private_data; |
| 203 | const unsigned long __user *input = (const unsigned long __user *)in; | 228 | const unsigned long __user *input = (const unsigned long __user *)in; |
| 204 | unsigned long req; | 229 | unsigned long req; |
| 230 | struct lg_cpu *uninitialized_var(cpu); | ||
| 231 | unsigned int cpu_id = *off; | ||
| 205 | 232 | ||
| 206 | if (get_user(req, input) != 0) | 233 | if (get_user(req, input) != 0) |
| 207 | return -EFAULT; | 234 | return -EFAULT; |
| 208 | input++; | 235 | input++; |
| 209 | 236 | ||
| 210 | /* If you haven't initialized, you must do that first. */ | 237 | /* If you haven't initialized, you must do that first. */ |
| 211 | if (req != LHREQ_INITIALIZE && !lg) | 238 | if (req != LHREQ_INITIALIZE) { |
| 212 | return -EINVAL; | 239 | if (!lg || (cpu_id >= lg->nr_cpus)) |
| 240 | return -EINVAL; | ||
| 241 | cpu = &lg->cpus[cpu_id]; | ||
| 242 | if (!cpu) | ||
| 243 | return -EINVAL; | ||
| 244 | } | ||
| 213 | 245 | ||
| 214 | /* Once the Guest is dead, all you can do is read() why it died. */ | 246 | /* Once the Guest is dead, all you can do is read() why it died. */ |
| 215 | if (lg && lg->dead) | 247 | if (lg && lg->dead) |
| 216 | return -ENOENT; | 248 | return -ENOENT; |
| 217 | 249 | ||
| 218 | /* If you're not the task which owns the Guest, you can only break */ | 250 | /* If you're not the task which owns the Guest, you can only break */ |
| 219 | if (lg && current != lg->tsk && req != LHREQ_BREAK) | 251 | if (lg && current != cpu->tsk && req != LHREQ_BREAK) |
| 220 | return -EPERM; | 252 | return -EPERM; |
| 221 | 253 | ||
| 222 | switch (req) { | 254 | switch (req) { |
| 223 | case LHREQ_INITIALIZE: | 255 | case LHREQ_INITIALIZE: |
| 224 | return initialize(file, input); | 256 | return initialize(file, input); |
| 225 | case LHREQ_IRQ: | 257 | case LHREQ_IRQ: |
| 226 | return user_send_irq(lg, input); | 258 | return user_send_irq(cpu, input); |
| 227 | case LHREQ_BREAK: | 259 | case LHREQ_BREAK: |
| 228 | return break_guest_out(lg, input); | 260 | return break_guest_out(cpu, input); |
| 229 | default: | 261 | default: |
| 230 | return -EINVAL; | 262 | return -EINVAL; |
| 231 | } | 263 | } |
| @@ -241,6 +273,7 @@ static ssize_t write(struct file *file, const char __user *in, | |||
| 241 | static int close(struct inode *inode, struct file *file) | 273 | static int close(struct inode *inode, struct file *file) |
| 242 | { | 274 | { |
| 243 | struct lguest *lg = file->private_data; | 275 | struct lguest *lg = file->private_data; |
| 276 | unsigned int i; | ||
| 244 | 277 | ||
| 245 | /* If we never successfully initialized, there's nothing to clean up */ | 278 | /* If we never successfully initialized, there's nothing to clean up */ |
| 246 | if (!lg) | 279 | if (!lg) |
| @@ -249,19 +282,23 @@ static int close(struct inode *inode, struct file *file) | |||
| 249 | /* We need the big lock, to protect from inter-guest I/O and other | 282 | /* We need the big lock, to protect from inter-guest I/O and other |
| 250 | * Launchers initializing guests. */ | 283 | * Launchers initializing guests. */ |
| 251 | mutex_lock(&lguest_lock); | 284 | mutex_lock(&lguest_lock); |
| 252 | /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ | 285 | |
| 253 | hrtimer_cancel(&lg->hrt); | ||
| 254 | /* Free up the shadow page tables for the Guest. */ | 286 | /* Free up the shadow page tables for the Guest. */ |
| 255 | free_guest_pagetable(lg); | 287 | free_guest_pagetable(lg); |
| 256 | /* Now all the memory cleanups are done, it's safe to release the | 288 | |
| 257 | * Launcher's memory management structure. */ | 289 | for (i = 0; i < lg->nr_cpus; i++) { |
| 258 | mmput(lg->mm); | 290 | /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ |
| 291 | hrtimer_cancel(&lg->cpus[i].hrt); | ||
| 292 | /* We can free up the register page we allocated. */ | ||
| 293 | free_page(lg->cpus[i].regs_page); | ||
| 294 | /* Now all the memory cleanups are done, it's safe to release | ||
| 295 | * the Launcher's memory management structure. */ | ||
| 296 | mmput(lg->cpus[i].mm); | ||
| 297 | } | ||
| 259 | /* If lg->dead doesn't contain an error code it will be NULL or a | 298 | /* If lg->dead doesn't contain an error code it will be NULL or a |
| 260 | * kmalloc()ed string, either of which is ok to hand to kfree(). */ | 299 | * kmalloc()ed string, either of which is ok to hand to kfree(). */ |
| 261 | if (!IS_ERR(lg->dead)) | 300 | if (!IS_ERR(lg->dead)) |
| 262 | kfree(lg->dead); | 301 | kfree(lg->dead); |
| 263 | /* We can free up the register page we allocated. */ | ||
| 264 | free_page(lg->regs_page); | ||
| 265 | /* We clear the entire structure, which also marks it as free for the | 302 | /* We clear the entire structure, which also marks it as free for the |
| 266 | * next user. */ | 303 | * next user. */ |
| 267 | memset(lg, 0, sizeof(*lg)); | 304 | memset(lg, 0, sizeof(*lg)); |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index fffabb327157..74b4cf2a6c41 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
| @@ -68,23 +68,23 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); | |||
| 68 | * page directory entry (PGD) for that address. Since we keep track of several | 68 | * page directory entry (PGD) for that address. Since we keep track of several |
| 69 | * page tables, the "i" argument tells us which one we're interested in (it's | 69 | * page tables, the "i" argument tells us which one we're interested in (it's |
| 70 | * usually the current one). */ | 70 | * usually the current one). */ |
| 71 | static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) | 71 | static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) |
| 72 | { | 72 | { |
| 73 | unsigned int index = pgd_index(vaddr); | 73 | unsigned int index = pgd_index(vaddr); |
| 74 | 74 | ||
| 75 | /* We kill any Guest trying to touch the Switcher addresses. */ | 75 | /* We kill any Guest trying to touch the Switcher addresses. */ |
| 76 | if (index >= SWITCHER_PGD_INDEX) { | 76 | if (index >= SWITCHER_PGD_INDEX) { |
| 77 | kill_guest(lg, "attempt to access switcher pages"); | 77 | kill_guest(cpu, "attempt to access switcher pages"); |
| 78 | index = 0; | 78 | index = 0; |
| 79 | } | 79 | } |
| 80 | /* Return a pointer index'th pgd entry for the i'th page table. */ | 80 | /* Return a pointer index'th pgd entry for the i'th page table. */ |
| 81 | return &lg->pgdirs[i].pgdir[index]; | 81 | return &cpu->lg->pgdirs[i].pgdir[index]; |
| 82 | } | 82 | } |
| 83 | 83 | ||
| 84 | /* This routine then takes the page directory entry returned above, which | 84 | /* This routine then takes the page directory entry returned above, which |
| 85 | * contains the address of the page table entry (PTE) page. It then returns a | 85 | * contains the address of the page table entry (PTE) page. It then returns a |
| 86 | * pointer to the PTE entry for the given address. */ | 86 | * pointer to the PTE entry for the given address. */ |
| 87 | static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) | 87 | static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) |
| 88 | { | 88 | { |
| 89 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); | 89 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); |
| 90 | /* You should never call this if the PGD entry wasn't valid */ | 90 | /* You should never call this if the PGD entry wasn't valid */ |
| @@ -94,14 +94,13 @@ static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) | |||
| 94 | 94 | ||
| 95 | /* These two functions just like the above two, except they access the Guest | 95 | /* These two functions just like the above two, except they access the Guest |
| 96 | * page tables. Hence they return a Guest address. */ | 96 | * page tables. Hence they return a Guest address. */ |
| 97 | static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) | 97 | static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) |
| 98 | { | 98 | { |
| 99 | unsigned int index = vaddr >> (PGDIR_SHIFT); | 99 | unsigned int index = vaddr >> (PGDIR_SHIFT); |
| 100 | return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t); | 100 | return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); |
| 101 | } | 101 | } |
| 102 | 102 | ||
| 103 | static unsigned long gpte_addr(struct lguest *lg, | 103 | static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) |
| 104 | pgd_t gpgd, unsigned long vaddr) | ||
| 105 | { | 104 | { |
| 106 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; | 105 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; |
| 107 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); | 106 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); |
| @@ -138,7 +137,7 @@ static unsigned long get_pfn(unsigned long virtpfn, int write) | |||
| 138 | * entry can be a little tricky. The flags are (almost) the same, but the | 137 | * entry can be a little tricky. The flags are (almost) the same, but the |
| 139 | * Guest PTE contains a virtual page number: the CPU needs the real page | 138 | * Guest PTE contains a virtual page number: the CPU needs the real page |
| 140 | * number. */ | 139 | * number. */ |
| 141 | static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write) | 140 | static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) |
| 142 | { | 141 | { |
| 143 | unsigned long pfn, base, flags; | 142 | unsigned long pfn, base, flags; |
| 144 | 143 | ||
| @@ -149,7 +148,7 @@ static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write) | |||
| 149 | flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); | 148 | flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); |
| 150 | 149 | ||
| 151 | /* The Guest's pages are offset inside the Launcher. */ | 150 | /* The Guest's pages are offset inside the Launcher. */ |
| 152 | base = (unsigned long)lg->mem_base / PAGE_SIZE; | 151 | base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; |
| 153 | 152 | ||
| 154 | /* We need a temporary "unsigned long" variable to hold the answer from | 153 | /* We need a temporary "unsigned long" variable to hold the answer from |
| 155 | * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't | 154 | * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't |
| @@ -157,7 +156,7 @@ static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write) | |||
| 157 | * page, given the virtual number. */ | 156 | * page, given the virtual number. */ |
| 158 | pfn = get_pfn(base + pte_pfn(gpte), write); | 157 | pfn = get_pfn(base + pte_pfn(gpte), write); |
| 159 | if (pfn == -1UL) { | 158 | if (pfn == -1UL) { |
| 160 | kill_guest(lg, "failed to get page %lu", pte_pfn(gpte)); | 159 | kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); |
| 161 | /* When we destroy the Guest, we'll go through the shadow page | 160 | /* When we destroy the Guest, we'll go through the shadow page |
| 162 | * tables and release_pte() them. Make sure we don't think | 161 | * tables and release_pte() them. Make sure we don't think |
| 163 | * this one is valid! */ | 162 | * this one is valid! */ |
| @@ -177,17 +176,18 @@ static void release_pte(pte_t pte) | |||
| 177 | } | 176 | } |
| 178 | /*:*/ | 177 | /*:*/ |
| 179 | 178 | ||
| 180 | static void check_gpte(struct lguest *lg, pte_t gpte) | 179 | static void check_gpte(struct lg_cpu *cpu, pte_t gpte) |
| 181 | { | 180 | { |
| 182 | if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE)) | 181 | if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE)) |
| 183 | || pte_pfn(gpte) >= lg->pfn_limit) | 182 | || pte_pfn(gpte) >= cpu->lg->pfn_limit) |
| 184 | kill_guest(lg, "bad page table entry"); | 183 | kill_guest(cpu, "bad page table entry"); |
| 185 | } | 184 | } |
| 186 | 185 | ||
| 187 | static void check_gpgd(struct lguest *lg, pgd_t gpgd) | 186 | static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) |
| 188 | { | 187 | { |
| 189 | if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit) | 188 | if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || |
| 190 | kill_guest(lg, "bad page directory entry"); | 189 | (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) |
| 190 | kill_guest(cpu, "bad page directory entry"); | ||
| 191 | } | 191 | } |
| 192 | 192 | ||
| 193 | /*H:330 | 193 | /*H:330 |
| @@ -200,7 +200,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd) | |||
| 200 | * | 200 | * |
| 201 | * If we fixed up the fault (ie. we mapped the address), this routine returns | 201 | * If we fixed up the fault (ie. we mapped the address), this routine returns |
| 202 | * true. Otherwise, it was a real fault and we need to tell the Guest. */ | 202 | * true. Otherwise, it was a real fault and we need to tell the Guest. */ |
| 203 | int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | 203 | int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) |
| 204 | { | 204 | { |
| 205 | pgd_t gpgd; | 205 | pgd_t gpgd; |
| 206 | pgd_t *spgd; | 206 | pgd_t *spgd; |
| @@ -209,24 +209,24 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
| 209 | pte_t *spte; | 209 | pte_t *spte; |
| 210 | 210 | ||
| 211 | /* First step: get the top-level Guest page table entry. */ | 211 | /* First step: get the top-level Guest page table entry. */ |
| 212 | gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); | 212 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); |
| 213 | /* Toplevel not present? We can't map it in. */ | 213 | /* Toplevel not present? We can't map it in. */ |
| 214 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) | 214 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) |
| 215 | return 0; | 215 | return 0; |
| 216 | 216 | ||
| 217 | /* Now look at the matching shadow entry. */ | 217 | /* Now look at the matching shadow entry. */ |
| 218 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); | 218 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); |
| 219 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { | 219 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { |
| 220 | /* No shadow entry: allocate a new shadow PTE page. */ | 220 | /* No shadow entry: allocate a new shadow PTE page. */ |
| 221 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); | 221 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); |
| 222 | /* This is not really the Guest's fault, but killing it is | 222 | /* This is not really the Guest's fault, but killing it is |
| 223 | * simple for this corner case. */ | 223 | * simple for this corner case. */ |
| 224 | if (!ptepage) { | 224 | if (!ptepage) { |
| 225 | kill_guest(lg, "out of memory allocating pte page"); | 225 | kill_guest(cpu, "out of memory allocating pte page"); |
| 226 | return 0; | 226 | return 0; |
| 227 | } | 227 | } |
| 228 | /* We check that the Guest pgd is OK. */ | 228 | /* We check that the Guest pgd is OK. */ |
| 229 | check_gpgd(lg, gpgd); | 229 | check_gpgd(cpu, gpgd); |
| 230 | /* And we copy the flags to the shadow PGD entry. The page | 230 | /* And we copy the flags to the shadow PGD entry. The page |
| 231 | * number in the shadow PGD is the page we just allocated. */ | 231 | * number in the shadow PGD is the page we just allocated. */ |
| 232 | *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); | 232 | *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); |
| @@ -234,8 +234,8 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
| 234 | 234 | ||
| 235 | /* OK, now we look at the lower level in the Guest page table: keep its | 235 | /* OK, now we look at the lower level in the Guest page table: keep its |
| 236 | * address, because we might update it later. */ | 236 | * address, because we might update it later. */ |
| 237 | gpte_ptr = gpte_addr(lg, gpgd, vaddr); | 237 | gpte_ptr = gpte_addr(gpgd, vaddr); |
| 238 | gpte = lgread(lg, gpte_ptr, pte_t); | 238 | gpte = lgread(cpu, gpte_ptr, pte_t); |
| 239 | 239 | ||
| 240 | /* If this page isn't in the Guest page tables, we can't page it in. */ | 240 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
| 241 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) | 241 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) |
| @@ -252,7 +252,7 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
| 252 | 252 | ||
| 253 | /* Check that the Guest PTE flags are OK, and the page number is below | 253 | /* Check that the Guest PTE flags are OK, and the page number is below |
| 254 | * the pfn_limit (ie. not mapping the Launcher binary). */ | 254 | * the pfn_limit (ie. not mapping the Launcher binary). */ |
| 255 | check_gpte(lg, gpte); | 255 | check_gpte(cpu, gpte); |
| 256 | 256 | ||
| 257 | /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ | 257 | /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ |
| 258 | gpte = pte_mkyoung(gpte); | 258 | gpte = pte_mkyoung(gpte); |
| @@ -260,7 +260,7 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
| 260 | gpte = pte_mkdirty(gpte); | 260 | gpte = pte_mkdirty(gpte); |
| 261 | 261 | ||
| 262 | /* Get the pointer to the shadow PTE entry we're going to set. */ | 262 | /* Get the pointer to the shadow PTE entry we're going to set. */ |
| 263 | spte = spte_addr(lg, *spgd, vaddr); | 263 | spte = spte_addr(*spgd, vaddr); |
| 264 | /* If there was a valid shadow PTE entry here before, we release it. | 264 | /* If there was a valid shadow PTE entry here before, we release it. |
| 265 | * This can happen with a write to a previously read-only entry. */ | 265 | * This can happen with a write to a previously read-only entry. */ |
| 266 | release_pte(*spte); | 266 | release_pte(*spte); |
| @@ -268,17 +268,17 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
| 268 | /* If this is a write, we insist that the Guest page is writable (the | 268 | /* If this is a write, we insist that the Guest page is writable (the |
| 269 | * final arg to gpte_to_spte()). */ | 269 | * final arg to gpte_to_spte()). */ |
| 270 | if (pte_dirty(gpte)) | 270 | if (pte_dirty(gpte)) |
| 271 | *spte = gpte_to_spte(lg, gpte, 1); | 271 | *spte = gpte_to_spte(cpu, gpte, 1); |
| 272 | else | 272 | else |
| 273 | /* If this is a read, don't set the "writable" bit in the page | 273 | /* If this is a read, don't set the "writable" bit in the page |
| 274 | * table entry, even if the Guest says it's writable. That way | 274 | * table entry, even if the Guest says it's writable. That way |
| 275 | * we will come back here when a write does actually occur, so | 275 | * we will come back here when a write does actually occur, so |
| 276 | * we can update the Guest's _PAGE_DIRTY flag. */ | 276 | * we can update the Guest's _PAGE_DIRTY flag. */ |
| 277 | *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); | 277 | *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0); |
| 278 | 278 | ||
| 279 | /* Finally, we write the Guest PTE entry back: we've set the | 279 | /* Finally, we write the Guest PTE entry back: we've set the |
| 280 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ | 280 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ |
| 281 | lgwrite(lg, gpte_ptr, pte_t, gpte); | 281 | lgwrite(cpu, gpte_ptr, pte_t, gpte); |
| 282 | 282 | ||
| 283 | /* The fault is fixed, the page table is populated, the mapping | 283 | /* The fault is fixed, the page table is populated, the mapping |
| 284 | * manipulated, the result returned and the code complete. A small | 284 | * manipulated, the result returned and the code complete. A small |
| @@ -297,19 +297,19 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
| 297 | * | 297 | * |
| 298 | * This is a quick version which answers the question: is this virtual address | 298 | * This is a quick version which answers the question: is this virtual address |
| 299 | * mapped by the shadow page tables, and is it writable? */ | 299 | * mapped by the shadow page tables, and is it writable? */ |
| 300 | static int page_writable(struct lguest *lg, unsigned long vaddr) | 300 | static int page_writable(struct lg_cpu *cpu, unsigned long vaddr) |
| 301 | { | 301 | { |
| 302 | pgd_t *spgd; | 302 | pgd_t *spgd; |
| 303 | unsigned long flags; | 303 | unsigned long flags; |
| 304 | 304 | ||
| 305 | /* Look at the current top level entry: is it present? */ | 305 | /* Look at the current top level entry: is it present? */ |
| 306 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); | 306 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); |
| 307 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) | 307 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) |
| 308 | return 0; | 308 | return 0; |
| 309 | 309 | ||
| 310 | /* Check the flags on the pte entry itself: it must be present and | 310 | /* Check the flags on the pte entry itself: it must be present and |
| 311 | * writable. */ | 311 | * writable. */ |
| 312 | flags = pte_flags(*(spte_addr(lg, *spgd, vaddr))); | 312 | flags = pte_flags(*(spte_addr(*spgd, vaddr))); |
| 313 | 313 | ||
| 314 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); | 314 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); |
| 315 | } | 315 | } |
| @@ -317,10 +317,10 @@ static int page_writable(struct lguest *lg, unsigned long vaddr) | |||
| 317 | /* So, when pin_stack_pages() asks us to pin a page, we check if it's already | 317 | /* So, when pin_stack_pages() asks us to pin a page, we check if it's already |
| 318 | * in the page tables, and if not, we call demand_page() with error code 2 | 318 | * in the page tables, and if not, we call demand_page() with error code 2 |
| 319 | * (meaning "write"). */ | 319 | * (meaning "write"). */ |
| 320 | void pin_page(struct lguest *lg, unsigned long vaddr) | 320 | void pin_page(struct lg_cpu *cpu, unsigned long vaddr) |
| 321 | { | 321 | { |
| 322 | if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2)) | 322 | if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) |
| 323 | kill_guest(lg, "bad stack page %#lx", vaddr); | 323 | kill_guest(cpu, "bad stack page %#lx", vaddr); |
| 324 | } | 324 | } |
| 325 | 325 | ||
| 326 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ | 326 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ |
| @@ -358,28 +358,28 @@ static void flush_user_mappings(struct lguest *lg, int idx) | |||
| 358 | * | 358 | * |
| 359 | * The Guest has a hypercall to throw away the page tables: it's used when a | 359 | * The Guest has a hypercall to throw away the page tables: it's used when a |
| 360 | * large number of mappings have been changed. */ | 360 | * large number of mappings have been changed. */ |
| 361 | void guest_pagetable_flush_user(struct lguest *lg) | 361 | void guest_pagetable_flush_user(struct lg_cpu *cpu) |
| 362 | { | 362 | { |
| 363 | /* Drop the userspace part of the current page table. */ | 363 | /* Drop the userspace part of the current page table. */ |
| 364 | flush_user_mappings(lg, lg->pgdidx); | 364 | flush_user_mappings(cpu->lg, cpu->cpu_pgd); |
| 365 | } | 365 | } |
| 366 | /*:*/ | 366 | /*:*/ |
| 367 | 367 | ||
| 368 | /* We walk down the guest page tables to get a guest-physical address */ | 368 | /* We walk down the guest page tables to get a guest-physical address */ |
| 369 | unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) | 369 | unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) |
| 370 | { | 370 | { |
| 371 | pgd_t gpgd; | 371 | pgd_t gpgd; |
| 372 | pte_t gpte; | 372 | pte_t gpte; |
| 373 | 373 | ||
| 374 | /* First step: get the top-level Guest page table entry. */ | 374 | /* First step: get the top-level Guest page table entry. */ |
| 375 | gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); | 375 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); |
| 376 | /* Toplevel not present? We can't map it in. */ | 376 | /* Toplevel not present? We can't map it in. */ |
| 377 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) | 377 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) |
| 378 | kill_guest(lg, "Bad address %#lx", vaddr); | 378 | kill_guest(cpu, "Bad address %#lx", vaddr); |
| 379 | 379 | ||
| 380 | gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t); | 380 | gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t); |
| 381 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) | 381 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) |
| 382 | kill_guest(lg, "Bad address %#lx", vaddr); | 382 | kill_guest(cpu, "Bad address %#lx", vaddr); |
| 383 | 383 | ||
| 384 | return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); | 384 | return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); |
| 385 | } | 385 | } |
| @@ -399,7 +399,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) | |||
| 399 | /*H:435 And this is us, creating the new page directory. If we really do | 399 | /*H:435 And this is us, creating the new page directory. If we really do |
| 400 | * allocate a new one (and so the kernel parts are not there), we set | 400 | * allocate a new one (and so the kernel parts are not there), we set |
| 401 | * blank_pgdir. */ | 401 | * blank_pgdir. */ |
| 402 | static unsigned int new_pgdir(struct lguest *lg, | 402 | static unsigned int new_pgdir(struct lg_cpu *cpu, |
| 403 | unsigned long gpgdir, | 403 | unsigned long gpgdir, |
| 404 | int *blank_pgdir) | 404 | int *blank_pgdir) |
| 405 | { | 405 | { |
| @@ -407,22 +407,23 @@ static unsigned int new_pgdir(struct lguest *lg, | |||
| 407 | 407 | ||
| 408 | /* We pick one entry at random to throw out. Choosing the Least | 408 | /* We pick one entry at random to throw out. Choosing the Least |
| 409 | * Recently Used might be better, but this is easy. */ | 409 | * Recently Used might be better, but this is easy. */ |
| 410 | next = random32() % ARRAY_SIZE(lg->pgdirs); | 410 | next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); |
| 411 | /* If it's never been allocated at all before, try now. */ | 411 | /* If it's never been allocated at all before, try now. */ |
| 412 | if (!lg->pgdirs[next].pgdir) { | 412 | if (!cpu->lg->pgdirs[next].pgdir) { |
| 413 | lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); | 413 | cpu->lg->pgdirs[next].pgdir = |
| 414 | (pgd_t *)get_zeroed_page(GFP_KERNEL); | ||
| 414 | /* If the allocation fails, just keep using the one we have */ | 415 | /* If the allocation fails, just keep using the one we have */ |
| 415 | if (!lg->pgdirs[next].pgdir) | 416 | if (!cpu->lg->pgdirs[next].pgdir) |
| 416 | next = lg->pgdidx; | 417 | next = cpu->cpu_pgd; |
| 417 | else | 418 | else |
| 418 | /* This is a blank page, so there are no kernel | 419 | /* This is a blank page, so there are no kernel |
| 419 | * mappings: caller must map the stack! */ | 420 | * mappings: caller must map the stack! */ |
| 420 | *blank_pgdir = 1; | 421 | *blank_pgdir = 1; |
| 421 | } | 422 | } |
| 422 | /* Record which Guest toplevel this shadows. */ | 423 | /* Record which Guest toplevel this shadows. */ |
| 423 | lg->pgdirs[next].gpgdir = gpgdir; | 424 | cpu->lg->pgdirs[next].gpgdir = gpgdir; |
| 424 | /* Release all the non-kernel mappings. */ | 425 | /* Release all the non-kernel mappings. */ |
| 425 | flush_user_mappings(lg, next); | 426 | flush_user_mappings(cpu->lg, next); |
| 426 | 427 | ||
| 427 | return next; | 428 | return next; |
| 428 | } | 429 | } |
| @@ -432,21 +433,21 @@ static unsigned int new_pgdir(struct lguest *lg, | |||
| 432 | * Now we've seen all the page table setting and manipulation, let's see what | 433 | * Now we've seen all the page table setting and manipulation, let's see what |
| 433 | * what happens when the Guest changes page tables (ie. changes the top-level | 434 | * what happens when the Guest changes page tables (ie. changes the top-level |
| 434 | * pgdir). This occurs on almost every context switch. */ | 435 | * pgdir). This occurs on almost every context switch. */ |
| 435 | void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) | 436 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) |
| 436 | { | 437 | { |
| 437 | int newpgdir, repin = 0; | 438 | int newpgdir, repin = 0; |
| 438 | 439 | ||
| 439 | /* Look to see if we have this one already. */ | 440 | /* Look to see if we have this one already. */ |
| 440 | newpgdir = find_pgdir(lg, pgtable); | 441 | newpgdir = find_pgdir(cpu->lg, pgtable); |
| 441 | /* If not, we allocate or mug an existing one: if it's a fresh one, | 442 | /* If not, we allocate or mug an existing one: if it's a fresh one, |
| 442 | * repin gets set to 1. */ | 443 | * repin gets set to 1. */ |
| 443 | if (newpgdir == ARRAY_SIZE(lg->pgdirs)) | 444 | if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) |
| 444 | newpgdir = new_pgdir(lg, pgtable, &repin); | 445 | newpgdir = new_pgdir(cpu, pgtable, &repin); |
| 445 | /* Change the current pgd index to the new one. */ | 446 | /* Change the current pgd index to the new one. */ |
| 446 | lg->pgdidx = newpgdir; | 447 | cpu->cpu_pgd = newpgdir; |
| 447 | /* If it was completely blank, we map in the Guest kernel stack */ | 448 | /* If it was completely blank, we map in the Guest kernel stack */ |
| 448 | if (repin) | 449 | if (repin) |
| 449 | pin_stack_pages(lg); | 450 | pin_stack_pages(cpu); |
| 450 | } | 451 | } |
| 451 | 452 | ||
| 452 | /*H:470 Finally, a routine which throws away everything: all PGD entries in all | 453 | /*H:470 Finally, a routine which throws away everything: all PGD entries in all |
| @@ -468,11 +469,11 @@ static void release_all_pagetables(struct lguest *lg) | |||
| 468 | * mapping. Since kernel mappings are in every page table, it's easiest to | 469 | * mapping. Since kernel mappings are in every page table, it's easiest to |
| 469 | * throw them all away. This traps the Guest in amber for a while as | 470 | * throw them all away. This traps the Guest in amber for a while as |
| 470 | * everything faults back in, but it's rare. */ | 471 | * everything faults back in, but it's rare. */ |
| 471 | void guest_pagetable_clear_all(struct lguest *lg) | 472 | void guest_pagetable_clear_all(struct lg_cpu *cpu) |
| 472 | { | 473 | { |
| 473 | release_all_pagetables(lg); | 474 | release_all_pagetables(cpu->lg); |
| 474 | /* We need the Guest kernel stack mapped again. */ | 475 | /* We need the Guest kernel stack mapped again. */ |
| 475 | pin_stack_pages(lg); | 476 | pin_stack_pages(cpu); |
| 476 | } | 477 | } |
| 477 | /*:*/ | 478 | /*:*/ |
| 478 | /*M:009 Since we throw away all mappings when a kernel mapping changes, our | 479 | /*M:009 Since we throw away all mappings when a kernel mapping changes, our |
| @@ -497,24 +498,24 @@ void guest_pagetable_clear_all(struct lguest *lg) | |||
| 497 | * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if | 498 | * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if |
| 498 | * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. | 499 | * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. |
| 499 | */ | 500 | */ |
| 500 | static void do_set_pte(struct lguest *lg, int idx, | 501 | static void do_set_pte(struct lg_cpu *cpu, int idx, |
| 501 | unsigned long vaddr, pte_t gpte) | 502 | unsigned long vaddr, pte_t gpte) |
| 502 | { | 503 | { |
| 503 | /* Look up the matching shadow page directory entry. */ | 504 | /* Look up the matching shadow page directory entry. */ |
| 504 | pgd_t *spgd = spgd_addr(lg, idx, vaddr); | 505 | pgd_t *spgd = spgd_addr(cpu, idx, vaddr); |
| 505 | 506 | ||
| 506 | /* If the top level isn't present, there's no entry to update. */ | 507 | /* If the top level isn't present, there's no entry to update. */ |
| 507 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { | 508 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { |
| 508 | /* Otherwise, we start by releasing the existing entry. */ | 509 | /* Otherwise, we start by releasing the existing entry. */ |
| 509 | pte_t *spte = spte_addr(lg, *spgd, vaddr); | 510 | pte_t *spte = spte_addr(*spgd, vaddr); |
| 510 | release_pte(*spte); | 511 | release_pte(*spte); |
| 511 | 512 | ||
| 512 | /* If they're setting this entry as dirty or accessed, we might | 513 | /* If they're setting this entry as dirty or accessed, we might |
| 513 | * as well put that entry they've given us in now. This shaves | 514 | * as well put that entry they've given us in now. This shaves |
| 514 | * 10% off a copy-on-write micro-benchmark. */ | 515 | * 10% off a copy-on-write micro-benchmark. */ |
| 515 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { | 516 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { |
| 516 | check_gpte(lg, gpte); | 517 | check_gpte(cpu, gpte); |
| 517 | *spte = gpte_to_spte(lg, gpte, | 518 | *spte = gpte_to_spte(cpu, gpte, |
| 518 | pte_flags(gpte) & _PAGE_DIRTY); | 519 | pte_flags(gpte) & _PAGE_DIRTY); |
| 519 | } else | 520 | } else |
| 520 | /* Otherwise kill it and we can demand_page() it in | 521 | /* Otherwise kill it and we can demand_page() it in |
| @@ -533,22 +534,22 @@ static void do_set_pte(struct lguest *lg, int idx, | |||
| 533 | * | 534 | * |
| 534 | * The benefit is that when we have to track a new page table, we can copy keep | 535 | * The benefit is that when we have to track a new page table, we can copy keep |
| 535 | * all the kernel mappings. This speeds up context switch immensely. */ | 536 | * all the kernel mappings. This speeds up context switch immensely. */ |
| 536 | void guest_set_pte(struct lguest *lg, | 537 | void guest_set_pte(struct lg_cpu *cpu, |
| 537 | unsigned long gpgdir, unsigned long vaddr, pte_t gpte) | 538 | unsigned long gpgdir, unsigned long vaddr, pte_t gpte) |
| 538 | { | 539 | { |
| 539 | /* Kernel mappings must be changed on all top levels. Slow, but | 540 | /* Kernel mappings must be changed on all top levels. Slow, but |
| 540 | * doesn't happen often. */ | 541 | * doesn't happen often. */ |
| 541 | if (vaddr >= lg->kernel_address) { | 542 | if (vaddr >= cpu->lg->kernel_address) { |
| 542 | unsigned int i; | 543 | unsigned int i; |
| 543 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 544 | for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) |
| 544 | if (lg->pgdirs[i].pgdir) | 545 | if (cpu->lg->pgdirs[i].pgdir) |
| 545 | do_set_pte(lg, i, vaddr, gpte); | 546 | do_set_pte(cpu, i, vaddr, gpte); |
| 546 | } else { | 547 | } else { |
| 547 | /* Is this page table one we have a shadow for? */ | 548 | /* Is this page table one we have a shadow for? */ |
| 548 | int pgdir = find_pgdir(lg, gpgdir); | 549 | int pgdir = find_pgdir(cpu->lg, gpgdir); |
| 549 | if (pgdir != ARRAY_SIZE(lg->pgdirs)) | 550 | if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs)) |
| 550 | /* If so, do the update. */ | 551 | /* If so, do the update. */ |
| 551 | do_set_pte(lg, pgdir, vaddr, gpte); | 552 | do_set_pte(cpu, pgdir, vaddr, gpte); |
| 552 | } | 553 | } |
| 553 | } | 554 | } |
| 554 | 555 | ||
| @@ -590,30 +591,32 @@ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) | |||
| 590 | { | 591 | { |
| 591 | /* We start on the first shadow page table, and give it a blank PGD | 592 | /* We start on the first shadow page table, and give it a blank PGD |
| 592 | * page. */ | 593 | * page. */ |
| 593 | lg->pgdidx = 0; | 594 | lg->pgdirs[0].gpgdir = pgtable; |
| 594 | lg->pgdirs[lg->pgdidx].gpgdir = pgtable; | 595 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); |
| 595 | lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL); | 596 | if (!lg->pgdirs[0].pgdir) |
| 596 | if (!lg->pgdirs[lg->pgdidx].pgdir) | ||
| 597 | return -ENOMEM; | 597 | return -ENOMEM; |
| 598 | lg->cpus[0].cpu_pgd = 0; | ||
| 598 | return 0; | 599 | return 0; |
| 599 | } | 600 | } |
| 600 | 601 | ||
| 601 | /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ | 602 | /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ |
| 602 | void page_table_guest_data_init(struct lguest *lg) | 603 | void page_table_guest_data_init(struct lg_cpu *cpu) |
| 603 | { | 604 | { |
| 604 | /* We get the kernel address: above this is all kernel memory. */ | 605 | /* We get the kernel address: above this is all kernel memory. */ |
| 605 | if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address) | 606 | if (get_user(cpu->lg->kernel_address, |
| 607 | &cpu->lg->lguest_data->kernel_address) | ||
| 606 | /* We tell the Guest that it can't use the top 4MB of virtual | 608 | /* We tell the Guest that it can't use the top 4MB of virtual |
| 607 | * addresses used by the Switcher. */ | 609 | * addresses used by the Switcher. */ |
| 608 | || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) | 610 | || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem) |
| 609 | || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir)) | 611 | || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) |
| 610 | kill_guest(lg, "bad guest page %p", lg->lguest_data); | 612 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); |
| 611 | 613 | ||
| 612 | /* In flush_user_mappings() we loop from 0 to | 614 | /* In flush_user_mappings() we loop from 0 to |
| 613 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the | 615 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the |
| 614 | * Switcher mappings, so check that now. */ | 616 | * Switcher mappings, so check that now. */ |
| 615 | if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX) | 617 | if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) |
| 616 | kill_guest(lg, "bad kernel address %#lx", lg->kernel_address); | 618 | kill_guest(cpu, "bad kernel address %#lx", |
| 619 | cpu->lg->kernel_address); | ||
| 617 | } | 620 | } |
| 618 | 621 | ||
| 619 | /* When a Guest dies, our cleanup is fairly simple. */ | 622 | /* When a Guest dies, our cleanup is fairly simple. */ |
| @@ -634,17 +637,18 @@ void free_guest_pagetable(struct lguest *lg) | |||
| 634 | * Guest (and not the pages for other CPUs). We have the appropriate PTE pages | 637 | * Guest (and not the pages for other CPUs). We have the appropriate PTE pages |
| 635 | * for each CPU already set up, we just need to hook them in now we know which | 638 | * for each CPU already set up, we just need to hook them in now we know which |
| 636 | * Guest is about to run on this CPU. */ | 639 | * Guest is about to run on this CPU. */ |
| 637 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) | 640 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) |
| 638 | { | 641 | { |
| 639 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); | 642 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); |
| 640 | pgd_t switcher_pgd; | 643 | pgd_t switcher_pgd; |
| 641 | pte_t regs_pte; | 644 | pte_t regs_pte; |
| 645 | unsigned long pfn; | ||
| 642 | 646 | ||
| 643 | /* Make the last PGD entry for this Guest point to the Switcher's PTE | 647 | /* Make the last PGD entry for this Guest point to the Switcher's PTE |
| 644 | * page for this CPU (with appropriate flags). */ | 648 | * page for this CPU (with appropriate flags). */ |
| 645 | switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL); | 649 | switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL); |
| 646 | 650 | ||
| 647 | lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; | 651 | cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; |
| 648 | 652 | ||
| 649 | /* We also change the Switcher PTE page. When we're running the Guest, | 653 | /* We also change the Switcher PTE page. When we're running the Guest, |
| 650 | * we want the Guest's "regs" page to appear where the first Switcher | 654 | * we want the Guest's "regs" page to appear where the first Switcher |
| @@ -653,7 +657,8 @@ void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) | |||
| 653 | * CPU's "struct lguest_pages": if we make sure the Guest's register | 657 | * CPU's "struct lguest_pages": if we make sure the Guest's register |
| 654 | * page is already mapped there, we don't have to copy them out | 658 | * page is already mapped there, we don't have to copy them out |
| 655 | * again. */ | 659 | * again. */ |
| 656 | regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL)); | 660 | pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; |
| 661 | regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL)); | ||
| 657 | switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; | 662 | switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; |
| 658 | } | 663 | } |
| 659 | /*:*/ | 664 | /*:*/ |
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index 9e189cbec7dd..ec6aa3f1c36b 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c | |||
| @@ -58,7 +58,7 @@ static int ignored_gdt(unsigned int num) | |||
| 58 | * Protection Fault in the Switcher when it restores a Guest segment register | 58 | * Protection Fault in the Switcher when it restores a Guest segment register |
| 59 | * which tries to use that entry. Then we kill the Guest for causing such a | 59 | * which tries to use that entry. Then we kill the Guest for causing such a |
| 60 | * mess: the message will be "unhandled trap 256". */ | 60 | * mess: the message will be "unhandled trap 256". */ |
| 61 | static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) | 61 | static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) |
| 62 | { | 62 | { |
| 63 | unsigned int i; | 63 | unsigned int i; |
| 64 | 64 | ||
| @@ -71,14 +71,14 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) | |||
| 71 | /* Segment descriptors contain a privilege level: the Guest is | 71 | /* Segment descriptors contain a privilege level: the Guest is |
| 72 | * sometimes careless and leaves this as 0, even though it's | 72 | * sometimes careless and leaves this as 0, even though it's |
| 73 | * running at privilege level 1. If so, we fix it here. */ | 73 | * running at privilege level 1. If so, we fix it here. */ |
| 74 | if ((lg->arch.gdt[i].b & 0x00006000) == 0) | 74 | if ((cpu->arch.gdt[i].b & 0x00006000) == 0) |
| 75 | lg->arch.gdt[i].b |= (GUEST_PL << 13); | 75 | cpu->arch.gdt[i].b |= (GUEST_PL << 13); |
| 76 | 76 | ||
| 77 | /* Each descriptor has an "accessed" bit. If we don't set it | 77 | /* Each descriptor has an "accessed" bit. If we don't set it |
| 78 | * now, the CPU will try to set it when the Guest first loads | 78 | * now, the CPU will try to set it when the Guest first loads |
| 79 | * that entry into a segment register. But the GDT isn't | 79 | * that entry into a segment register. But the GDT isn't |
| 80 | * writable by the Guest, so bad things can happen. */ | 80 | * writable by the Guest, so bad things can happen. */ |
| 81 | lg->arch.gdt[i].b |= 0x00000100; | 81 | cpu->arch.gdt[i].b |= 0x00000100; |
| 82 | } | 82 | } |
| 83 | } | 83 | } |
| 84 | 84 | ||
| @@ -109,31 +109,31 @@ void setup_default_gdt_entries(struct lguest_ro_state *state) | |||
| 109 | 109 | ||
| 110 | /* This routine sets up the initial Guest GDT for booting. All entries start | 110 | /* This routine sets up the initial Guest GDT for booting. All entries start |
| 111 | * as 0 (unusable). */ | 111 | * as 0 (unusable). */ |
| 112 | void setup_guest_gdt(struct lguest *lg) | 112 | void setup_guest_gdt(struct lg_cpu *cpu) |
| 113 | { | 113 | { |
| 114 | /* Start with full 0-4G segments... */ | 114 | /* Start with full 0-4G segments... */ |
| 115 | lg->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; | 115 | cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; |
| 116 | lg->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; | 116 | cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; |
| 117 | /* ...except the Guest is allowed to use them, so set the privilege | 117 | /* ...except the Guest is allowed to use them, so set the privilege |
| 118 | * level appropriately in the flags. */ | 118 | * level appropriately in the flags. */ |
| 119 | lg->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); | 119 | cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); |
| 120 | lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); | 120 | cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); |
| 121 | } | 121 | } |
| 122 | 122 | ||
| 123 | /*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" | 123 | /*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" |
| 124 | * entries. */ | 124 | * entries. */ |
| 125 | void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) | 125 | void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) |
| 126 | { | 126 | { |
| 127 | unsigned int i; | 127 | unsigned int i; |
| 128 | 128 | ||
| 129 | for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++) | 129 | for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++) |
| 130 | gdt[i] = lg->arch.gdt[i]; | 130 | gdt[i] = cpu->arch.gdt[i]; |
| 131 | } | 131 | } |
| 132 | 132 | ||
| 133 | /*H:640 When the Guest is run on a different CPU, or the GDT entries have | 133 | /*H:640 When the Guest is run on a different CPU, or the GDT entries have |
| 134 | * changed, copy_gdt() is called to copy the Guest's GDT entries across to this | 134 | * changed, copy_gdt() is called to copy the Guest's GDT entries across to this |
| 135 | * CPU's GDT. */ | 135 | * CPU's GDT. */ |
| 136 | void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) | 136 | void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) |
| 137 | { | 137 | { |
| 138 | unsigned int i; | 138 | unsigned int i; |
| 139 | 139 | ||
| @@ -141,38 +141,38 @@ void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) | |||
| 141 | * replaced. See ignored_gdt() above. */ | 141 | * replaced. See ignored_gdt() above. */ |
| 142 | for (i = 0; i < GDT_ENTRIES; i++) | 142 | for (i = 0; i < GDT_ENTRIES; i++) |
| 143 | if (!ignored_gdt(i)) | 143 | if (!ignored_gdt(i)) |
| 144 | gdt[i] = lg->arch.gdt[i]; | 144 | gdt[i] = cpu->arch.gdt[i]; |
| 145 | } | 145 | } |
| 146 | 146 | ||
| 147 | /*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). | 147 | /*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). |
| 148 | * We copy it from the Guest and tweak the entries. */ | 148 | * We copy it from the Guest and tweak the entries. */ |
| 149 | void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) | 149 | void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num) |
| 150 | { | 150 | { |
| 151 | /* We assume the Guest has the same number of GDT entries as the | 151 | /* We assume the Guest has the same number of GDT entries as the |
| 152 | * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ | 152 | * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ |
| 153 | if (num > ARRAY_SIZE(lg->arch.gdt)) | 153 | if (num > ARRAY_SIZE(cpu->arch.gdt)) |
| 154 | kill_guest(lg, "too many gdt entries %i", num); | 154 | kill_guest(cpu, "too many gdt entries %i", num); |
| 155 | 155 | ||
| 156 | /* We read the whole thing in, then fix it up. */ | 156 | /* We read the whole thing in, then fix it up. */ |
| 157 | __lgread(lg, lg->arch.gdt, table, num * sizeof(lg->arch.gdt[0])); | 157 | __lgread(cpu, cpu->arch.gdt, table, num * sizeof(cpu->arch.gdt[0])); |
| 158 | fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->arch.gdt)); | 158 | fixup_gdt_table(cpu, 0, ARRAY_SIZE(cpu->arch.gdt)); |
| 159 | /* Mark that the GDT changed so the core knows it has to copy it again, | 159 | /* Mark that the GDT changed so the core knows it has to copy it again, |
| 160 | * even if the Guest is run on the same CPU. */ | 160 | * even if the Guest is run on the same CPU. */ |
| 161 | lg->changed |= CHANGED_GDT; | 161 | cpu->changed |= CHANGED_GDT; |
| 162 | } | 162 | } |
| 163 | 163 | ||
| 164 | /* This is the fast-track version for just changing the three TLS entries. | 164 | /* This is the fast-track version for just changing the three TLS entries. |
| 165 | * Remember that this happens on every context switch, so it's worth | 165 | * Remember that this happens on every context switch, so it's worth |
| 166 | * optimizing. But wouldn't it be neater to have a single hypercall to cover | 166 | * optimizing. But wouldn't it be neater to have a single hypercall to cover |
| 167 | * both cases? */ | 167 | * both cases? */ |
| 168 | void guest_load_tls(struct lguest *lg, unsigned long gtls) | 168 | void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) |
| 169 | { | 169 | { |
| 170 | struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN]; | 170 | struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN]; |
| 171 | 171 | ||
| 172 | __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); | 172 | __lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); |
| 173 | fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); | 173 | fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); |
| 174 | /* Note that just the TLS entries have changed. */ | 174 | /* Note that just the TLS entries have changed. */ |
| 175 | lg->changed |= CHANGED_GDT_TLS; | 175 | cpu->changed |= CHANGED_GDT_TLS; |
| 176 | } | 176 | } |
| 177 | /*:*/ | 177 | /*:*/ |
| 178 | 178 | ||
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 44adb00e1490..61f2f8eb8cad 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c | |||
| @@ -60,7 +60,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu) | |||
| 60 | (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); | 60 | (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); |
| 61 | } | 61 | } |
| 62 | 62 | ||
| 63 | static DEFINE_PER_CPU(struct lguest *, last_guest); | 63 | static DEFINE_PER_CPU(struct lg_cpu *, last_cpu); |
| 64 | 64 | ||
| 65 | /*S:010 | 65 | /*S:010 |
| 66 | * We approach the Switcher. | 66 | * We approach the Switcher. |
| @@ -73,16 +73,16 @@ static DEFINE_PER_CPU(struct lguest *, last_guest); | |||
| 73 | * since it last ran. We saw this set in interrupts_and_traps.c and | 73 | * since it last ran. We saw this set in interrupts_and_traps.c and |
| 74 | * segments.c. | 74 | * segments.c. |
| 75 | */ | 75 | */ |
| 76 | static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) | 76 | static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) |
| 77 | { | 77 | { |
| 78 | /* Copying all this data can be quite expensive. We usually run the | 78 | /* Copying all this data can be quite expensive. We usually run the |
| 79 | * same Guest we ran last time (and that Guest hasn't run anywhere else | 79 | * same Guest we ran last time (and that Guest hasn't run anywhere else |
| 80 | * meanwhile). If that's not the case, we pretend everything in the | 80 | * meanwhile). If that's not the case, we pretend everything in the |
| 81 | * Guest has changed. */ | 81 | * Guest has changed. */ |
| 82 | if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { | 82 | if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) { |
| 83 | __get_cpu_var(last_guest) = lg; | 83 | __get_cpu_var(last_cpu) = cpu; |
| 84 | lg->last_pages = pages; | 84 | cpu->last_pages = pages; |
| 85 | lg->changed = CHANGED_ALL; | 85 | cpu->changed = CHANGED_ALL; |
| 86 | } | 86 | } |
| 87 | 87 | ||
| 88 | /* These copies are pretty cheap, so we do them unconditionally: */ | 88 | /* These copies are pretty cheap, so we do them unconditionally: */ |
| @@ -90,42 +90,42 @@ static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) | |||
| 90 | pages->state.host_cr3 = __pa(current->mm->pgd); | 90 | pages->state.host_cr3 = __pa(current->mm->pgd); |
| 91 | /* Set up the Guest's page tables to see this CPU's pages (and no | 91 | /* Set up the Guest's page tables to see this CPU's pages (and no |
| 92 | * other CPU's pages). */ | 92 | * other CPU's pages). */ |
| 93 | map_switcher_in_guest(lg, pages); | 93 | map_switcher_in_guest(cpu, pages); |
| 94 | /* Set up the two "TSS" members which tell the CPU what stack to use | 94 | /* Set up the two "TSS" members which tell the CPU what stack to use |
| 95 | * for traps which do directly into the Guest (ie. traps at privilege | 95 | * for traps which do directly into the Guest (ie. traps at privilege |
| 96 | * level 1). */ | 96 | * level 1). */ |
| 97 | pages->state.guest_tss.sp1 = lg->esp1; | 97 | pages->state.guest_tss.esp1 = cpu->esp1; |
| 98 | pages->state.guest_tss.ss1 = lg->ss1; | 98 | pages->state.guest_tss.ss1 = cpu->ss1; |
| 99 | 99 | ||
| 100 | /* Copy direct-to-Guest trap entries. */ | 100 | /* Copy direct-to-Guest trap entries. */ |
| 101 | if (lg->changed & CHANGED_IDT) | 101 | if (cpu->changed & CHANGED_IDT) |
| 102 | copy_traps(lg, pages->state.guest_idt, default_idt_entries); | 102 | copy_traps(cpu, pages->state.guest_idt, default_idt_entries); |
| 103 | 103 | ||
| 104 | /* Copy all GDT entries which the Guest can change. */ | 104 | /* Copy all GDT entries which the Guest can change. */ |
| 105 | if (lg->changed & CHANGED_GDT) | 105 | if (cpu->changed & CHANGED_GDT) |
| 106 | copy_gdt(lg, pages->state.guest_gdt); | 106 | copy_gdt(cpu, pages->state.guest_gdt); |
| 107 | /* If only the TLS entries have changed, copy them. */ | 107 | /* If only the TLS entries have changed, copy them. */ |
| 108 | else if (lg->changed & CHANGED_GDT_TLS) | 108 | else if (cpu->changed & CHANGED_GDT_TLS) |
| 109 | copy_gdt_tls(lg, pages->state.guest_gdt); | 109 | copy_gdt_tls(cpu, pages->state.guest_gdt); |
| 110 | 110 | ||
| 111 | /* Mark the Guest as unchanged for next time. */ | 111 | /* Mark the Guest as unchanged for next time. */ |
| 112 | lg->changed = 0; | 112 | cpu->changed = 0; |
| 113 | } | 113 | } |
| 114 | 114 | ||
| 115 | /* Finally: the code to actually call into the Switcher to run the Guest. */ | 115 | /* Finally: the code to actually call into the Switcher to run the Guest. */ |
| 116 | static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) | 116 | static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) |
| 117 | { | 117 | { |
| 118 | /* This is a dummy value we need for GCC's sake. */ | 118 | /* This is a dummy value we need for GCC's sake. */ |
| 119 | unsigned int clobber; | 119 | unsigned int clobber; |
| 120 | 120 | ||
| 121 | /* Copy the guest-specific information into this CPU's "struct | 121 | /* Copy the guest-specific information into this CPU's "struct |
| 122 | * lguest_pages". */ | 122 | * lguest_pages". */ |
| 123 | copy_in_guest_info(lg, pages); | 123 | copy_in_guest_info(cpu, pages); |
| 124 | 124 | ||
| 125 | /* Set the trap number to 256 (impossible value). If we fault while | 125 | /* Set the trap number to 256 (impossible value). If we fault while |
| 126 | * switching to the Guest (bad segment registers or bug), this will | 126 | * switching to the Guest (bad segment registers or bug), this will |
| 127 | * cause us to abort the Guest. */ | 127 | * cause us to abort the Guest. */ |
| 128 | lg->regs->trapnum = 256; | 128 | cpu->regs->trapnum = 256; |
| 129 | 129 | ||
| 130 | /* Now: we push the "eflags" register on the stack, then do an "lcall". | 130 | /* Now: we push the "eflags" register on the stack, then do an "lcall". |
| 131 | * This is how we change from using the kernel code segment to using | 131 | * This is how we change from using the kernel code segment to using |
| @@ -143,7 +143,7 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) | |||
| 143 | * 0-th argument above, ie "a"). %ebx contains the | 143 | * 0-th argument above, ie "a"). %ebx contains the |
| 144 | * physical address of the Guest's top-level page | 144 | * physical address of the Guest's top-level page |
| 145 | * directory. */ | 145 | * directory. */ |
| 146 | : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) | 146 | : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) |
| 147 | /* We tell gcc that all these registers could change, | 147 | /* We tell gcc that all these registers could change, |
| 148 | * which means we don't have to save and restore them in | 148 | * which means we don't have to save and restore them in |
| 149 | * the Switcher. */ | 149 | * the Switcher. */ |
| @@ -161,12 +161,12 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) | |||
| 161 | 161 | ||
| 162 | /*H:040 This is the i386-specific code to setup and run the Guest. Interrupts | 162 | /*H:040 This is the i386-specific code to setup and run the Guest. Interrupts |
| 163 | * are disabled: we own the CPU. */ | 163 | * are disabled: we own the CPU. */ |
| 164 | void lguest_arch_run_guest(struct lguest *lg) | 164 | void lguest_arch_run_guest(struct lg_cpu *cpu) |
| 165 | { | 165 | { |
| 166 | /* Remember the awfully-named TS bit? If the Guest has asked to set it | 166 | /* Remember the awfully-named TS bit? If the Guest has asked to set it |
| 167 | * we set it now, so we can trap and pass that trap to the Guest if it | 167 | * we set it now, so we can trap and pass that trap to the Guest if it |
| 168 | * uses the FPU. */ | 168 | * uses the FPU. */ |
| 169 | if (lg->ts) | 169 | if (cpu->ts) |
| 170 | lguest_set_ts(); | 170 | lguest_set_ts(); |
| 171 | 171 | ||
| 172 | /* SYSENTER is an optimized way of doing system calls. We can't allow | 172 | /* SYSENTER is an optimized way of doing system calls. We can't allow |
| @@ -180,7 +180,7 @@ void lguest_arch_run_guest(struct lguest *lg) | |||
| 180 | /* Now we actually run the Guest. It will return when something | 180 | /* Now we actually run the Guest. It will return when something |
| 181 | * interesting happens, and we can examine its registers to see what it | 181 | * interesting happens, and we can examine its registers to see what it |
| 182 | * was doing. */ | 182 | * was doing. */ |
| 183 | run_guest_once(lg, lguest_pages(raw_smp_processor_id())); | 183 | run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); |
| 184 | 184 | ||
| 185 | /* Note that the "regs" pointer contains two extra entries which are | 185 | /* Note that the "regs" pointer contains two extra entries which are |
| 186 | * not really registers: a trap number which says what interrupt or | 186 | * not really registers: a trap number which says what interrupt or |
| @@ -191,11 +191,11 @@ void lguest_arch_run_guest(struct lguest *lg) | |||
| 191 | * bad virtual address. We have to grab this now, because once we | 191 | * bad virtual address. We have to grab this now, because once we |
| 192 | * re-enable interrupts an interrupt could fault and thus overwrite | 192 | * re-enable interrupts an interrupt could fault and thus overwrite |
| 193 | * cr2, or we could even move off to a different CPU. */ | 193 | * cr2, or we could even move off to a different CPU. */ |
| 194 | if (lg->regs->trapnum == 14) | 194 | if (cpu->regs->trapnum == 14) |
| 195 | lg->arch.last_pagefault = read_cr2(); | 195 | cpu->arch.last_pagefault = read_cr2(); |
| 196 | /* Similarly, if we took a trap because the Guest used the FPU, | 196 | /* Similarly, if we took a trap because the Guest used the FPU, |
| 197 | * we have to restore the FPU it expects to see. */ | 197 | * we have to restore the FPU it expects to see. */ |
| 198 | else if (lg->regs->trapnum == 7) | 198 | else if (cpu->regs->trapnum == 7) |
| 199 | math_state_restore(); | 199 | math_state_restore(); |
| 200 | 200 | ||
| 201 | /* Restore SYSENTER if it's supposed to be on. */ | 201 | /* Restore SYSENTER if it's supposed to be on. */ |
| @@ -214,22 +214,22 @@ void lguest_arch_run_guest(struct lguest *lg) | |||
| 214 | * When the Guest uses one of these instructions, we get a trap (General | 214 | * When the Guest uses one of these instructions, we get a trap (General |
| 215 | * Protection Fault) and come here. We see if it's one of those troublesome | 215 | * Protection Fault) and come here. We see if it's one of those troublesome |
| 216 | * instructions and skip over it. We return true if we did. */ | 216 | * instructions and skip over it. We return true if we did. */ |
| 217 | static int emulate_insn(struct lguest *lg) | 217 | static int emulate_insn(struct lg_cpu *cpu) |
| 218 | { | 218 | { |
| 219 | u8 insn; | 219 | u8 insn; |
| 220 | unsigned int insnlen = 0, in = 0, shift = 0; | 220 | unsigned int insnlen = 0, in = 0, shift = 0; |
| 221 | /* The eip contains the *virtual* address of the Guest's instruction: | 221 | /* The eip contains the *virtual* address of the Guest's instruction: |
| 222 | * guest_pa just subtracts the Guest's page_offset. */ | 222 | * guest_pa just subtracts the Guest's page_offset. */ |
| 223 | unsigned long physaddr = guest_pa(lg, lg->regs->eip); | 223 | unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); |
| 224 | 224 | ||
| 225 | /* This must be the Guest kernel trying to do something, not userspace! | 225 | /* This must be the Guest kernel trying to do something, not userspace! |
| 226 | * The bottom two bits of the CS segment register are the privilege | 226 | * The bottom two bits of the CS segment register are the privilege |
| 227 | * level. */ | 227 | * level. */ |
| 228 | if ((lg->regs->cs & 3) != GUEST_PL) | 228 | if ((cpu->regs->cs & 3) != GUEST_PL) |
| 229 | return 0; | 229 | return 0; |
| 230 | 230 | ||
| 231 | /* Decoding x86 instructions is icky. */ | 231 | /* Decoding x86 instructions is icky. */ |
| 232 | insn = lgread(lg, physaddr, u8); | 232 | insn = lgread(cpu, physaddr, u8); |
| 233 | 233 | ||
| 234 | /* 0x66 is an "operand prefix". It means it's using the upper 16 bits | 234 | /* 0x66 is an "operand prefix". It means it's using the upper 16 bits |
| 235 | of the eax register. */ | 235 | of the eax register. */ |
| @@ -237,7 +237,7 @@ static int emulate_insn(struct lguest *lg) | |||
| 237 | shift = 16; | 237 | shift = 16; |
| 238 | /* The instruction is 1 byte so far, read the next byte. */ | 238 | /* The instruction is 1 byte so far, read the next byte. */ |
| 239 | insnlen = 1; | 239 | insnlen = 1; |
| 240 | insn = lgread(lg, physaddr + insnlen, u8); | 240 | insn = lgread(cpu, physaddr + insnlen, u8); |
| 241 | } | 241 | } |
| 242 | 242 | ||
| 243 | /* We can ignore the lower bit for the moment and decode the 4 opcodes | 243 | /* We can ignore the lower bit for the moment and decode the 4 opcodes |
| @@ -268,26 +268,26 @@ static int emulate_insn(struct lguest *lg) | |||
| 268 | if (in) { | 268 | if (in) { |
| 269 | /* Lower bit tells is whether it's a 16 or 32 bit access */ | 269 | /* Lower bit tells is whether it's a 16 or 32 bit access */ |
| 270 | if (insn & 0x1) | 270 | if (insn & 0x1) |
| 271 | lg->regs->eax = 0xFFFFFFFF; | 271 | cpu->regs->eax = 0xFFFFFFFF; |
| 272 | else | 272 | else |
| 273 | lg->regs->eax |= (0xFFFF << shift); | 273 | cpu->regs->eax |= (0xFFFF << shift); |
| 274 | } | 274 | } |
| 275 | /* Finally, we've "done" the instruction, so move past it. */ | 275 | /* Finally, we've "done" the instruction, so move past it. */ |
| 276 | lg->regs->eip += insnlen; | 276 | cpu->regs->eip += insnlen; |
| 277 | /* Success! */ | 277 | /* Success! */ |
| 278 | return 1; | 278 | return 1; |
| 279 | } | 279 | } |
| 280 | 280 | ||
| 281 | /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ | 281 | /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ |
| 282 | void lguest_arch_handle_trap(struct lguest *lg) | 282 | void lguest_arch_handle_trap(struct lg_cpu *cpu) |
| 283 | { | 283 | { |
| 284 | switch (lg->regs->trapnum) { | 284 | switch (cpu->regs->trapnum) { |
| 285 | case 13: /* We've intercepted a General Protection Fault. */ | 285 | case 13: /* We've intercepted a General Protection Fault. */ |
| 286 | /* Check if this was one of those annoying IN or OUT | 286 | /* Check if this was one of those annoying IN or OUT |
| 287 | * instructions which we need to emulate. If so, we just go | 287 | * instructions which we need to emulate. If so, we just go |
| 288 | * back into the Guest after we've done it. */ | 288 | * back into the Guest after we've done it. */ |
| 289 | if (lg->regs->errcode == 0) { | 289 | if (cpu->regs->errcode == 0) { |
| 290 | if (emulate_insn(lg)) | 290 | if (emulate_insn(cpu)) |
| 291 | return; | 291 | return; |
| 292 | } | 292 | } |
| 293 | break; | 293 | break; |
| @@ -301,7 +301,8 @@ void lguest_arch_handle_trap(struct lguest *lg) | |||
| 301 | * | 301 | * |
| 302 | * The errcode tells whether this was a read or a write, and | 302 | * The errcode tells whether this was a read or a write, and |
| 303 | * whether kernel or userspace code. */ | 303 | * whether kernel or userspace code. */ |
| 304 | if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode)) | 304 | if (demand_page(cpu, cpu->arch.last_pagefault, |
| 305 | cpu->regs->errcode)) | ||
| 305 | return; | 306 | return; |
| 306 | 307 | ||
| 307 | /* OK, it's really not there (or not OK): the Guest needs to | 308 | /* OK, it's really not there (or not OK): the Guest needs to |
| @@ -311,15 +312,16 @@ void lguest_arch_handle_trap(struct lguest *lg) | |||
| 311 | * Note that if the Guest were really messed up, this could | 312 | * Note that if the Guest were really messed up, this could |
| 312 | * happen before it's done the LHCALL_LGUEST_INIT hypercall, so | 313 | * happen before it's done the LHCALL_LGUEST_INIT hypercall, so |
| 313 | * lg->lguest_data could be NULL */ | 314 | * lg->lguest_data could be NULL */ |
| 314 | if (lg->lguest_data && | 315 | if (cpu->lg->lguest_data && |
| 315 | put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2)) | 316 | put_user(cpu->arch.last_pagefault, |
| 316 | kill_guest(lg, "Writing cr2"); | 317 | &cpu->lg->lguest_data->cr2)) |
| 318 | kill_guest(cpu, "Writing cr2"); | ||
| 317 | break; | 319 | break; |
| 318 | case 7: /* We've intercepted a Device Not Available fault. */ | 320 | case 7: /* We've intercepted a Device Not Available fault. */ |
| 319 | /* If the Guest doesn't want to know, we already restored the | 321 | /* If the Guest doesn't want to know, we already restored the |
| 320 | * Floating Point Unit, so we just continue without telling | 322 | * Floating Point Unit, so we just continue without telling |
| 321 | * it. */ | 323 | * it. */ |
| 322 | if (!lg->ts) | 324 | if (!cpu->ts) |
| 323 | return; | 325 | return; |
| 324 | break; | 326 | break; |
| 325 | case 32 ... 255: | 327 | case 32 ... 255: |
| @@ -332,19 +334,19 @@ void lguest_arch_handle_trap(struct lguest *lg) | |||
| 332 | case LGUEST_TRAP_ENTRY: | 334 | case LGUEST_TRAP_ENTRY: |
| 333 | /* Our 'struct hcall_args' maps directly over our regs: we set | 335 | /* Our 'struct hcall_args' maps directly over our regs: we set |
| 334 | * up the pointer now to indicate a hypercall is pending. */ | 336 | * up the pointer now to indicate a hypercall is pending. */ |
| 335 | lg->hcall = (struct hcall_args *)lg->regs; | 337 | cpu->hcall = (struct hcall_args *)cpu->regs; |
| 336 | return; | 338 | return; |
| 337 | } | 339 | } |
| 338 | 340 | ||
| 339 | /* We didn't handle the trap, so it needs to go to the Guest. */ | 341 | /* We didn't handle the trap, so it needs to go to the Guest. */ |
| 340 | if (!deliver_trap(lg, lg->regs->trapnum)) | 342 | if (!deliver_trap(cpu, cpu->regs->trapnum)) |
| 341 | /* If the Guest doesn't have a handler (either it hasn't | 343 | /* If the Guest doesn't have a handler (either it hasn't |
| 342 | * registered any yet, or it's one of the faults we don't let | 344 | * registered any yet, or it's one of the faults we don't let |
| 343 | * it handle), it dies with a cryptic error message. */ | 345 | * it handle), it dies with a cryptic error message. */ |
| 344 | kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", | 346 | kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", |
| 345 | lg->regs->trapnum, lg->regs->eip, | 347 | cpu->regs->trapnum, cpu->regs->eip, |
| 346 | lg->regs->trapnum == 14 ? lg->arch.last_pagefault | 348 | cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault |
| 347 | : lg->regs->errcode); | 349 | : cpu->regs->errcode); |
| 348 | } | 350 | } |
| 349 | 351 | ||
| 350 | /* Now we can look at each of the routines this calls, in increasing order of | 352 | /* Now we can look at each of the routines this calls, in increasing order of |
| @@ -487,17 +489,17 @@ void __exit lguest_arch_host_fini(void) | |||
| 487 | 489 | ||
| 488 | 490 | ||
| 489 | /*H:122 The i386-specific hypercalls simply farm out to the right functions. */ | 491 | /*H:122 The i386-specific hypercalls simply farm out to the right functions. */ |
| 490 | int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args) | 492 | int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args) |
| 491 | { | 493 | { |
| 492 | switch (args->arg0) { | 494 | switch (args->arg0) { |
| 493 | case LHCALL_LOAD_GDT: | 495 | case LHCALL_LOAD_GDT: |
| 494 | load_guest_gdt(lg, args->arg1, args->arg2); | 496 | load_guest_gdt(cpu, args->arg1, args->arg2); |
| 495 | break; | 497 | break; |
| 496 | case LHCALL_LOAD_IDT_ENTRY: | 498 | case LHCALL_LOAD_IDT_ENTRY: |
| 497 | load_guest_idt_entry(lg, args->arg1, args->arg2, args->arg3); | 499 | load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3); |
| 498 | break; | 500 | break; |
| 499 | case LHCALL_LOAD_TLS: | 501 | case LHCALL_LOAD_TLS: |
| 500 | guest_load_tls(lg, args->arg1); | 502 | guest_load_tls(cpu, args->arg1); |
| 501 | break; | 503 | break; |
| 502 | default: | 504 | default: |
| 503 | /* Bad Guest. Bad! */ | 505 | /* Bad Guest. Bad! */ |
| @@ -507,13 +509,14 @@ int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args) | |||
| 507 | } | 509 | } |
| 508 | 510 | ||
| 509 | /*H:126 i386-specific hypercall initialization: */ | 511 | /*H:126 i386-specific hypercall initialization: */ |
| 510 | int lguest_arch_init_hypercalls(struct lguest *lg) | 512 | int lguest_arch_init_hypercalls(struct lg_cpu *cpu) |
| 511 | { | 513 | { |
| 512 | u32 tsc_speed; | 514 | u32 tsc_speed; |
| 513 | 515 | ||
| 514 | /* The pointer to the Guest's "struct lguest_data" is the only | 516 | /* The pointer to the Guest's "struct lguest_data" is the only |
| 515 | * argument. We check that address now. */ | 517 | * argument. We check that address now. */ |
| 516 | if (!lguest_address_ok(lg, lg->hcall->arg1, sizeof(*lg->lguest_data))) | 518 | if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, |
| 519 | sizeof(*cpu->lg->lguest_data))) | ||
| 517 | return -EFAULT; | 520 | return -EFAULT; |
| 518 | 521 | ||
| 519 | /* Having checked it, we simply set lg->lguest_data to point straight | 522 | /* Having checked it, we simply set lg->lguest_data to point straight |
| @@ -521,7 +524,7 @@ int lguest_arch_init_hypercalls(struct lguest *lg) | |||
| 521 | * copy_to_user/from_user from now on, instead of lgread/write. I put | 524 | * copy_to_user/from_user from now on, instead of lgread/write. I put |
| 522 | * this in to show that I'm not immune to writing stupid | 525 | * this in to show that I'm not immune to writing stupid |
| 523 | * optimizations. */ | 526 | * optimizations. */ |
| 524 | lg->lguest_data = lg->mem_base + lg->hcall->arg1; | 527 | cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; |
| 525 | 528 | ||
| 526 | /* We insist that the Time Stamp Counter exist and doesn't change with | 529 | /* We insist that the Time Stamp Counter exist and doesn't change with |
| 527 | * cpu frequency. Some devious chip manufacturers decided that TSC | 530 | * cpu frequency. Some devious chip manufacturers decided that TSC |
| @@ -534,12 +537,12 @@ int lguest_arch_init_hypercalls(struct lguest *lg) | |||
| 534 | tsc_speed = tsc_khz; | 537 | tsc_speed = tsc_khz; |
| 535 | else | 538 | else |
| 536 | tsc_speed = 0; | 539 | tsc_speed = 0; |
| 537 | if (put_user(tsc_speed, &lg->lguest_data->tsc_khz)) | 540 | if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz)) |
| 538 | return -EFAULT; | 541 | return -EFAULT; |
| 539 | 542 | ||
| 540 | /* The interrupt code might not like the system call vector. */ | 543 | /* The interrupt code might not like the system call vector. */ |
| 541 | if (!check_syscall_vector(lg)) | 544 | if (!check_syscall_vector(cpu->lg)) |
| 542 | kill_guest(lg, "bad syscall vector"); | 545 | kill_guest(cpu, "bad syscall vector"); |
| 543 | 546 | ||
| 544 | return 0; | 547 | return 0; |
| 545 | } | 548 | } |
| @@ -548,9 +551,9 @@ int lguest_arch_init_hypercalls(struct lguest *lg) | |||
| 548 | * | 551 | * |
| 549 | * Most of the Guest's registers are left alone: we used get_zeroed_page() to | 552 | * Most of the Guest's registers are left alone: we used get_zeroed_page() to |
| 550 | * allocate the structure, so they will be 0. */ | 553 | * allocate the structure, so they will be 0. */ |
| 551 | void lguest_arch_setup_regs(struct lguest *lg, unsigned long start) | 554 | void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) |
| 552 | { | 555 | { |
| 553 | struct lguest_regs *regs = lg->regs; | 556 | struct lguest_regs *regs = cpu->regs; |
| 554 | 557 | ||
| 555 | /* There are four "segment" registers which the Guest needs to boot: | 558 | /* There are four "segment" registers which the Guest needs to boot: |
| 556 | * The "code segment" register (cs) refers to the kernel code segment | 559 | * The "code segment" register (cs) refers to the kernel code segment |
| @@ -577,5 +580,5 @@ void lguest_arch_setup_regs(struct lguest *lg, unsigned long start) | |||
| 577 | 580 | ||
| 578 | /* There are a couple of GDT entries the Guest expects when first | 581 | /* There are a couple of GDT entries the Guest expects when first |
| 579 | * booting. */ | 582 | * booting. */ |
| 580 | setup_guest_gdt(lg); | 583 | setup_guest_gdt(cpu); |
| 581 | } | 584 | } |
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index e45f85f7c7ed..0dff05840ee2 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c | |||
| @@ -4224,10 +4224,10 @@ zfcp_fsf_send_fcp_command_task_handler(struct zfcp_fsf_req *fsf_req) | |||
| 4224 | 4224 | ||
| 4225 | ZFCP_LOG_TRACE("%i bytes sense data provided by FCP\n", | 4225 | ZFCP_LOG_TRACE("%i bytes sense data provided by FCP\n", |
| 4226 | fcp_rsp_iu->fcp_sns_len); | 4226 | fcp_rsp_iu->fcp_sns_len); |
| 4227 | memcpy(&scpnt->sense_buffer, | 4227 | memcpy(scpnt->sense_buffer, |
| 4228 | zfcp_get_fcp_sns_info_ptr(fcp_rsp_iu), sns_len); | 4228 | zfcp_get_fcp_sns_info_ptr(fcp_rsp_iu), sns_len); |
| 4229 | ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_TRACE, | 4229 | ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_TRACE, |
| 4230 | (void *) &scpnt->sense_buffer, sns_len); | 4230 | (void *)scpnt->sense_buffer, sns_len); |
| 4231 | } | 4231 | } |
| 4232 | 4232 | ||
| 4233 | /* check for overrun */ | 4233 | /* check for overrun */ |
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c index 1c244832c6c8..b4912d1cee2a 100644 --- a/drivers/scsi/3w-9xxx.c +++ b/drivers/scsi/3w-9xxx.c | |||
| @@ -1990,7 +1990,6 @@ static struct scsi_host_template driver_template = { | |||
| 1990 | .max_sectors = TW_MAX_SECTORS, | 1990 | .max_sectors = TW_MAX_SECTORS, |
| 1991 | .cmd_per_lun = TW_MAX_CMDS_PER_LUN, | 1991 | .cmd_per_lun = TW_MAX_CMDS_PER_LUN, |
| 1992 | .use_clustering = ENABLE_CLUSTERING, | 1992 | .use_clustering = ENABLE_CLUSTERING, |
| 1993 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1994 | .shost_attrs = twa_host_attrs, | 1993 | .shost_attrs = twa_host_attrs, |
| 1995 | .emulated = 1 | 1994 | .emulated = 1 |
| 1996 | }; | 1995 | }; |
diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c index 59716ebeb10c..d09532162217 100644 --- a/drivers/scsi/3w-xxxx.c +++ b/drivers/scsi/3w-xxxx.c | |||
| @@ -2261,7 +2261,6 @@ static struct scsi_host_template driver_template = { | |||
| 2261 | .max_sectors = TW_MAX_SECTORS, | 2261 | .max_sectors = TW_MAX_SECTORS, |
| 2262 | .cmd_per_lun = TW_MAX_CMDS_PER_LUN, | 2262 | .cmd_per_lun = TW_MAX_CMDS_PER_LUN, |
| 2263 | .use_clustering = ENABLE_CLUSTERING, | 2263 | .use_clustering = ENABLE_CLUSTERING, |
| 2264 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 2265 | .shost_attrs = tw_host_attrs, | 2264 | .shost_attrs = tw_host_attrs, |
| 2266 | .emulated = 1 | 2265 | .emulated = 1 |
| 2267 | }; | 2266 | }; |
diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c index ead47c143ce0..4d3ebb1af490 100644 --- a/drivers/scsi/BusLogic.c +++ b/drivers/scsi/BusLogic.c | |||
| @@ -3575,7 +3575,6 @@ static struct scsi_host_template Bus_Logic_template = { | |||
| 3575 | .unchecked_isa_dma = 1, | 3575 | .unchecked_isa_dma = 1, |
| 3576 | .max_sectors = 128, | 3576 | .max_sectors = 128, |
| 3577 | .use_clustering = ENABLE_CLUSTERING, | 3577 | .use_clustering = ENABLE_CLUSTERING, |
| 3578 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 3579 | }; | 3578 | }; |
| 3580 | 3579 | ||
| 3581 | /* | 3580 | /* |
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 3e161cd66463..14fc7f39e83e 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig | |||
| @@ -345,7 +345,7 @@ config ISCSI_TCP | |||
| 345 | 345 | ||
| 346 | config SGIWD93_SCSI | 346 | config SGIWD93_SCSI |
| 347 | tristate "SGI WD93C93 SCSI Driver" | 347 | tristate "SGI WD93C93 SCSI Driver" |
| 348 | depends on SGI_IP22 && SCSI | 348 | depends on SGI_HAS_WD93 && SCSI |
| 349 | help | 349 | help |
| 350 | If you have a Western Digital WD93 SCSI controller on | 350 | If you have a Western Digital WD93 SCSI controller on |
| 351 | an SGI MIPS system, say Y. Otherwise, say N. | 351 | an SGI MIPS system, say Y. Otherwise, say N. |
diff --git a/drivers/scsi/NCR53c406a.c b/drivers/scsi/NCR53c406a.c index 137d065db3da..6961f78742ae 100644 --- a/drivers/scsi/NCR53c406a.c +++ b/drivers/scsi/NCR53c406a.c | |||
| @@ -1065,7 +1065,6 @@ static struct scsi_host_template driver_template = | |||
| 1065 | .cmd_per_lun = 1 /* commands per lun */, | 1065 | .cmd_per_lun = 1 /* commands per lun */, |
| 1066 | .unchecked_isa_dma = 1 /* unchecked_isa_dma */, | 1066 | .unchecked_isa_dma = 1 /* unchecked_isa_dma */, |
| 1067 | .use_clustering = ENABLE_CLUSTERING, | 1067 | .use_clustering = ENABLE_CLUSTERING, |
| 1068 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1069 | }; | 1068 | }; |
| 1070 | 1069 | ||
| 1071 | #include "scsi_module.c" | 1070 | #include "scsi_module.c" |
diff --git a/drivers/scsi/a100u2w.c b/drivers/scsi/a100u2w.c index d3a6d15fb77a..f608d4a1d6da 100644 --- a/drivers/scsi/a100u2w.c +++ b/drivers/scsi/a100u2w.c | |||
| @@ -1071,7 +1071,6 @@ static struct scsi_host_template inia100_template = { | |||
| 1071 | .sg_tablesize = SG_ALL, | 1071 | .sg_tablesize = SG_ALL, |
| 1072 | .cmd_per_lun = 1, | 1072 | .cmd_per_lun = 1, |
| 1073 | .use_clustering = ENABLE_CLUSTERING, | 1073 | .use_clustering = ENABLE_CLUSTERING, |
| 1074 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1075 | }; | 1074 | }; |
| 1076 | 1075 | ||
| 1077 | static int __devinit inia100_probe_one(struct pci_dev *pdev, | 1076 | static int __devinit inia100_probe_one(struct pci_dev *pdev, |
diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c index 851a7e599c50..f8afa358b6b6 100644 --- a/drivers/scsi/aacraid/commctrl.c +++ b/drivers/scsi/aacraid/commctrl.c | |||
| @@ -243,7 +243,6 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg) | |||
| 243 | * Search the list of AdapterFibContext addresses on the adapter | 243 | * Search the list of AdapterFibContext addresses on the adapter |
| 244 | * to be sure this is a valid address | 244 | * to be sure this is a valid address |
| 245 | */ | 245 | */ |
| 246 | spin_lock_irqsave(&dev->fib_lock, flags); | ||
| 247 | entry = dev->fib_list.next; | 246 | entry = dev->fib_list.next; |
| 248 | fibctx = NULL; | 247 | fibctx = NULL; |
| 249 | 248 | ||
| @@ -252,25 +251,24 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg) | |||
| 252 | /* | 251 | /* |
| 253 | * Extract the AdapterFibContext from the Input parameters. | 252 | * Extract the AdapterFibContext from the Input parameters. |
| 254 | */ | 253 | */ |
| 255 | if (fibctx->unique == f.fibctx) { /* We found a winner */ | 254 | if (fibctx->unique == f.fibctx) { /* We found a winner */ |
| 256 | break; | 255 | break; |
| 257 | } | 256 | } |
| 258 | entry = entry->next; | 257 | entry = entry->next; |
| 259 | fibctx = NULL; | 258 | fibctx = NULL; |
| 260 | } | 259 | } |
| 261 | if (!fibctx) { | 260 | if (!fibctx) { |
| 262 | spin_unlock_irqrestore(&dev->fib_lock, flags); | ||
| 263 | dprintk ((KERN_INFO "Fib Context not found\n")); | 261 | dprintk ((KERN_INFO "Fib Context not found\n")); |
| 264 | return -EINVAL; | 262 | return -EINVAL; |
| 265 | } | 263 | } |
| 266 | 264 | ||
| 267 | if((fibctx->type != FSAFS_NTC_GET_ADAPTER_FIB_CONTEXT) || | 265 | if((fibctx->type != FSAFS_NTC_GET_ADAPTER_FIB_CONTEXT) || |
| 268 | (fibctx->size != sizeof(struct aac_fib_context))) { | 266 | (fibctx->size != sizeof(struct aac_fib_context))) { |
| 269 | spin_unlock_irqrestore(&dev->fib_lock, flags); | ||
| 270 | dprintk ((KERN_INFO "Fib Context corrupt?\n")); | 267 | dprintk ((KERN_INFO "Fib Context corrupt?\n")); |
| 271 | return -EINVAL; | 268 | return -EINVAL; |
| 272 | } | 269 | } |
| 273 | status = 0; | 270 | status = 0; |
| 271 | spin_lock_irqsave(&dev->fib_lock, flags); | ||
| 274 | /* | 272 | /* |
| 275 | * If there are no fibs to send back, then either wait or return | 273 | * If there are no fibs to send back, then either wait or return |
| 276 | * -EAGAIN | 274 | * -EAGAIN |
| @@ -328,9 +326,7 @@ return_fib: | |||
| 328 | int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx) | 326 | int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx) |
| 329 | { | 327 | { |
| 330 | struct fib *fib; | 328 | struct fib *fib; |
| 331 | unsigned long flags; | ||
| 332 | 329 | ||
| 333 | spin_lock_irqsave(&dev->fib_lock, flags); | ||
| 334 | /* | 330 | /* |
| 335 | * First free any FIBs that have not been consumed. | 331 | * First free any FIBs that have not been consumed. |
| 336 | */ | 332 | */ |
| @@ -353,7 +349,6 @@ int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx) | |||
| 353 | * Remove the Context from the AdapterFibContext List | 349 | * Remove the Context from the AdapterFibContext List |
| 354 | */ | 350 | */ |
| 355 | list_del(&fibctx->next); | 351 | list_del(&fibctx->next); |
| 356 | spin_unlock_irqrestore(&dev->fib_lock, flags); | ||
| 357 | /* | 352 | /* |
| 358 | * Invalidate context | 353 | * Invalidate context |
| 359 | */ | 354 | */ |
| @@ -419,8 +414,8 @@ static int close_getadapter_fib(struct aac_dev * dev, void __user *arg) | |||
| 419 | * @arg: ioctl arguments | 414 | * @arg: ioctl arguments |
| 420 | * | 415 | * |
| 421 | * This routine returns the driver version. | 416 | * This routine returns the driver version. |
| 422 | * Under Linux, there have been no version incompatibilities, so this is | 417 | * Under Linux, there have been no version incompatibilities, so this is |
| 423 | * simple! | 418 | * simple! |
| 424 | */ | 419 | */ |
| 425 | 420 | ||
| 426 | static int check_revision(struct aac_dev *dev, void __user *arg) | 421 | static int check_revision(struct aac_dev *dev, void __user *arg) |
| @@ -468,7 +463,7 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg) | |||
| 468 | u32 data_dir; | 463 | u32 data_dir; |
| 469 | void __user *sg_user[32]; | 464 | void __user *sg_user[32]; |
| 470 | void *sg_list[32]; | 465 | void *sg_list[32]; |
| 471 | u32 sg_indx = 0; | 466 | u32 sg_indx = 0; |
| 472 | u32 byte_count = 0; | 467 | u32 byte_count = 0; |
| 473 | u32 actual_fibsize64, actual_fibsize = 0; | 468 | u32 actual_fibsize64, actual_fibsize = 0; |
| 474 | int i; | 469 | int i; |
| @@ -522,11 +517,11 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg) | |||
| 522 | // Fix up srb for endian and force some values | 517 | // Fix up srb for endian and force some values |
| 523 | 518 | ||
| 524 | srbcmd->function = cpu_to_le32(SRBF_ExecuteScsi); // Force this | 519 | srbcmd->function = cpu_to_le32(SRBF_ExecuteScsi); // Force this |
| 525 | srbcmd->channel = cpu_to_le32(user_srbcmd->channel); | 520 | srbcmd->channel = cpu_to_le32(user_srbcmd->channel); |
| 526 | srbcmd->id = cpu_to_le32(user_srbcmd->id); | 521 | srbcmd->id = cpu_to_le32(user_srbcmd->id); |
| 527 | srbcmd->lun = cpu_to_le32(user_srbcmd->lun); | 522 | srbcmd->lun = cpu_to_le32(user_srbcmd->lun); |
| 528 | srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout); | 523 | srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout); |
| 529 | srbcmd->flags = cpu_to_le32(flags); | 524 | srbcmd->flags = cpu_to_le32(flags); |
| 530 | srbcmd->retry_limit = 0; // Obsolete parameter | 525 | srbcmd->retry_limit = 0; // Obsolete parameter |
| 531 | srbcmd->cdb_size = cpu_to_le32(user_srbcmd->cdb_size); | 526 | srbcmd->cdb_size = cpu_to_le32(user_srbcmd->cdb_size); |
| 532 | memcpy(srbcmd->cdb, user_srbcmd->cdb, sizeof(srbcmd->cdb)); | 527 | memcpy(srbcmd->cdb, user_srbcmd->cdb, sizeof(srbcmd->cdb)); |
| @@ -791,9 +786,9 @@ static int aac_get_pci_info(struct aac_dev* dev, void __user *arg) | |||
| 791 | pci_info.bus = dev->pdev->bus->number; | 786 | pci_info.bus = dev->pdev->bus->number; |
| 792 | pci_info.slot = PCI_SLOT(dev->pdev->devfn); | 787 | pci_info.slot = PCI_SLOT(dev->pdev->devfn); |
| 793 | 788 | ||
| 794 | if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) { | 789 | if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) { |
| 795 | dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n")); | 790 | dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n")); |
| 796 | return -EFAULT; | 791 | return -EFAULT; |
| 797 | } | 792 | } |
| 798 | return 0; | 793 | return 0; |
| 799 | } | 794 | } |
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 61be22774e99..0e8267c1e915 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c | |||
| @@ -1032,7 +1032,6 @@ static struct scsi_host_template aac_driver_template = { | |||
| 1032 | .cmd_per_lun = AAC_NUM_IO_FIB, | 1032 | .cmd_per_lun = AAC_NUM_IO_FIB, |
| 1033 | #endif | 1033 | #endif |
| 1034 | .use_clustering = ENABLE_CLUSTERING, | 1034 | .use_clustering = ENABLE_CLUSTERING, |
| 1035 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1036 | .emulated = 1, | 1035 | .emulated = 1, |
| 1037 | }; | 1036 | }; |
| 1038 | 1037 | ||
diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c index be58a0b097c7..7c45d88a205b 100644 --- a/drivers/scsi/aha1740.c +++ b/drivers/scsi/aha1740.c | |||
| @@ -563,7 +563,6 @@ static struct scsi_host_template aha1740_template = { | |||
| 563 | .sg_tablesize = AHA1740_SCATTER, | 563 | .sg_tablesize = AHA1740_SCATTER, |
| 564 | .cmd_per_lun = AHA1740_CMDLUN, | 564 | .cmd_per_lun = AHA1740_CMDLUN, |
| 565 | .use_clustering = ENABLE_CLUSTERING, | 565 | .use_clustering = ENABLE_CLUSTERING, |
| 566 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 567 | .eh_abort_handler = aha1740_eh_abort_handler, | 566 | .eh_abort_handler = aha1740_eh_abort_handler, |
| 568 | }; | 567 | }; |
| 569 | 568 | ||
diff --git a/drivers/scsi/aic7xxx/aic79xx.h b/drivers/scsi/aic7xxx/aic79xx.h index ce638aa6005a..2f00467b6b8c 100644 --- a/drivers/scsi/aic7xxx/aic79xx.h +++ b/drivers/scsi/aic7xxx/aic79xx.h | |||
| @@ -1340,8 +1340,10 @@ struct ahd_pci_identity *ahd_find_pci_device(ahd_dev_softc_t); | |||
| 1340 | int ahd_pci_config(struct ahd_softc *, | 1340 | int ahd_pci_config(struct ahd_softc *, |
| 1341 | struct ahd_pci_identity *); | 1341 | struct ahd_pci_identity *); |
| 1342 | int ahd_pci_test_register_access(struct ahd_softc *); | 1342 | int ahd_pci_test_register_access(struct ahd_softc *); |
| 1343 | #ifdef CONFIG_PM | ||
| 1343 | void ahd_pci_suspend(struct ahd_softc *); | 1344 | void ahd_pci_suspend(struct ahd_softc *); |
| 1344 | void ahd_pci_resume(struct ahd_softc *); | 1345 | void ahd_pci_resume(struct ahd_softc *); |
| 1346 | #endif | ||
| 1345 | 1347 | ||
| 1346 | /************************** SCB and SCB queue management **********************/ | 1348 | /************************** SCB and SCB queue management **********************/ |
| 1347 | void ahd_qinfifo_requeue_tail(struct ahd_softc *ahd, | 1349 | void ahd_qinfifo_requeue_tail(struct ahd_softc *ahd, |
| @@ -1352,8 +1354,10 @@ struct ahd_softc *ahd_alloc(void *platform_arg, char *name); | |||
| 1352 | int ahd_softc_init(struct ahd_softc *); | 1354 | int ahd_softc_init(struct ahd_softc *); |
| 1353 | void ahd_controller_info(struct ahd_softc *ahd, char *buf); | 1355 | void ahd_controller_info(struct ahd_softc *ahd, char *buf); |
| 1354 | int ahd_init(struct ahd_softc *ahd); | 1356 | int ahd_init(struct ahd_softc *ahd); |
| 1357 | #ifdef CONFIG_PM | ||
| 1355 | int ahd_suspend(struct ahd_softc *ahd); | 1358 | int ahd_suspend(struct ahd_softc *ahd); |
| 1356 | void ahd_resume(struct ahd_softc *ahd); | 1359 | void ahd_resume(struct ahd_softc *ahd); |
| 1360 | #endif | ||
| 1357 | int ahd_default_config(struct ahd_softc *ahd); | 1361 | int ahd_default_config(struct ahd_softc *ahd); |
| 1358 | int ahd_parse_vpddata(struct ahd_softc *ahd, | 1362 | int ahd_parse_vpddata(struct ahd_softc *ahd, |
| 1359 | struct vpd_config *vpd); | 1363 | struct vpd_config *vpd); |
| @@ -1361,7 +1365,6 @@ int ahd_parse_cfgdata(struct ahd_softc *ahd, | |||
| 1361 | struct seeprom_config *sc); | 1365 | struct seeprom_config *sc); |
| 1362 | void ahd_intr_enable(struct ahd_softc *ahd, int enable); | 1366 | void ahd_intr_enable(struct ahd_softc *ahd, int enable); |
| 1363 | void ahd_pause_and_flushwork(struct ahd_softc *ahd); | 1367 | void ahd_pause_and_flushwork(struct ahd_softc *ahd); |
| 1364 | int ahd_suspend(struct ahd_softc *ahd); | ||
| 1365 | void ahd_set_unit(struct ahd_softc *, int); | 1368 | void ahd_set_unit(struct ahd_softc *, int); |
| 1366 | void ahd_set_name(struct ahd_softc *, char *); | 1369 | void ahd_set_name(struct ahd_softc *, char *); |
| 1367 | struct scb *ahd_get_scb(struct ahd_softc *ahd, u_int col_idx); | 1370 | struct scb *ahd_get_scb(struct ahd_softc *ahd, u_int col_idx); |
diff --git a/drivers/scsi/aic7xxx/aic79xx_core.c b/drivers/scsi/aic7xxx/aic79xx_core.c index a7dd8cdda472..ade0fb8fbdb2 100644 --- a/drivers/scsi/aic7xxx/aic79xx_core.c +++ b/drivers/scsi/aic7xxx/aic79xx_core.c | |||
| @@ -7175,6 +7175,7 @@ ahd_pause_and_flushwork(struct ahd_softc *ahd) | |||
| 7175 | ahd->flags &= ~AHD_ALL_INTERRUPTS; | 7175 | ahd->flags &= ~AHD_ALL_INTERRUPTS; |
| 7176 | } | 7176 | } |
| 7177 | 7177 | ||
| 7178 | #ifdef CONFIG_PM | ||
| 7178 | int | 7179 | int |
| 7179 | ahd_suspend(struct ahd_softc *ahd) | 7180 | ahd_suspend(struct ahd_softc *ahd) |
| 7180 | { | 7181 | { |
| @@ -7197,6 +7198,7 @@ ahd_resume(struct ahd_softc *ahd) | |||
| 7197 | ahd_intr_enable(ahd, TRUE); | 7198 | ahd_intr_enable(ahd, TRUE); |
| 7198 | ahd_restart(ahd); | 7199 | ahd_restart(ahd); |
| 7199 | } | 7200 | } |
| 7201 | #endif | ||
| 7200 | 7202 | ||
| 7201 | /************************** Busy Target Table *********************************/ | 7203 | /************************** Busy Target Table *********************************/ |
| 7202 | /* | 7204 | /* |
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c index 0e4708fd43c8..014654792901 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm.c +++ b/drivers/scsi/aic7xxx/aic79xx_osm.c | |||
| @@ -766,7 +766,6 @@ struct scsi_host_template aic79xx_driver_template = { | |||
| 766 | .max_sectors = 8192, | 766 | .max_sectors = 8192, |
| 767 | .cmd_per_lun = 2, | 767 | .cmd_per_lun = 2, |
| 768 | .use_clustering = ENABLE_CLUSTERING, | 768 | .use_clustering = ENABLE_CLUSTERING, |
| 769 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 770 | .slave_alloc = ahd_linux_slave_alloc, | 769 | .slave_alloc = ahd_linux_slave_alloc, |
| 771 | .slave_configure = ahd_linux_slave_configure, | 770 | .slave_configure = ahd_linux_slave_configure, |
| 772 | .target_alloc = ahd_linux_target_alloc, | 771 | .target_alloc = ahd_linux_target_alloc, |
| @@ -1922,7 +1921,7 @@ ahd_linux_queue_cmd_complete(struct ahd_softc *ahd, struct scsi_cmnd *cmd) | |||
| 1922 | struct scsi_sense_data *sense; | 1921 | struct scsi_sense_data *sense; |
| 1923 | 1922 | ||
| 1924 | sense = (struct scsi_sense_data *) | 1923 | sense = (struct scsi_sense_data *) |
| 1925 | &cmd->sense_buffer; | 1924 | cmd->sense_buffer; |
| 1926 | if (sense->extra_len >= 5 && | 1925 | if (sense->extra_len >= 5 && |
| 1927 | (sense->add_sense_code == 0x47 | 1926 | (sense->add_sense_code == 0x47 |
| 1928 | || sense->add_sense_code == 0x48)) | 1927 | || sense->add_sense_code == 0x48)) |
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c index 66f0259edb69..4150c8a8fdc2 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c +++ b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c | |||
| @@ -43,17 +43,6 @@ | |||
| 43 | #include "aic79xx_inline.h" | 43 | #include "aic79xx_inline.h" |
| 44 | #include "aic79xx_pci.h" | 44 | #include "aic79xx_pci.h" |
| 45 | 45 | ||
| 46 | static int ahd_linux_pci_dev_probe(struct pci_dev *pdev, | ||
| 47 | const struct pci_device_id *ent); | ||
| 48 | static int ahd_linux_pci_reserve_io_regions(struct ahd_softc *ahd, | ||
| 49 | u_long *base, u_long *base2); | ||
| 50 | static int ahd_linux_pci_reserve_mem_region(struct ahd_softc *ahd, | ||
| 51 | u_long *bus_addr, | ||
| 52 | uint8_t __iomem **maddr); | ||
| 53 | static int ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg); | ||
| 54 | static int ahd_linux_pci_dev_resume(struct pci_dev *pdev); | ||
| 55 | static void ahd_linux_pci_dev_remove(struct pci_dev *pdev); | ||
| 56 | |||
| 57 | /* Define the macro locally since it's different for different class of chips. | 46 | /* Define the macro locally since it's different for different class of chips. |
| 58 | */ | 47 | */ |
| 59 | #define ID(x) \ | 48 | #define ID(x) \ |
| @@ -85,17 +74,7 @@ static struct pci_device_id ahd_linux_pci_id_table[] = { | |||
| 85 | 74 | ||
| 86 | MODULE_DEVICE_TABLE(pci, ahd_linux_pci_id_table); | 75 | MODULE_DEVICE_TABLE(pci, ahd_linux_pci_id_table); |
| 87 | 76 | ||
| 88 | static struct pci_driver aic79xx_pci_driver = { | ||
| 89 | .name = "aic79xx", | ||
| 90 | .probe = ahd_linux_pci_dev_probe, | ||
| 91 | #ifdef CONFIG_PM | 77 | #ifdef CONFIG_PM |
| 92 | .suspend = ahd_linux_pci_dev_suspend, | ||
| 93 | .resume = ahd_linux_pci_dev_resume, | ||
| 94 | #endif | ||
| 95 | .remove = ahd_linux_pci_dev_remove, | ||
| 96 | .id_table = ahd_linux_pci_id_table | ||
| 97 | }; | ||
| 98 | |||
| 99 | static int | 78 | static int |
| 100 | ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg) | 79 | ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg) |
| 101 | { | 80 | { |
| @@ -139,6 +118,7 @@ ahd_linux_pci_dev_resume(struct pci_dev *pdev) | |||
| 139 | 118 | ||
| 140 | return rc; | 119 | return rc; |
| 141 | } | 120 | } |
| 121 | #endif | ||
| 142 | 122 | ||
| 143 | static void | 123 | static void |
| 144 | ahd_linux_pci_dev_remove(struct pci_dev *pdev) | 124 | ahd_linux_pci_dev_remove(struct pci_dev *pdev) |
| @@ -245,6 +225,17 @@ ahd_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent) | |||
| 245 | return (0); | 225 | return (0); |
| 246 | } | 226 | } |
| 247 | 227 | ||
| 228 | static struct pci_driver aic79xx_pci_driver = { | ||
| 229 | .name = "aic79xx", | ||
| 230 | .probe = ahd_linux_pci_dev_probe, | ||
| 231 | #ifdef CONFIG_PM | ||
| 232 | .suspend = ahd_linux_pci_dev_suspend, | ||
| 233 | .resume = ahd_linux_pci_dev_resume, | ||
| 234 | #endif | ||
| 235 | .remove = ahd_linux_pci_dev_remove, | ||
| 236 | .id_table = ahd_linux_pci_id_table | ||
| 237 | }; | ||
| 238 | |||
| 248 | int | 239 | int |
| 249 | ahd_linux_pci_init(void) | 240 | ahd_linux_pci_init(void) |
| 250 | { | 241 | { |
diff --git a/drivers/scsi/aic7xxx/aic79xx_pci.c b/drivers/scsi/aic7xxx/aic79xx_pci.c index 7a203a90601a..df853676e66a 100644 --- a/drivers/scsi/aic7xxx/aic79xx_pci.c +++ b/drivers/scsi/aic7xxx/aic79xx_pci.c | |||
| @@ -389,6 +389,7 @@ ahd_pci_config(struct ahd_softc *ahd, struct ahd_pci_identity *entry) | |||
| 389 | return error; | 389 | return error; |
| 390 | } | 390 | } |
| 391 | 391 | ||
| 392 | #ifdef CONFIG_PM | ||
| 392 | void | 393 | void |
| 393 | ahd_pci_suspend(struct ahd_softc *ahd) | 394 | ahd_pci_suspend(struct ahd_softc *ahd) |
| 394 | { | 395 | { |
| @@ -415,6 +416,7 @@ ahd_pci_resume(struct ahd_softc *ahd) | |||
| 415 | ahd_pci_write_config(ahd->dev_softc, CSIZE_LATTIME, | 416 | ahd_pci_write_config(ahd->dev_softc, CSIZE_LATTIME, |
| 416 | ahd->suspend_state.pci_state.csize_lattime, /*bytes*/1); | 417 | ahd->suspend_state.pci_state.csize_lattime, /*bytes*/1); |
| 417 | } | 418 | } |
| 419 | #endif | ||
| 418 | 420 | ||
| 419 | /* | 421 | /* |
| 420 | * Perform some simple tests that should catch situations where | 422 | * Perform some simple tests that should catch situations where |
diff --git a/drivers/scsi/aic7xxx/aic7xxx.h b/drivers/scsi/aic7xxx/aic7xxx.h index 3d4e42d90452..c0344e617651 100644 --- a/drivers/scsi/aic7xxx/aic7xxx.h +++ b/drivers/scsi/aic7xxx/aic7xxx.h | |||
| @@ -1143,7 +1143,9 @@ struct ahc_pci_identity *ahc_find_pci_device(ahc_dev_softc_t); | |||
| 1143 | int ahc_pci_config(struct ahc_softc *, | 1143 | int ahc_pci_config(struct ahc_softc *, |
| 1144 | struct ahc_pci_identity *); | 1144 | struct ahc_pci_identity *); |
| 1145 | int ahc_pci_test_register_access(struct ahc_softc *); | 1145 | int ahc_pci_test_register_access(struct ahc_softc *); |
| 1146 | #ifdef CONFIG_PM | ||
| 1146 | void ahc_pci_resume(struct ahc_softc *ahc); | 1147 | void ahc_pci_resume(struct ahc_softc *ahc); |
| 1148 | #endif | ||
| 1147 | 1149 | ||
| 1148 | /*************************** EISA/VL Front End ********************************/ | 1150 | /*************************** EISA/VL Front End ********************************/ |
| 1149 | struct aic7770_identity *aic7770_find_device(uint32_t); | 1151 | struct aic7770_identity *aic7770_find_device(uint32_t); |
| @@ -1170,8 +1172,10 @@ int ahc_chip_init(struct ahc_softc *ahc); | |||
| 1170 | int ahc_init(struct ahc_softc *ahc); | 1172 | int ahc_init(struct ahc_softc *ahc); |
| 1171 | void ahc_intr_enable(struct ahc_softc *ahc, int enable); | 1173 | void ahc_intr_enable(struct ahc_softc *ahc, int enable); |
| 1172 | void ahc_pause_and_flushwork(struct ahc_softc *ahc); | 1174 | void ahc_pause_and_flushwork(struct ahc_softc *ahc); |
| 1175 | #ifdef CONFIG_PM | ||
| 1173 | int ahc_suspend(struct ahc_softc *ahc); | 1176 | int ahc_suspend(struct ahc_softc *ahc); |
| 1174 | int ahc_resume(struct ahc_softc *ahc); | 1177 | int ahc_resume(struct ahc_softc *ahc); |
| 1178 | #endif | ||
| 1175 | void ahc_set_unit(struct ahc_softc *, int); | 1179 | void ahc_set_unit(struct ahc_softc *, int); |
| 1176 | void ahc_set_name(struct ahc_softc *, char *); | 1180 | void ahc_set_name(struct ahc_softc *, char *); |
| 1177 | void ahc_alloc_scbs(struct ahc_softc *ahc); | 1181 | void ahc_alloc_scbs(struct ahc_softc *ahc); |
diff --git a/drivers/scsi/aic7xxx/aic7xxx_core.c b/drivers/scsi/aic7xxx/aic7xxx_core.c index f350b5e89e76..6d2ae641273c 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_core.c +++ b/drivers/scsi/aic7xxx/aic7xxx_core.c | |||
| @@ -5078,6 +5078,7 @@ ahc_pause_and_flushwork(struct ahc_softc *ahc) | |||
| 5078 | ahc->flags &= ~AHC_ALL_INTERRUPTS; | 5078 | ahc->flags &= ~AHC_ALL_INTERRUPTS; |
| 5079 | } | 5079 | } |
| 5080 | 5080 | ||
| 5081 | #ifdef CONFIG_PM | ||
| 5081 | int | 5082 | int |
| 5082 | ahc_suspend(struct ahc_softc *ahc) | 5083 | ahc_suspend(struct ahc_softc *ahc) |
| 5083 | { | 5084 | { |
| @@ -5113,7 +5114,7 @@ ahc_resume(struct ahc_softc *ahc) | |||
| 5113 | ahc_restart(ahc); | 5114 | ahc_restart(ahc); |
| 5114 | return (0); | 5115 | return (0); |
| 5115 | } | 5116 | } |
| 5116 | 5117 | #endif | |
| 5117 | /************************** Busy Target Table *********************************/ | 5118 | /************************** Busy Target Table *********************************/ |
| 5118 | /* | 5119 | /* |
| 5119 | * Return the untagged transaction id for a given target/channel lun. | 5120 | * Return the untagged transaction id for a given target/channel lun. |
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c index e310e414067f..99a3b33a3233 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_osm.c +++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c | |||
| @@ -747,7 +747,6 @@ struct scsi_host_template aic7xxx_driver_template = { | |||
| 747 | .max_sectors = 8192, | 747 | .max_sectors = 8192, |
| 748 | .cmd_per_lun = 2, | 748 | .cmd_per_lun = 2, |
| 749 | .use_clustering = ENABLE_CLUSTERING, | 749 | .use_clustering = ENABLE_CLUSTERING, |
| 750 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 751 | .slave_alloc = ahc_linux_slave_alloc, | 750 | .slave_alloc = ahc_linux_slave_alloc, |
| 752 | .slave_configure = ahc_linux_slave_configure, | 751 | .slave_configure = ahc_linux_slave_configure, |
| 753 | .target_alloc = ahc_linux_target_alloc, | 752 | .target_alloc = ahc_linux_target_alloc, |
| @@ -1658,9 +1657,12 @@ ahc_done(struct ahc_softc *ahc, struct scb *scb) | |||
| 1658 | untagged_q = &(ahc->untagged_queues[target_offset]); | 1657 | untagged_q = &(ahc->untagged_queues[target_offset]); |
| 1659 | TAILQ_REMOVE(untagged_q, scb, links.tqe); | 1658 | TAILQ_REMOVE(untagged_q, scb, links.tqe); |
| 1660 | BUG_ON(!TAILQ_EMPTY(untagged_q)); | 1659 | BUG_ON(!TAILQ_EMPTY(untagged_q)); |
| 1661 | } | 1660 | } else if ((scb->flags & SCB_ACTIVE) == 0) { |
| 1662 | 1661 | /* | |
| 1663 | if ((scb->flags & SCB_ACTIVE) == 0) { | 1662 | * Transactions aborted from the untagged queue may |
| 1663 | * not have been dispatched to the controller, so | ||
| 1664 | * only check the SCB_ACTIVE flag for tagged transactions. | ||
| 1665 | */ | ||
| 1664 | printf("SCB %d done'd twice\n", scb->hscb->tag); | 1666 | printf("SCB %d done'd twice\n", scb->hscb->tag); |
| 1665 | ahc_dump_card_state(ahc); | 1667 | ahc_dump_card_state(ahc); |
| 1666 | panic("Stopping for safety"); | 1668 | panic("Stopping for safety"); |
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c index 4488946cff2e..dd6e21d6f1dd 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c +++ b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c | |||
| @@ -42,17 +42,6 @@ | |||
| 42 | #include "aic7xxx_osm.h" | 42 | #include "aic7xxx_osm.h" |
| 43 | #include "aic7xxx_pci.h" | 43 | #include "aic7xxx_pci.h" |
| 44 | 44 | ||
| 45 | static int ahc_linux_pci_dev_probe(struct pci_dev *pdev, | ||
| 46 | const struct pci_device_id *ent); | ||
| 47 | static int ahc_linux_pci_reserve_io_region(struct ahc_softc *ahc, | ||
| 48 | u_long *base); | ||
| 49 | static int ahc_linux_pci_reserve_mem_region(struct ahc_softc *ahc, | ||
| 50 | u_long *bus_addr, | ||
| 51 | uint8_t __iomem **maddr); | ||
| 52 | static int ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg); | ||
| 53 | static int ahc_linux_pci_dev_resume(struct pci_dev *pdev); | ||
| 54 | static void ahc_linux_pci_dev_remove(struct pci_dev *pdev); | ||
| 55 | |||
| 56 | /* Define the macro locally since it's different for different class of chips. | 45 | /* Define the macro locally since it's different for different class of chips. |
| 57 | */ | 46 | */ |
| 58 | #define ID(x) ID_C(x, PCI_CLASS_STORAGE_SCSI) | 47 | #define ID(x) ID_C(x, PCI_CLASS_STORAGE_SCSI) |
| @@ -132,17 +121,7 @@ static struct pci_device_id ahc_linux_pci_id_table[] = { | |||
| 132 | 121 | ||
| 133 | MODULE_DEVICE_TABLE(pci, ahc_linux_pci_id_table); | 122 | MODULE_DEVICE_TABLE(pci, ahc_linux_pci_id_table); |
| 134 | 123 | ||
| 135 | static struct pci_driver aic7xxx_pci_driver = { | ||
| 136 | .name = "aic7xxx", | ||
| 137 | .probe = ahc_linux_pci_dev_probe, | ||
| 138 | #ifdef CONFIG_PM | 124 | #ifdef CONFIG_PM |
| 139 | .suspend = ahc_linux_pci_dev_suspend, | ||
| 140 | .resume = ahc_linux_pci_dev_resume, | ||
| 141 | #endif | ||
| 142 | .remove = ahc_linux_pci_dev_remove, | ||
| 143 | .id_table = ahc_linux_pci_id_table | ||
| 144 | }; | ||
| 145 | |||
| 146 | static int | 125 | static int |
| 147 | ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg) | 126 | ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg) |
| 148 | { | 127 | { |
| @@ -182,6 +161,7 @@ ahc_linux_pci_dev_resume(struct pci_dev *pdev) | |||
| 182 | 161 | ||
| 183 | return (ahc_resume(ahc)); | 162 | return (ahc_resume(ahc)); |
| 184 | } | 163 | } |
| 164 | #endif | ||
| 185 | 165 | ||
| 186 | static void | 166 | static void |
| 187 | ahc_linux_pci_dev_remove(struct pci_dev *pdev) | 167 | ahc_linux_pci_dev_remove(struct pci_dev *pdev) |
| @@ -289,6 +269,17 @@ ahc_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent) | |||
| 289 | return (0); | 269 | return (0); |
| 290 | } | 270 | } |
| 291 | 271 | ||
| 272 | static struct pci_driver aic7xxx_pci_driver = { | ||
| 273 | .name = "aic7xxx", | ||
| 274 | .probe = ahc_linux_pci_dev_probe, | ||
| 275 | #ifdef CONFIG_PM | ||
| 276 | .suspend = ahc_linux_pci_dev_suspend, | ||
| 277 | .resume = ahc_linux_pci_dev_resume, | ||
| 278 | #endif | ||
| 279 | .remove = ahc_linux_pci_dev_remove, | ||
| 280 | .id_table = ahc_linux_pci_id_table | ||
| 281 | }; | ||
| 282 | |||
| 292 | int | 283 | int |
| 293 | ahc_linux_pci_init(void) | 284 | ahc_linux_pci_init(void) |
| 294 | { | 285 | { |
diff --git a/drivers/scsi/aic7xxx/aic7xxx_pci.c b/drivers/scsi/aic7xxx/aic7xxx_pci.c index ae35937b8055..56848f41e4f9 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_pci.c +++ b/drivers/scsi/aic7xxx/aic7xxx_pci.c | |||
| @@ -2020,6 +2020,7 @@ ahc_pci_chip_init(struct ahc_softc *ahc) | |||
| 2020 | return (ahc_chip_init(ahc)); | 2020 | return (ahc_chip_init(ahc)); |
| 2021 | } | 2021 | } |
| 2022 | 2022 | ||
| 2023 | #ifdef CONFIG_PM | ||
| 2023 | void | 2024 | void |
| 2024 | ahc_pci_resume(struct ahc_softc *ahc) | 2025 | ahc_pci_resume(struct ahc_softc *ahc) |
| 2025 | { | 2026 | { |
| @@ -2051,6 +2052,7 @@ ahc_pci_resume(struct ahc_softc *ahc) | |||
| 2051 | ahc_release_seeprom(&sd); | 2052 | ahc_release_seeprom(&sd); |
| 2052 | } | 2053 | } |
| 2053 | } | 2054 | } |
| 2055 | #endif | ||
| 2054 | 2056 | ||
| 2055 | static int | 2057 | static int |
| 2056 | ahc_aic785X_setup(struct ahc_softc *ahc) | 2058 | ahc_aic785X_setup(struct ahc_softc *ahc) |
diff --git a/drivers/scsi/aic7xxx_old.c b/drivers/scsi/aic7xxx_old.c index bcb0b870320c..3bfd9296bbfa 100644 --- a/drivers/scsi/aic7xxx_old.c +++ b/drivers/scsi/aic7xxx_old.c | |||
| @@ -11141,7 +11141,6 @@ static struct scsi_host_template driver_template = { | |||
| 11141 | .max_sectors = 2048, | 11141 | .max_sectors = 2048, |
| 11142 | .cmd_per_lun = 3, | 11142 | .cmd_per_lun = 3, |
| 11143 | .use_clustering = ENABLE_CLUSTERING, | 11143 | .use_clustering = ENABLE_CLUSTERING, |
| 11144 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 11145 | }; | 11144 | }; |
| 11146 | 11145 | ||
| 11147 | #include "scsi_module.c" | 11146 | #include "scsi_module.c" |
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index d80dba913a75..f4a202e8df26 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c | |||
| @@ -122,7 +122,6 @@ static struct scsi_host_template arcmsr_scsi_host_template = { | |||
| 122 | .max_sectors = ARCMSR_MAX_XFER_SECTORS, | 122 | .max_sectors = ARCMSR_MAX_XFER_SECTORS, |
| 123 | .cmd_per_lun = ARCMSR_MAX_CMD_PERLUN, | 123 | .cmd_per_lun = ARCMSR_MAX_CMD_PERLUN, |
| 124 | .use_clustering = ENABLE_CLUSTERING, | 124 | .use_clustering = ENABLE_CLUSTERING, |
| 125 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 126 | .shost_attrs = arcmsr_host_attrs, | 125 | .shost_attrs = arcmsr_host_attrs, |
| 127 | }; | 126 | }; |
| 128 | #ifdef CONFIG_SCSI_ARCMSR_AER | 127 | #ifdef CONFIG_SCSI_ARCMSR_AER |
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c index f93c73c0ba53..22ef3716e786 100644 --- a/drivers/scsi/dc395x.c +++ b/drivers/scsi/dc395x.c | |||
| @@ -4763,7 +4763,6 @@ static struct scsi_host_template dc395x_driver_template = { | |||
| 4763 | .eh_bus_reset_handler = dc395x_eh_bus_reset, | 4763 | .eh_bus_reset_handler = dc395x_eh_bus_reset, |
| 4764 | .unchecked_isa_dma = 0, | 4764 | .unchecked_isa_dma = 0, |
| 4765 | .use_clustering = DISABLE_CLUSTERING, | 4765 | .use_clustering = DISABLE_CLUSTERING, |
| 4766 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 4767 | }; | 4766 | }; |
| 4768 | 4767 | ||
| 4769 | 4768 | ||
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c index 19cce125124c..c9dd8392aab2 100644 --- a/drivers/scsi/dpt_i2o.c +++ b/drivers/scsi/dpt_i2o.c | |||
| @@ -3340,7 +3340,6 @@ static struct scsi_host_template driver_template = { | |||
| 3340 | .this_id = 7, | 3340 | .this_id = 7, |
| 3341 | .cmd_per_lun = 1, | 3341 | .cmd_per_lun = 1, |
| 3342 | .use_clustering = ENABLE_CLUSTERING, | 3342 | .use_clustering = ENABLE_CLUSTERING, |
| 3343 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 3344 | }; | 3343 | }; |
| 3345 | #include "scsi_module.c" | 3344 | #include "scsi_module.c" |
| 3346 | MODULE_LICENSE("GPL"); | 3345 | MODULE_LICENSE("GPL"); |
diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c index 05163cefec12..8be3d76656fa 100644 --- a/drivers/scsi/eata.c +++ b/drivers/scsi/eata.c | |||
| @@ -524,7 +524,6 @@ static struct scsi_host_template driver_template = { | |||
| 524 | .this_id = 7, | 524 | .this_id = 7, |
| 525 | .unchecked_isa_dma = 1, | 525 | .unchecked_isa_dma = 1, |
| 526 | .use_clustering = ENABLE_CLUSTERING, | 526 | .use_clustering = ENABLE_CLUSTERING, |
| 527 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 528 | }; | 527 | }; |
| 529 | 528 | ||
| 530 | #if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD) | 529 | #if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD) |
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index 5ea1f986220c..880c78bff0e1 100644 --- a/drivers/scsi/hosts.c +++ b/drivers/scsi/hosts.c | |||
| @@ -342,7 +342,6 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize) | |||
| 342 | shost->use_clustering = sht->use_clustering; | 342 | shost->use_clustering = sht->use_clustering; |
| 343 | shost->ordered_tag = sht->ordered_tag; | 343 | shost->ordered_tag = sht->ordered_tag; |
| 344 | shost->active_mode = sht->supported_mode; | 344 | shost->active_mode = sht->supported_mode; |
| 345 | shost->use_sg_chaining = sht->use_sg_chaining; | ||
| 346 | 345 | ||
| 347 | if (sht->supported_mode == MODE_UNKNOWN) | 346 | if (sht->supported_mode == MODE_UNKNOWN) |
| 348 | /* means we didn't set it ... default to INITIATOR */ | 347 | /* means we didn't set it ... default to INITIATOR */ |
diff --git a/drivers/scsi/hptiop.c b/drivers/scsi/hptiop.c index e7b2f3575ce9..ff149ad6bc4e 100644 --- a/drivers/scsi/hptiop.c +++ b/drivers/scsi/hptiop.c | |||
| @@ -573,7 +573,7 @@ static void hptiop_finish_scsi_req(struct hptiop_hba *hba, u32 tag, | |||
| 573 | scsi_set_resid(scp, | 573 | scsi_set_resid(scp, |
| 574 | scsi_bufflen(scp) - le32_to_cpu(req->dataxfer_length)); | 574 | scsi_bufflen(scp) - le32_to_cpu(req->dataxfer_length)); |
| 575 | scp->result = SAM_STAT_CHECK_CONDITION; | 575 | scp->result = SAM_STAT_CHECK_CONDITION; |
| 576 | memcpy(&scp->sense_buffer, &req->sg_list, | 576 | memcpy(scp->sense_buffer, &req->sg_list, |
| 577 | min_t(size_t, SCSI_SENSE_BUFFERSIZE, | 577 | min_t(size_t, SCSI_SENSE_BUFFERSIZE, |
| 578 | le32_to_cpu(req->dataxfer_length))); | 578 | le32_to_cpu(req->dataxfer_length))); |
| 579 | break; | 579 | break; |
| @@ -906,7 +906,6 @@ static struct scsi_host_template driver_template = { | |||
| 906 | .unchecked_isa_dma = 0, | 906 | .unchecked_isa_dma = 0, |
| 907 | .emulated = 0, | 907 | .emulated = 0, |
| 908 | .use_clustering = ENABLE_CLUSTERING, | 908 | .use_clustering = ENABLE_CLUSTERING, |
| 909 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 910 | .proc_name = driver_name, | 909 | .proc_name = driver_name, |
| 911 | .shost_attrs = hptiop_attrs, | 910 | .shost_attrs = hptiop_attrs, |
| 912 | .this_id = -1, | 911 | .this_id = -1, |
diff --git a/drivers/scsi/ibmmca.c b/drivers/scsi/ibmmca.c index db004a450732..4d15a62914e9 100644 --- a/drivers/scsi/ibmmca.c +++ b/drivers/scsi/ibmmca.c | |||
| @@ -1501,7 +1501,6 @@ static struct scsi_host_template ibmmca_driver_template = { | |||
| 1501 | .sg_tablesize = 16, | 1501 | .sg_tablesize = 16, |
| 1502 | .cmd_per_lun = 1, | 1502 | .cmd_per_lun = 1, |
| 1503 | .use_clustering = ENABLE_CLUSTERING, | 1503 | .use_clustering = ENABLE_CLUSTERING, |
| 1504 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1505 | }; | 1504 | }; |
| 1506 | 1505 | ||
| 1507 | static int ibmmca_probe(struct device *dev) | 1506 | static int ibmmca_probe(struct device *dev) |
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index 30819012898f..78d46a900bb5 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c | |||
| @@ -1600,7 +1600,6 @@ static struct scsi_host_template driver_template = { | |||
| 1600 | .this_id = -1, | 1600 | .this_id = -1, |
| 1601 | .sg_tablesize = SG_ALL, | 1601 | .sg_tablesize = SG_ALL, |
| 1602 | .use_clustering = ENABLE_CLUSTERING, | 1602 | .use_clustering = ENABLE_CLUSTERING, |
| 1603 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1604 | .shost_attrs = ibmvscsi_attrs, | 1603 | .shost_attrs = ibmvscsi_attrs, |
| 1605 | }; | 1604 | }; |
| 1606 | 1605 | ||
diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c index a10a5c74b48d..0cc8868ea35d 100644 --- a/drivers/scsi/initio.c +++ b/drivers/scsi/initio.c | |||
| @@ -2833,7 +2833,6 @@ static struct scsi_host_template initio_template = { | |||
| 2833 | .sg_tablesize = SG_ALL, | 2833 | .sg_tablesize = SG_ALL, |
| 2834 | .cmd_per_lun = 1, | 2834 | .cmd_per_lun = 1, |
| 2835 | .use_clustering = ENABLE_CLUSTERING, | 2835 | .use_clustering = ENABLE_CLUSTERING, |
| 2836 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 2837 | }; | 2836 | }; |
| 2838 | 2837 | ||
| 2839 | static int initio_probe_one(struct pci_dev *pdev, | 2838 | static int initio_probe_one(struct pci_dev *pdev, |
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index e5be5fd4ef58..b6f99dfbb038 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c | |||
| @@ -1933,7 +1933,6 @@ static struct scsi_host_template iscsi_sht = { | |||
| 1933 | .eh_device_reset_handler= iscsi_eh_device_reset, | 1933 | .eh_device_reset_handler= iscsi_eh_device_reset, |
| 1934 | .eh_host_reset_handler = iscsi_eh_host_reset, | 1934 | .eh_host_reset_handler = iscsi_eh_host_reset, |
| 1935 | .use_clustering = DISABLE_CLUSTERING, | 1935 | .use_clustering = DISABLE_CLUSTERING, |
| 1936 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1937 | .slave_configure = iscsi_tcp_slave_configure, | 1936 | .slave_configure = iscsi_tcp_slave_configure, |
| 1938 | .proc_name = "iscsi_tcp", | 1937 | .proc_name = "iscsi_tcp", |
| 1939 | .this_id = -1, | 1938 | .this_id = -1, |
diff --git a/drivers/scsi/libsrp.c b/drivers/scsi/libsrp.c index 5cff0204227d..6d6a76e65a6c 100644 --- a/drivers/scsi/libsrp.c +++ b/drivers/scsi/libsrp.c | |||
| @@ -426,8 +426,8 @@ int srp_cmd_queue(struct Scsi_Host *shost, struct srp_cmd *cmd, void *info, | |||
| 426 | 426 | ||
| 427 | sc->SCp.ptr = info; | 427 | sc->SCp.ptr = info; |
| 428 | memcpy(sc->cmnd, cmd->cdb, MAX_COMMAND_SIZE); | 428 | memcpy(sc->cmnd, cmd->cdb, MAX_COMMAND_SIZE); |
| 429 | sc->request_bufflen = len; | 429 | sc->sdb.length = len; |
| 430 | sc->request_buffer = (void *) (unsigned long) addr; | 430 | sc->sdb.table.sgl = (void *) (unsigned long) addr; |
| 431 | sc->tag = tag; | 431 | sc->tag = tag; |
| 432 | err = scsi_tgt_queue_command(sc, itn_id, (struct scsi_lun *)&cmd->lun, | 432 | err = scsi_tgt_queue_command(sc, itn_id, (struct scsi_lun *)&cmd->lun, |
| 433 | cmd->tag); | 433 | cmd->tag); |
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index 6483c62730b3..fc5c3a42b05a 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c | |||
| @@ -1459,7 +1459,6 @@ struct scsi_host_template lpfc_template = { | |||
| 1459 | .scan_finished = lpfc_scan_finished, | 1459 | .scan_finished = lpfc_scan_finished, |
| 1460 | .this_id = -1, | 1460 | .this_id = -1, |
| 1461 | .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT, | 1461 | .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT, |
| 1462 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1463 | .cmd_per_lun = LPFC_CMD_PER_LUN, | 1462 | .cmd_per_lun = LPFC_CMD_PER_LUN, |
| 1464 | .use_clustering = ENABLE_CLUSTERING, | 1463 | .use_clustering = ENABLE_CLUSTERING, |
| 1465 | .shost_attrs = lpfc_hba_attrs, | 1464 | .shost_attrs = lpfc_hba_attrs, |
| @@ -1482,7 +1481,6 @@ struct scsi_host_template lpfc_vport_template = { | |||
| 1482 | .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT, | 1481 | .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT, |
| 1483 | .cmd_per_lun = LPFC_CMD_PER_LUN, | 1482 | .cmd_per_lun = LPFC_CMD_PER_LUN, |
| 1484 | .use_clustering = ENABLE_CLUSTERING, | 1483 | .use_clustering = ENABLE_CLUSTERING, |
| 1485 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1486 | .shost_attrs = lpfc_vport_attrs, | 1484 | .shost_attrs = lpfc_vport_attrs, |
| 1487 | .max_sectors = 0xFFFF, | 1485 | .max_sectors = 0xFFFF, |
| 1488 | }; | 1486 | }; |
diff --git a/drivers/scsi/mac53c94.c b/drivers/scsi/mac53c94.c index a035001f4438..b12ad7c7c673 100644 --- a/drivers/scsi/mac53c94.c +++ b/drivers/scsi/mac53c94.c | |||
| @@ -402,7 +402,6 @@ static struct scsi_host_template mac53c94_template = { | |||
| 402 | .sg_tablesize = SG_ALL, | 402 | .sg_tablesize = SG_ALL, |
| 403 | .cmd_per_lun = 1, | 403 | .cmd_per_lun = 1, |
| 404 | .use_clustering = DISABLE_CLUSTERING, | 404 | .use_clustering = DISABLE_CLUSTERING, |
| 405 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 406 | }; | 405 | }; |
| 407 | 406 | ||
| 408 | static int mac53c94_probe(struct macio_dev *mdev, const struct of_device_id *match) | 407 | static int mac53c94_probe(struct macio_dev *mdev, const struct of_device_id *match) |
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index 765c24d2bc38..4d59ae8491a4 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c | |||
| @@ -4490,7 +4490,6 @@ static struct scsi_host_template megaraid_template = { | |||
| 4490 | .sg_tablesize = MAX_SGLIST, | 4490 | .sg_tablesize = MAX_SGLIST, |
| 4491 | .cmd_per_lun = DEF_CMD_PER_LUN, | 4491 | .cmd_per_lun = DEF_CMD_PER_LUN, |
| 4492 | .use_clustering = ENABLE_CLUSTERING, | 4492 | .use_clustering = ENABLE_CLUSTERING, |
| 4493 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 4494 | .eh_abort_handler = megaraid_abort, | 4493 | .eh_abort_handler = megaraid_abort, |
| 4495 | .eh_device_reset_handler = megaraid_reset, | 4494 | .eh_device_reset_handler = megaraid_reset, |
| 4496 | .eh_bus_reset_handler = megaraid_reset, | 4495 | .eh_bus_reset_handler = megaraid_reset, |
diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c index 24e32e446e76..6db77c00e3ee 100644 --- a/drivers/scsi/megaraid/megaraid_mbox.c +++ b/drivers/scsi/megaraid/megaraid_mbox.c | |||
| @@ -361,7 +361,6 @@ static struct scsi_host_template megaraid_template_g = { | |||
| 361 | .eh_host_reset_handler = megaraid_reset_handler, | 361 | .eh_host_reset_handler = megaraid_reset_handler, |
| 362 | .change_queue_depth = megaraid_change_queue_depth, | 362 | .change_queue_depth = megaraid_change_queue_depth, |
| 363 | .use_clustering = ENABLE_CLUSTERING, | 363 | .use_clustering = ENABLE_CLUSTERING, |
| 364 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 365 | .sdev_attrs = megaraid_sdev_attrs, | 364 | .sdev_attrs = megaraid_sdev_attrs, |
| 366 | .shost_attrs = megaraid_shost_attrs, | 365 | .shost_attrs = megaraid_shost_attrs, |
| 367 | }; | 366 | }; |
diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c index d7ec921865c4..672c759ac24d 100644 --- a/drivers/scsi/megaraid/megaraid_sas.c +++ b/drivers/scsi/megaraid/megaraid_sas.c | |||
| @@ -1192,7 +1192,6 @@ static struct scsi_host_template megasas_template = { | |||
| 1192 | .eh_timed_out = megasas_reset_timer, | 1192 | .eh_timed_out = megasas_reset_timer, |
| 1193 | .bios_param = megasas_bios_param, | 1193 | .bios_param = megasas_bios_param, |
| 1194 | .use_clustering = ENABLE_CLUSTERING, | 1194 | .use_clustering = ENABLE_CLUSTERING, |
| 1195 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1196 | }; | 1195 | }; |
| 1197 | 1196 | ||
| 1198 | /** | 1197 | /** |
diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c index 7470ff39ab22..651d09b08f2a 100644 --- a/drivers/scsi/mesh.c +++ b/drivers/scsi/mesh.c | |||
| @@ -1843,7 +1843,6 @@ static struct scsi_host_template mesh_template = { | |||
| 1843 | .sg_tablesize = SG_ALL, | 1843 | .sg_tablesize = SG_ALL, |
| 1844 | .cmd_per_lun = 2, | 1844 | .cmd_per_lun = 2, |
| 1845 | .use_clustering = DISABLE_CLUSTERING, | 1845 | .use_clustering = DISABLE_CLUSTERING, |
| 1846 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1847 | }; | 1846 | }; |
| 1848 | 1847 | ||
| 1849 | static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match) | 1848 | static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match) |
diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c index c02771aa6c9b..c5ebf018b378 100644 --- a/drivers/scsi/ncr53c8xx.c +++ b/drivers/scsi/ncr53c8xx.c | |||
| @@ -4967,7 +4967,7 @@ void ncr_complete (struct ncb *np, struct ccb *cp) | |||
| 4967 | sizeof(cp->sense_buf))); | 4967 | sizeof(cp->sense_buf))); |
| 4968 | 4968 | ||
| 4969 | if (DEBUG_FLAGS & (DEBUG_RESULT|DEBUG_TINY)) { | 4969 | if (DEBUG_FLAGS & (DEBUG_RESULT|DEBUG_TINY)) { |
| 4970 | u_char * p = (u_char*) & cmd->sense_buffer; | 4970 | u_char *p = cmd->sense_buffer; |
| 4971 | int i; | 4971 | int i; |
| 4972 | PRINT_ADDR(cmd, "sense data:"); | 4972 | PRINT_ADDR(cmd, "sense data:"); |
| 4973 | for (i=0; i<14; i++) printk (" %x", *p++); | 4973 | for (i=0; i<14; i++) printk (" %x", *p++); |
diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c index 28161dc95e0d..7fed35372150 100644 --- a/drivers/scsi/nsp32.c +++ b/drivers/scsi/nsp32.c | |||
| @@ -281,7 +281,6 @@ static struct scsi_host_template nsp32_template = { | |||
| 281 | .cmd_per_lun = 1, | 281 | .cmd_per_lun = 1, |
| 282 | .this_id = NSP32_HOST_SCSIID, | 282 | .this_id = NSP32_HOST_SCSIID, |
| 283 | .use_clustering = DISABLE_CLUSTERING, | 283 | .use_clustering = DISABLE_CLUSTERING, |
| 284 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 285 | .eh_abort_handler = nsp32_eh_abort, | 284 | .eh_abort_handler = nsp32_eh_abort, |
| 286 | .eh_bus_reset_handler = nsp32_eh_bus_reset, | 285 | .eh_bus_reset_handler = nsp32_eh_bus_reset, |
| 287 | .eh_host_reset_handler = nsp32_eh_host_reset, | 286 | .eh_host_reset_handler = nsp32_eh_host_reset, |
diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c index 969b9387a0c3..3454a5714749 100644 --- a/drivers/scsi/pcmcia/sym53c500_cs.c +++ b/drivers/scsi/pcmcia/sym53c500_cs.c | |||
| @@ -692,7 +692,6 @@ static struct scsi_host_template sym53c500_driver_template = { | |||
| 692 | .sg_tablesize = 32, | 692 | .sg_tablesize = 32, |
| 693 | .cmd_per_lun = 1, | 693 | .cmd_per_lun = 1, |
| 694 | .use_clustering = ENABLE_CLUSTERING, | 694 | .use_clustering = ENABLE_CLUSTERING, |
| 695 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 696 | .shost_attrs = SYM53C500_shost_attrs | 695 | .shost_attrs = SYM53C500_shost_attrs |
| 697 | }; | 696 | }; |
| 698 | 697 | ||
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c index c94906abfee3..68c0d09ffe78 100644 --- a/drivers/scsi/qla1280.c +++ b/drivers/scsi/qla1280.c | |||
| @@ -4204,7 +4204,6 @@ static struct scsi_host_template qla1280_driver_template = { | |||
| 4204 | .sg_tablesize = SG_ALL, | 4204 | .sg_tablesize = SG_ALL, |
| 4205 | .cmd_per_lun = 1, | 4205 | .cmd_per_lun = 1, |
| 4206 | .use_clustering = ENABLE_CLUSTERING, | 4206 | .use_clustering = ENABLE_CLUSTERING, |
| 4207 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 4208 | }; | 4207 | }; |
| 4209 | 4208 | ||
| 4210 | 4209 | ||
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index aba1e6d48066..3954ed2d7b51 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c | |||
| @@ -131,7 +131,6 @@ static struct scsi_host_template qla2x00_driver_template = { | |||
| 131 | .this_id = -1, | 131 | .this_id = -1, |
| 132 | .cmd_per_lun = 3, | 132 | .cmd_per_lun = 3, |
| 133 | .use_clustering = ENABLE_CLUSTERING, | 133 | .use_clustering = ENABLE_CLUSTERING, |
| 134 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 135 | .sg_tablesize = SG_ALL, | 134 | .sg_tablesize = SG_ALL, |
| 136 | 135 | ||
| 137 | /* | 136 | /* |
| @@ -163,7 +162,6 @@ struct scsi_host_template qla24xx_driver_template = { | |||
| 163 | .this_id = -1, | 162 | .this_id = -1, |
| 164 | .cmd_per_lun = 3, | 163 | .cmd_per_lun = 3, |
| 165 | .use_clustering = ENABLE_CLUSTERING, | 164 | .use_clustering = ENABLE_CLUSTERING, |
| 166 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 167 | .sg_tablesize = SG_ALL, | 165 | .sg_tablesize = SG_ALL, |
| 168 | 166 | ||
| 169 | .max_sectors = 0xFFFF, | 167 | .max_sectors = 0xFFFF, |
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index d3f86646cb08..2e2b9fedffcc 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c | |||
| @@ -94,7 +94,6 @@ static struct scsi_host_template qla4xxx_driver_template = { | |||
| 94 | .this_id = -1, | 94 | .this_id = -1, |
| 95 | .cmd_per_lun = 3, | 95 | .cmd_per_lun = 3, |
| 96 | .use_clustering = ENABLE_CLUSTERING, | 96 | .use_clustering = ENABLE_CLUSTERING, |
| 97 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 98 | .sg_tablesize = SG_ALL, | 97 | .sg_tablesize = SG_ALL, |
| 99 | 98 | ||
| 100 | .max_sectors = 0xFFFF, | 99 | .max_sectors = 0xFFFF, |
diff --git a/drivers/scsi/qlogicfas.c b/drivers/scsi/qlogicfas.c index 1769f965eedf..1e874f1fb5c6 100644 --- a/drivers/scsi/qlogicfas.c +++ b/drivers/scsi/qlogicfas.c | |||
| @@ -197,7 +197,6 @@ static struct scsi_host_template qlogicfas_driver_template = { | |||
| 197 | .sg_tablesize = SG_ALL, | 197 | .sg_tablesize = SG_ALL, |
| 198 | .cmd_per_lun = 1, | 198 | .cmd_per_lun = 1, |
| 199 | .use_clustering = DISABLE_CLUSTERING, | 199 | .use_clustering = DISABLE_CLUSTERING, |
| 200 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 201 | }; | 200 | }; |
| 202 | 201 | ||
| 203 | static __init int qlogicfas_init(void) | 202 | static __init int qlogicfas_init(void) |
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 1a9fba6a9f92..b35d19472caa 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c | |||
| @@ -757,7 +757,7 @@ void scsi_finish_command(struct scsi_cmnd *cmd) | |||
| 757 | "Notifying upper driver of completion " | 757 | "Notifying upper driver of completion " |
| 758 | "(result %x)\n", cmd->result)); | 758 | "(result %x)\n", cmd->result)); |
| 759 | 759 | ||
| 760 | good_bytes = cmd->request_bufflen; | 760 | good_bytes = scsi_bufflen(cmd); |
| 761 | if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) { | 761 | if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) { |
| 762 | drv = scsi_cmd_to_driver(cmd); | 762 | drv = scsi_cmd_to_driver(cmd); |
| 763 | if (drv->done) | 763 | if (drv->done) |
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 82c06f0a9d02..1541c174937a 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c | |||
| @@ -280,6 +280,8 @@ static int resp_write(struct scsi_cmnd * SCpnt, unsigned long long lba, | |||
| 280 | unsigned int num, struct sdebug_dev_info * devip); | 280 | unsigned int num, struct sdebug_dev_info * devip); |
| 281 | static int resp_report_luns(struct scsi_cmnd * SCpnt, | 281 | static int resp_report_luns(struct scsi_cmnd * SCpnt, |
| 282 | struct sdebug_dev_info * devip); | 282 | struct sdebug_dev_info * devip); |
| 283 | static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba, | ||
| 284 | unsigned int num, struct sdebug_dev_info *devip); | ||
| 283 | static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, | 285 | static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, |
| 284 | int arr_len); | 286 | int arr_len); |
| 285 | static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, | 287 | static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, |
| @@ -311,12 +313,48 @@ static void sdebug_max_tgts_luns(void); | |||
| 311 | static struct device pseudo_primary; | 313 | static struct device pseudo_primary; |
| 312 | static struct bus_type pseudo_lld_bus; | 314 | static struct bus_type pseudo_lld_bus; |
| 313 | 315 | ||
| 316 | static void get_data_transfer_info(unsigned char *cmd, | ||
| 317 | unsigned long long *lba, unsigned int *num) | ||
| 318 | { | ||
| 319 | int i; | ||
| 320 | |||
| 321 | switch (*cmd) { | ||
| 322 | case WRITE_16: | ||
| 323 | case READ_16: | ||
| 324 | for (*lba = 0, i = 0; i < 8; ++i) { | ||
| 325 | if (i > 0) | ||
| 326 | *lba <<= 8; | ||
| 327 | *lba += cmd[2 + i]; | ||
| 328 | } | ||
| 329 | *num = cmd[13] + (cmd[12] << 8) + | ||
| 330 | (cmd[11] << 16) + (cmd[10] << 24); | ||
| 331 | break; | ||
| 332 | case WRITE_12: | ||
| 333 | case READ_12: | ||
| 334 | *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24); | ||
| 335 | *num = cmd[9] + (cmd[8] << 8) + (cmd[7] << 16) + (cmd[6] << 24); | ||
| 336 | break; | ||
| 337 | case WRITE_10: | ||
| 338 | case READ_10: | ||
| 339 | case XDWRITEREAD_10: | ||
| 340 | *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24); | ||
| 341 | *num = cmd[8] + (cmd[7] << 8); | ||
| 342 | break; | ||
| 343 | case WRITE_6: | ||
| 344 | case READ_6: | ||
| 345 | *lba = cmd[3] + (cmd[2] << 8) + ((cmd[1] & 0x1f) << 16); | ||
| 346 | *num = (0 == cmd[4]) ? 256 : cmd[4]; | ||
| 347 | break; | ||
| 348 | default: | ||
| 349 | break; | ||
| 350 | } | ||
| 351 | } | ||
| 314 | 352 | ||
| 315 | static | 353 | static |
| 316 | int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) | 354 | int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) |
| 317 | { | 355 | { |
| 318 | unsigned char *cmd = (unsigned char *) SCpnt->cmnd; | 356 | unsigned char *cmd = (unsigned char *) SCpnt->cmnd; |
| 319 | int len, k, j; | 357 | int len, k; |
| 320 | unsigned int num; | 358 | unsigned int num; |
| 321 | unsigned long long lba; | 359 | unsigned long long lba; |
| 322 | int errsts = 0; | 360 | int errsts = 0; |
| @@ -452,28 +490,7 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) | |||
| 452 | break; | 490 | break; |
| 453 | if (scsi_debug_fake_rw) | 491 | if (scsi_debug_fake_rw) |
| 454 | break; | 492 | break; |
| 455 | if ((*cmd) == READ_16) { | 493 | get_data_transfer_info(cmd, &lba, &num); |
| 456 | for (lba = 0, j = 0; j < 8; ++j) { | ||
| 457 | if (j > 0) | ||
| 458 | lba <<= 8; | ||
| 459 | lba += cmd[2 + j]; | ||
| 460 | } | ||
| 461 | num = cmd[13] + (cmd[12] << 8) + | ||
| 462 | (cmd[11] << 16) + (cmd[10] << 24); | ||
| 463 | } else if ((*cmd) == READ_12) { | ||
| 464 | lba = cmd[5] + (cmd[4] << 8) + | ||
| 465 | (cmd[3] << 16) + (cmd[2] << 24); | ||
| 466 | num = cmd[9] + (cmd[8] << 8) + | ||
| 467 | (cmd[7] << 16) + (cmd[6] << 24); | ||
| 468 | } else if ((*cmd) == READ_10) { | ||
| 469 | lba = cmd[5] + (cmd[4] << 8) + | ||
| 470 | (cmd[3] << 16) + (cmd[2] << 24); | ||
| 471 | num = cmd[8] + (cmd[7] << 8); | ||
| 472 | } else { /* READ (6) */ | ||
| 473 | lba = cmd[3] + (cmd[2] << 8) + | ||
| 474 | ((cmd[1] & 0x1f) << 16); | ||
| 475 | num = (0 == cmd[4]) ? 256 : cmd[4]; | ||
| 476 | } | ||
| 477 | errsts = resp_read(SCpnt, lba, num, devip); | 494 | errsts = resp_read(SCpnt, lba, num, devip); |
| 478 | if (inj_recovered && (0 == errsts)) { | 495 | if (inj_recovered && (0 == errsts)) { |
| 479 | mk_sense_buffer(devip, RECOVERED_ERROR, | 496 | mk_sense_buffer(devip, RECOVERED_ERROR, |
| @@ -500,28 +517,7 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) | |||
| 500 | break; | 517 | break; |
| 501 | if (scsi_debug_fake_rw) | 518 | if (scsi_debug_fake_rw) |
| 502 | break; | 519 | break; |
| 503 | if ((*cmd) == WRITE_16) { | 520 | get_data_transfer_info(cmd, &lba, &num); |
| 504 | for (lba = 0, j = 0; j < 8; ++j) { | ||
| 505 | if (j > 0) | ||
| 506 | lba <<= 8; | ||
| 507 | lba += cmd[2 + j]; | ||
| 508 | } | ||
| 509 | num = cmd[13] + (cmd[12] << 8) + | ||
| 510 | (cmd[11] << 16) + (cmd[10] << 24); | ||
| 511 | } else if ((*cmd) == WRITE_12) { | ||
| 512 | lba = cmd[5] + (cmd[4] << 8) + | ||
| 513 | (cmd[3] << 16) + (cmd[2] << 24); | ||
| 514 | num = cmd[9] + (cmd[8] << 8) + | ||
| 515 | (cmd[7] << 16) + (cmd[6] << 24); | ||
| 516 | } else if ((*cmd) == WRITE_10) { | ||
| 517 | lba = cmd[5] + (cmd[4] << 8) + | ||
| 518 | (cmd[3] << 16) + (cmd[2] << 24); | ||
| 519 | num = cmd[8] + (cmd[7] << 8); | ||
| 520 | } else { /* WRITE (6) */ | ||
| 521 | lba = cmd[3] + (cmd[2] << 8) + | ||
| 522 | ((cmd[1] & 0x1f) << 16); | ||
| 523 | num = (0 == cmd[4]) ? 256 : cmd[4]; | ||
| 524 | } | ||
| 525 | errsts = resp_write(SCpnt, lba, num, devip); | 521 | errsts = resp_write(SCpnt, lba, num, devip); |
| 526 | if (inj_recovered && (0 == errsts)) { | 522 | if (inj_recovered && (0 == errsts)) { |
| 527 | mk_sense_buffer(devip, RECOVERED_ERROR, | 523 | mk_sense_buffer(devip, RECOVERED_ERROR, |
| @@ -549,6 +545,28 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) | |||
| 549 | case WRITE_BUFFER: | 545 | case WRITE_BUFFER: |
| 550 | errsts = check_readiness(SCpnt, 1, devip); | 546 | errsts = check_readiness(SCpnt, 1, devip); |
| 551 | break; | 547 | break; |
| 548 | case XDWRITEREAD_10: | ||
| 549 | if (!scsi_bidi_cmnd(SCpnt)) { | ||
| 550 | mk_sense_buffer(devip, ILLEGAL_REQUEST, | ||
| 551 | INVALID_FIELD_IN_CDB, 0); | ||
| 552 | errsts = check_condition_result; | ||
| 553 | break; | ||
| 554 | } | ||
| 555 | |||
| 556 | errsts = check_readiness(SCpnt, 0, devip); | ||
| 557 | if (errsts) | ||
| 558 | break; | ||
| 559 | if (scsi_debug_fake_rw) | ||
| 560 | break; | ||
| 561 | get_data_transfer_info(cmd, &lba, &num); | ||
| 562 | errsts = resp_read(SCpnt, lba, num, devip); | ||
| 563 | if (errsts) | ||
| 564 | break; | ||
| 565 | errsts = resp_write(SCpnt, lba, num, devip); | ||
| 566 | if (errsts) | ||
| 567 | break; | ||
| 568 | errsts = resp_xdwriteread(SCpnt, lba, num, devip); | ||
| 569 | break; | ||
| 552 | default: | 570 | default: |
| 553 | if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) | 571 | if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) |
| 554 | printk(KERN_INFO "scsi_debug: Opcode: 0x%x not " | 572 | printk(KERN_INFO "scsi_debug: Opcode: 0x%x not " |
| @@ -601,18 +619,18 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, | |||
| 601 | int k, req_len, act_len, len, active; | 619 | int k, req_len, act_len, len, active; |
| 602 | void * kaddr; | 620 | void * kaddr; |
| 603 | void * kaddr_off; | 621 | void * kaddr_off; |
| 604 | struct scatterlist * sg; | 622 | struct scatterlist *sg; |
| 623 | struct scsi_data_buffer *sdb = scsi_in(scp); | ||
| 605 | 624 | ||
| 606 | if (0 == scsi_bufflen(scp)) | 625 | if (!sdb->length) |
| 607 | return 0; | 626 | return 0; |
| 608 | if (NULL == scsi_sglist(scp)) | 627 | if (!sdb->table.sgl) |
| 609 | return (DID_ERROR << 16); | 628 | return (DID_ERROR << 16); |
| 610 | if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) || | 629 | if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_FROM_DEVICE)) |
| 611 | (scp->sc_data_direction == DMA_FROM_DEVICE))) | ||
| 612 | return (DID_ERROR << 16); | 630 | return (DID_ERROR << 16); |
| 613 | active = 1; | 631 | active = 1; |
| 614 | req_len = act_len = 0; | 632 | req_len = act_len = 0; |
| 615 | scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) { | 633 | for_each_sg(sdb->table.sgl, sg, sdb->table.nents, k) { |
| 616 | if (active) { | 634 | if (active) { |
| 617 | kaddr = (unsigned char *) | 635 | kaddr = (unsigned char *) |
| 618 | kmap_atomic(sg_page(sg), KM_USER0); | 636 | kmap_atomic(sg_page(sg), KM_USER0); |
| @@ -630,10 +648,10 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, | |||
| 630 | } | 648 | } |
| 631 | req_len += sg->length; | 649 | req_len += sg->length; |
| 632 | } | 650 | } |
| 633 | if (scsi_get_resid(scp)) | 651 | if (sdb->resid) |
| 634 | scsi_set_resid(scp, scsi_get_resid(scp) - act_len); | 652 | sdb->resid -= act_len; |
| 635 | else | 653 | else |
| 636 | scsi_set_resid(scp, req_len - act_len); | 654 | sdb->resid = req_len - act_len; |
| 637 | return 0; | 655 | return 0; |
| 638 | } | 656 | } |
| 639 | 657 | ||
| @@ -650,8 +668,7 @@ static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, | |||
| 650 | return 0; | 668 | return 0; |
| 651 | if (NULL == scsi_sglist(scp)) | 669 | if (NULL == scsi_sglist(scp)) |
| 652 | return -1; | 670 | return -1; |
| 653 | if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) || | 671 | if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_TO_DEVICE)) |
| 654 | (scp->sc_data_direction == DMA_TO_DEVICE))) | ||
| 655 | return -1; | 672 | return -1; |
| 656 | req_len = fin = 0; | 673 | req_len = fin = 0; |
| 657 | scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) { | 674 | scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) { |
| @@ -1956,6 +1973,50 @@ static int resp_report_luns(struct scsi_cmnd * scp, | |||
| 1956 | min((int)alloc_len, SDEBUG_RLUN_ARR_SZ)); | 1973 | min((int)alloc_len, SDEBUG_RLUN_ARR_SZ)); |
| 1957 | } | 1974 | } |
| 1958 | 1975 | ||
| 1976 | static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba, | ||
| 1977 | unsigned int num, struct sdebug_dev_info *devip) | ||
| 1978 | { | ||
| 1979 | int i, j, ret = -1; | ||
| 1980 | unsigned char *kaddr, *buf; | ||
| 1981 | unsigned int offset; | ||
| 1982 | struct scatterlist *sg; | ||
| 1983 | struct scsi_data_buffer *sdb = scsi_in(scp); | ||
| 1984 | |||
| 1985 | /* better not to use temporary buffer. */ | ||
| 1986 | buf = kmalloc(scsi_bufflen(scp), GFP_ATOMIC); | ||
| 1987 | if (!buf) | ||
| 1988 | return ret; | ||
| 1989 | |||
| 1990 | offset = 0; | ||
| 1991 | scsi_for_each_sg(scp, sg, scsi_sg_count(scp), i) { | ||
| 1992 | kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0); | ||
| 1993 | if (!kaddr) | ||
| 1994 | goto out; | ||
| 1995 | |||
| 1996 | memcpy(buf + offset, kaddr + sg->offset, sg->length); | ||
| 1997 | offset += sg->length; | ||
| 1998 | kunmap_atomic(kaddr, KM_USER0); | ||
| 1999 | } | ||
| 2000 | |||
| 2001 | offset = 0; | ||
| 2002 | for_each_sg(sdb->table.sgl, sg, sdb->table.nents, i) { | ||
| 2003 | kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0); | ||
| 2004 | if (!kaddr) | ||
| 2005 | goto out; | ||
| 2006 | |||
| 2007 | for (j = 0; j < sg->length; j++) | ||
| 2008 | *(kaddr + sg->offset + j) ^= *(buf + offset + j); | ||
| 2009 | |||
| 2010 | offset += sg->length; | ||
| 2011 | kunmap_atomic(kaddr, KM_USER0); | ||
| 2012 | } | ||
| 2013 | ret = 0; | ||
| 2014 | out: | ||
| 2015 | kfree(buf); | ||
| 2016 | |||
| 2017 | return ret; | ||
| 2018 | } | ||
| 2019 | |||
| 1959 | /* When timer goes off this function is called. */ | 2020 | /* When timer goes off this function is called. */ |
| 1960 | static void timer_intr_handler(unsigned long indx) | 2021 | static void timer_intr_handler(unsigned long indx) |
| 1961 | { | 2022 | { |
| @@ -1989,6 +2050,7 @@ static int scsi_debug_slave_alloc(struct scsi_device * sdp) | |||
| 1989 | if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) | 2050 | if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) |
| 1990 | printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %u>\n", | 2051 | printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %u>\n", |
| 1991 | sdp->host->host_no, sdp->channel, sdp->id, sdp->lun); | 2052 | sdp->host->host_no, sdp->channel, sdp->id, sdp->lun); |
| 2053 | set_bit(QUEUE_FLAG_BIDI, &sdp->request_queue->queue_flags); | ||
| 1992 | return 0; | 2054 | return 0; |
| 1993 | } | 2055 | } |
| 1994 | 2056 | ||
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 547e85aa414f..045a0868fc7b 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c | |||
| @@ -617,29 +617,27 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses, | |||
| 617 | ses->cmd_len = scmd->cmd_len; | 617 | ses->cmd_len = scmd->cmd_len; |
| 618 | memcpy(ses->cmnd, scmd->cmnd, sizeof(scmd->cmnd)); | 618 | memcpy(ses->cmnd, scmd->cmnd, sizeof(scmd->cmnd)); |
| 619 | ses->data_direction = scmd->sc_data_direction; | 619 | ses->data_direction = scmd->sc_data_direction; |
| 620 | ses->bufflen = scmd->request_bufflen; | 620 | ses->sdb = scmd->sdb; |
| 621 | ses->buffer = scmd->request_buffer; | 621 | ses->next_rq = scmd->request->next_rq; |
| 622 | ses->use_sg = scmd->use_sg; | ||
| 623 | ses->resid = scmd->resid; | ||
| 624 | ses->result = scmd->result; | 622 | ses->result = scmd->result; |
| 625 | 623 | ||
| 624 | memset(&scmd->sdb, 0, sizeof(scmd->sdb)); | ||
| 625 | scmd->request->next_rq = NULL; | ||
| 626 | |||
| 626 | if (sense_bytes) { | 627 | if (sense_bytes) { |
| 627 | scmd->request_bufflen = min_t(unsigned, | 628 | scmd->sdb.length = min_t(unsigned, SCSI_SENSE_BUFFERSIZE, |
| 628 | SCSI_SENSE_BUFFERSIZE, sense_bytes); | 629 | sense_bytes); |
| 629 | sg_init_one(&ses->sense_sgl, scmd->sense_buffer, | 630 | sg_init_one(&ses->sense_sgl, scmd->sense_buffer, |
| 630 | scmd->request_bufflen); | 631 | scmd->sdb.length); |
| 631 | scmd->request_buffer = &ses->sense_sgl; | 632 | scmd->sdb.table.sgl = &ses->sense_sgl; |
| 632 | scmd->sc_data_direction = DMA_FROM_DEVICE; | 633 | scmd->sc_data_direction = DMA_FROM_DEVICE; |
| 633 | scmd->use_sg = 1; | 634 | scmd->sdb.table.nents = 1; |
| 634 | memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); | 635 | memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); |
| 635 | scmd->cmnd[0] = REQUEST_SENSE; | 636 | scmd->cmnd[0] = REQUEST_SENSE; |
| 636 | scmd->cmnd[4] = scmd->request_bufflen; | 637 | scmd->cmnd[4] = scmd->sdb.length; |
| 637 | scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); | 638 | scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); |
| 638 | } else { | 639 | } else { |
| 639 | scmd->request_buffer = NULL; | ||
| 640 | scmd->request_bufflen = 0; | ||
| 641 | scmd->sc_data_direction = DMA_NONE; | 640 | scmd->sc_data_direction = DMA_NONE; |
| 642 | scmd->use_sg = 0; | ||
| 643 | if (cmnd) { | 641 | if (cmnd) { |
| 644 | memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); | 642 | memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); |
| 645 | memcpy(scmd->cmnd, cmnd, cmnd_size); | 643 | memcpy(scmd->cmnd, cmnd, cmnd_size); |
| @@ -676,10 +674,8 @@ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses) | |||
| 676 | scmd->cmd_len = ses->cmd_len; | 674 | scmd->cmd_len = ses->cmd_len; |
| 677 | memcpy(scmd->cmnd, ses->cmnd, sizeof(scmd->cmnd)); | 675 | memcpy(scmd->cmnd, ses->cmnd, sizeof(scmd->cmnd)); |
| 678 | scmd->sc_data_direction = ses->data_direction; | 676 | scmd->sc_data_direction = ses->data_direction; |
| 679 | scmd->request_bufflen = ses->bufflen; | 677 | scmd->sdb = ses->sdb; |
| 680 | scmd->request_buffer = ses->buffer; | 678 | scmd->request->next_rq = ses->next_rq; |
| 681 | scmd->use_sg = ses->use_sg; | ||
| 682 | scmd->resid = ses->resid; | ||
| 683 | scmd->result = ses->result; | 679 | scmd->result = ses->result; |
| 684 | } | 680 | } |
| 685 | EXPORT_SYMBOL(scsi_eh_restore_cmnd); | 681 | EXPORT_SYMBOL(scsi_eh_restore_cmnd); |
| @@ -1700,8 +1696,7 @@ scsi_reset_provider(struct scsi_device *dev, int flag) | |||
| 1700 | memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd)); | 1696 | memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd)); |
| 1701 | 1697 | ||
| 1702 | scmd->scsi_done = scsi_reset_provider_done_command; | 1698 | scmd->scsi_done = scsi_reset_provider_done_command; |
| 1703 | scmd->request_buffer = NULL; | 1699 | memset(&scmd->sdb, 0, sizeof(scmd->sdb)); |
| 1704 | scmd->request_bufflen = 0; | ||
| 1705 | 1700 | ||
| 1706 | scmd->cmd_len = 0; | 1701 | scmd->cmd_len = 0; |
| 1707 | 1702 | ||
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 7c4c889c5221..b12fb310e399 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | */ | 8 | */ |
| 9 | 9 | ||
| 10 | #include <linux/bio.h> | 10 | #include <linux/bio.h> |
| 11 | #include <linux/bitops.h> | ||
| 11 | #include <linux/blkdev.h> | 12 | #include <linux/blkdev.h> |
| 12 | #include <linux/completion.h> | 13 | #include <linux/completion.h> |
| 13 | #include <linux/kernel.h> | 14 | #include <linux/kernel.h> |
| @@ -34,13 +35,6 @@ | |||
| 34 | #define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools) | 35 | #define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools) |
| 35 | #define SG_MEMPOOL_SIZE 2 | 36 | #define SG_MEMPOOL_SIZE 2 |
| 36 | 37 | ||
| 37 | /* | ||
| 38 | * The maximum number of SG segments that we will put inside a scatterlist | ||
| 39 | * (unless chaining is used). Should ideally fit inside a single page, to | ||
| 40 | * avoid a higher order allocation. | ||
| 41 | */ | ||
| 42 | #define SCSI_MAX_SG_SEGMENTS 128 | ||
| 43 | |||
| 44 | struct scsi_host_sg_pool { | 38 | struct scsi_host_sg_pool { |
| 45 | size_t size; | 39 | size_t size; |
| 46 | char *name; | 40 | char *name; |
| @@ -48,22 +42,31 @@ struct scsi_host_sg_pool { | |||
| 48 | mempool_t *pool; | 42 | mempool_t *pool; |
| 49 | }; | 43 | }; |
| 50 | 44 | ||
| 51 | #define SP(x) { x, "sgpool-" #x } | 45 | #define SP(x) { x, "sgpool-" __stringify(x) } |
| 46 | #if (SCSI_MAX_SG_SEGMENTS < 32) | ||
| 47 | #error SCSI_MAX_SG_SEGMENTS is too small (must be 32 or greater) | ||
| 48 | #endif | ||
| 52 | static struct scsi_host_sg_pool scsi_sg_pools[] = { | 49 | static struct scsi_host_sg_pool scsi_sg_pools[] = { |
| 53 | SP(8), | 50 | SP(8), |
| 54 | SP(16), | 51 | SP(16), |
| 55 | #if (SCSI_MAX_SG_SEGMENTS > 16) | ||
| 56 | SP(32), | ||
| 57 | #if (SCSI_MAX_SG_SEGMENTS > 32) | 52 | #if (SCSI_MAX_SG_SEGMENTS > 32) |
| 58 | SP(64), | 53 | SP(32), |
| 59 | #if (SCSI_MAX_SG_SEGMENTS > 64) | 54 | #if (SCSI_MAX_SG_SEGMENTS > 64) |
| 55 | SP(64), | ||
| 56 | #if (SCSI_MAX_SG_SEGMENTS > 128) | ||
| 60 | SP(128), | 57 | SP(128), |
| 58 | #if (SCSI_MAX_SG_SEGMENTS > 256) | ||
| 59 | #error SCSI_MAX_SG_SEGMENTS is too large (256 MAX) | ||
| 60 | #endif | ||
| 61 | #endif | 61 | #endif |
| 62 | #endif | 62 | #endif |
| 63 | #endif | 63 | #endif |
| 64 | SP(SCSI_MAX_SG_SEGMENTS) | ||
| 64 | }; | 65 | }; |
| 65 | #undef SP | 66 | #undef SP |
| 66 | 67 | ||
| 68 | static struct kmem_cache *scsi_bidi_sdb_cache; | ||
| 69 | |||
| 67 | static void scsi_run_queue(struct request_queue *q); | 70 | static void scsi_run_queue(struct request_queue *q); |
| 68 | 71 | ||
| 69 | /* | 72 | /* |
| @@ -440,7 +443,7 @@ EXPORT_SYMBOL_GPL(scsi_execute_async); | |||
| 440 | static void scsi_init_cmd_errh(struct scsi_cmnd *cmd) | 443 | static void scsi_init_cmd_errh(struct scsi_cmnd *cmd) |
| 441 | { | 444 | { |
| 442 | cmd->serial_number = 0; | 445 | cmd->serial_number = 0; |
| 443 | cmd->resid = 0; | 446 | scsi_set_resid(cmd, 0); |
| 444 | memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); | 447 | memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); |
| 445 | if (cmd->cmd_len == 0) | 448 | if (cmd->cmd_len == 0) |
| 446 | cmd->cmd_len = COMMAND_SIZE(cmd->cmnd[0]); | 449 | cmd->cmd_len = COMMAND_SIZE(cmd->cmnd[0]); |
| @@ -690,42 +693,16 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error, | |||
| 690 | return NULL; | 693 | return NULL; |
| 691 | } | 694 | } |
| 692 | 695 | ||
| 693 | /* | ||
| 694 | * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit | ||
| 695 | * is totally arbitrary, a setting of 2048 will get you at least 8mb ios. | ||
| 696 | */ | ||
| 697 | #define SCSI_MAX_SG_CHAIN_SEGMENTS 2048 | ||
| 698 | |||
| 699 | static inline unsigned int scsi_sgtable_index(unsigned short nents) | 696 | static inline unsigned int scsi_sgtable_index(unsigned short nents) |
| 700 | { | 697 | { |
| 701 | unsigned int index; | 698 | unsigned int index; |
| 702 | 699 | ||
| 703 | switch (nents) { | 700 | BUG_ON(nents > SCSI_MAX_SG_SEGMENTS); |
| 704 | case 1 ... 8: | 701 | |
| 702 | if (nents <= 8) | ||
| 705 | index = 0; | 703 | index = 0; |
| 706 | break; | 704 | else |
| 707 | case 9 ... 16: | 705 | index = get_count_order(nents) - 3; |
| 708 | index = 1; | ||
| 709 | break; | ||
| 710 | #if (SCSI_MAX_SG_SEGMENTS > 16) | ||
| 711 | case 17 ... 32: | ||
| 712 | index = 2; | ||
| 713 | break; | ||
| 714 | #if (SCSI_MAX_SG_SEGMENTS > 32) | ||
| 715 | case 33 ... 64: | ||
| 716 | index = 3; | ||
| 717 | break; | ||
| 718 | #if (SCSI_MAX_SG_SEGMENTS > 64) | ||
| 719 | case 65 ... 128: | ||
| 720 | index = 4; | ||
| 721 | break; | ||
| 722 | #endif | ||
| 723 | #endif | ||
| 724 | #endif | ||
| 725 | default: | ||
| 726 | printk(KERN_ERR "scsi: bad segment count=%d\n", nents); | ||
| 727 | BUG(); | ||
| 728 | } | ||
| 729 | 706 | ||
| 730 | return index; | 707 | return index; |
| 731 | } | 708 | } |
| @@ -746,31 +723,27 @@ static struct scatterlist *scsi_sg_alloc(unsigned int nents, gfp_t gfp_mask) | |||
| 746 | return mempool_alloc(sgp->pool, gfp_mask); | 723 | return mempool_alloc(sgp->pool, gfp_mask); |
| 747 | } | 724 | } |
| 748 | 725 | ||
| 749 | int scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask) | 726 | static int scsi_alloc_sgtable(struct scsi_data_buffer *sdb, int nents, |
| 727 | gfp_t gfp_mask) | ||
| 750 | { | 728 | { |
| 751 | int ret; | 729 | int ret; |
| 752 | 730 | ||
| 753 | BUG_ON(!cmd->use_sg); | 731 | BUG_ON(!nents); |
| 754 | 732 | ||
| 755 | ret = __sg_alloc_table(&cmd->sg_table, cmd->use_sg, | 733 | ret = __sg_alloc_table(&sdb->table, nents, SCSI_MAX_SG_SEGMENTS, |
| 756 | SCSI_MAX_SG_SEGMENTS, gfp_mask, scsi_sg_alloc); | 734 | gfp_mask, scsi_sg_alloc); |
| 757 | if (unlikely(ret)) | 735 | if (unlikely(ret)) |
| 758 | __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, | 736 | __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, |
| 759 | scsi_sg_free); | 737 | scsi_sg_free); |
| 760 | 738 | ||
| 761 | cmd->request_buffer = cmd->sg_table.sgl; | ||
| 762 | return ret; | 739 | return ret; |
| 763 | } | 740 | } |
| 764 | 741 | ||
| 765 | EXPORT_SYMBOL(scsi_alloc_sgtable); | 742 | static void scsi_free_sgtable(struct scsi_data_buffer *sdb) |
| 766 | |||
| 767 | void scsi_free_sgtable(struct scsi_cmnd *cmd) | ||
| 768 | { | 743 | { |
| 769 | __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free); | 744 | __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free); |
| 770 | } | 745 | } |
| 771 | 746 | ||
| 772 | EXPORT_SYMBOL(scsi_free_sgtable); | ||
| 773 | |||
| 774 | /* | 747 | /* |
| 775 | * Function: scsi_release_buffers() | 748 | * Function: scsi_release_buffers() |
| 776 | * | 749 | * |
| @@ -788,17 +761,49 @@ EXPORT_SYMBOL(scsi_free_sgtable); | |||
| 788 | * the scatter-gather table, and potentially any bounce | 761 | * the scatter-gather table, and potentially any bounce |
| 789 | * buffers. | 762 | * buffers. |
| 790 | */ | 763 | */ |
| 791 | static void scsi_release_buffers(struct scsi_cmnd *cmd) | 764 | void scsi_release_buffers(struct scsi_cmnd *cmd) |
| 765 | { | ||
| 766 | if (cmd->sdb.table.nents) | ||
| 767 | scsi_free_sgtable(&cmd->sdb); | ||
| 768 | |||
| 769 | memset(&cmd->sdb, 0, sizeof(cmd->sdb)); | ||
| 770 | |||
| 771 | if (scsi_bidi_cmnd(cmd)) { | ||
| 772 | struct scsi_data_buffer *bidi_sdb = | ||
| 773 | cmd->request->next_rq->special; | ||
| 774 | scsi_free_sgtable(bidi_sdb); | ||
| 775 | kmem_cache_free(scsi_bidi_sdb_cache, bidi_sdb); | ||
| 776 | cmd->request->next_rq->special = NULL; | ||
| 777 | } | ||
| 778 | } | ||
| 779 | EXPORT_SYMBOL(scsi_release_buffers); | ||
| 780 | |||
| 781 | /* | ||
| 782 | * Bidi commands Must be complete as a whole, both sides at once. | ||
| 783 | * If part of the bytes were written and lld returned | ||
| 784 | * scsi_in()->resid and/or scsi_out()->resid this information will be left | ||
| 785 | * in req->data_len and req->next_rq->data_len. The upper-layer driver can | ||
| 786 | * decide what to do with this information. | ||
| 787 | */ | ||
| 788 | void scsi_end_bidi_request(struct scsi_cmnd *cmd) | ||
| 792 | { | 789 | { |
| 793 | if (cmd->use_sg) | 790 | struct request *req = cmd->request; |
| 794 | scsi_free_sgtable(cmd); | 791 | unsigned int dlen = req->data_len; |
| 792 | unsigned int next_dlen = req->next_rq->data_len; | ||
| 793 | |||
| 794 | req->data_len = scsi_out(cmd)->resid; | ||
| 795 | req->next_rq->data_len = scsi_in(cmd)->resid; | ||
| 796 | |||
| 797 | /* The req and req->next_rq have not been completed */ | ||
| 798 | BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen)); | ||
| 799 | |||
| 800 | scsi_release_buffers(cmd); | ||
| 795 | 801 | ||
| 796 | /* | 802 | /* |
| 797 | * Zero these out. They now point to freed memory, and it is | 803 | * This will goose the queue request function at the end, so we don't |
| 798 | * dangerous to hang onto the pointers. | 804 | * need to worry about launching another command. |
| 799 | */ | 805 | */ |
| 800 | cmd->request_buffer = NULL; | 806 | scsi_next_command(cmd); |
| 801 | cmd->request_bufflen = 0; | ||
| 802 | } | 807 | } |
| 803 | 808 | ||
| 804 | /* | 809 | /* |
| @@ -832,7 +837,7 @@ static void scsi_release_buffers(struct scsi_cmnd *cmd) | |||
| 832 | void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) | 837 | void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) |
| 833 | { | 838 | { |
| 834 | int result = cmd->result; | 839 | int result = cmd->result; |
| 835 | int this_count = cmd->request_bufflen; | 840 | int this_count = scsi_bufflen(cmd); |
| 836 | struct request_queue *q = cmd->device->request_queue; | 841 | struct request_queue *q = cmd->device->request_queue; |
| 837 | struct request *req = cmd->request; | 842 | struct request *req = cmd->request; |
| 838 | int clear_errors = 1; | 843 | int clear_errors = 1; |
| @@ -840,8 +845,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) | |||
| 840 | int sense_valid = 0; | 845 | int sense_valid = 0; |
| 841 | int sense_deferred = 0; | 846 | int sense_deferred = 0; |
| 842 | 847 | ||
| 843 | scsi_release_buffers(cmd); | ||
| 844 | |||
| 845 | if (result) { | 848 | if (result) { |
| 846 | sense_valid = scsi_command_normalize_sense(cmd, &sshdr); | 849 | sense_valid = scsi_command_normalize_sense(cmd, &sshdr); |
| 847 | if (sense_valid) | 850 | if (sense_valid) |
| @@ -864,9 +867,17 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) | |||
| 864 | req->sense_len = len; | 867 | req->sense_len = len; |
| 865 | } | 868 | } |
| 866 | } | 869 | } |
| 867 | req->data_len = cmd->resid; | 870 | if (scsi_bidi_cmnd(cmd)) { |
| 871 | /* will also release_buffers */ | ||
| 872 | scsi_end_bidi_request(cmd); | ||
| 873 | return; | ||
| 874 | } | ||
| 875 | req->data_len = scsi_get_resid(cmd); | ||
| 868 | } | 876 | } |
| 869 | 877 | ||
| 878 | BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */ | ||
| 879 | scsi_release_buffers(cmd); | ||
| 880 | |||
| 870 | /* | 881 | /* |
| 871 | * Next deal with any sectors which we were able to correctly | 882 | * Next deal with any sectors which we were able to correctly |
| 872 | * handle. | 883 | * handle. |
| @@ -874,7 +885,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) | |||
| 874 | SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, " | 885 | SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, " |
| 875 | "%d bytes done.\n", | 886 | "%d bytes done.\n", |
| 876 | req->nr_sectors, good_bytes)); | 887 | req->nr_sectors, good_bytes)); |
| 877 | SCSI_LOG_HLCOMPLETE(1, printk("use_sg is %d\n", cmd->use_sg)); | ||
| 878 | 888 | ||
| 879 | if (clear_errors) | 889 | if (clear_errors) |
| 880 | req->errors = 0; | 890 | req->errors = 0; |
| @@ -991,52 +1001,80 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) | |||
| 991 | scsi_end_request(cmd, -EIO, this_count, !result); | 1001 | scsi_end_request(cmd, -EIO, this_count, !result); |
| 992 | } | 1002 | } |
| 993 | 1003 | ||
| 994 | /* | 1004 | static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb, |
| 995 | * Function: scsi_init_io() | 1005 | gfp_t gfp_mask) |
| 996 | * | ||
| 997 | * Purpose: SCSI I/O initialize function. | ||
| 998 | * | ||
| 999 | * Arguments: cmd - Command descriptor we wish to initialize | ||
| 1000 | * | ||
| 1001 | * Returns: 0 on success | ||
| 1002 | * BLKPREP_DEFER if the failure is retryable | ||
| 1003 | */ | ||
| 1004 | static int scsi_init_io(struct scsi_cmnd *cmd) | ||
| 1005 | { | 1006 | { |
| 1006 | struct request *req = cmd->request; | 1007 | int count; |
| 1007 | int count; | ||
| 1008 | |||
| 1009 | /* | ||
| 1010 | * We used to not use scatter-gather for single segment request, | ||
| 1011 | * but now we do (it makes highmem I/O easier to support without | ||
| 1012 | * kmapping pages) | ||
| 1013 | */ | ||
| 1014 | cmd->use_sg = req->nr_phys_segments; | ||
| 1015 | 1008 | ||
| 1016 | /* | 1009 | /* |
| 1017 | * If sg table allocation fails, requeue request later. | 1010 | * If sg table allocation fails, requeue request later. |
| 1018 | */ | 1011 | */ |
| 1019 | if (unlikely(scsi_alloc_sgtable(cmd, GFP_ATOMIC))) { | 1012 | if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments, |
| 1020 | scsi_unprep_request(req); | 1013 | gfp_mask))) { |
| 1021 | return BLKPREP_DEFER; | 1014 | return BLKPREP_DEFER; |
| 1022 | } | 1015 | } |
| 1023 | 1016 | ||
| 1024 | req->buffer = NULL; | 1017 | req->buffer = NULL; |
| 1025 | if (blk_pc_request(req)) | 1018 | if (blk_pc_request(req)) |
| 1026 | cmd->request_bufflen = req->data_len; | 1019 | sdb->length = req->data_len; |
| 1027 | else | 1020 | else |
| 1028 | cmd->request_bufflen = req->nr_sectors << 9; | 1021 | sdb->length = req->nr_sectors << 9; |
| 1029 | 1022 | ||
| 1030 | /* | 1023 | /* |
| 1031 | * Next, walk the list, and fill in the addresses and sizes of | 1024 | * Next, walk the list, and fill in the addresses and sizes of |
| 1032 | * each segment. | 1025 | * each segment. |
| 1033 | */ | 1026 | */ |
| 1034 | count = blk_rq_map_sg(req->q, req, cmd->request_buffer); | 1027 | count = blk_rq_map_sg(req->q, req, sdb->table.sgl); |
| 1035 | BUG_ON(count > cmd->use_sg); | 1028 | BUG_ON(count > sdb->table.nents); |
| 1036 | cmd->use_sg = count; | 1029 | sdb->table.nents = count; |
| 1037 | return BLKPREP_OK; | 1030 | return BLKPREP_OK; |
| 1038 | } | 1031 | } |
| 1039 | 1032 | ||
| 1033 | /* | ||
| 1034 | * Function: scsi_init_io() | ||
| 1035 | * | ||
| 1036 | * Purpose: SCSI I/O initialize function. | ||
| 1037 | * | ||
| 1038 | * Arguments: cmd - Command descriptor we wish to initialize | ||
| 1039 | * | ||
| 1040 | * Returns: 0 on success | ||
| 1041 | * BLKPREP_DEFER if the failure is retryable | ||
| 1042 | * BLKPREP_KILL if the failure is fatal | ||
| 1043 | */ | ||
| 1044 | int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask) | ||
| 1045 | { | ||
| 1046 | int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask); | ||
| 1047 | if (error) | ||
| 1048 | goto err_exit; | ||
| 1049 | |||
| 1050 | if (blk_bidi_rq(cmd->request)) { | ||
| 1051 | struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc( | ||
| 1052 | scsi_bidi_sdb_cache, GFP_ATOMIC); | ||
| 1053 | if (!bidi_sdb) { | ||
| 1054 | error = BLKPREP_DEFER; | ||
| 1055 | goto err_exit; | ||
| 1056 | } | ||
| 1057 | |||
| 1058 | cmd->request->next_rq->special = bidi_sdb; | ||
| 1059 | error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb, | ||
| 1060 | GFP_ATOMIC); | ||
| 1061 | if (error) | ||
| 1062 | goto err_exit; | ||
| 1063 | } | ||
| 1064 | |||
| 1065 | return BLKPREP_OK ; | ||
| 1066 | |||
| 1067 | err_exit: | ||
| 1068 | scsi_release_buffers(cmd); | ||
| 1069 | if (error == BLKPREP_KILL) | ||
| 1070 | scsi_put_command(cmd); | ||
| 1071 | else /* BLKPREP_DEFER */ | ||
| 1072 | scsi_unprep_request(cmd->request); | ||
| 1073 | |||
| 1074 | return error; | ||
| 1075 | } | ||
| 1076 | EXPORT_SYMBOL(scsi_init_io); | ||
| 1077 | |||
| 1040 | static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev, | 1078 | static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev, |
| 1041 | struct request *req) | 1079 | struct request *req) |
| 1042 | { | 1080 | { |
| @@ -1081,16 +1119,14 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req) | |||
| 1081 | 1119 | ||
| 1082 | BUG_ON(!req->nr_phys_segments); | 1120 | BUG_ON(!req->nr_phys_segments); |
| 1083 | 1121 | ||
| 1084 | ret = scsi_init_io(cmd); | 1122 | ret = scsi_init_io(cmd, GFP_ATOMIC); |
| 1085 | if (unlikely(ret)) | 1123 | if (unlikely(ret)) |
| 1086 | return ret; | 1124 | return ret; |
| 1087 | } else { | 1125 | } else { |
| 1088 | BUG_ON(req->data_len); | 1126 | BUG_ON(req->data_len); |
| 1089 | BUG_ON(req->data); | 1127 | BUG_ON(req->data); |
| 1090 | 1128 | ||
| 1091 | cmd->request_bufflen = 0; | 1129 | memset(&cmd->sdb, 0, sizeof(cmd->sdb)); |
| 1092 | cmd->request_buffer = NULL; | ||
| 1093 | cmd->use_sg = 0; | ||
| 1094 | req->buffer = NULL; | 1130 | req->buffer = NULL; |
| 1095 | } | 1131 | } |
| 1096 | 1132 | ||
| @@ -1132,7 +1168,7 @@ int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req) | |||
| 1132 | if (unlikely(!cmd)) | 1168 | if (unlikely(!cmd)) |
| 1133 | return BLKPREP_DEFER; | 1169 | return BLKPREP_DEFER; |
| 1134 | 1170 | ||
| 1135 | return scsi_init_io(cmd); | 1171 | return scsi_init_io(cmd, GFP_ATOMIC); |
| 1136 | } | 1172 | } |
| 1137 | EXPORT_SYMBOL(scsi_setup_fs_cmnd); | 1173 | EXPORT_SYMBOL(scsi_setup_fs_cmnd); |
| 1138 | 1174 | ||
| @@ -1542,20 +1578,7 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost, | |||
| 1542 | * this limit is imposed by hardware restrictions | 1578 | * this limit is imposed by hardware restrictions |
| 1543 | */ | 1579 | */ |
| 1544 | blk_queue_max_hw_segments(q, shost->sg_tablesize); | 1580 | blk_queue_max_hw_segments(q, shost->sg_tablesize); |
| 1545 | 1581 | blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS); | |
| 1546 | /* | ||
| 1547 | * In the future, sg chaining support will be mandatory and this | ||
| 1548 | * ifdef can then go away. Right now we don't have all archs | ||
| 1549 | * converted, so better keep it safe. | ||
| 1550 | */ | ||
| 1551 | #ifdef ARCH_HAS_SG_CHAIN | ||
| 1552 | if (shost->use_sg_chaining) | ||
| 1553 | blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS); | ||
| 1554 | else | ||
| 1555 | blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS); | ||
| 1556 | #else | ||
| 1557 | blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS); | ||
| 1558 | #endif | ||
| 1559 | 1582 | ||
| 1560 | blk_queue_max_sectors(q, shost->max_sectors); | 1583 | blk_queue_max_sectors(q, shost->max_sectors); |
| 1561 | blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost)); | 1584 | blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost)); |
| @@ -1654,6 +1677,14 @@ int __init scsi_init_queue(void) | |||
| 1654 | return -ENOMEM; | 1677 | return -ENOMEM; |
| 1655 | } | 1678 | } |
| 1656 | 1679 | ||
| 1680 | scsi_bidi_sdb_cache = kmem_cache_create("scsi_bidi_sdb", | ||
| 1681 | sizeof(struct scsi_data_buffer), | ||
| 1682 | 0, 0, NULL); | ||
| 1683 | if (!scsi_bidi_sdb_cache) { | ||
| 1684 | printk(KERN_ERR "SCSI: can't init scsi bidi sdb cache\n"); | ||
| 1685 | goto cleanup_io_context; | ||
| 1686 | } | ||
| 1687 | |||
| 1657 | for (i = 0; i < SG_MEMPOOL_NR; i++) { | 1688 | for (i = 0; i < SG_MEMPOOL_NR; i++) { |
| 1658 | struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; | 1689 | struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; |
| 1659 | int size = sgp->size * sizeof(struct scatterlist); | 1690 | int size = sgp->size * sizeof(struct scatterlist); |
| @@ -1663,6 +1694,7 @@ int __init scsi_init_queue(void) | |||
| 1663 | if (!sgp->slab) { | 1694 | if (!sgp->slab) { |
| 1664 | printk(KERN_ERR "SCSI: can't init sg slab %s\n", | 1695 | printk(KERN_ERR "SCSI: can't init sg slab %s\n", |
| 1665 | sgp->name); | 1696 | sgp->name); |
| 1697 | goto cleanup_bidi_sdb; | ||
| 1666 | } | 1698 | } |
| 1667 | 1699 | ||
| 1668 | sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE, | 1700 | sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE, |
| @@ -1670,10 +1702,25 @@ int __init scsi_init_queue(void) | |||
| 1670 | if (!sgp->pool) { | 1702 | if (!sgp->pool) { |
| 1671 | printk(KERN_ERR "SCSI: can't init sg mempool %s\n", | 1703 | printk(KERN_ERR "SCSI: can't init sg mempool %s\n", |
| 1672 | sgp->name); | 1704 | sgp->name); |
| 1705 | goto cleanup_bidi_sdb; | ||
| 1673 | } | 1706 | } |
| 1674 | } | 1707 | } |
| 1675 | 1708 | ||
| 1676 | return 0; | 1709 | return 0; |
| 1710 | |||
| 1711 | cleanup_bidi_sdb: | ||
| 1712 | for (i = 0; i < SG_MEMPOOL_NR; i++) { | ||
| 1713 | struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; | ||
| 1714 | if (sgp->pool) | ||
| 1715 | mempool_destroy(sgp->pool); | ||
| 1716 | if (sgp->slab) | ||
| 1717 | kmem_cache_destroy(sgp->slab); | ||
| 1718 | } | ||
| 1719 | kmem_cache_destroy(scsi_bidi_sdb_cache); | ||
| 1720 | cleanup_io_context: | ||
| 1721 | kmem_cache_destroy(scsi_io_context_cache); | ||
| 1722 | |||
| 1723 | return -ENOMEM; | ||
| 1677 | } | 1724 | } |
| 1678 | 1725 | ||
| 1679 | void scsi_exit_queue(void) | 1726 | void scsi_exit_queue(void) |
| @@ -1681,6 +1728,7 @@ void scsi_exit_queue(void) | |||
| 1681 | int i; | 1728 | int i; |
| 1682 | 1729 | ||
| 1683 | kmem_cache_destroy(scsi_io_context_cache); | 1730 | kmem_cache_destroy(scsi_io_context_cache); |
| 1731 | kmem_cache_destroy(scsi_bidi_sdb_cache); | ||
| 1684 | 1732 | ||
| 1685 | for (i = 0; i < SG_MEMPOOL_NR; i++) { | 1733 | for (i = 0; i < SG_MEMPOOL_NR; i++) { |
| 1686 | struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; | 1734 | struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; |
diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c index 01e03f3f6ffa..91630baea532 100644 --- a/drivers/scsi/scsi_tgt_lib.c +++ b/drivers/scsi/scsi_tgt_lib.c | |||
| @@ -331,8 +331,7 @@ static void scsi_tgt_cmd_done(struct scsi_cmnd *cmd) | |||
| 331 | 331 | ||
| 332 | scsi_tgt_uspace_send_status(cmd, tcmd->itn_id, tcmd->tag); | 332 | scsi_tgt_uspace_send_status(cmd, tcmd->itn_id, tcmd->tag); |
| 333 | 333 | ||
| 334 | if (scsi_sglist(cmd)) | 334 | scsi_release_buffers(cmd); |
| 335 | scsi_free_sgtable(cmd); | ||
| 336 | 335 | ||
| 337 | queue_work(scsi_tgtd, &tcmd->work); | 336 | queue_work(scsi_tgtd, &tcmd->work); |
| 338 | } | 337 | } |
| @@ -353,25 +352,6 @@ static int scsi_tgt_transfer_response(struct scsi_cmnd *cmd) | |||
| 353 | return 0; | 352 | return 0; |
| 354 | } | 353 | } |
| 355 | 354 | ||
| 356 | static int scsi_tgt_init_cmd(struct scsi_cmnd *cmd, gfp_t gfp_mask) | ||
| 357 | { | ||
| 358 | struct request *rq = cmd->request; | ||
| 359 | int count; | ||
| 360 | |||
| 361 | cmd->use_sg = rq->nr_phys_segments; | ||
| 362 | if (scsi_alloc_sgtable(cmd, gfp_mask)) | ||
| 363 | return -ENOMEM; | ||
| 364 | |||
| 365 | cmd->request_bufflen = rq->data_len; | ||
| 366 | |||
| 367 | dprintk("cmd %p cnt %d %lu\n", cmd, scsi_sg_count(cmd), | ||
| 368 | rq_data_dir(rq)); | ||
| 369 | count = blk_rq_map_sg(rq->q, rq, scsi_sglist(cmd)); | ||
| 370 | BUG_ON(count > cmd->use_sg); | ||
| 371 | cmd->use_sg = count; | ||
| 372 | return 0; | ||
| 373 | } | ||
| 374 | |||
| 375 | /* TODO: test this crap and replace bio_map_user with new interface maybe */ | 355 | /* TODO: test this crap and replace bio_map_user with new interface maybe */ |
| 376 | static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd, | 356 | static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd, |
| 377 | unsigned long uaddr, unsigned int len, int rw) | 357 | unsigned long uaddr, unsigned int len, int rw) |
| @@ -397,9 +377,11 @@ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd, | |||
| 397 | } | 377 | } |
| 398 | 378 | ||
| 399 | tcmd->bio = rq->bio; | 379 | tcmd->bio = rq->bio; |
| 400 | err = scsi_tgt_init_cmd(cmd, GFP_KERNEL); | 380 | err = scsi_init_io(cmd, GFP_KERNEL); |
| 401 | if (err) | 381 | if (err) { |
| 382 | scsi_release_buffers(cmd); | ||
| 402 | goto unmap_rq; | 383 | goto unmap_rq; |
| 384 | } | ||
| 403 | 385 | ||
| 404 | return 0; | 386 | return 0; |
| 405 | 387 | ||
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 24eba3118b5a..51a5557f42dd 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c | |||
| @@ -519,7 +519,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq) | |||
| 519 | SCpnt->cmnd[4] = (unsigned char) this_count; | 519 | SCpnt->cmnd[4] = (unsigned char) this_count; |
| 520 | SCpnt->cmnd[5] = 0; | 520 | SCpnt->cmnd[5] = 0; |
| 521 | } | 521 | } |
| 522 | SCpnt->request_bufflen = this_count * sdp->sector_size; | 522 | SCpnt->sdb.length = this_count * sdp->sector_size; |
| 523 | 523 | ||
| 524 | /* | 524 | /* |
| 525 | * We shouldn't disconnect in the middle of a sector, so with a dumb | 525 | * We shouldn't disconnect in the middle of a sector, so with a dumb |
| @@ -926,7 +926,7 @@ static struct block_device_operations sd_fops = { | |||
| 926 | static int sd_done(struct scsi_cmnd *SCpnt) | 926 | static int sd_done(struct scsi_cmnd *SCpnt) |
| 927 | { | 927 | { |
| 928 | int result = SCpnt->result; | 928 | int result = SCpnt->result; |
| 929 | unsigned int xfer_size = SCpnt->request_bufflen; | 929 | unsigned int xfer_size = scsi_bufflen(SCpnt); |
| 930 | unsigned int good_bytes = result ? 0 : xfer_size; | 930 | unsigned int good_bytes = result ? 0 : xfer_size; |
| 931 | u64 start_lba = SCpnt->request->sector; | 931 | u64 start_lba = SCpnt->request->sector; |
| 932 | u64 bad_lba; | 932 | u64 bad_lba; |
diff --git a/drivers/scsi/sgiwd93.c b/drivers/scsi/sgiwd93.c index d4ebe8c67ba9..26cfc56c7091 100644 --- a/drivers/scsi/sgiwd93.c +++ b/drivers/scsi/sgiwd93.c | |||
| @@ -33,10 +33,9 @@ | |||
| 33 | 33 | ||
| 34 | struct ip22_hostdata { | 34 | struct ip22_hostdata { |
| 35 | struct WD33C93_hostdata wh; | 35 | struct WD33C93_hostdata wh; |
| 36 | struct hpc_data { | 36 | dma_addr_t dma; |
| 37 | dma_addr_t dma; | 37 | void *cpu; |
| 38 | void *cpu; | 38 | struct device *dev; |
| 39 | } hd; | ||
| 40 | }; | 39 | }; |
| 41 | 40 | ||
| 42 | #define host_to_hostdata(host) ((struct ip22_hostdata *)((host)->hostdata)) | 41 | #define host_to_hostdata(host) ((struct ip22_hostdata *)((host)->hostdata)) |
| @@ -46,6 +45,11 @@ struct hpc_chunk { | |||
| 46 | u32 _padding; /* align to quadword boundary */ | 45 | u32 _padding; /* align to quadword boundary */ |
| 47 | }; | 46 | }; |
| 48 | 47 | ||
| 48 | /* space for hpc dma descriptors */ | ||
| 49 | #define HPC_DMA_SIZE PAGE_SIZE | ||
| 50 | |||
| 51 | #define DMA_DIR(d) ((d == DATA_OUT_DIR) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) | ||
| 52 | |||
| 49 | static irqreturn_t sgiwd93_intr(int irq, void *dev_id) | 53 | static irqreturn_t sgiwd93_intr(int irq, void *dev_id) |
| 50 | { | 54 | { |
| 51 | struct Scsi_Host * host = dev_id; | 55 | struct Scsi_Host * host = dev_id; |
| @@ -59,15 +63,17 @@ static irqreturn_t sgiwd93_intr(int irq, void *dev_id) | |||
| 59 | } | 63 | } |
| 60 | 64 | ||
| 61 | static inline | 65 | static inline |
| 62 | void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp) | 66 | void fill_hpc_entries(struct ip22_hostdata *hd, struct scsi_cmnd *cmd, int din) |
| 63 | { | 67 | { |
| 64 | unsigned long len = cmd->SCp.this_residual; | 68 | unsigned long len = cmd->SCp.this_residual; |
| 65 | void *addr = cmd->SCp.ptr; | 69 | void *addr = cmd->SCp.ptr; |
| 66 | dma_addr_t physaddr; | 70 | dma_addr_t physaddr; |
| 67 | unsigned long count; | 71 | unsigned long count; |
| 72 | struct hpc_chunk *hcp; | ||
| 68 | 73 | ||
| 69 | physaddr = dma_map_single(NULL, addr, len, cmd->sc_data_direction); | 74 | physaddr = dma_map_single(hd->dev, addr, len, DMA_DIR(din)); |
| 70 | cmd->SCp.dma_handle = physaddr; | 75 | cmd->SCp.dma_handle = physaddr; |
| 76 | hcp = hd->cpu; | ||
| 71 | 77 | ||
| 72 | while (len) { | 78 | while (len) { |
| 73 | /* | 79 | /* |
| @@ -89,6 +95,9 @@ void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp) | |||
| 89 | */ | 95 | */ |
| 90 | hcp->desc.pbuf = 0; | 96 | hcp->desc.pbuf = 0; |
| 91 | hcp->desc.cntinfo = HPCDMA_EOX; | 97 | hcp->desc.cntinfo = HPCDMA_EOX; |
| 98 | dma_cache_sync(hd->dev, hd->cpu, | ||
| 99 | (unsigned long)(hcp + 1) - (unsigned long)hd->cpu, | ||
| 100 | DMA_TO_DEVICE); | ||
| 92 | } | 101 | } |
| 93 | 102 | ||
| 94 | static int dma_setup(struct scsi_cmnd *cmd, int datainp) | 103 | static int dma_setup(struct scsi_cmnd *cmd, int datainp) |
| @@ -96,9 +105,8 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp) | |||
| 96 | struct ip22_hostdata *hdata = host_to_hostdata(cmd->device->host); | 105 | struct ip22_hostdata *hdata = host_to_hostdata(cmd->device->host); |
| 97 | struct hpc3_scsiregs *hregs = | 106 | struct hpc3_scsiregs *hregs = |
| 98 | (struct hpc3_scsiregs *) cmd->device->host->base; | 107 | (struct hpc3_scsiregs *) cmd->device->host->base; |
| 99 | struct hpc_chunk *hcp = (struct hpc_chunk *) hdata->hd.cpu; | ||
| 100 | 108 | ||
| 101 | pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hcp); | 109 | pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hdata->cpu); |
| 102 | 110 | ||
| 103 | hdata->wh.dma_dir = datainp; | 111 | hdata->wh.dma_dir = datainp; |
| 104 | 112 | ||
| @@ -111,12 +119,12 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp) | |||
| 111 | if (cmd->SCp.ptr == NULL || cmd->SCp.this_residual == 0) | 119 | if (cmd->SCp.ptr == NULL || cmd->SCp.this_residual == 0) |
| 112 | return 1; | 120 | return 1; |
| 113 | 121 | ||
| 114 | fill_hpc_entries(hcp, cmd, datainp); | 122 | fill_hpc_entries(hdata, cmd, datainp); |
| 115 | 123 | ||
| 116 | pr_debug(" HPCGO\n"); | 124 | pr_debug(" HPCGO\n"); |
| 117 | 125 | ||
| 118 | /* Start up the HPC. */ | 126 | /* Start up the HPC. */ |
| 119 | hregs->ndptr = hdata->hd.dma; | 127 | hregs->ndptr = hdata->dma; |
| 120 | if (datainp) | 128 | if (datainp) |
| 121 | hregs->ctrl = HPC3_SCTRL_ACTIVE; | 129 | hregs->ctrl = HPC3_SCTRL_ACTIVE; |
| 122 | else | 130 | else |
| @@ -134,6 +142,9 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt, | |||
| 134 | if (!SCpnt) | 142 | if (!SCpnt) |
| 135 | return; | 143 | return; |
| 136 | 144 | ||
| 145 | if (SCpnt->SCp.ptr == NULL || SCpnt->SCp.this_residual == 0) | ||
| 146 | return; | ||
| 147 | |||
| 137 | hregs = (struct hpc3_scsiregs *) SCpnt->device->host->base; | 148 | hregs = (struct hpc3_scsiregs *) SCpnt->device->host->base; |
| 138 | 149 | ||
| 139 | pr_debug("dma_stop: status<%d> ", status); | 150 | pr_debug("dma_stop: status<%d> ", status); |
| @@ -145,8 +156,9 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt, | |||
| 145 | barrier(); | 156 | barrier(); |
| 146 | } | 157 | } |
| 147 | hregs->ctrl = 0; | 158 | hregs->ctrl = 0; |
| 148 | dma_unmap_single(NULL, SCpnt->SCp.dma_handle, SCpnt->SCp.this_residual, | 159 | dma_unmap_single(hdata->dev, SCpnt->SCp.dma_handle, |
| 149 | SCpnt->sc_data_direction); | 160 | SCpnt->SCp.this_residual, |
| 161 | DMA_DIR(hdata->wh.dma_dir)); | ||
| 150 | 162 | ||
| 151 | pr_debug("\n"); | 163 | pr_debug("\n"); |
| 152 | } | 164 | } |
| @@ -161,22 +173,23 @@ void sgiwd93_reset(unsigned long base) | |||
| 161 | } | 173 | } |
| 162 | EXPORT_SYMBOL_GPL(sgiwd93_reset); | 174 | EXPORT_SYMBOL_GPL(sgiwd93_reset); |
| 163 | 175 | ||
| 164 | static inline void init_hpc_chain(struct hpc_data *hd) | 176 | static inline void init_hpc_chain(struct ip22_hostdata *hdata) |
| 165 | { | 177 | { |
| 166 | struct hpc_chunk *hcp = (struct hpc_chunk *) hd->cpu; | 178 | struct hpc_chunk *hcp = (struct hpc_chunk *)hdata->cpu; |
| 167 | struct hpc_chunk *dma = (struct hpc_chunk *) hd->dma; | 179 | dma_addr_t dma = hdata->dma; |
| 168 | unsigned long start, end; | 180 | unsigned long start, end; |
| 169 | 181 | ||
| 170 | start = (unsigned long) hcp; | 182 | start = (unsigned long) hcp; |
| 171 | end = start + PAGE_SIZE; | 183 | end = start + HPC_DMA_SIZE; |
| 172 | while (start < end) { | 184 | while (start < end) { |
| 173 | hcp->desc.pnext = (u32) (dma + 1); | 185 | hcp->desc.pnext = (u32) (dma + sizeof(struct hpc_chunk)); |
| 174 | hcp->desc.cntinfo = HPCDMA_EOX; | 186 | hcp->desc.cntinfo = HPCDMA_EOX; |
| 175 | hcp++; dma++; | 187 | hcp++; |
| 188 | dma += sizeof(struct hpc_chunk); | ||
| 176 | start += sizeof(struct hpc_chunk); | 189 | start += sizeof(struct hpc_chunk); |
| 177 | }; | 190 | }; |
| 178 | hcp--; | 191 | hcp--; |
| 179 | hcp->desc.pnext = hd->dma; | 192 | hcp->desc.pnext = hdata->dma; |
| 180 | } | 193 | } |
| 181 | 194 | ||
| 182 | static int sgiwd93_bus_reset(struct scsi_cmnd *cmd) | 195 | static int sgiwd93_bus_reset(struct scsi_cmnd *cmd) |
| @@ -235,16 +248,17 @@ static int __init sgiwd93_probe(struct platform_device *pdev) | |||
| 235 | host->irq = irq; | 248 | host->irq = irq; |
| 236 | 249 | ||
| 237 | hdata = host_to_hostdata(host); | 250 | hdata = host_to_hostdata(host); |
| 238 | hdata->hd.cpu = dma_alloc_coherent(&pdev->dev, PAGE_SIZE, | 251 | hdata->dev = &pdev->dev; |
| 239 | &hdata->hd.dma, GFP_KERNEL); | 252 | hdata->cpu = dma_alloc_noncoherent(&pdev->dev, HPC_DMA_SIZE, |
| 240 | if (!hdata->hd.cpu) { | 253 | &hdata->dma, GFP_KERNEL); |
| 254 | if (!hdata->cpu) { | ||
| 241 | printk(KERN_WARNING "sgiwd93: Could not allocate memory for " | 255 | printk(KERN_WARNING "sgiwd93: Could not allocate memory for " |
| 242 | "host %d buffer.\n", unit); | 256 | "host %d buffer.\n", unit); |
| 243 | err = -ENOMEM; | 257 | err = -ENOMEM; |
| 244 | goto out_put; | 258 | goto out_put; |
| 245 | } | 259 | } |
| 246 | 260 | ||
| 247 | init_hpc_chain(&hdata->hd); | 261 | init_hpc_chain(hdata); |
| 248 | 262 | ||
| 249 | regs.SASR = wdregs + 3; | 263 | regs.SASR = wdregs + 3; |
| 250 | regs.SCMD = wdregs + 7; | 264 | regs.SCMD = wdregs + 7; |
| @@ -274,7 +288,7 @@ static int __init sgiwd93_probe(struct platform_device *pdev) | |||
| 274 | out_irq: | 288 | out_irq: |
| 275 | free_irq(irq, host); | 289 | free_irq(irq, host); |
| 276 | out_free: | 290 | out_free: |
| 277 | dma_free_coherent(NULL, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma); | 291 | dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma); |
| 278 | out_put: | 292 | out_put: |
| 279 | scsi_host_put(host); | 293 | scsi_host_put(host); |
| 280 | out: | 294 | out: |
| @@ -290,7 +304,7 @@ static void __exit sgiwd93_remove(struct platform_device *pdev) | |||
| 290 | 304 | ||
| 291 | scsi_remove_host(host); | 305 | scsi_remove_host(host); |
| 292 | free_irq(pd->irq, host); | 306 | free_irq(pd->irq, host); |
| 293 | dma_free_coherent(&pdev->dev, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma); | 307 | dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma); |
| 294 | scsi_host_put(host); | 308 | scsi_host_put(host); |
| 295 | } | 309 | } |
| 296 | 310 | ||
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 1fcee16fa36d..50ba49250203 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c | |||
| @@ -231,7 +231,7 @@ out: | |||
| 231 | static int sr_done(struct scsi_cmnd *SCpnt) | 231 | static int sr_done(struct scsi_cmnd *SCpnt) |
| 232 | { | 232 | { |
| 233 | int result = SCpnt->result; | 233 | int result = SCpnt->result; |
| 234 | int this_count = SCpnt->request_bufflen; | 234 | int this_count = scsi_bufflen(SCpnt); |
| 235 | int good_bytes = (result == 0 ? this_count : 0); | 235 | int good_bytes = (result == 0 ? this_count : 0); |
| 236 | int block_sectors = 0; | 236 | int block_sectors = 0; |
| 237 | long error_sector; | 237 | long error_sector; |
| @@ -379,17 +379,18 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq) | |||
| 379 | } | 379 | } |
| 380 | 380 | ||
| 381 | { | 381 | { |
| 382 | struct scatterlist *sg = SCpnt->request_buffer; | 382 | struct scatterlist *sg; |
| 383 | int i, size = 0; | 383 | int i, size = 0, sg_count = scsi_sg_count(SCpnt); |
| 384 | for (i = 0; i < SCpnt->use_sg; i++) | ||
| 385 | size += sg[i].length; | ||
| 386 | 384 | ||
| 387 | if (size != SCpnt->request_bufflen && SCpnt->use_sg) { | 385 | scsi_for_each_sg(SCpnt, sg, sg_count, i) |
| 386 | size += sg->length; | ||
| 387 | |||
| 388 | if (size != scsi_bufflen(SCpnt)) { | ||
| 388 | scmd_printk(KERN_ERR, SCpnt, | 389 | scmd_printk(KERN_ERR, SCpnt, |
| 389 | "mismatch count %d, bytes %d\n", | 390 | "mismatch count %d, bytes %d\n", |
| 390 | size, SCpnt->request_bufflen); | 391 | size, scsi_bufflen(SCpnt)); |
| 391 | if (SCpnt->request_bufflen > size) | 392 | if (scsi_bufflen(SCpnt) > size) |
| 392 | SCpnt->request_bufflen = size; | 393 | SCpnt->sdb.length = size; |
| 393 | } | 394 | } |
| 394 | } | 395 | } |
| 395 | 396 | ||
| @@ -397,12 +398,12 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq) | |||
| 397 | * request doesn't start on hw block boundary, add scatter pads | 398 | * request doesn't start on hw block boundary, add scatter pads |
| 398 | */ | 399 | */ |
| 399 | if (((unsigned int)rq->sector % (s_size >> 9)) || | 400 | if (((unsigned int)rq->sector % (s_size >> 9)) || |
| 400 | (SCpnt->request_bufflen % s_size)) { | 401 | (scsi_bufflen(SCpnt) % s_size)) { |
| 401 | scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n"); | 402 | scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n"); |
| 402 | goto out; | 403 | goto out; |
| 403 | } | 404 | } |
| 404 | 405 | ||
| 405 | this_count = (SCpnt->request_bufflen >> 9) / (s_size >> 9); | 406 | this_count = (scsi_bufflen(SCpnt) >> 9) / (s_size >> 9); |
| 406 | 407 | ||
| 407 | 408 | ||
| 408 | SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n", | 409 | SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n", |
| @@ -416,7 +417,7 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq) | |||
| 416 | 417 | ||
| 417 | if (this_count > 0xffff) { | 418 | if (this_count > 0xffff) { |
| 418 | this_count = 0xffff; | 419 | this_count = 0xffff; |
| 419 | SCpnt->request_bufflen = this_count * s_size; | 420 | SCpnt->sdb.length = this_count * s_size; |
| 420 | } | 421 | } |
| 421 | 422 | ||
| 422 | SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff; | 423 | SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff; |
diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c index e3fab3a6aed7..72f6d8015358 100644 --- a/drivers/scsi/stex.c +++ b/drivers/scsi/stex.c | |||
| @@ -1123,7 +1123,6 @@ static struct scsi_host_template driver_template = { | |||
| 1123 | .this_id = -1, | 1123 | .this_id = -1, |
| 1124 | .sg_tablesize = ST_MAX_SG, | 1124 | .sg_tablesize = ST_MAX_SG, |
| 1125 | .cmd_per_lun = ST_CMD_PER_LUN, | 1125 | .cmd_per_lun = ST_CMD_PER_LUN, |
| 1126 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1127 | }; | 1126 | }; |
| 1128 | 1127 | ||
| 1129 | static int stex_set_dma_mask(struct pci_dev * pdev) | 1128 | static int stex_set_dma_mask(struct pci_dev * pdev) |
diff --git a/drivers/scsi/sym53c416.c b/drivers/scsi/sym53c416.c index 1f6fd1680335..6325901e5093 100644 --- a/drivers/scsi/sym53c416.c +++ b/drivers/scsi/sym53c416.c | |||
| @@ -840,6 +840,5 @@ static struct scsi_host_template driver_template = { | |||
| 840 | .cmd_per_lun = 1, | 840 | .cmd_per_lun = 1, |
| 841 | .unchecked_isa_dma = 1, | 841 | .unchecked_isa_dma = 1, |
| 842 | .use_clustering = ENABLE_CLUSTERING, | 842 | .use_clustering = ENABLE_CLUSTERING, |
| 843 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 844 | }; | 843 | }; |
| 845 | #include "scsi_module.c" | 844 | #include "scsi_module.c" |
diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c b/drivers/scsi/sym53c8xx_2/sym_glue.c index 21e926dcdab0..d39107b7669b 100644 --- a/drivers/scsi/sym53c8xx_2/sym_glue.c +++ b/drivers/scsi/sym53c8xx_2/sym_glue.c | |||
| @@ -207,7 +207,7 @@ void sym_set_cam_result_error(struct sym_hcb *np, struct sym_ccb *cp, int resid) | |||
| 207 | /* | 207 | /* |
| 208 | * Bounce back the sense data to user. | 208 | * Bounce back the sense data to user. |
| 209 | */ | 209 | */ |
| 210 | memset(&cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); | 210 | memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); |
| 211 | memcpy(cmd->sense_buffer, cp->sns_bbuf, | 211 | memcpy(cmd->sense_buffer, cp->sns_bbuf, |
| 212 | min(SCSI_SENSE_BUFFERSIZE, SYM_SNS_BBUF_LEN)); | 212 | min(SCSI_SENSE_BUFFERSIZE, SYM_SNS_BBUF_LEN)); |
| 213 | #if 0 | 213 | #if 0 |
| @@ -1681,7 +1681,6 @@ static struct scsi_host_template sym2_template = { | |||
| 1681 | .eh_host_reset_handler = sym53c8xx_eh_host_reset_handler, | 1681 | .eh_host_reset_handler = sym53c8xx_eh_host_reset_handler, |
| 1682 | .this_id = 7, | 1682 | .this_id = 7, |
| 1683 | .use_clustering = ENABLE_CLUSTERING, | 1683 | .use_clustering = ENABLE_CLUSTERING, |
| 1684 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1685 | .max_sectors = 0xFFFF, | 1684 | .max_sectors = 0xFFFF, |
| 1686 | #ifdef SYM_LINUX_PROC_INFO_SUPPORT | 1685 | #ifdef SYM_LINUX_PROC_INFO_SUPPORT |
| 1687 | .proc_info = sym53c8xx_proc_info, | 1686 | .proc_info = sym53c8xx_proc_info, |
diff --git a/drivers/scsi/u14-34f.c b/drivers/scsi/u14-34f.c index 4bc5407f9695..662c00451be4 100644 --- a/drivers/scsi/u14-34f.c +++ b/drivers/scsi/u14-34f.c | |||
| @@ -451,7 +451,6 @@ static struct scsi_host_template driver_template = { | |||
| 451 | .this_id = 7, | 451 | .this_id = 7, |
| 452 | .unchecked_isa_dma = 1, | 452 | .unchecked_isa_dma = 1, |
| 453 | .use_clustering = ENABLE_CLUSTERING, | 453 | .use_clustering = ENABLE_CLUSTERING, |
| 454 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 455 | }; | 454 | }; |
| 456 | 455 | ||
| 457 | #if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD) | 456 | #if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD) |
diff --git a/drivers/scsi/ultrastor.c b/drivers/scsi/ultrastor.c index 75eca6b22db5..f385dce8dfbe 100644 --- a/drivers/scsi/ultrastor.c +++ b/drivers/scsi/ultrastor.c | |||
| @@ -1204,6 +1204,5 @@ static struct scsi_host_template driver_template = { | |||
| 1204 | .cmd_per_lun = ULTRASTOR_MAX_CMDS_PER_LUN, | 1204 | .cmd_per_lun = ULTRASTOR_MAX_CMDS_PER_LUN, |
| 1205 | .unchecked_isa_dma = 1, | 1205 | .unchecked_isa_dma = 1, |
| 1206 | .use_clustering = ENABLE_CLUSTERING, | 1206 | .use_clustering = ENABLE_CLUSTERING, |
| 1207 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1208 | }; | 1207 | }; |
| 1209 | #include "scsi_module.c" | 1208 | #include "scsi_module.c" |
diff --git a/drivers/scsi/wd7000.c b/drivers/scsi/wd7000.c index b4304ae78527..c975c01b3a02 100644 --- a/drivers/scsi/wd7000.c +++ b/drivers/scsi/wd7000.c | |||
| @@ -1671,7 +1671,6 @@ static struct scsi_host_template driver_template = { | |||
| 1671 | .cmd_per_lun = 1, | 1671 | .cmd_per_lun = 1, |
| 1672 | .unchecked_isa_dma = 1, | 1672 | .unchecked_isa_dma = 1, |
| 1673 | .use_clustering = ENABLE_CLUSTERING, | 1673 | .use_clustering = ENABLE_CLUSTERING, |
| 1674 | .use_sg_chaining = ENABLE_SG_CHAINING, | ||
| 1675 | }; | 1674 | }; |
| 1676 | 1675 | ||
| 1677 | #include "scsi_module.c" | 1676 | #include "scsi_module.c" |
diff --git a/drivers/usb/storage/isd200.c b/drivers/usb/storage/isd200.c index 178e8c2a8a2f..0db488624ab1 100644 --- a/drivers/usb/storage/isd200.c +++ b/drivers/usb/storage/isd200.c | |||
| @@ -415,14 +415,14 @@ static void isd200_set_srb(struct isd200_info *info, | |||
| 415 | sg_init_one(&info->sg, buff, bufflen); | 415 | sg_init_one(&info->sg, buff, bufflen); |
| 416 | 416 | ||
| 417 | srb->sc_data_direction = dir; | 417 | srb->sc_data_direction = dir; |
| 418 | srb->request_buffer = buff ? &info->sg : NULL; | 418 | srb->sdb.table.sgl = buff ? &info->sg : NULL; |
| 419 | srb->request_bufflen = bufflen; | 419 | srb->sdb.length = bufflen; |
| 420 | srb->use_sg = buff ? 1 : 0; | 420 | srb->sdb.table.nents = buff ? 1 : 0; |
| 421 | } | 421 | } |
| 422 | 422 | ||
| 423 | static void isd200_srb_set_bufflen(struct scsi_cmnd *srb, unsigned bufflen) | 423 | static void isd200_srb_set_bufflen(struct scsi_cmnd *srb, unsigned bufflen) |
| 424 | { | 424 | { |
| 425 | srb->request_bufflen = bufflen; | 425 | srb->sdb.length = bufflen; |
| 426 | } | 426 | } |
| 427 | 427 | ||
| 428 | 428 | ||
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 899fc13d0612..afcdc69e37d6 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig | |||
| @@ -609,7 +609,7 @@ config SBC_EPX_C3_WATCHDOG | |||
| 609 | 609 | ||
| 610 | config INDYDOG | 610 | config INDYDOG |
| 611 | tristate "Indy/I2 Hardware Watchdog" | 611 | tristate "Indy/I2 Hardware Watchdog" |
| 612 | depends on SGI_IP22 | 612 | depends on SGI_HAS_INDYDOG |
| 613 | help | 613 | help |
| 614 | Hardware driver for the Indy's/I2's watchdog. This is a | 614 | Hardware driver for the Indy's/I2's watchdog. This is a |
| 615 | watchdog timer that will reboot the machine after a 60 second | 615 | watchdog timer that will reboot the machine after a 60 second |
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 46754553fdcc..ff97ba924333 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c | |||
| @@ -49,7 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len) | |||
| 49 | spin_unlock(&ls->ls_recover_list_lock); | 49 | spin_unlock(&ls->ls_recover_list_lock); |
| 50 | 50 | ||
| 51 | if (!found) | 51 | if (!found) |
| 52 | de = allocate_direntry(ls, len); | 52 | de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_KERNEL); |
| 53 | return de; | 53 | return de; |
| 54 | } | 54 | } |
| 55 | 55 | ||
| @@ -62,7 +62,7 @@ void dlm_clear_free_entries(struct dlm_ls *ls) | |||
| 62 | de = list_entry(ls->ls_recover_list.next, struct dlm_direntry, | 62 | de = list_entry(ls->ls_recover_list.next, struct dlm_direntry, |
| 63 | list); | 63 | list); |
| 64 | list_del(&de->list); | 64 | list_del(&de->list); |
| 65 | free_direntry(de); | 65 | kfree(de); |
| 66 | } | 66 | } |
| 67 | spin_unlock(&ls->ls_recover_list_lock); | 67 | spin_unlock(&ls->ls_recover_list_lock); |
| 68 | } | 68 | } |
| @@ -171,7 +171,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen | |||
| 171 | } | 171 | } |
| 172 | 172 | ||
| 173 | list_del(&de->list); | 173 | list_del(&de->list); |
| 174 | free_direntry(de); | 174 | kfree(de); |
| 175 | out: | 175 | out: |
| 176 | write_unlock(&ls->ls_dirtbl[bucket].lock); | 176 | write_unlock(&ls->ls_dirtbl[bucket].lock); |
| 177 | } | 177 | } |
| @@ -302,7 +302,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name, | |||
| 302 | 302 | ||
| 303 | write_unlock(&ls->ls_dirtbl[bucket].lock); | 303 | write_unlock(&ls->ls_dirtbl[bucket].lock); |
| 304 | 304 | ||
| 305 | de = allocate_direntry(ls, namelen); | 305 | de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL); |
| 306 | if (!de) | 306 | if (!de) |
| 307 | return -ENOMEM; | 307 | return -ENOMEM; |
| 308 | 308 | ||
| @@ -313,7 +313,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name, | |||
| 313 | write_lock(&ls->ls_dirtbl[bucket].lock); | 313 | write_lock(&ls->ls_dirtbl[bucket].lock); |
| 314 | tmp = search_bucket(ls, name, namelen, bucket); | 314 | tmp = search_bucket(ls, name, namelen, bucket); |
| 315 | if (tmp) { | 315 | if (tmp) { |
| 316 | free_direntry(de); | 316 | kfree(de); |
| 317 | de = tmp; | 317 | de = tmp; |
| 318 | } else { | 318 | } else { |
| 319 | list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); | 319 | list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); |
| @@ -329,49 +329,47 @@ int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen, | |||
| 329 | return get_entry(ls, nodeid, name, namelen, r_nodeid); | 329 | return get_entry(ls, nodeid, name, namelen, r_nodeid); |
| 330 | } | 330 | } |
| 331 | 331 | ||
| 332 | /* Copy the names of master rsb's into the buffer provided. | 332 | static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len) |
| 333 | Only select names whose dir node is the given nodeid. */ | 333 | { |
| 334 | struct dlm_rsb *r; | ||
| 335 | |||
| 336 | down_read(&ls->ls_root_sem); | ||
| 337 | list_for_each_entry(r, &ls->ls_root_list, res_root_list) { | ||
| 338 | if (len == r->res_length && !memcmp(name, r->res_name, len)) { | ||
| 339 | up_read(&ls->ls_root_sem); | ||
| 340 | return r; | ||
| 341 | } | ||
| 342 | } | ||
| 343 | up_read(&ls->ls_root_sem); | ||
| 344 | return NULL; | ||
| 345 | } | ||
| 346 | |||
| 347 | /* Find the rsb where we left off (or start again), then send rsb names | ||
| 348 | for rsb's we're master of and whose directory node matches the requesting | ||
| 349 | node. inbuf is the rsb name last sent, inlen is the name's length */ | ||
| 334 | 350 | ||
| 335 | void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, | 351 | void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, |
| 336 | char *outbuf, int outlen, int nodeid) | 352 | char *outbuf, int outlen, int nodeid) |
| 337 | { | 353 | { |
| 338 | struct list_head *list; | 354 | struct list_head *list; |
| 339 | struct dlm_rsb *start_r = NULL, *r = NULL; | 355 | struct dlm_rsb *r; |
| 340 | int offset = 0, start_namelen, error, dir_nodeid; | 356 | int offset = 0, dir_nodeid; |
| 341 | char *start_name; | ||
| 342 | uint16_t be_namelen; | 357 | uint16_t be_namelen; |
| 343 | 358 | ||
| 344 | /* | ||
| 345 | * Find the rsb where we left off (or start again) | ||
| 346 | */ | ||
| 347 | |||
| 348 | start_namelen = inlen; | ||
| 349 | start_name = inbuf; | ||
| 350 | |||
| 351 | if (start_namelen > 1) { | ||
| 352 | /* | ||
| 353 | * We could also use a find_rsb_root() function here that | ||
| 354 | * searched the ls_root_list. | ||
| 355 | */ | ||
| 356 | error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER, | ||
| 357 | &start_r); | ||
| 358 | DLM_ASSERT(!error && start_r, | ||
| 359 | printk("error %d\n", error);); | ||
| 360 | DLM_ASSERT(!list_empty(&start_r->res_root_list), | ||
| 361 | dlm_print_rsb(start_r);); | ||
| 362 | dlm_put_rsb(start_r); | ||
| 363 | } | ||
| 364 | |||
| 365 | /* | ||
| 366 | * Send rsb names for rsb's we're master of and whose directory node | ||
| 367 | * matches the requesting node. | ||
| 368 | */ | ||
| 369 | |||
| 370 | down_read(&ls->ls_root_sem); | 359 | down_read(&ls->ls_root_sem); |
| 371 | if (start_r) | 360 | |
| 372 | list = start_r->res_root_list.next; | 361 | if (inlen > 1) { |
| 373 | else | 362 | r = find_rsb_root(ls, inbuf, inlen); |
| 363 | if (!r) { | ||
| 364 | inbuf[inlen - 1] = '\0'; | ||
| 365 | log_error(ls, "copy_master_names from %d start %d %s", | ||
| 366 | nodeid, inlen, inbuf); | ||
| 367 | goto out; | ||
| 368 | } | ||
| 369 | list = r->res_root_list.next; | ||
| 370 | } else { | ||
| 374 | list = ls->ls_root_list.next; | 371 | list = ls->ls_root_list.next; |
| 372 | } | ||
| 375 | 373 | ||
| 376 | for (offset = 0; list != &ls->ls_root_list; list = list->next) { | 374 | for (offset = 0; list != &ls->ls_root_list; list = list->next) { |
| 377 | r = list_entry(list, struct dlm_rsb, res_root_list); | 375 | r = list_entry(list, struct dlm_rsb, res_root_list); |
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index d2fc2384c3be..ec61bbaf25df 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h | |||
| @@ -570,5 +570,21 @@ static inline int dlm_no_directory(struct dlm_ls *ls) | |||
| 570 | return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0; | 570 | return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0; |
| 571 | } | 571 | } |
| 572 | 572 | ||
| 573 | int dlm_netlink_init(void); | ||
| 574 | void dlm_netlink_exit(void); | ||
| 575 | void dlm_timeout_warn(struct dlm_lkb *lkb); | ||
| 576 | |||
| 577 | #ifdef CONFIG_DLM_DEBUG | ||
| 578 | int dlm_register_debugfs(void); | ||
| 579 | void dlm_unregister_debugfs(void); | ||
| 580 | int dlm_create_debug_file(struct dlm_ls *ls); | ||
| 581 | void dlm_delete_debug_file(struct dlm_ls *ls); | ||
| 582 | #else | ||
| 583 | static inline int dlm_register_debugfs(void) { return 0; } | ||
| 584 | static inline void dlm_unregister_debugfs(void) { } | ||
| 585 | static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; } | ||
| 586 | static inline void dlm_delete_debug_file(struct dlm_ls *ls) { } | ||
| 587 | #endif | ||
| 588 | |||
| 573 | #endif /* __DLM_INTERNAL_DOT_H__ */ | 589 | #endif /* __DLM_INTERNAL_DOT_H__ */ |
| 574 | 590 | ||
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 3915b8e14146..ff4a198fa677 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /****************************************************************************** | 1 | /****************************************************************************** |
| 2 | ******************************************************************************* | 2 | ******************************************************************************* |
| 3 | ** | 3 | ** |
| 4 | ** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. | 4 | ** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. |
| 5 | ** | 5 | ** |
| 6 | ** This copyrighted material is made available to anyone wishing to use, | 6 | ** This copyrighted material is made available to anyone wishing to use, |
| 7 | ** modify, copy, or redistribute it subject to the terms and conditions | 7 | ** modify, copy, or redistribute it subject to the terms and conditions |
| @@ -88,7 +88,6 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, | |||
| 88 | static int receive_extralen(struct dlm_message *ms); | 88 | static int receive_extralen(struct dlm_message *ms); |
| 89 | static void do_purge(struct dlm_ls *ls, int nodeid, int pid); | 89 | static void do_purge(struct dlm_ls *ls, int nodeid, int pid); |
| 90 | static void del_timeout(struct dlm_lkb *lkb); | 90 | static void del_timeout(struct dlm_lkb *lkb); |
| 91 | void dlm_timeout_warn(struct dlm_lkb *lkb); | ||
| 92 | 91 | ||
| 93 | /* | 92 | /* |
| 94 | * Lock compatibilty matrix - thanks Steve | 93 | * Lock compatibilty matrix - thanks Steve |
| @@ -335,7 +334,7 @@ static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len) | |||
| 335 | { | 334 | { |
| 336 | struct dlm_rsb *r; | 335 | struct dlm_rsb *r; |
| 337 | 336 | ||
| 338 | r = allocate_rsb(ls, len); | 337 | r = dlm_allocate_rsb(ls, len); |
| 339 | if (!r) | 338 | if (!r) |
| 340 | return NULL; | 339 | return NULL; |
| 341 | 340 | ||
| @@ -478,7 +477,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen, | |||
| 478 | error = _search_rsb(ls, name, namelen, bucket, 0, &tmp); | 477 | error = _search_rsb(ls, name, namelen, bucket, 0, &tmp); |
| 479 | if (!error) { | 478 | if (!error) { |
| 480 | write_unlock(&ls->ls_rsbtbl[bucket].lock); | 479 | write_unlock(&ls->ls_rsbtbl[bucket].lock); |
| 481 | free_rsb(r); | 480 | dlm_free_rsb(r); |
| 482 | r = tmp; | 481 | r = tmp; |
| 483 | goto out; | 482 | goto out; |
| 484 | } | 483 | } |
| @@ -490,12 +489,6 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen, | |||
| 490 | return error; | 489 | return error; |
| 491 | } | 490 | } |
| 492 | 491 | ||
| 493 | int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen, | ||
| 494 | unsigned int flags, struct dlm_rsb **r_ret) | ||
| 495 | { | ||
| 496 | return find_rsb(ls, name, namelen, flags, r_ret); | ||
| 497 | } | ||
| 498 | |||
| 499 | /* This is only called to add a reference when the code already holds | 492 | /* This is only called to add a reference when the code already holds |
| 500 | a valid reference to the rsb, so there's no need for locking. */ | 493 | a valid reference to the rsb, so there's no need for locking. */ |
| 501 | 494 | ||
| @@ -519,7 +512,7 @@ static void toss_rsb(struct kref *kref) | |||
| 519 | list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss); | 512 | list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss); |
| 520 | r->res_toss_time = jiffies; | 513 | r->res_toss_time = jiffies; |
| 521 | if (r->res_lvbptr) { | 514 | if (r->res_lvbptr) { |
| 522 | free_lvb(r->res_lvbptr); | 515 | dlm_free_lvb(r->res_lvbptr); |
| 523 | r->res_lvbptr = NULL; | 516 | r->res_lvbptr = NULL; |
| 524 | } | 517 | } |
| 525 | } | 518 | } |
| @@ -589,7 +582,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) | |||
| 589 | uint32_t lkid = 0; | 582 | uint32_t lkid = 0; |
| 590 | uint16_t bucket; | 583 | uint16_t bucket; |
| 591 | 584 | ||
| 592 | lkb = allocate_lkb(ls); | 585 | lkb = dlm_allocate_lkb(ls); |
| 593 | if (!lkb) | 586 | if (!lkb) |
| 594 | return -ENOMEM; | 587 | return -ENOMEM; |
| 595 | 588 | ||
| @@ -683,8 +676,8 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb) | |||
| 683 | 676 | ||
| 684 | /* for local/process lkbs, lvbptr points to caller's lksb */ | 677 | /* for local/process lkbs, lvbptr points to caller's lksb */ |
| 685 | if (lkb->lkb_lvbptr && is_master_copy(lkb)) | 678 | if (lkb->lkb_lvbptr && is_master_copy(lkb)) |
| 686 | free_lvb(lkb->lkb_lvbptr); | 679 | dlm_free_lvb(lkb->lkb_lvbptr); |
| 687 | free_lkb(lkb); | 680 | dlm_free_lkb(lkb); |
| 688 | return 1; | 681 | return 1; |
| 689 | } else { | 682 | } else { |
| 690 | write_unlock(&ls->ls_lkbtbl[bucket].lock); | 683 | write_unlock(&ls->ls_lkbtbl[bucket].lock); |
| @@ -988,7 +981,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b) | |||
| 988 | 981 | ||
| 989 | if (is_master(r)) | 982 | if (is_master(r)) |
| 990 | dir_remove(r); | 983 | dir_remove(r); |
| 991 | free_rsb(r); | 984 | dlm_free_rsb(r); |
| 992 | count++; | 985 | count++; |
| 993 | } else { | 986 | } else { |
| 994 | write_unlock(&ls->ls_rsbtbl[b].lock); | 987 | write_unlock(&ls->ls_rsbtbl[b].lock); |
| @@ -1171,7 +1164,7 @@ static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) | |||
| 1171 | return; | 1164 | return; |
| 1172 | 1165 | ||
| 1173 | if (!r->res_lvbptr) | 1166 | if (!r->res_lvbptr) |
| 1174 | r->res_lvbptr = allocate_lvb(r->res_ls); | 1167 | r->res_lvbptr = dlm_allocate_lvb(r->res_ls); |
| 1175 | 1168 | ||
| 1176 | if (!r->res_lvbptr) | 1169 | if (!r->res_lvbptr) |
| 1177 | return; | 1170 | return; |
| @@ -1203,7 +1196,7 @@ static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) | |||
| 1203 | return; | 1196 | return; |
| 1204 | 1197 | ||
| 1205 | if (!r->res_lvbptr) | 1198 | if (!r->res_lvbptr) |
| 1206 | r->res_lvbptr = allocate_lvb(r->res_ls); | 1199 | r->res_lvbptr = dlm_allocate_lvb(r->res_ls); |
| 1207 | 1200 | ||
| 1208 | if (!r->res_lvbptr) | 1201 | if (!r->res_lvbptr) |
| 1209 | return; | 1202 | return; |
| @@ -1852,7 +1845,7 @@ static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb) | |||
| 1852 | static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) | 1845 | static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) |
| 1853 | { | 1846 | { |
| 1854 | struct dlm_ls *ls = r->res_ls; | 1847 | struct dlm_ls *ls = r->res_ls; |
| 1855 | int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); | 1848 | int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); |
| 1856 | 1849 | ||
| 1857 | if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) { | 1850 | if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) { |
| 1858 | rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); | 1851 | rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); |
| @@ -1886,7 +1879,7 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) | |||
| 1886 | return 1; | 1879 | return 1; |
| 1887 | } | 1880 | } |
| 1888 | 1881 | ||
| 1889 | for (;;) { | 1882 | for (i = 0; i < 2; i++) { |
| 1890 | /* It's possible for dlm_scand to remove an old rsb for | 1883 | /* It's possible for dlm_scand to remove an old rsb for |
| 1891 | this same resource from the toss list, us to create | 1884 | this same resource from the toss list, us to create |
| 1892 | a new one, look up the master locally, and find it | 1885 | a new one, look up the master locally, and find it |
| @@ -1900,6 +1893,8 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) | |||
| 1900 | log_debug(ls, "dir_lookup error %d %s", error, r->res_name); | 1893 | log_debug(ls, "dir_lookup error %d %s", error, r->res_name); |
| 1901 | schedule(); | 1894 | schedule(); |
| 1902 | } | 1895 | } |
| 1896 | if (error && error != -EEXIST) | ||
| 1897 | return error; | ||
| 1903 | 1898 | ||
| 1904 | if (ret_nodeid == our_nodeid) { | 1899 | if (ret_nodeid == our_nodeid) { |
| 1905 | r->res_first_lkid = 0; | 1900 | r->res_first_lkid = 0; |
| @@ -1941,8 +1936,11 @@ static void confirm_master(struct dlm_rsb *r, int error) | |||
| 1941 | break; | 1936 | break; |
| 1942 | 1937 | ||
| 1943 | case -EAGAIN: | 1938 | case -EAGAIN: |
| 1944 | /* the remote master didn't queue our NOQUEUE request; | 1939 | case -EBADR: |
| 1945 | make a waiting lkb the first_lkid */ | 1940 | case -ENOTBLK: |
| 1941 | /* the remote request failed and won't be retried (it was | ||
| 1942 | a NOQUEUE, or has been canceled/unlocked); make a waiting | ||
| 1943 | lkb the first_lkid */ | ||
| 1946 | 1944 | ||
| 1947 | r->res_first_lkid = 0; | 1945 | r->res_first_lkid = 0; |
| 1948 | 1946 | ||
| @@ -2108,17 +2106,18 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) | |||
| 2108 | /* an lkb may be waiting for an rsb lookup to complete where the | 2106 | /* an lkb may be waiting for an rsb lookup to complete where the |
| 2109 | lookup was initiated by another lock */ | 2107 | lookup was initiated by another lock */ |
| 2110 | 2108 | ||
| 2111 | if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) { | 2109 | if (!list_empty(&lkb->lkb_rsb_lookup)) { |
| 2112 | if (!list_empty(&lkb->lkb_rsb_lookup)) { | 2110 | if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) { |
| 2113 | log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id); | 2111 | log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id); |
| 2114 | list_del_init(&lkb->lkb_rsb_lookup); | 2112 | list_del_init(&lkb->lkb_rsb_lookup); |
| 2115 | queue_cast(lkb->lkb_resource, lkb, | 2113 | queue_cast(lkb->lkb_resource, lkb, |
| 2116 | args->flags & DLM_LKF_CANCEL ? | 2114 | args->flags & DLM_LKF_CANCEL ? |
| 2117 | -DLM_ECANCEL : -DLM_EUNLOCK); | 2115 | -DLM_ECANCEL : -DLM_EUNLOCK); |
| 2118 | unhold_lkb(lkb); /* undoes create_lkb() */ | 2116 | unhold_lkb(lkb); /* undoes create_lkb() */ |
| 2119 | rv = -EBUSY; | ||
| 2120 | goto out; | ||
| 2121 | } | 2117 | } |
| 2118 | /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */ | ||
| 2119 | rv = -EBUSY; | ||
| 2120 | goto out; | ||
| 2122 | } | 2121 | } |
| 2123 | 2122 | ||
| 2124 | /* cancel not allowed with another cancel/unlock in progress */ | 2123 | /* cancel not allowed with another cancel/unlock in progress */ |
| @@ -2986,7 +2985,7 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, | |||
| 2986 | 2985 | ||
| 2987 | if (lkb->lkb_exflags & DLM_LKF_VALBLK) { | 2986 | if (lkb->lkb_exflags & DLM_LKF_VALBLK) { |
| 2988 | if (!lkb->lkb_lvbptr) | 2987 | if (!lkb->lkb_lvbptr) |
| 2989 | lkb->lkb_lvbptr = allocate_lvb(ls); | 2988 | lkb->lkb_lvbptr = dlm_allocate_lvb(ls); |
| 2990 | if (!lkb->lkb_lvbptr) | 2989 | if (!lkb->lkb_lvbptr) |
| 2991 | return -ENOMEM; | 2990 | return -ENOMEM; |
| 2992 | len = receive_extralen(ms); | 2991 | len = receive_extralen(ms); |
| @@ -3006,11 +3005,9 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, | |||
| 3006 | lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST); | 3005 | lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST); |
| 3007 | lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP); | 3006 | lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP); |
| 3008 | 3007 | ||
| 3009 | DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb);); | ||
| 3010 | |||
| 3011 | if (lkb->lkb_exflags & DLM_LKF_VALBLK) { | 3008 | if (lkb->lkb_exflags & DLM_LKF_VALBLK) { |
| 3012 | /* lkb was just created so there won't be an lvb yet */ | 3009 | /* lkb was just created so there won't be an lvb yet */ |
| 3013 | lkb->lkb_lvbptr = allocate_lvb(ls); | 3010 | lkb->lkb_lvbptr = dlm_allocate_lvb(ls); |
| 3014 | if (!lkb->lkb_lvbptr) | 3011 | if (!lkb->lkb_lvbptr) |
| 3015 | return -ENOMEM; | 3012 | return -ENOMEM; |
| 3016 | } | 3013 | } |
| @@ -3021,16 +3018,6 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, | |||
| 3021 | static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, | 3018 | static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, |
| 3022 | struct dlm_message *ms) | 3019 | struct dlm_message *ms) |
| 3023 | { | 3020 | { |
| 3024 | if (lkb->lkb_nodeid != ms->m_header.h_nodeid) { | ||
| 3025 | log_error(ls, "convert_args nodeid %d %d lkid %x %x", | ||
| 3026 | lkb->lkb_nodeid, ms->m_header.h_nodeid, | ||
| 3027 | lkb->lkb_id, lkb->lkb_remid); | ||
| 3028 | return -EINVAL; | ||
| 3029 | } | ||
| 3030 | |||
| 3031 | if (!is_master_copy(lkb)) | ||
| 3032 | return -EINVAL; | ||
| 3033 | |||
| 3034 | if (lkb->lkb_status != DLM_LKSTS_GRANTED) | 3021 | if (lkb->lkb_status != DLM_LKSTS_GRANTED) |
| 3035 | return -EBUSY; | 3022 | return -EBUSY; |
| 3036 | 3023 | ||
| @@ -3046,8 +3033,6 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, | |||
| 3046 | static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, | 3033 | static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, |
| 3047 | struct dlm_message *ms) | 3034 | struct dlm_message *ms) |
| 3048 | { | 3035 | { |
| 3049 | if (!is_master_copy(lkb)) | ||
| 3050 | return -EINVAL; | ||
| 3051 | if (receive_lvb(ls, lkb, ms)) | 3036 | if (receive_lvb(ls, lkb, ms)) |
| 3052 | return -ENOMEM; | 3037 | return -ENOMEM; |
| 3053 | return 0; | 3038 | return 0; |
| @@ -3063,6 +3048,50 @@ static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms) | |||
| 3063 | lkb->lkb_remid = ms->m_lkid; | 3048 | lkb->lkb_remid = ms->m_lkid; |
| 3064 | } | 3049 | } |
| 3065 | 3050 | ||
| 3051 | /* This is called after the rsb is locked so that we can safely inspect | ||
| 3052 | fields in the lkb. */ | ||
| 3053 | |||
| 3054 | static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) | ||
| 3055 | { | ||
| 3056 | int from = ms->m_header.h_nodeid; | ||
| 3057 | int error = 0; | ||
| 3058 | |||
| 3059 | switch (ms->m_type) { | ||
| 3060 | case DLM_MSG_CONVERT: | ||
| 3061 | case DLM_MSG_UNLOCK: | ||
| 3062 | case DLM_MSG_CANCEL: | ||
| 3063 | if (!is_master_copy(lkb) || lkb->lkb_nodeid != from) | ||
| 3064 | error = -EINVAL; | ||
| 3065 | break; | ||
| 3066 | |||
| 3067 | case DLM_MSG_CONVERT_REPLY: | ||
| 3068 | case DLM_MSG_UNLOCK_REPLY: | ||
| 3069 | case DLM_MSG_CANCEL_REPLY: | ||
| 3070 | case DLM_MSG_GRANT: | ||
| 3071 | case DLM_MSG_BAST: | ||
| 3072 | if (!is_process_copy(lkb) || lkb->lkb_nodeid != from) | ||
| 3073 | error = -EINVAL; | ||
| 3074 | break; | ||
| 3075 | |||
| 3076 | case DLM_MSG_REQUEST_REPLY: | ||
| 3077 | if (!is_process_copy(lkb)) | ||
| 3078 | error = -EINVAL; | ||
| 3079 | else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from) | ||
| 3080 | error = -EINVAL; | ||
| 3081 | break; | ||
| 3082 | |||
| 3083 | default: | ||
| 3084 | error = -EINVAL; | ||
| 3085 | } | ||
| 3086 | |||
| 3087 | if (error) | ||
| 3088 | log_error(lkb->lkb_resource->res_ls, | ||
| 3089 | "ignore invalid message %d from %d %x %x %x %d", | ||
| 3090 | ms->m_type, from, lkb->lkb_id, lkb->lkb_remid, | ||
| 3091 | lkb->lkb_flags, lkb->lkb_nodeid); | ||
| 3092 | return error; | ||
| 3093 | } | ||
| 3094 | |||
| 3066 | static void receive_request(struct dlm_ls *ls, struct dlm_message *ms) | 3095 | static void receive_request(struct dlm_ls *ls, struct dlm_message *ms) |
| 3067 | { | 3096 | { |
| 3068 | struct dlm_lkb *lkb; | 3097 | struct dlm_lkb *lkb; |
| @@ -3124,17 +3153,21 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms) | |||
| 3124 | hold_rsb(r); | 3153 | hold_rsb(r); |
| 3125 | lock_rsb(r); | 3154 | lock_rsb(r); |
| 3126 | 3155 | ||
| 3156 | error = validate_message(lkb, ms); | ||
| 3157 | if (error) | ||
| 3158 | goto out; | ||
| 3159 | |||
| 3127 | receive_flags(lkb, ms); | 3160 | receive_flags(lkb, ms); |
| 3128 | error = receive_convert_args(ls, lkb, ms); | 3161 | error = receive_convert_args(ls, lkb, ms); |
| 3129 | if (error) | 3162 | if (error) |
| 3130 | goto out; | 3163 | goto out_reply; |
| 3131 | reply = !down_conversion(lkb); | 3164 | reply = !down_conversion(lkb); |
| 3132 | 3165 | ||
| 3133 | error = do_convert(r, lkb); | 3166 | error = do_convert(r, lkb); |
| 3134 | out: | 3167 | out_reply: |
| 3135 | if (reply) | 3168 | if (reply) |
| 3136 | send_convert_reply(r, lkb, error); | 3169 | send_convert_reply(r, lkb, error); |
| 3137 | 3170 | out: | |
| 3138 | unlock_rsb(r); | 3171 | unlock_rsb(r); |
| 3139 | put_rsb(r); | 3172 | put_rsb(r); |
| 3140 | dlm_put_lkb(lkb); | 3173 | dlm_put_lkb(lkb); |
| @@ -3160,15 +3193,19 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) | |||
| 3160 | hold_rsb(r); | 3193 | hold_rsb(r); |
| 3161 | lock_rsb(r); | 3194 | lock_rsb(r); |
| 3162 | 3195 | ||
| 3196 | error = validate_message(lkb, ms); | ||
| 3197 | if (error) | ||
| 3198 | goto out; | ||
| 3199 | |||
| 3163 | receive_flags(lkb, ms); | 3200 | receive_flags(lkb, ms); |
| 3164 | error = receive_unlock_args(ls, lkb, ms); | 3201 | error = receive_unlock_args(ls, lkb, ms); |
| 3165 | if (error) | 3202 | if (error) |
| 3166 | goto out; | 3203 | goto out_reply; |
| 3167 | 3204 | ||
| 3168 | error = do_unlock(r, lkb); | 3205 | error = do_unlock(r, lkb); |
| 3169 | out: | 3206 | out_reply: |
| 3170 | send_unlock_reply(r, lkb, error); | 3207 | send_unlock_reply(r, lkb, error); |
| 3171 | 3208 | out: | |
| 3172 | unlock_rsb(r); | 3209 | unlock_rsb(r); |
| 3173 | put_rsb(r); | 3210 | put_rsb(r); |
| 3174 | dlm_put_lkb(lkb); | 3211 | dlm_put_lkb(lkb); |
| @@ -3196,9 +3233,13 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) | |||
| 3196 | hold_rsb(r); | 3233 | hold_rsb(r); |
| 3197 | lock_rsb(r); | 3234 | lock_rsb(r); |
| 3198 | 3235 | ||
| 3236 | error = validate_message(lkb, ms); | ||
| 3237 | if (error) | ||
| 3238 | goto out; | ||
| 3239 | |||
| 3199 | error = do_cancel(r, lkb); | 3240 | error = do_cancel(r, lkb); |
| 3200 | send_cancel_reply(r, lkb, error); | 3241 | send_cancel_reply(r, lkb, error); |
| 3201 | 3242 | out: | |
| 3202 | unlock_rsb(r); | 3243 | unlock_rsb(r); |
| 3203 | put_rsb(r); | 3244 | put_rsb(r); |
| 3204 | dlm_put_lkb(lkb); | 3245 | dlm_put_lkb(lkb); |
| @@ -3217,22 +3258,26 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms) | |||
| 3217 | 3258 | ||
| 3218 | error = find_lkb(ls, ms->m_remid, &lkb); | 3259 | error = find_lkb(ls, ms->m_remid, &lkb); |
| 3219 | if (error) { | 3260 | if (error) { |
| 3220 | log_error(ls, "receive_grant no lkb"); | 3261 | log_debug(ls, "receive_grant from %d no lkb %x", |
| 3262 | ms->m_header.h_nodeid, ms->m_remid); | ||
| 3221 | return; | 3263 | return; |
| 3222 | } | 3264 | } |
| 3223 | DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); | ||
| 3224 | 3265 | ||
| 3225 | r = lkb->lkb_resource; | 3266 | r = lkb->lkb_resource; |
| 3226 | 3267 | ||
| 3227 | hold_rsb(r); | 3268 | hold_rsb(r); |
| 3228 | lock_rsb(r); | 3269 | lock_rsb(r); |
| 3229 | 3270 | ||
| 3271 | error = validate_message(lkb, ms); | ||
| 3272 | if (error) | ||
| 3273 | goto out; | ||
| 3274 | |||
| 3230 | receive_flags_reply(lkb, ms); | 3275 | receive_flags_reply(lkb, ms); |
| 3231 | if (is_altmode(lkb)) | 3276 | if (is_altmode(lkb)) |
| 3232 | munge_altmode(lkb, ms); | 3277 | munge_altmode(lkb, ms); |
| 3233 | grant_lock_pc(r, lkb, ms); | 3278 | grant_lock_pc(r, lkb, ms); |
| 3234 | queue_cast(r, lkb, 0); | 3279 | queue_cast(r, lkb, 0); |
| 3235 | 3280 | out: | |
| 3236 | unlock_rsb(r); | 3281 | unlock_rsb(r); |
| 3237 | put_rsb(r); | 3282 | put_rsb(r); |
| 3238 | dlm_put_lkb(lkb); | 3283 | dlm_put_lkb(lkb); |
| @@ -3246,18 +3291,22 @@ static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms) | |||
| 3246 | 3291 | ||
| 3247 | error = find_lkb(ls, ms->m_remid, &lkb); | 3292 | error = find_lkb(ls, ms->m_remid, &lkb); |
| 3248 | if (error) { | 3293 | if (error) { |
| 3249 | log_error(ls, "receive_bast no lkb"); | 3294 | log_debug(ls, "receive_bast from %d no lkb %x", |
| 3295 | ms->m_header.h_nodeid, ms->m_remid); | ||
| 3250 | return; | 3296 | return; |
| 3251 | } | 3297 | } |
| 3252 | DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); | ||
| 3253 | 3298 | ||
| 3254 | r = lkb->lkb_resource; | 3299 | r = lkb->lkb_resource; |
| 3255 | 3300 | ||
| 3256 | hold_rsb(r); | 3301 | hold_rsb(r); |
| 3257 | lock_rsb(r); | 3302 | lock_rsb(r); |
| 3258 | 3303 | ||
| 3259 | queue_bast(r, lkb, ms->m_bastmode); | 3304 | error = validate_message(lkb, ms); |
| 3305 | if (error) | ||
| 3306 | goto out; | ||
| 3260 | 3307 | ||
| 3308 | queue_bast(r, lkb, ms->m_bastmode); | ||
| 3309 | out: | ||
| 3261 | unlock_rsb(r); | 3310 | unlock_rsb(r); |
| 3262 | put_rsb(r); | 3311 | put_rsb(r); |
| 3263 | dlm_put_lkb(lkb); | 3312 | dlm_put_lkb(lkb); |
| @@ -3323,15 +3372,19 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) | |||
| 3323 | 3372 | ||
| 3324 | error = find_lkb(ls, ms->m_remid, &lkb); | 3373 | error = find_lkb(ls, ms->m_remid, &lkb); |
| 3325 | if (error) { | 3374 | if (error) { |
| 3326 | log_error(ls, "receive_request_reply no lkb"); | 3375 | log_debug(ls, "receive_request_reply from %d no lkb %x", |
| 3376 | ms->m_header.h_nodeid, ms->m_remid); | ||
| 3327 | return; | 3377 | return; |
| 3328 | } | 3378 | } |
| 3329 | DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); | ||
| 3330 | 3379 | ||
| 3331 | r = lkb->lkb_resource; | 3380 | r = lkb->lkb_resource; |
| 3332 | hold_rsb(r); | 3381 | hold_rsb(r); |
| 3333 | lock_rsb(r); | 3382 | lock_rsb(r); |
| 3334 | 3383 | ||
| 3384 | error = validate_message(lkb, ms); | ||
| 3385 | if (error) | ||
| 3386 | goto out; | ||
| 3387 | |||
| 3335 | mstype = lkb->lkb_wait_type; | 3388 | mstype = lkb->lkb_wait_type; |
| 3336 | error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); | 3389 | error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); |
| 3337 | if (error) | 3390 | if (error) |
| @@ -3383,6 +3436,7 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) | |||
| 3383 | if (is_overlap(lkb)) { | 3436 | if (is_overlap(lkb)) { |
| 3384 | /* we'll ignore error in cancel/unlock reply */ | 3437 | /* we'll ignore error in cancel/unlock reply */ |
| 3385 | queue_cast_overlap(r, lkb); | 3438 | queue_cast_overlap(r, lkb); |
| 3439 | confirm_master(r, result); | ||
| 3386 | unhold_lkb(lkb); /* undoes create_lkb() */ | 3440 | unhold_lkb(lkb); /* undoes create_lkb() */ |
| 3387 | } else | 3441 | } else |
| 3388 | _request_lock(r, lkb); | 3442 | _request_lock(r, lkb); |
| @@ -3463,6 +3517,10 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms) | |||
| 3463 | hold_rsb(r); | 3517 | hold_rsb(r); |
| 3464 | lock_rsb(r); | 3518 | lock_rsb(r); |
| 3465 | 3519 | ||
| 3520 | error = validate_message(lkb, ms); | ||
| 3521 | if (error) | ||
| 3522 | goto out; | ||
| 3523 | |||
| 3466 | /* stub reply can happen with waiters_mutex held */ | 3524 | /* stub reply can happen with waiters_mutex held */ |
| 3467 | error = remove_from_waiters_ms(lkb, ms); | 3525 | error = remove_from_waiters_ms(lkb, ms); |
| 3468 | if (error) | 3526 | if (error) |
| @@ -3481,10 +3539,10 @@ static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) | |||
| 3481 | 3539 | ||
| 3482 | error = find_lkb(ls, ms->m_remid, &lkb); | 3540 | error = find_lkb(ls, ms->m_remid, &lkb); |
| 3483 | if (error) { | 3541 | if (error) { |
| 3484 | log_error(ls, "receive_convert_reply no lkb"); | 3542 | log_debug(ls, "receive_convert_reply from %d no lkb %x", |
| 3543 | ms->m_header.h_nodeid, ms->m_remid); | ||
| 3485 | return; | 3544 | return; |
| 3486 | } | 3545 | } |
| 3487 | DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); | ||
| 3488 | 3546 | ||
| 3489 | _receive_convert_reply(lkb, ms); | 3547 | _receive_convert_reply(lkb, ms); |
| 3490 | dlm_put_lkb(lkb); | 3548 | dlm_put_lkb(lkb); |
| @@ -3498,6 +3556,10 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) | |||
| 3498 | hold_rsb(r); | 3556 | hold_rsb(r); |
| 3499 | lock_rsb(r); | 3557 | lock_rsb(r); |
| 3500 | 3558 | ||
| 3559 | error = validate_message(lkb, ms); | ||
| 3560 | if (error) | ||
| 3561 | goto out; | ||
| 3562 | |||
| 3501 | /* stub reply can happen with waiters_mutex held */ | 3563 | /* stub reply can happen with waiters_mutex held */ |
| 3502 | error = remove_from_waiters_ms(lkb, ms); | 3564 | error = remove_from_waiters_ms(lkb, ms); |
| 3503 | if (error) | 3565 | if (error) |
| @@ -3529,10 +3591,10 @@ static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) | |||
| 3529 | 3591 | ||
| 3530 | error = find_lkb(ls, ms->m_remid, &lkb); | 3592 | error = find_lkb(ls, ms->m_remid, &lkb); |
| 3531 | if (error) { | 3593 | if (error) { |
| 3532 | log_error(ls, "receive_unlock_reply no lkb"); | 3594 | log_debug(ls, "receive_unlock_reply from %d no lkb %x", |
| 3595 | ms->m_header.h_nodeid, ms->m_remid); | ||
| 3533 | return; | 3596 | return; |
| 3534 | } | 3597 | } |
| 3535 | DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); | ||
| 3536 | 3598 | ||
| 3537 | _receive_unlock_reply(lkb, ms); | 3599 | _receive_unlock_reply(lkb, ms); |
| 3538 | dlm_put_lkb(lkb); | 3600 | dlm_put_lkb(lkb); |
| @@ -3546,6 +3608,10 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) | |||
| 3546 | hold_rsb(r); | 3608 | hold_rsb(r); |
| 3547 | lock_rsb(r); | 3609 | lock_rsb(r); |
| 3548 | 3610 | ||
| 3611 | error = validate_message(lkb, ms); | ||
| 3612 | if (error) | ||
| 3613 | goto out; | ||
| 3614 | |||
| 3549 | /* stub reply can happen with waiters_mutex held */ | 3615 | /* stub reply can happen with waiters_mutex held */ |
| 3550 | error = remove_from_waiters_ms(lkb, ms); | 3616 | error = remove_from_waiters_ms(lkb, ms); |
| 3551 | if (error) | 3617 | if (error) |
| @@ -3577,10 +3643,10 @@ static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) | |||
| 3577 | 3643 | ||
| 3578 | error = find_lkb(ls, ms->m_remid, &lkb); | 3644 | error = find_lkb(ls, ms->m_remid, &lkb); |
| 3579 | if (error) { | 3645 | if (error) { |
| 3580 | log_error(ls, "receive_cancel_reply no lkb"); | 3646 | log_debug(ls, "receive_cancel_reply from %d no lkb %x", |
| 3647 | ms->m_header.h_nodeid, ms->m_remid); | ||
| 3581 | return; | 3648 | return; |
| 3582 | } | 3649 | } |
| 3583 | DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); | ||
| 3584 | 3650 | ||
| 3585 | _receive_cancel_reply(lkb, ms); | 3651 | _receive_cancel_reply(lkb, ms); |
| 3586 | dlm_put_lkb(lkb); | 3652 | dlm_put_lkb(lkb); |
| @@ -3640,6 +3706,13 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) | |||
| 3640 | 3706 | ||
| 3641 | static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) | 3707 | static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) |
| 3642 | { | 3708 | { |
| 3709 | if (!dlm_is_member(ls, ms->m_header.h_nodeid)) { | ||
| 3710 | log_debug(ls, "ignore non-member message %d from %d %x %x %d", | ||
| 3711 | ms->m_type, ms->m_header.h_nodeid, ms->m_lkid, | ||
| 3712 | ms->m_remid, ms->m_result); | ||
| 3713 | return; | ||
| 3714 | } | ||
| 3715 | |||
| 3643 | switch (ms->m_type) { | 3716 | switch (ms->m_type) { |
| 3644 | 3717 | ||
| 3645 | /* messages sent to a master node */ | 3718 | /* messages sent to a master node */ |
| @@ -3778,8 +3851,9 @@ void dlm_receive_buffer(struct dlm_header *hd, int nodeid) | |||
| 3778 | 3851 | ||
| 3779 | ls = dlm_find_lockspace_global(hd->h_lockspace); | 3852 | ls = dlm_find_lockspace_global(hd->h_lockspace); |
| 3780 | if (!ls) { | 3853 | if (!ls) { |
| 3781 | log_print("invalid h_lockspace %x from %d cmd %d type %d", | 3854 | if (dlm_config.ci_log_debug) |
| 3782 | hd->h_lockspace, nodeid, hd->h_cmd, type); | 3855 | log_print("invalid lockspace %x from %d cmd %d type %d", |
| 3856 | hd->h_lockspace, nodeid, hd->h_cmd, type); | ||
| 3783 | 3857 | ||
| 3784 | if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) | 3858 | if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) |
| 3785 | dlm_send_ls_not_ready(nodeid, rc); | 3859 | dlm_send_ls_not_ready(nodeid, rc); |
| @@ -3806,6 +3880,7 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb) | |||
| 3806 | ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; | 3880 | ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; |
| 3807 | ls->ls_stub_ms.m_result = -EINPROGRESS; | 3881 | ls->ls_stub_ms.m_result = -EINPROGRESS; |
| 3808 | ls->ls_stub_ms.m_flags = lkb->lkb_flags; | 3882 | ls->ls_stub_ms.m_flags = lkb->lkb_flags; |
| 3883 | ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; | ||
| 3809 | _receive_convert_reply(lkb, &ls->ls_stub_ms); | 3884 | _receive_convert_reply(lkb, &ls->ls_stub_ms); |
| 3810 | 3885 | ||
| 3811 | /* Same special case as in receive_rcom_lock_args() */ | 3886 | /* Same special case as in receive_rcom_lock_args() */ |
| @@ -3847,6 +3922,7 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb) | |||
| 3847 | void dlm_recover_waiters_pre(struct dlm_ls *ls) | 3922 | void dlm_recover_waiters_pre(struct dlm_ls *ls) |
| 3848 | { | 3923 | { |
| 3849 | struct dlm_lkb *lkb, *safe; | 3924 | struct dlm_lkb *lkb, *safe; |
| 3925 | int wait_type, stub_unlock_result, stub_cancel_result; | ||
| 3850 | 3926 | ||
| 3851 | mutex_lock(&ls->ls_waiters_mutex); | 3927 | mutex_lock(&ls->ls_waiters_mutex); |
| 3852 | 3928 | ||
| @@ -3865,7 +3941,33 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) | |||
| 3865 | if (!waiter_needs_recovery(ls, lkb)) | 3941 | if (!waiter_needs_recovery(ls, lkb)) |
| 3866 | continue; | 3942 | continue; |
| 3867 | 3943 | ||
| 3868 | switch (lkb->lkb_wait_type) { | 3944 | wait_type = lkb->lkb_wait_type; |
| 3945 | stub_unlock_result = -DLM_EUNLOCK; | ||
| 3946 | stub_cancel_result = -DLM_ECANCEL; | ||
| 3947 | |||
| 3948 | /* Main reply may have been received leaving a zero wait_type, | ||
| 3949 | but a reply for the overlapping op may not have been | ||
| 3950 | received. In that case we need to fake the appropriate | ||
| 3951 | reply for the overlap op. */ | ||
| 3952 | |||
| 3953 | if (!wait_type) { | ||
| 3954 | if (is_overlap_cancel(lkb)) { | ||
| 3955 | wait_type = DLM_MSG_CANCEL; | ||
| 3956 | if (lkb->lkb_grmode == DLM_LOCK_IV) | ||
| 3957 | stub_cancel_result = 0; | ||
| 3958 | } | ||
| 3959 | if (is_overlap_unlock(lkb)) { | ||
| 3960 | wait_type = DLM_MSG_UNLOCK; | ||
| 3961 | if (lkb->lkb_grmode == DLM_LOCK_IV) | ||
| 3962 | stub_unlock_result = -ENOENT; | ||
| 3963 | } | ||
| 3964 | |||
| 3965 | log_debug(ls, "rwpre overlap %x %x %d %d %d", | ||
| 3966 | lkb->lkb_id, lkb->lkb_flags, wait_type, | ||
| 3967 | stub_cancel_result, stub_unlock_result); | ||
| 3968 | } | ||
| 3969 | |||
| 3970 | switch (wait_type) { | ||
| 3869 | 3971 | ||
| 3870 | case DLM_MSG_REQUEST: | 3972 | case DLM_MSG_REQUEST: |
| 3871 | lkb->lkb_flags |= DLM_IFL_RESEND; | 3973 | lkb->lkb_flags |= DLM_IFL_RESEND; |
| @@ -3878,8 +3980,9 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) | |||
| 3878 | case DLM_MSG_UNLOCK: | 3980 | case DLM_MSG_UNLOCK: |
| 3879 | hold_lkb(lkb); | 3981 | hold_lkb(lkb); |
| 3880 | ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY; | 3982 | ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY; |
| 3881 | ls->ls_stub_ms.m_result = -DLM_EUNLOCK; | 3983 | ls->ls_stub_ms.m_result = stub_unlock_result; |
| 3882 | ls->ls_stub_ms.m_flags = lkb->lkb_flags; | 3984 | ls->ls_stub_ms.m_flags = lkb->lkb_flags; |
| 3985 | ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; | ||
| 3883 | _receive_unlock_reply(lkb, &ls->ls_stub_ms); | 3986 | _receive_unlock_reply(lkb, &ls->ls_stub_ms); |
| 3884 | dlm_put_lkb(lkb); | 3987 | dlm_put_lkb(lkb); |
| 3885 | break; | 3988 | break; |
| @@ -3887,15 +3990,16 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) | |||
| 3887 | case DLM_MSG_CANCEL: | 3990 | case DLM_MSG_CANCEL: |
| 3888 | hold_lkb(lkb); | 3991 | hold_lkb(lkb); |
| 3889 | ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY; | 3992 | ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY; |
| 3890 | ls->ls_stub_ms.m_result = -DLM_ECANCEL; | 3993 | ls->ls_stub_ms.m_result = stub_cancel_result; |
| 3891 | ls->ls_stub_ms.m_flags = lkb->lkb_flags; | 3994 | ls->ls_stub_ms.m_flags = lkb->lkb_flags; |
| 3995 | ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; | ||
| 3892 | _receive_cancel_reply(lkb, &ls->ls_stub_ms); | 3996 | _receive_cancel_reply(lkb, &ls->ls_stub_ms); |
| 3893 | dlm_put_lkb(lkb); | 3997 | dlm_put_lkb(lkb); |
| 3894 | break; | 3998 | break; |
| 3895 | 3999 | ||
| 3896 | default: | 4000 | default: |
| 3897 | log_error(ls, "invalid lkb wait_type %d", | 4001 | log_error(ls, "invalid lkb wait_type %d %d", |
| 3898 | lkb->lkb_wait_type); | 4002 | lkb->lkb_wait_type, wait_type); |
| 3899 | } | 4003 | } |
| 3900 | schedule(); | 4004 | schedule(); |
| 3901 | } | 4005 | } |
| @@ -4184,7 +4288,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, | |||
| 4184 | lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP); | 4288 | lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP); |
| 4185 | 4289 | ||
| 4186 | if (lkb->lkb_exflags & DLM_LKF_VALBLK) { | 4290 | if (lkb->lkb_exflags & DLM_LKF_VALBLK) { |
| 4187 | lkb->lkb_lvbptr = allocate_lvb(ls); | 4291 | lkb->lkb_lvbptr = dlm_allocate_lvb(ls); |
| 4188 | if (!lkb->lkb_lvbptr) | 4292 | if (!lkb->lkb_lvbptr) |
| 4189 | return -ENOMEM; | 4293 | return -ENOMEM; |
| 4190 | lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) - | 4294 | lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) - |
| @@ -4259,7 +4363,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) | |||
| 4259 | put_rsb(r); | 4363 | put_rsb(r); |
| 4260 | out: | 4364 | out: |
| 4261 | if (error) | 4365 | if (error) |
| 4262 | log_print("recover_master_copy %d %x", error, rl->rl_lkid); | 4366 | log_debug(ls, "recover_master_copy %d %x", error, rl->rl_lkid); |
| 4263 | rl->rl_result = error; | 4367 | rl->rl_result = error; |
| 4264 | return error; | 4368 | return error; |
| 4265 | } | 4369 | } |
| @@ -4342,7 +4446,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, | |||
| 4342 | } | 4446 | } |
| 4343 | } | 4447 | } |
| 4344 | 4448 | ||
| 4345 | /* After ua is attached to lkb it will be freed by free_lkb(). | 4449 | /* After ua is attached to lkb it will be freed by dlm_free_lkb(). |
| 4346 | When DLM_IFL_USER is set, the dlm knows that this is a userspace | 4450 | When DLM_IFL_USER is set, the dlm knows that this is a userspace |
| 4347 | lock and that lkb_astparam is the dlm_user_args structure. */ | 4451 | lock and that lkb_astparam is the dlm_user_args structure. */ |
| 4348 | 4452 | ||
| @@ -4679,6 +4783,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) | |||
| 4679 | } | 4783 | } |
| 4680 | 4784 | ||
| 4681 | list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { | 4785 | list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { |
| 4786 | lkb->lkb_ast_type = 0; | ||
| 4682 | list_del(&lkb->lkb_astqueue); | 4787 | list_del(&lkb->lkb_astqueue); |
| 4683 | dlm_put_lkb(lkb); | 4788 | dlm_put_lkb(lkb); |
| 4684 | } | 4789 | } |
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index ada04680a1e5..27b6ed302911 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h | |||
| @@ -19,8 +19,6 @@ void dlm_print_lkb(struct dlm_lkb *lkb); | |||
| 19 | void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms); | 19 | void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms); |
| 20 | void dlm_receive_buffer(struct dlm_header *hd, int nodeid); | 20 | void dlm_receive_buffer(struct dlm_header *hd, int nodeid); |
| 21 | int dlm_modes_compat(int mode1, int mode2); | 21 | int dlm_modes_compat(int mode1, int mode2); |
| 22 | int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen, | ||
| 23 | unsigned int flags, struct dlm_rsb **r_ret); | ||
| 24 | void dlm_put_rsb(struct dlm_rsb *r); | 22 | void dlm_put_rsb(struct dlm_rsb *r); |
| 25 | void dlm_hold_rsb(struct dlm_rsb *r); | 23 | void dlm_hold_rsb(struct dlm_rsb *r); |
| 26 | int dlm_put_lkb(struct dlm_lkb *lkb); | 24 | int dlm_put_lkb(struct dlm_lkb *lkb); |
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 5c108c49cb8c..b180fdc51085 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c | |||
| @@ -24,14 +24,6 @@ | |||
| 24 | #include "recover.h" | 24 | #include "recover.h" |
| 25 | #include "requestqueue.h" | 25 | #include "requestqueue.h" |
| 26 | 26 | ||
| 27 | #ifdef CONFIG_DLM_DEBUG | ||
| 28 | int dlm_create_debug_file(struct dlm_ls *ls); | ||
| 29 | void dlm_delete_debug_file(struct dlm_ls *ls); | ||
| 30 | #else | ||
| 31 | static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; } | ||
| 32 | static inline void dlm_delete_debug_file(struct dlm_ls *ls) { } | ||
| 33 | #endif | ||
| 34 | |||
| 35 | static int ls_count; | 27 | static int ls_count; |
| 36 | static struct mutex ls_lock; | 28 | static struct mutex ls_lock; |
| 37 | static struct list_head lslist; | 29 | static struct list_head lslist; |
| @@ -684,9 +676,9 @@ static int release_lockspace(struct dlm_ls *ls, int force) | |||
| 684 | dlm_del_ast(lkb); | 676 | dlm_del_ast(lkb); |
| 685 | 677 | ||
| 686 | if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY) | 678 | if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY) |
| 687 | free_lvb(lkb->lkb_lvbptr); | 679 | dlm_free_lvb(lkb->lkb_lvbptr); |
| 688 | 680 | ||
| 689 | free_lkb(lkb); | 681 | dlm_free_lkb(lkb); |
| 690 | } | 682 | } |
| 691 | } | 683 | } |
| 692 | dlm_astd_resume(); | 684 | dlm_astd_resume(); |
| @@ -704,7 +696,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) | |||
| 704 | res_hashchain); | 696 | res_hashchain); |
| 705 | 697 | ||
| 706 | list_del(&rsb->res_hashchain); | 698 | list_del(&rsb->res_hashchain); |
| 707 | free_rsb(rsb); | 699 | dlm_free_rsb(rsb); |
| 708 | } | 700 | } |
| 709 | 701 | ||
| 710 | head = &ls->ls_rsbtbl[i].toss; | 702 | head = &ls->ls_rsbtbl[i].toss; |
| @@ -712,7 +704,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) | |||
| 712 | rsb = list_entry(head->next, struct dlm_rsb, | 704 | rsb = list_entry(head->next, struct dlm_rsb, |
| 713 | res_hashchain); | 705 | res_hashchain); |
| 714 | list_del(&rsb->res_hashchain); | 706 | list_del(&rsb->res_hashchain); |
| 715 | free_rsb(rsb); | 707 | dlm_free_rsb(rsb); |
| 716 | } | 708 | } |
| 717 | } | 709 | } |
| 718 | 710 | ||
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index e9923ca9c2d9..7c1e5e5cccd8 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c | |||
| @@ -864,7 +864,7 @@ static void sctp_init_assoc(struct connection *con) | |||
| 864 | static void tcp_connect_to_sock(struct connection *con) | 864 | static void tcp_connect_to_sock(struct connection *con) |
| 865 | { | 865 | { |
| 866 | int result = -EHOSTUNREACH; | 866 | int result = -EHOSTUNREACH; |
| 867 | struct sockaddr_storage saddr; | 867 | struct sockaddr_storage saddr, src_addr; |
| 868 | int addr_len; | 868 | int addr_len; |
| 869 | struct socket *sock; | 869 | struct socket *sock; |
| 870 | 870 | ||
| @@ -898,6 +898,17 @@ static void tcp_connect_to_sock(struct connection *con) | |||
| 898 | con->connect_action = tcp_connect_to_sock; | 898 | con->connect_action = tcp_connect_to_sock; |
| 899 | add_sock(sock, con); | 899 | add_sock(sock, con); |
| 900 | 900 | ||
| 901 | /* Bind to our cluster-known address connecting to avoid | ||
| 902 | routing problems */ | ||
| 903 | memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr)); | ||
| 904 | make_sockaddr(&src_addr, 0, &addr_len); | ||
| 905 | result = sock->ops->bind(sock, (struct sockaddr *) &src_addr, | ||
| 906 | addr_len); | ||
| 907 | if (result < 0) { | ||
| 908 | log_print("could not bind for connect: %d", result); | ||
| 909 | /* This *may* not indicate a critical error */ | ||
| 910 | } | ||
| 911 | |||
| 901 | make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); | 912 | make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); |
| 902 | 913 | ||
| 903 | log_print("connecting to %d", con->nodeid); | 914 | log_print("connecting to %d", con->nodeid); |
| @@ -1426,6 +1437,8 @@ void dlm_lowcomms_stop(void) | |||
| 1426 | con = __nodeid2con(i, 0); | 1437 | con = __nodeid2con(i, 0); |
| 1427 | if (con) { | 1438 | if (con) { |
| 1428 | close_connection(con, true); | 1439 | close_connection(con, true); |
| 1440 | if (con->othercon) | ||
| 1441 | kmem_cache_free(con_cache, con->othercon); | ||
| 1429 | kmem_cache_free(con_cache, con); | 1442 | kmem_cache_free(con_cache, con); |
| 1430 | } | 1443 | } |
| 1431 | } | 1444 | } |
diff --git a/fs/dlm/main.c b/fs/dlm/main.c index eca2907f2386..58487fb95a4c 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c | |||
| @@ -18,16 +18,6 @@ | |||
| 18 | #include "memory.h" | 18 | #include "memory.h" |
| 19 | #include "config.h" | 19 | #include "config.h" |
| 20 | 20 | ||
| 21 | #ifdef CONFIG_DLM_DEBUG | ||
| 22 | int dlm_register_debugfs(void); | ||
| 23 | void dlm_unregister_debugfs(void); | ||
| 24 | #else | ||
| 25 | static inline int dlm_register_debugfs(void) { return 0; } | ||
| 26 | static inline void dlm_unregister_debugfs(void) { } | ||
| 27 | #endif | ||
| 28 | int dlm_netlink_init(void); | ||
| 29 | void dlm_netlink_exit(void); | ||
| 30 | |||
| 31 | static int __init init_dlm(void) | 21 | static int __init init_dlm(void) |
| 32 | { | 22 | { |
| 33 | int error; | 23 | int error; |
diff --git a/fs/dlm/member.c b/fs/dlm/member.c index e9cdcab306e2..fa17f5a27883 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /****************************************************************************** | 1 | /****************************************************************************** |
| 2 | ******************************************************************************* | 2 | ******************************************************************************* |
| 3 | ** | 3 | ** |
| 4 | ** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. | 4 | ** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. |
| 5 | ** | 5 | ** |
| 6 | ** This copyrighted material is made available to anyone wishing to use, | 6 | ** This copyrighted material is made available to anyone wishing to use, |
| 7 | ** modify, copy, or redistribute it subject to the terms and conditions | 7 | ** modify, copy, or redistribute it subject to the terms and conditions |
| @@ -70,7 +70,7 @@ static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb) | |||
| 70 | ls->ls_num_nodes--; | 70 | ls->ls_num_nodes--; |
| 71 | } | 71 | } |
| 72 | 72 | ||
| 73 | static int dlm_is_member(struct dlm_ls *ls, int nodeid) | 73 | int dlm_is_member(struct dlm_ls *ls, int nodeid) |
| 74 | { | 74 | { |
| 75 | struct dlm_member *memb; | 75 | struct dlm_member *memb; |
| 76 | 76 | ||
diff --git a/fs/dlm/member.h b/fs/dlm/member.h index 927c08c19214..7a26fca1e0b5 100644 --- a/fs/dlm/member.h +++ b/fs/dlm/member.h | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /****************************************************************************** | 1 | /****************************************************************************** |
| 2 | ******************************************************************************* | 2 | ******************************************************************************* |
| 3 | ** | 3 | ** |
| 4 | ** Copyright (C) 2005 Red Hat, Inc. All rights reserved. | 4 | ** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. |
| 5 | ** | 5 | ** |
| 6 | ** This copyrighted material is made available to anyone wishing to use, | 6 | ** This copyrighted material is made available to anyone wishing to use, |
| 7 | ** modify, copy, or redistribute it subject to the terms and conditions | 7 | ** modify, copy, or redistribute it subject to the terms and conditions |
| @@ -19,6 +19,7 @@ void dlm_clear_members(struct dlm_ls *ls); | |||
| 19 | void dlm_clear_members_gone(struct dlm_ls *ls); | 19 | void dlm_clear_members_gone(struct dlm_ls *ls); |
| 20 | int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); | 20 | int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); |
| 21 | int dlm_is_removed(struct dlm_ls *ls, int nodeid); | 21 | int dlm_is_removed(struct dlm_ls *ls, int nodeid); |
| 22 | int dlm_is_member(struct dlm_ls *ls, int nodeid); | ||
| 22 | 23 | ||
| 23 | #endif /* __MEMBER_DOT_H__ */ | 24 | #endif /* __MEMBER_DOT_H__ */ |
| 24 | 25 | ||
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index ecf0e5cb2035..f7783867491a 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c | |||
| @@ -2,7 +2,7 @@ | |||
| 2 | ******************************************************************************* | 2 | ******************************************************************************* |
| 3 | ** | 3 | ** |
| 4 | ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | 4 | ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. |
| 5 | ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. | 5 | ** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. |
| 6 | ** | 6 | ** |
| 7 | ** This copyrighted material is made available to anyone wishing to use, | 7 | ** This copyrighted material is made available to anyone wishing to use, |
| 8 | ** modify, copy, or redistribute it subject to the terms and conditions | 8 | ** modify, copy, or redistribute it subject to the terms and conditions |
| @@ -35,7 +35,7 @@ void dlm_memory_exit(void) | |||
| 35 | kmem_cache_destroy(lkb_cache); | 35 | kmem_cache_destroy(lkb_cache); |
| 36 | } | 36 | } |
| 37 | 37 | ||
| 38 | char *allocate_lvb(struct dlm_ls *ls) | 38 | char *dlm_allocate_lvb(struct dlm_ls *ls) |
| 39 | { | 39 | { |
| 40 | char *p; | 40 | char *p; |
| 41 | 41 | ||
| @@ -43,7 +43,7 @@ char *allocate_lvb(struct dlm_ls *ls) | |||
| 43 | return p; | 43 | return p; |
| 44 | } | 44 | } |
| 45 | 45 | ||
| 46 | void free_lvb(char *p) | 46 | void dlm_free_lvb(char *p) |
| 47 | { | 47 | { |
| 48 | kfree(p); | 48 | kfree(p); |
| 49 | } | 49 | } |
| @@ -51,7 +51,7 @@ void free_lvb(char *p) | |||
| 51 | /* FIXME: have some minimal space built-in to rsb for the name and | 51 | /* FIXME: have some minimal space built-in to rsb for the name and |
| 52 | kmalloc a separate name if needed, like dentries are done */ | 52 | kmalloc a separate name if needed, like dentries are done */ |
| 53 | 53 | ||
| 54 | struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen) | 54 | struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen) |
| 55 | { | 55 | { |
| 56 | struct dlm_rsb *r; | 56 | struct dlm_rsb *r; |
| 57 | 57 | ||
| @@ -61,14 +61,14 @@ struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen) | |||
| 61 | return r; | 61 | return r; |
| 62 | } | 62 | } |
| 63 | 63 | ||
| 64 | void free_rsb(struct dlm_rsb *r) | 64 | void dlm_free_rsb(struct dlm_rsb *r) |
| 65 | { | 65 | { |
| 66 | if (r->res_lvbptr) | 66 | if (r->res_lvbptr) |
| 67 | free_lvb(r->res_lvbptr); | 67 | dlm_free_lvb(r->res_lvbptr); |
| 68 | kfree(r); | 68 | kfree(r); |
| 69 | } | 69 | } |
| 70 | 70 | ||
| 71 | struct dlm_lkb *allocate_lkb(struct dlm_ls *ls) | 71 | struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls) |
| 72 | { | 72 | { |
| 73 | struct dlm_lkb *lkb; | 73 | struct dlm_lkb *lkb; |
| 74 | 74 | ||
| @@ -76,7 +76,7 @@ struct dlm_lkb *allocate_lkb(struct dlm_ls *ls) | |||
| 76 | return lkb; | 76 | return lkb; |
| 77 | } | 77 | } |
| 78 | 78 | ||
| 79 | void free_lkb(struct dlm_lkb *lkb) | 79 | void dlm_free_lkb(struct dlm_lkb *lkb) |
| 80 | { | 80 | { |
| 81 | if (lkb->lkb_flags & DLM_IFL_USER) { | 81 | if (lkb->lkb_flags & DLM_IFL_USER) { |
| 82 | struct dlm_user_args *ua; | 82 | struct dlm_user_args *ua; |
| @@ -90,19 +90,3 @@ void free_lkb(struct dlm_lkb *lkb) | |||
| 90 | kmem_cache_free(lkb_cache, lkb); | 90 | kmem_cache_free(lkb_cache, lkb); |
| 91 | } | 91 | } |
| 92 | 92 | ||
| 93 | struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen) | ||
| 94 | { | ||
| 95 | struct dlm_direntry *de; | ||
| 96 | |||
| 97 | DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN, | ||
| 98 | printk("namelen = %d\n", namelen);); | ||
| 99 | |||
| 100 | de = kzalloc(sizeof(*de) + namelen, GFP_KERNEL); | ||
| 101 | return de; | ||
| 102 | } | ||
| 103 | |||
| 104 | void free_direntry(struct dlm_direntry *de) | ||
| 105 | { | ||
| 106 | kfree(de); | ||
| 107 | } | ||
| 108 | |||
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h index 6ead158ccc5c..485fb29143bd 100644 --- a/fs/dlm/memory.h +++ b/fs/dlm/memory.h | |||
| @@ -2,7 +2,7 @@ | |||
| 2 | ******************************************************************************* | 2 | ******************************************************************************* |
| 3 | ** | 3 | ** |
| 4 | ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | 4 | ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. |
| 5 | ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. | 5 | ** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. |
| 6 | ** | 6 | ** |
| 7 | ** This copyrighted material is made available to anyone wishing to use, | 7 | ** This copyrighted material is made available to anyone wishing to use, |
| 8 | ** modify, copy, or redistribute it subject to the terms and conditions | 8 | ** modify, copy, or redistribute it subject to the terms and conditions |
| @@ -16,14 +16,12 @@ | |||
| 16 | 16 | ||
| 17 | int dlm_memory_init(void); | 17 | int dlm_memory_init(void); |
| 18 | void dlm_memory_exit(void); | 18 | void dlm_memory_exit(void); |
| 19 | struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen); | 19 | struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen); |
| 20 | void free_rsb(struct dlm_rsb *r); | 20 | void dlm_free_rsb(struct dlm_rsb *r); |
| 21 | struct dlm_lkb *allocate_lkb(struct dlm_ls *ls); | 21 | struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls); |
| 22 | void free_lkb(struct dlm_lkb *l); | 22 | void dlm_free_lkb(struct dlm_lkb *l); |
| 23 | struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen); | 23 | char *dlm_allocate_lvb(struct dlm_ls *ls); |
| 24 | void free_direntry(struct dlm_direntry *de); | 24 | void dlm_free_lvb(char *l); |
| 25 | char *allocate_lvb(struct dlm_ls *ls); | ||
| 26 | void free_lvb(char *l); | ||
| 27 | 25 | ||
| 28 | #endif /* __MEMORY_DOT_H__ */ | 26 | #endif /* __MEMORY_DOT_H__ */ |
| 29 | 27 | ||
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index f8c69dda16a0..e69926e984db 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c | |||
| @@ -2,7 +2,7 @@ | |||
| 2 | ******************************************************************************* | 2 | ******************************************************************************* |
| 3 | ** | 3 | ** |
| 4 | ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | 4 | ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. |
| 5 | ** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. | 5 | ** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. |
| 6 | ** | 6 | ** |
| 7 | ** This copyrighted material is made available to anyone wishing to use, | 7 | ** This copyrighted material is made available to anyone wishing to use, |
| 8 | ** modify, copy, or redistribute it subject to the terms and conditions | 8 | ** modify, copy, or redistribute it subject to the terms and conditions |
| @@ -58,8 +58,12 @@ static void copy_from_cb(void *dst, const void *base, unsigned offset, | |||
| 58 | int dlm_process_incoming_buffer(int nodeid, const void *base, | 58 | int dlm_process_incoming_buffer(int nodeid, const void *base, |
| 59 | unsigned offset, unsigned len, unsigned limit) | 59 | unsigned offset, unsigned len, unsigned limit) |
| 60 | { | 60 | { |
| 61 | unsigned char __tmp[DLM_INBUF_LEN]; | 61 | union { |
| 62 | struct dlm_header *msg = (struct dlm_header *) __tmp; | 62 | unsigned char __buf[DLM_INBUF_LEN]; |
| 63 | /* this is to force proper alignment on some arches */ | ||
| 64 | struct dlm_header dlm; | ||
| 65 | } __tmp; | ||
| 66 | struct dlm_header *msg = &__tmp.dlm; | ||
| 63 | int ret = 0; | 67 | int ret = 0; |
| 64 | int err = 0; | 68 | int err = 0; |
| 65 | uint16_t msglen; | 69 | uint16_t msglen; |
| @@ -100,8 +104,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base, | |||
| 100 | in the buffer on the stack (which should work for most | 104 | in the buffer on the stack (which should work for most |
| 101 | ordinary messages). */ | 105 | ordinary messages). */ |
| 102 | 106 | ||
| 103 | if (msglen > sizeof(__tmp) && | 107 | if (msglen > DLM_INBUF_LEN && msg == &__tmp.dlm) { |
| 104 | msg == (struct dlm_header *) __tmp) { | ||
| 105 | msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); | 108 | msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); |
| 106 | if (msg == NULL) | 109 | if (msg == NULL) |
| 107 | return ret; | 110 | return ret; |
| @@ -119,7 +122,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base, | |||
| 119 | dlm_receive_buffer(msg, nodeid); | 122 | dlm_receive_buffer(msg, nodeid); |
| 120 | } | 123 | } |
| 121 | 124 | ||
| 122 | if (msg != (struct dlm_header *) __tmp) | 125 | if (msg != &__tmp.dlm) |
| 123 | kfree(msg); | 126 | kfree(msg); |
| 124 | 127 | ||
| 125 | return err ? err : ret; | 128 | return err ? err : ret; |
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index ae2fd97fa4ad..026824cd3acb 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c | |||
| @@ -2,7 +2,7 @@ | |||
| 2 | ******************************************************************************* | 2 | ******************************************************************************* |
| 3 | ** | 3 | ** |
| 4 | ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. | 4 | ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. |
| 5 | ** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. | 5 | ** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. |
| 6 | ** | 6 | ** |
| 7 | ** This copyrighted material is made available to anyone wishing to use, | 7 | ** This copyrighted material is made available to anyone wishing to use, |
| 8 | ** modify, copy, or redistribute it subject to the terms and conditions | 8 | ** modify, copy, or redistribute it subject to the terms and conditions |
| @@ -197,11 +197,6 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) | |||
| 197 | spin_unlock(&ls->ls_rcom_spin); | 197 | spin_unlock(&ls->ls_rcom_spin); |
| 198 | } | 198 | } |
| 199 | 199 | ||
| 200 | static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) | ||
| 201 | { | ||
| 202 | receive_sync_reply(ls, rc_in); | ||
| 203 | } | ||
| 204 | |||
| 205 | int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) | 200 | int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) |
| 206 | { | 201 | { |
| 207 | struct dlm_rcom *rc; | 202 | struct dlm_rcom *rc; |
| @@ -254,11 +249,6 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) | |||
| 254 | send_rcom(ls, mh, rc); | 249 | send_rcom(ls, mh, rc); |
| 255 | } | 250 | } |
| 256 | 251 | ||
| 257 | static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) | ||
| 258 | { | ||
| 259 | receive_sync_reply(ls, rc_in); | ||
| 260 | } | ||
| 261 | |||
| 262 | int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) | 252 | int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) |
| 263 | { | 253 | { |
| 264 | struct dlm_rcom *rc; | 254 | struct dlm_rcom *rc; |
| @@ -381,11 +371,6 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) | |||
| 381 | send_rcom(ls, mh, rc); | 371 | send_rcom(ls, mh, rc); |
| 382 | } | 372 | } |
| 383 | 373 | ||
| 384 | static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) | ||
| 385 | { | ||
| 386 | dlm_recover_process_copy(ls, rc_in); | ||
| 387 | } | ||
| 388 | |||
| 389 | /* If the lockspace doesn't exist then still send a status message | 374 | /* If the lockspace doesn't exist then still send a status message |
| 390 | back; it's possible that it just doesn't have its global_id yet. */ | 375 | back; it's possible that it just doesn't have its global_id yet. */ |
| 391 | 376 | ||
| @@ -481,11 +466,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) | |||
| 481 | break; | 466 | break; |
| 482 | 467 | ||
| 483 | case DLM_RCOM_STATUS_REPLY: | 468 | case DLM_RCOM_STATUS_REPLY: |
| 484 | receive_rcom_status_reply(ls, rc); | 469 | receive_sync_reply(ls, rc); |
| 485 | break; | 470 | break; |
| 486 | 471 | ||
| 487 | case DLM_RCOM_NAMES_REPLY: | 472 | case DLM_RCOM_NAMES_REPLY: |
| 488 | receive_rcom_names_reply(ls, rc); | 473 | receive_sync_reply(ls, rc); |
| 489 | break; | 474 | break; |
| 490 | 475 | ||
| 491 | case DLM_RCOM_LOOKUP_REPLY: | 476 | case DLM_RCOM_LOOKUP_REPLY: |
| @@ -493,11 +478,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) | |||
| 493 | break; | 478 | break; |
| 494 | 479 | ||
| 495 | case DLM_RCOM_LOCK_REPLY: | 480 | case DLM_RCOM_LOCK_REPLY: |
| 496 | receive_rcom_lock_reply(ls, rc); | 481 | dlm_recover_process_copy(ls, rc); |
| 497 | break; | 482 | break; |
| 498 | 483 | ||
| 499 | default: | 484 | default: |
| 500 | DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type);); | 485 | log_error(ls, "receive_rcom bad type %d", rc->rc_type); |
| 501 | } | 486 | } |
| 502 | out: | 487 | out: |
| 503 | return; | 488 | return; |
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index c2cc7694cd16..df075dc300fa 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c | |||
| @@ -629,7 +629,7 @@ static void recover_lvb(struct dlm_rsb *r) | |||
| 629 | goto out; | 629 | goto out; |
| 630 | 630 | ||
| 631 | if (!r->res_lvbptr) { | 631 | if (!r->res_lvbptr) { |
| 632 | r->res_lvbptr = allocate_lvb(r->res_ls); | 632 | r->res_lvbptr = dlm_allocate_lvb(r->res_ls); |
| 633 | if (!r->res_lvbptr) | 633 | if (!r->res_lvbptr) |
| 634 | goto out; | 634 | goto out; |
| 635 | } | 635 | } |
| @@ -731,6 +731,20 @@ int dlm_create_root_list(struct dlm_ls *ls) | |||
| 731 | list_add(&r->res_root_list, &ls->ls_root_list); | 731 | list_add(&r->res_root_list, &ls->ls_root_list); |
| 732 | dlm_hold_rsb(r); | 732 | dlm_hold_rsb(r); |
| 733 | } | 733 | } |
| 734 | |||
| 735 | /* If we're using a directory, add tossed rsbs to the root | ||
| 736 | list; they'll have entries created in the new directory, | ||
| 737 | but no other recovery steps should do anything with them. */ | ||
| 738 | |||
| 739 | if (dlm_no_directory(ls)) { | ||
| 740 | read_unlock(&ls->ls_rsbtbl[i].lock); | ||
| 741 | continue; | ||
| 742 | } | ||
| 743 | |||
| 744 | list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) { | ||
| 745 | list_add(&r->res_root_list, &ls->ls_root_list); | ||
| 746 | dlm_hold_rsb(r); | ||
| 747 | } | ||
| 734 | read_unlock(&ls->ls_rsbtbl[i].lock); | 748 | read_unlock(&ls->ls_rsbtbl[i].lock); |
| 735 | } | 749 | } |
| 736 | out: | 750 | out: |
| @@ -750,6 +764,11 @@ void dlm_release_root_list(struct dlm_ls *ls) | |||
| 750 | up_write(&ls->ls_root_sem); | 764 | up_write(&ls->ls_root_sem); |
| 751 | } | 765 | } |
| 752 | 766 | ||
| 767 | /* If not using a directory, clear the entire toss list, there's no benefit to | ||
| 768 | caching the master value since it's fixed. If we are using a dir, keep the | ||
| 769 | rsb's we're the master of. Recovery will add them to the root list and from | ||
| 770 | there they'll be entered in the rebuilt directory. */ | ||
| 771 | |||
| 753 | void dlm_clear_toss_list(struct dlm_ls *ls) | 772 | void dlm_clear_toss_list(struct dlm_ls *ls) |
| 754 | { | 773 | { |
| 755 | struct dlm_rsb *r, *safe; | 774 | struct dlm_rsb *r, *safe; |
| @@ -759,8 +778,10 @@ void dlm_clear_toss_list(struct dlm_ls *ls) | |||
| 759 | write_lock(&ls->ls_rsbtbl[i].lock); | 778 | write_lock(&ls->ls_rsbtbl[i].lock); |
| 760 | list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss, | 779 | list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss, |
| 761 | res_hashchain) { | 780 | res_hashchain) { |
| 762 | list_del(&r->res_hashchain); | 781 | if (dlm_no_directory(ls) || !is_master(r)) { |
| 763 | free_rsb(r); | 782 | list_del(&r->res_hashchain); |
| 783 | dlm_free_rsb(r); | ||
| 784 | } | ||
| 764 | } | 785 | } |
| 765 | write_unlock(&ls->ls_rsbtbl[i].lock); | 786 | write_unlock(&ls->ls_rsbtbl[i].lock); |
| 766 | } | 787 | } |
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 4b89e20eebe7..997f9531d594 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c | |||
| @@ -67,17 +67,18 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) | |||
| 67 | dlm_astd_resume(); | 67 | dlm_astd_resume(); |
| 68 | 68 | ||
| 69 | /* | 69 | /* |
| 70 | * This list of root rsb's will be the basis of most of the recovery | 70 | * Free non-master tossed rsb's. Master rsb's are kept on toss |
| 71 | * routines. | 71 | * list and put on root list to be included in resdir recovery. |
| 72 | */ | 72 | */ |
| 73 | 73 | ||
| 74 | dlm_create_root_list(ls); | 74 | dlm_clear_toss_list(ls); |
| 75 | 75 | ||
| 76 | /* | 76 | /* |
| 77 | * Free all the tossed rsb's so we don't have to recover them. | 77 | * This list of root rsb's will be the basis of most of the recovery |
| 78 | * routines. | ||
| 78 | */ | 79 | */ |
| 79 | 80 | ||
| 80 | dlm_clear_toss_list(ls); | 81 | dlm_create_root_list(ls); |
| 81 | 82 | ||
| 82 | /* | 83 | /* |
| 83 | * Add or remove nodes from the lockspace's ls_nodes list. | 84 | * Add or remove nodes from the lockspace's ls_nodes list. |
diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 4f741546f4bb..7cbc6826239b 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c | |||
| @@ -24,8 +24,7 @@ | |||
| 24 | #include "lvb_table.h" | 24 | #include "lvb_table.h" |
| 25 | #include "user.h" | 25 | #include "user.h" |
| 26 | 26 | ||
| 27 | static const char *name_prefix="dlm"; | 27 | static const char name_prefix[] = "dlm"; |
| 28 | static struct miscdevice ctl_device; | ||
| 29 | static const struct file_operations device_fops; | 28 | static const struct file_operations device_fops; |
| 30 | 29 | ||
| 31 | #ifdef CONFIG_COMPAT | 30 | #ifdef CONFIG_COMPAT |
| @@ -82,7 +81,8 @@ struct dlm_lock_result32 { | |||
| 82 | }; | 81 | }; |
| 83 | 82 | ||
| 84 | static void compat_input(struct dlm_write_request *kb, | 83 | static void compat_input(struct dlm_write_request *kb, |
| 85 | struct dlm_write_request32 *kb32) | 84 | struct dlm_write_request32 *kb32, |
| 85 | int max_namelen) | ||
| 86 | { | 86 | { |
| 87 | kb->version[0] = kb32->version[0]; | 87 | kb->version[0] = kb32->version[0]; |
| 88 | kb->version[1] = kb32->version[1]; | 88 | kb->version[1] = kb32->version[1]; |
| @@ -112,7 +112,11 @@ static void compat_input(struct dlm_write_request *kb, | |||
| 112 | kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr; | 112 | kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr; |
| 113 | kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb; | 113 | kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb; |
| 114 | memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN); | 114 | memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN); |
| 115 | memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen); | 115 | if (kb->i.lock.namelen <= max_namelen) |
| 116 | memcpy(kb->i.lock.name, kb32->i.lock.name, | ||
| 117 | kb->i.lock.namelen); | ||
| 118 | else | ||
| 119 | kb->i.lock.namelen = max_namelen; | ||
| 116 | } | 120 | } |
| 117 | } | 121 | } |
| 118 | 122 | ||
| @@ -236,12 +240,12 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type) | |||
| 236 | spin_unlock(&proc->asts_spin); | 240 | spin_unlock(&proc->asts_spin); |
| 237 | 241 | ||
| 238 | if (eol) { | 242 | if (eol) { |
| 239 | spin_lock(&ua->proc->locks_spin); | 243 | spin_lock(&proc->locks_spin); |
| 240 | if (!list_empty(&lkb->lkb_ownqueue)) { | 244 | if (!list_empty(&lkb->lkb_ownqueue)) { |
| 241 | list_del_init(&lkb->lkb_ownqueue); | 245 | list_del_init(&lkb->lkb_ownqueue); |
| 242 | dlm_put_lkb(lkb); | 246 | dlm_put_lkb(lkb); |
| 243 | } | 247 | } |
| 244 | spin_unlock(&ua->proc->locks_spin); | 248 | spin_unlock(&proc->locks_spin); |
| 245 | } | 249 | } |
| 246 | out: | 250 | out: |
| 247 | mutex_unlock(&ls->ls_clear_proc_locks); | 251 | mutex_unlock(&ls->ls_clear_proc_locks); |
| @@ -529,7 +533,8 @@ static ssize_t device_write(struct file *file, const char __user *buf, | |||
| 529 | 533 | ||
| 530 | if (proc) | 534 | if (proc) |
| 531 | set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags); | 535 | set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags); |
| 532 | compat_input(kbuf, k32buf); | 536 | compat_input(kbuf, k32buf, |
| 537 | count - sizeof(struct dlm_write_request32)); | ||
| 533 | kfree(k32buf); | 538 | kfree(k32buf); |
| 534 | } | 539 | } |
| 535 | #endif | 540 | #endif |
| @@ -896,14 +901,16 @@ static const struct file_operations ctl_device_fops = { | |||
| 896 | .owner = THIS_MODULE, | 901 | .owner = THIS_MODULE, |
| 897 | }; | 902 | }; |
| 898 | 903 | ||
| 904 | static struct miscdevice ctl_device = { | ||
| 905 | .name = "dlm-control", | ||
| 906 | .fops = &ctl_device_fops, | ||
| 907 | .minor = MISC_DYNAMIC_MINOR, | ||
| 908 | }; | ||
| 909 | |||
| 899 | int dlm_user_init(void) | 910 | int dlm_user_init(void) |
| 900 | { | 911 | { |
| 901 | int error; | 912 | int error; |
| 902 | 913 | ||
| 903 | ctl_device.name = "dlm-control"; | ||
| 904 | ctl_device.fops = &ctl_device_fops; | ||
| 905 | ctl_device.minor = MISC_DYNAMIC_MINOR; | ||
| 906 | |||
| 907 | error = misc_register(&ctl_device); | 914 | error = misc_register(&ctl_device); |
| 908 | if (error) | 915 | if (error) |
| 909 | log_print("misc_register failed for control device"); | 916 | log_print("misc_register failed for control device"); |
diff --git a/fs/dlm/util.c b/fs/dlm/util.c index 963889cf6740..4d9c1f4e1bd1 100644 --- a/fs/dlm/util.c +++ b/fs/dlm/util.c | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /****************************************************************************** | 1 | /****************************************************************************** |
| 2 | ******************************************************************************* | 2 | ******************************************************************************* |
| 3 | ** | 3 | ** |
| 4 | ** Copyright (C) 2005 Red Hat, Inc. All rights reserved. | 4 | ** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. |
| 5 | ** | 5 | ** |
| 6 | ** This copyrighted material is made available to anyone wishing to use, | 6 | ** This copyrighted material is made available to anyone wishing to use, |
| 7 | ** modify, copy, or redistribute it subject to the terms and conditions | 7 | ** modify, copy, or redistribute it subject to the terms and conditions |
| @@ -14,6 +14,14 @@ | |||
| 14 | #include "rcom.h" | 14 | #include "rcom.h" |
| 15 | #include "util.h" | 15 | #include "util.h" |
| 16 | 16 | ||
| 17 | #define DLM_ERRNO_EDEADLK 35 | ||
| 18 | #define DLM_ERRNO_EBADR 53 | ||
| 19 | #define DLM_ERRNO_EBADSLT 57 | ||
| 20 | #define DLM_ERRNO_EPROTO 71 | ||
| 21 | #define DLM_ERRNO_EOPNOTSUPP 95 | ||
| 22 | #define DLM_ERRNO_ETIMEDOUT 110 | ||
| 23 | #define DLM_ERRNO_EINPROGRESS 115 | ||
| 24 | |||
| 17 | static void header_out(struct dlm_header *hd) | 25 | static void header_out(struct dlm_header *hd) |
| 18 | { | 26 | { |
| 19 | hd->h_version = cpu_to_le32(hd->h_version); | 27 | hd->h_version = cpu_to_le32(hd->h_version); |
| @@ -30,11 +38,54 @@ static void header_in(struct dlm_header *hd) | |||
| 30 | hd->h_length = le16_to_cpu(hd->h_length); | 38 | hd->h_length = le16_to_cpu(hd->h_length); |
| 31 | } | 39 | } |
| 32 | 40 | ||
| 33 | void dlm_message_out(struct dlm_message *ms) | 41 | /* higher errno values are inconsistent across architectures, so select |
| 42 | one set of values for on the wire */ | ||
| 43 | |||
| 44 | static int to_dlm_errno(int err) | ||
| 45 | { | ||
| 46 | switch (err) { | ||
| 47 | case -EDEADLK: | ||
| 48 | return -DLM_ERRNO_EDEADLK; | ||
| 49 | case -EBADR: | ||
| 50 | return -DLM_ERRNO_EBADR; | ||
| 51 | case -EBADSLT: | ||
| 52 | return -DLM_ERRNO_EBADSLT; | ||
| 53 | case -EPROTO: | ||
| 54 | return -DLM_ERRNO_EPROTO; | ||
| 55 | case -EOPNOTSUPP: | ||
| 56 | return -DLM_ERRNO_EOPNOTSUPP; | ||
| 57 | case -ETIMEDOUT: | ||
| 58 | return -DLM_ERRNO_ETIMEDOUT; | ||
| 59 | case -EINPROGRESS: | ||
| 60 | return -DLM_ERRNO_EINPROGRESS; | ||
| 61 | } | ||
| 62 | return err; | ||
| 63 | } | ||
| 64 | |||
| 65 | static int from_dlm_errno(int err) | ||
| 34 | { | 66 | { |
| 35 | struct dlm_header *hd = (struct dlm_header *) ms; | 67 | switch (err) { |
| 68 | case -DLM_ERRNO_EDEADLK: | ||
| 69 | return -EDEADLK; | ||
| 70 | case -DLM_ERRNO_EBADR: | ||
| 71 | return -EBADR; | ||
| 72 | case -DLM_ERRNO_EBADSLT: | ||
| 73 | return -EBADSLT; | ||
| 74 | case -DLM_ERRNO_EPROTO: | ||
| 75 | return -EPROTO; | ||
| 76 | case -DLM_ERRNO_EOPNOTSUPP: | ||
| 77 | return -EOPNOTSUPP; | ||
| 78 | case -DLM_ERRNO_ETIMEDOUT: | ||
| 79 | return -ETIMEDOUT; | ||
| 80 | case -DLM_ERRNO_EINPROGRESS: | ||
| 81 | return -EINPROGRESS; | ||
| 82 | } | ||
| 83 | return err; | ||
| 84 | } | ||
| 36 | 85 | ||
| 37 | header_out(hd); | 86 | void dlm_message_out(struct dlm_message *ms) |
| 87 | { | ||
| 88 | header_out(&ms->m_header); | ||
| 38 | 89 | ||
| 39 | ms->m_type = cpu_to_le32(ms->m_type); | 90 | ms->m_type = cpu_to_le32(ms->m_type); |
| 40 | ms->m_nodeid = cpu_to_le32(ms->m_nodeid); | 91 | ms->m_nodeid = cpu_to_le32(ms->m_nodeid); |
| @@ -53,14 +104,12 @@ void dlm_message_out(struct dlm_message *ms) | |||
| 53 | ms->m_rqmode = cpu_to_le32(ms->m_rqmode); | 104 | ms->m_rqmode = cpu_to_le32(ms->m_rqmode); |
| 54 | ms->m_bastmode = cpu_to_le32(ms->m_bastmode); | 105 | ms->m_bastmode = cpu_to_le32(ms->m_bastmode); |
| 55 | ms->m_asts = cpu_to_le32(ms->m_asts); | 106 | ms->m_asts = cpu_to_le32(ms->m_asts); |
| 56 | ms->m_result = cpu_to_le32(ms->m_result); | 107 | ms->m_result = cpu_to_le32(to_dlm_errno(ms->m_result)); |
| 57 | } | 108 | } |
| 58 | 109 | ||
| 59 | void dlm_message_in(struct dlm_message *ms) | 110 | void dlm_message_in(struct dlm_message *ms) |
| 60 | { | 111 | { |
| 61 | struct dlm_header *hd = (struct dlm_header *) ms; | 112 | header_in(&ms->m_header); |
| 62 | |||
| 63 | header_in(hd); | ||
| 64 | 113 | ||
| 65 | ms->m_type = le32_to_cpu(ms->m_type); | 114 | ms->m_type = le32_to_cpu(ms->m_type); |
| 66 | ms->m_nodeid = le32_to_cpu(ms->m_nodeid); | 115 | ms->m_nodeid = le32_to_cpu(ms->m_nodeid); |
| @@ -79,7 +128,7 @@ void dlm_message_in(struct dlm_message *ms) | |||
| 79 | ms->m_rqmode = le32_to_cpu(ms->m_rqmode); | 128 | ms->m_rqmode = le32_to_cpu(ms->m_rqmode); |
| 80 | ms->m_bastmode = le32_to_cpu(ms->m_bastmode); | 129 | ms->m_bastmode = le32_to_cpu(ms->m_bastmode); |
| 81 | ms->m_asts = le32_to_cpu(ms->m_asts); | 130 | ms->m_asts = le32_to_cpu(ms->m_asts); |
| 82 | ms->m_result = le32_to_cpu(ms->m_result); | 131 | ms->m_result = from_dlm_errno(le32_to_cpu(ms->m_result)); |
| 83 | } | 132 | } |
| 84 | 133 | ||
| 85 | static void rcom_lock_out(struct rcom_lock *rl) | 134 | static void rcom_lock_out(struct rcom_lock *rl) |
| @@ -126,10 +175,9 @@ static void rcom_config_in(struct rcom_config *rf) | |||
| 126 | 175 | ||
| 127 | void dlm_rcom_out(struct dlm_rcom *rc) | 176 | void dlm_rcom_out(struct dlm_rcom *rc) |
| 128 | { | 177 | { |
| 129 | struct dlm_header *hd = (struct dlm_header *) rc; | ||
| 130 | int type = rc->rc_type; | 178 | int type = rc->rc_type; |
| 131 | 179 | ||
| 132 | header_out(hd); | 180 | header_out(&rc->rc_header); |
| 133 | 181 | ||
| 134 | rc->rc_type = cpu_to_le32(rc->rc_type); | 182 | rc->rc_type = cpu_to_le32(rc->rc_type); |
| 135 | rc->rc_result = cpu_to_le32(rc->rc_result); | 183 | rc->rc_result = cpu_to_le32(rc->rc_result); |
| @@ -137,7 +185,7 @@ void dlm_rcom_out(struct dlm_rcom *rc) | |||
| 137 | rc->rc_seq = cpu_to_le64(rc->rc_seq); | 185 | rc->rc_seq = cpu_to_le64(rc->rc_seq); |
| 138 | rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply); | 186 | rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply); |
| 139 | 187 | ||
| 140 | if (type == DLM_RCOM_LOCK) | 188 | if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY)) |
| 141 | rcom_lock_out((struct rcom_lock *) rc->rc_buf); | 189 | rcom_lock_out((struct rcom_lock *) rc->rc_buf); |
| 142 | 190 | ||
| 143 | else if (type == DLM_RCOM_STATUS_REPLY) | 191 | else if (type == DLM_RCOM_STATUS_REPLY) |
| @@ -146,9 +194,9 @@ void dlm_rcom_out(struct dlm_rcom *rc) | |||
| 146 | 194 | ||
| 147 | void dlm_rcom_in(struct dlm_rcom *rc) | 195 | void dlm_rcom_in(struct dlm_rcom *rc) |
| 148 | { | 196 | { |
| 149 | struct dlm_header *hd = (struct dlm_header *) rc; | 197 | int type; |
| 150 | 198 | ||
| 151 | header_in(hd); | 199 | header_in(&rc->rc_header); |
| 152 | 200 | ||
| 153 | rc->rc_type = le32_to_cpu(rc->rc_type); | 201 | rc->rc_type = le32_to_cpu(rc->rc_type); |
| 154 | rc->rc_result = le32_to_cpu(rc->rc_result); | 202 | rc->rc_result = le32_to_cpu(rc->rc_result); |
| @@ -156,10 +204,12 @@ void dlm_rcom_in(struct dlm_rcom *rc) | |||
| 156 | rc->rc_seq = le64_to_cpu(rc->rc_seq); | 204 | rc->rc_seq = le64_to_cpu(rc->rc_seq); |
| 157 | rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply); | 205 | rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply); |
| 158 | 206 | ||
| 159 | if (rc->rc_type == DLM_RCOM_LOCK) | 207 | type = rc->rc_type; |
| 208 | |||
| 209 | if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY)) | ||
| 160 | rcom_lock_in((struct rcom_lock *) rc->rc_buf); | 210 | rcom_lock_in((struct rcom_lock *) rc->rc_buf); |
| 161 | 211 | ||
| 162 | else if (rc->rc_type == DLM_RCOM_STATUS_REPLY) | 212 | else if (type == DLM_RCOM_STATUS_REPLY) |
| 163 | rcom_config_in((struct rcom_config *) rc->rc_buf); | 213 | rcom_config_in((struct rcom_config *) rc->rc_buf); |
| 164 | } | 214 | } |
| 165 | 215 | ||
diff --git a/include/asm-x86/Kbuild b/include/asm-x86/Kbuild index e6189b229143..3c6f0f80e827 100644 --- a/include/asm-x86/Kbuild +++ b/include/asm-x86/Kbuild | |||
| @@ -3,6 +3,7 @@ include include/asm-generic/Kbuild.asm | |||
| 3 | header-y += boot.h | 3 | header-y += boot.h |
| 4 | header-y += bootparam.h | 4 | header-y += bootparam.h |
| 5 | header-y += debugreg.h | 5 | header-y += debugreg.h |
| 6 | header-y += kvm.h | ||
| 6 | header-y += ldt.h | 7 | header-y += ldt.h |
| 7 | header-y += msr-index.h | 8 | header-y += msr-index.h |
| 8 | header-y += prctl.h | 9 | header-y += prctl.h |
diff --git a/include/asm-x86/kvm.h b/include/asm-x86/kvm.h new file mode 100644 index 000000000000..7a71120426a3 --- /dev/null +++ b/include/asm-x86/kvm.h | |||
| @@ -0,0 +1,191 @@ | |||
| 1 | #ifndef __LINUX_KVM_X86_H | ||
| 2 | #define __LINUX_KVM_X86_H | ||
| 3 | |||
| 4 | /* | ||
| 5 | * KVM x86 specific structures and definitions | ||
| 6 | * | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <asm/types.h> | ||
| 10 | #include <linux/ioctl.h> | ||
| 11 | |||
| 12 | /* Architectural interrupt line count. */ | ||
| 13 | #define KVM_NR_INTERRUPTS 256 | ||
| 14 | |||
| 15 | struct kvm_memory_alias { | ||
| 16 | __u32 slot; /* this has a different namespace than memory slots */ | ||
| 17 | __u32 flags; | ||
| 18 | __u64 guest_phys_addr; | ||
| 19 | __u64 memory_size; | ||
| 20 | __u64 target_phys_addr; | ||
| 21 | }; | ||
| 22 | |||
| 23 | /* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ | ||
| 24 | struct kvm_pic_state { | ||
| 25 | __u8 last_irr; /* edge detection */ | ||
| 26 | __u8 irr; /* interrupt request register */ | ||
| 27 | __u8 imr; /* interrupt mask register */ | ||
| 28 | __u8 isr; /* interrupt service register */ | ||
| 29 | __u8 priority_add; /* highest irq priority */ | ||
| 30 | __u8 irq_base; | ||
| 31 | __u8 read_reg_select; | ||
| 32 | __u8 poll; | ||
| 33 | __u8 special_mask; | ||
| 34 | __u8 init_state; | ||
| 35 | __u8 auto_eoi; | ||
| 36 | __u8 rotate_on_auto_eoi; | ||
| 37 | __u8 special_fully_nested_mode; | ||
| 38 | __u8 init4; /* true if 4 byte init */ | ||
| 39 | __u8 elcr; /* PIIX edge/trigger selection */ | ||
| 40 | __u8 elcr_mask; | ||
| 41 | }; | ||
| 42 | |||
| 43 | #define KVM_IOAPIC_NUM_PINS 24 | ||
| 44 | struct kvm_ioapic_state { | ||
| 45 | __u64 base_address; | ||
| 46 | __u32 ioregsel; | ||
| 47 | __u32 id; | ||
| 48 | __u32 irr; | ||
| 49 | __u32 pad; | ||
| 50 | union { | ||
| 51 | __u64 bits; | ||
| 52 | struct { | ||
| 53 | __u8 vector; | ||
| 54 | __u8 delivery_mode:3; | ||
| 55 | __u8 dest_mode:1; | ||
| 56 | __u8 delivery_status:1; | ||
| 57 | __u8 polarity:1; | ||
| 58 | __u8 remote_irr:1; | ||
| 59 | __u8 trig_mode:1; | ||
| 60 | __u8 mask:1; | ||
| 61 | __u8 reserve:7; | ||
| 62 | __u8 reserved[4]; | ||
| 63 | __u8 dest_id; | ||
| 64 | } fields; | ||
| 65 | } redirtbl[KVM_IOAPIC_NUM_PINS]; | ||
| 66 | }; | ||
| 67 | |||
| 68 | #define KVM_IRQCHIP_PIC_MASTER 0 | ||
| 69 | #define KVM_IRQCHIP_PIC_SLAVE 1 | ||
| 70 | #define KVM_IRQCHIP_IOAPIC 2 | ||
| 71 | |||
| 72 | /* for KVM_GET_REGS and KVM_SET_REGS */ | ||
| 73 | struct kvm_regs { | ||
| 74 | /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ | ||
| 75 | __u64 rax, rbx, rcx, rdx; | ||
| 76 | __u64 rsi, rdi, rsp, rbp; | ||
| 77 | __u64 r8, r9, r10, r11; | ||
| 78 | __u64 r12, r13, r14, r15; | ||
| 79 | __u64 rip, rflags; | ||
| 80 | }; | ||
| 81 | |||
| 82 | /* for KVM_GET_LAPIC and KVM_SET_LAPIC */ | ||
| 83 | #define KVM_APIC_REG_SIZE 0x400 | ||
| 84 | struct kvm_lapic_state { | ||
| 85 | char regs[KVM_APIC_REG_SIZE]; | ||
| 86 | }; | ||
| 87 | |||
| 88 | struct kvm_segment { | ||
| 89 | __u64 base; | ||
| 90 | __u32 limit; | ||
| 91 | __u16 selector; | ||
| 92 | __u8 type; | ||
| 93 | __u8 present, dpl, db, s, l, g, avl; | ||
| 94 | __u8 unusable; | ||
| 95 | __u8 padding; | ||
| 96 | }; | ||
| 97 | |||
| 98 | struct kvm_dtable { | ||
| 99 | __u64 base; | ||
| 100 | __u16 limit; | ||
| 101 | __u16 padding[3]; | ||
| 102 | }; | ||
| 103 | |||
| 104 | |||
| 105 | /* for KVM_GET_SREGS and KVM_SET_SREGS */ | ||
| 106 | struct kvm_sregs { | ||
| 107 | /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */ | ||
| 108 | struct kvm_segment cs, ds, es, fs, gs, ss; | ||
| 109 | struct kvm_segment tr, ldt; | ||
| 110 | struct kvm_dtable gdt, idt; | ||
| 111 | __u64 cr0, cr2, cr3, cr4, cr8; | ||
| 112 | __u64 efer; | ||
| 113 | __u64 apic_base; | ||
| 114 | __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; | ||
| 115 | }; | ||
| 116 | |||
| 117 | /* for KVM_GET_FPU and KVM_SET_FPU */ | ||
| 118 | struct kvm_fpu { | ||
| 119 | __u8 fpr[8][16]; | ||
| 120 | __u16 fcw; | ||
| 121 | __u16 fsw; | ||
| 122 | __u8 ftwx; /* in fxsave format */ | ||
| 123 | __u8 pad1; | ||
| 124 | __u16 last_opcode; | ||
| 125 | __u64 last_ip; | ||
| 126 | __u64 last_dp; | ||
| 127 | __u8 xmm[16][16]; | ||
| 128 | __u32 mxcsr; | ||
| 129 | __u32 pad2; | ||
| 130 | }; | ||
| 131 | |||
| 132 | struct kvm_msr_entry { | ||
| 133 | __u32 index; | ||
| 134 | __u32 reserved; | ||
| 135 | __u64 data; | ||
| 136 | }; | ||
| 137 | |||
| 138 | /* for KVM_GET_MSRS and KVM_SET_MSRS */ | ||
| 139 | struct kvm_msrs { | ||
| 140 | __u32 nmsrs; /* number of msrs in entries */ | ||
| 141 | __u32 pad; | ||
| 142 | |||
| 143 | struct kvm_msr_entry entries[0]; | ||
| 144 | }; | ||
| 145 | |||
| 146 | /* for KVM_GET_MSR_INDEX_LIST */ | ||
| 147 | struct kvm_msr_list { | ||
| 148 | __u32 nmsrs; /* number of msrs in entries */ | ||
| 149 | __u32 indices[0]; | ||
| 150 | }; | ||
| 151 | |||
| 152 | |||
| 153 | struct kvm_cpuid_entry { | ||
| 154 | __u32 function; | ||
| 155 | __u32 eax; | ||
| 156 | __u32 ebx; | ||
| 157 | __u32 ecx; | ||
| 158 | __u32 edx; | ||
| 159 | __u32 padding; | ||
| 160 | }; | ||
| 161 | |||
| 162 | /* for KVM_SET_CPUID */ | ||
| 163 | struct kvm_cpuid { | ||
| 164 | __u32 nent; | ||
| 165 | __u32 padding; | ||
| 166 | struct kvm_cpuid_entry entries[0]; | ||
| 167 | }; | ||
| 168 | |||
| 169 | struct kvm_cpuid_entry2 { | ||
| 170 | __u32 function; | ||
| 171 | __u32 index; | ||
| 172 | __u32 flags; | ||
| 173 | __u32 eax; | ||
| 174 | __u32 ebx; | ||
| 175 | __u32 ecx; | ||
| 176 | __u32 edx; | ||
| 177 | __u32 padding[3]; | ||
| 178 | }; | ||
| 179 | |||
| 180 | #define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 | ||
| 181 | #define KVM_CPUID_FLAG_STATEFUL_FUNC 2 | ||
| 182 | #define KVM_CPUID_FLAG_STATE_READ_NEXT 4 | ||
| 183 | |||
| 184 | /* for KVM_SET_CPUID2 */ | ||
| 185 | struct kvm_cpuid2 { | ||
| 186 | __u32 nent; | ||
| 187 | __u32 padding; | ||
| 188 | struct kvm_cpuid_entry2 entries[0]; | ||
| 189 | }; | ||
| 190 | |||
| 191 | #endif | ||
diff --git a/drivers/kvm/kvm.h b/include/asm-x86/kvm_host.h index 3b0bc4bda5f2..4702b04b979a 100644 --- a/drivers/kvm/kvm.h +++ b/include/asm-x86/kvm_host.h | |||
| @@ -1,23 +1,24 @@ | |||
| 1 | #ifndef __KVM_H | 1 | #/* |
| 2 | #define __KVM_H | 2 | * Kernel-based Virtual Machine driver for Linux |
| 3 | 3 | * | |
| 4 | /* | 4 | * This header defines architecture specific interfaces, x86 version |
| 5 | * | ||
| 5 | * This work is licensed under the terms of the GNU GPL, version 2. See | 6 | * This work is licensed under the terms of the GNU GPL, version 2. See |
| 6 | * the COPYING file in the top-level directory. | 7 | * the COPYING file in the top-level directory. |
| 8 | * | ||
| 7 | */ | 9 | */ |
| 8 | 10 | ||
| 11 | #ifndef ASM_KVM_HOST_H | ||
| 12 | #define ASM_KVM_HOST_H | ||
| 13 | |||
| 9 | #include <linux/types.h> | 14 | #include <linux/types.h> |
| 10 | #include <linux/list.h> | ||
| 11 | #include <linux/mutex.h> | ||
| 12 | #include <linux/spinlock.h> | ||
| 13 | #include <linux/signal.h> | ||
| 14 | #include <linux/sched.h> | ||
| 15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
| 16 | #include <linux/preempt.h> | ||
| 17 | #include <asm/signal.h> | ||
| 18 | 16 | ||
| 19 | #include <linux/kvm.h> | 17 | #include <linux/kvm.h> |
| 20 | #include <linux/kvm_para.h> | 18 | #include <linux/kvm_para.h> |
| 19 | #include <linux/kvm_types.h> | ||
| 20 | |||
| 21 | #include <asm/desc.h> | ||
| 21 | 22 | ||
| 22 | #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) | 23 | #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) |
| 23 | #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) | 24 | #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) |
| @@ -37,15 +38,8 @@ | |||
| 37 | #define INVALID_PAGE (~(hpa_t)0) | 38 | #define INVALID_PAGE (~(hpa_t)0) |
| 38 | #define UNMAPPED_GVA (~(gpa_t)0) | 39 | #define UNMAPPED_GVA (~(gpa_t)0) |
| 39 | 40 | ||
| 40 | #define KVM_MAX_VCPUS 4 | ||
| 41 | #define KVM_ALIAS_SLOTS 4 | ||
| 42 | #define KVM_MEMORY_SLOTS 8 | ||
| 43 | #define KVM_NUM_MMU_PAGES 1024 | ||
| 44 | #define KVM_MIN_FREE_MMU_PAGES 5 | ||
| 45 | #define KVM_REFILL_PAGES 25 | ||
| 46 | #define KVM_MAX_CPUID_ENTRIES 40 | ||
| 47 | |||
| 48 | #define DE_VECTOR 0 | 41 | #define DE_VECTOR 0 |
| 42 | #define UD_VECTOR 6 | ||
| 49 | #define NM_VECTOR 7 | 43 | #define NM_VECTOR 7 |
| 50 | #define DF_VECTOR 8 | 44 | #define DF_VECTOR 8 |
| 51 | #define TS_VECTOR 10 | 45 | #define TS_VECTOR 10 |
| @@ -59,31 +53,66 @@ | |||
| 59 | 53 | ||
| 60 | #define IOPL_SHIFT 12 | 54 | #define IOPL_SHIFT 12 |
| 61 | 55 | ||
| 62 | #define KVM_PIO_PAGE_OFFSET 1 | 56 | #define KVM_ALIAS_SLOTS 4 |
| 63 | 57 | ||
| 64 | /* | 58 | #define KVM_PERMILLE_MMU_PAGES 20 |
| 65 | * vcpu->requests bit members | 59 | #define KVM_MIN_ALLOC_MMU_PAGES 64 |
| 66 | */ | 60 | #define KVM_NUM_MMU_PAGES 1024 |
| 67 | #define KVM_TLB_FLUSH 0 | 61 | #define KVM_MIN_FREE_MMU_PAGES 5 |
| 62 | #define KVM_REFILL_PAGES 25 | ||
| 63 | #define KVM_MAX_CPUID_ENTRIES 40 | ||
| 68 | 64 | ||
| 69 | /* | 65 | extern spinlock_t kvm_lock; |
| 70 | * Address types: | 66 | extern struct list_head vm_list; |
| 71 | * | 67 | |
| 72 | * gva - guest virtual address | 68 | struct kvm_vcpu; |
| 73 | * gpa - guest physical address | 69 | struct kvm; |
| 74 | * gfn - guest frame number | 70 | |
| 75 | * hva - host virtual address | 71 | enum { |
| 76 | * hpa - host physical address | 72 | VCPU_REGS_RAX = 0, |
| 77 | * hfn - host frame number | 73 | VCPU_REGS_RCX = 1, |
| 78 | */ | 74 | VCPU_REGS_RDX = 2, |
| 75 | VCPU_REGS_RBX = 3, | ||
| 76 | VCPU_REGS_RSP = 4, | ||
| 77 | VCPU_REGS_RBP = 5, | ||
| 78 | VCPU_REGS_RSI = 6, | ||
| 79 | VCPU_REGS_RDI = 7, | ||
| 80 | #ifdef CONFIG_X86_64 | ||
| 81 | VCPU_REGS_R8 = 8, | ||
| 82 | VCPU_REGS_R9 = 9, | ||
| 83 | VCPU_REGS_R10 = 10, | ||
| 84 | VCPU_REGS_R11 = 11, | ||
| 85 | VCPU_REGS_R12 = 12, | ||
| 86 | VCPU_REGS_R13 = 13, | ||
| 87 | VCPU_REGS_R14 = 14, | ||
| 88 | VCPU_REGS_R15 = 15, | ||
| 89 | #endif | ||
| 90 | NR_VCPU_REGS | ||
| 91 | }; | ||
| 92 | |||
| 93 | enum { | ||
| 94 | VCPU_SREG_CS, | ||
| 95 | VCPU_SREG_DS, | ||
| 96 | VCPU_SREG_ES, | ||
| 97 | VCPU_SREG_FS, | ||
| 98 | VCPU_SREG_GS, | ||
| 99 | VCPU_SREG_SS, | ||
| 100 | VCPU_SREG_TR, | ||
| 101 | VCPU_SREG_LDTR, | ||
| 102 | }; | ||
| 79 | 103 | ||
| 80 | typedef unsigned long gva_t; | 104 | #include <asm/kvm_x86_emulate.h> |
| 81 | typedef u64 gpa_t; | ||
| 82 | typedef unsigned long gfn_t; | ||
| 83 | 105 | ||
| 84 | typedef unsigned long hva_t; | 106 | #define KVM_NR_MEM_OBJS 40 |
| 85 | typedef u64 hpa_t; | 107 | |
| 86 | typedef unsigned long hfn_t; | 108 | /* |
| 109 | * We don't want allocation failures within the mmu code, so we preallocate | ||
| 110 | * enough memory for a single page fault in a cache. | ||
| 111 | */ | ||
| 112 | struct kvm_mmu_memory_cache { | ||
| 113 | int nobjs; | ||
| 114 | void *objects[KVM_NR_MEM_OBJS]; | ||
| 115 | }; | ||
| 87 | 116 | ||
| 88 | #define NR_PTE_CHAIN_ENTRIES 5 | 117 | #define NR_PTE_CHAIN_ENTRIES 5 |
| 89 | 118 | ||
| @@ -99,7 +128,7 @@ struct kvm_pte_chain { | |||
| 99 | * bits 4:7 - page table level for this shadow (1-4) | 128 | * bits 4:7 - page table level for this shadow (1-4) |
| 100 | * bits 8:9 - page table quadrant for 2-level guests | 129 | * bits 8:9 - page table quadrant for 2-level guests |
| 101 | * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) | 130 | * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) |
| 102 | * bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde | 131 | * bits 17:19 - common access permissions for all ptes in this shadow page |
| 103 | */ | 132 | */ |
| 104 | union kvm_mmu_page_role { | 133 | union kvm_mmu_page_role { |
| 105 | unsigned word; | 134 | unsigned word; |
| @@ -109,7 +138,7 @@ union kvm_mmu_page_role { | |||
| 109 | unsigned quadrant : 2; | 138 | unsigned quadrant : 2; |
| 110 | unsigned pad_for_nice_hex_output : 6; | 139 | unsigned pad_for_nice_hex_output : 6; |
| 111 | unsigned metaphysical : 1; | 140 | unsigned metaphysical : 1; |
| 112 | unsigned hugepage_access : 3; | 141 | unsigned access : 3; |
| 113 | }; | 142 | }; |
| 114 | }; | 143 | }; |
| 115 | 144 | ||
| @@ -125,6 +154,8 @@ struct kvm_mmu_page { | |||
| 125 | union kvm_mmu_page_role role; | 154 | union kvm_mmu_page_role role; |
| 126 | 155 | ||
| 127 | u64 *spt; | 156 | u64 *spt; |
| 157 | /* hold the gfn of each spte inside spt */ | ||
| 158 | gfn_t *gfns; | ||
| 128 | unsigned long slot_bitmap; /* One bit set per slot which has memory | 159 | unsigned long slot_bitmap; /* One bit set per slot which has memory |
| 129 | * in this shadow page. | 160 | * in this shadow page. |
| 130 | */ | 161 | */ |
| @@ -136,9 +167,6 @@ struct kvm_mmu_page { | |||
| 136 | }; | 167 | }; |
| 137 | }; | 168 | }; |
| 138 | 169 | ||
| 139 | struct kvm_vcpu; | ||
| 140 | extern struct kmem_cache *kvm_vcpu_cache; | ||
| 141 | |||
| 142 | /* | 170 | /* |
| 143 | * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level | 171 | * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level |
| 144 | * 32-bit). The kvm_mmu structure abstracts the details of the current mmu | 172 | * 32-bit). The kvm_mmu structure abstracts the details of the current mmu |
| @@ -149,6 +177,8 @@ struct kvm_mmu { | |||
| 149 | int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); | 177 | int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); |
| 150 | void (*free)(struct kvm_vcpu *vcpu); | 178 | void (*free)(struct kvm_vcpu *vcpu); |
| 151 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); | 179 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); |
| 180 | void (*prefetch_page)(struct kvm_vcpu *vcpu, | ||
| 181 | struct kvm_mmu_page *page); | ||
| 152 | hpa_t root_hpa; | 182 | hpa_t root_hpa; |
| 153 | int root_level; | 183 | int root_level; |
| 154 | int shadow_root_level; | 184 | int shadow_root_level; |
| @@ -156,159 +186,9 @@ struct kvm_mmu { | |||
| 156 | u64 *pae_root; | 186 | u64 *pae_root; |
| 157 | }; | 187 | }; |
| 158 | 188 | ||
| 159 | #define KVM_NR_MEM_OBJS 20 | 189 | struct kvm_vcpu_arch { |
| 160 | |||
| 161 | struct kvm_mmu_memory_cache { | ||
| 162 | int nobjs; | ||
| 163 | void *objects[KVM_NR_MEM_OBJS]; | ||
| 164 | }; | ||
| 165 | |||
| 166 | /* | ||
| 167 | * We don't want allocation failures within the mmu code, so we preallocate | ||
| 168 | * enough memory for a single page fault in a cache. | ||
| 169 | */ | ||
| 170 | struct kvm_guest_debug { | ||
| 171 | int enabled; | ||
| 172 | unsigned long bp[4]; | ||
| 173 | int singlestep; | ||
| 174 | }; | ||
| 175 | |||
| 176 | enum { | ||
| 177 | VCPU_REGS_RAX = 0, | ||
| 178 | VCPU_REGS_RCX = 1, | ||
| 179 | VCPU_REGS_RDX = 2, | ||
| 180 | VCPU_REGS_RBX = 3, | ||
| 181 | VCPU_REGS_RSP = 4, | ||
| 182 | VCPU_REGS_RBP = 5, | ||
| 183 | VCPU_REGS_RSI = 6, | ||
| 184 | VCPU_REGS_RDI = 7, | ||
| 185 | #ifdef CONFIG_X86_64 | ||
| 186 | VCPU_REGS_R8 = 8, | ||
| 187 | VCPU_REGS_R9 = 9, | ||
| 188 | VCPU_REGS_R10 = 10, | ||
| 189 | VCPU_REGS_R11 = 11, | ||
| 190 | VCPU_REGS_R12 = 12, | ||
| 191 | VCPU_REGS_R13 = 13, | ||
| 192 | VCPU_REGS_R14 = 14, | ||
| 193 | VCPU_REGS_R15 = 15, | ||
| 194 | #endif | ||
| 195 | NR_VCPU_REGS | ||
| 196 | }; | ||
| 197 | |||
| 198 | enum { | ||
| 199 | VCPU_SREG_CS, | ||
| 200 | VCPU_SREG_DS, | ||
| 201 | VCPU_SREG_ES, | ||
| 202 | VCPU_SREG_FS, | ||
| 203 | VCPU_SREG_GS, | ||
| 204 | VCPU_SREG_SS, | ||
| 205 | VCPU_SREG_TR, | ||
| 206 | VCPU_SREG_LDTR, | ||
| 207 | }; | ||
| 208 | |||
| 209 | struct kvm_pio_request { | ||
| 210 | unsigned long count; | ||
| 211 | int cur_count; | ||
| 212 | struct page *guest_pages[2]; | ||
| 213 | unsigned guest_page_offset; | ||
| 214 | int in; | ||
| 215 | int port; | ||
| 216 | int size; | ||
| 217 | int string; | ||
| 218 | int down; | ||
| 219 | int rep; | ||
| 220 | }; | ||
| 221 | |||
| 222 | struct kvm_stat { | ||
| 223 | u32 pf_fixed; | ||
| 224 | u32 pf_guest; | ||
| 225 | u32 tlb_flush; | ||
| 226 | u32 invlpg; | ||
| 227 | |||
| 228 | u32 exits; | ||
| 229 | u32 io_exits; | ||
| 230 | u32 mmio_exits; | ||
| 231 | u32 signal_exits; | ||
| 232 | u32 irq_window_exits; | ||
| 233 | u32 halt_exits; | ||
| 234 | u32 halt_wakeup; | ||
| 235 | u32 request_irq_exits; | ||
| 236 | u32 irq_exits; | ||
| 237 | u32 light_exits; | ||
| 238 | u32 efer_reload; | ||
| 239 | }; | ||
| 240 | |||
| 241 | struct kvm_io_device { | ||
| 242 | void (*read)(struct kvm_io_device *this, | ||
| 243 | gpa_t addr, | ||
| 244 | int len, | ||
| 245 | void *val); | ||
| 246 | void (*write)(struct kvm_io_device *this, | ||
| 247 | gpa_t addr, | ||
| 248 | int len, | ||
| 249 | const void *val); | ||
| 250 | int (*in_range)(struct kvm_io_device *this, gpa_t addr); | ||
| 251 | void (*destructor)(struct kvm_io_device *this); | ||
| 252 | |||
| 253 | void *private; | ||
| 254 | }; | ||
| 255 | |||
| 256 | static inline void kvm_iodevice_read(struct kvm_io_device *dev, | ||
| 257 | gpa_t addr, | ||
| 258 | int len, | ||
| 259 | void *val) | ||
| 260 | { | ||
| 261 | dev->read(dev, addr, len, val); | ||
| 262 | } | ||
| 263 | |||
| 264 | static inline void kvm_iodevice_write(struct kvm_io_device *dev, | ||
| 265 | gpa_t addr, | ||
| 266 | int len, | ||
| 267 | const void *val) | ||
| 268 | { | ||
| 269 | dev->write(dev, addr, len, val); | ||
| 270 | } | ||
| 271 | |||
| 272 | static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr) | ||
| 273 | { | ||
| 274 | return dev->in_range(dev, addr); | ||
| 275 | } | ||
| 276 | |||
| 277 | static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) | ||
| 278 | { | ||
| 279 | if (dev->destructor) | ||
| 280 | dev->destructor(dev); | ||
| 281 | } | ||
| 282 | |||
| 283 | /* | ||
| 284 | * It would be nice to use something smarter than a linear search, TBD... | ||
| 285 | * Thankfully we dont expect many devices to register (famous last words :), | ||
| 286 | * so until then it will suffice. At least its abstracted so we can change | ||
| 287 | * in one place. | ||
| 288 | */ | ||
| 289 | struct kvm_io_bus { | ||
| 290 | int dev_count; | ||
| 291 | #define NR_IOBUS_DEVS 6 | ||
| 292 | struct kvm_io_device *devs[NR_IOBUS_DEVS]; | ||
| 293 | }; | ||
| 294 | |||
| 295 | void kvm_io_bus_init(struct kvm_io_bus *bus); | ||
| 296 | void kvm_io_bus_destroy(struct kvm_io_bus *bus); | ||
| 297 | struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr); | ||
| 298 | void kvm_io_bus_register_dev(struct kvm_io_bus *bus, | ||
| 299 | struct kvm_io_device *dev); | ||
| 300 | |||
| 301 | struct kvm_vcpu { | ||
| 302 | struct kvm *kvm; | ||
| 303 | struct preempt_notifier preempt_notifier; | ||
| 304 | int vcpu_id; | ||
| 305 | struct mutex mutex; | ||
| 306 | int cpu; | ||
| 307 | u64 host_tsc; | 190 | u64 host_tsc; |
| 308 | struct kvm_run *run; | ||
| 309 | int interrupt_window_open; | 191 | int interrupt_window_open; |
| 310 | int guest_mode; | ||
| 311 | unsigned long requests; | ||
| 312 | unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ | 192 | unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ |
| 313 | DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); | 193 | DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); |
| 314 | unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ | 194 | unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ |
| @@ -317,9 +197,6 @@ struct kvm_vcpu { | |||
| 317 | unsigned long cr0; | 197 | unsigned long cr0; |
| 318 | unsigned long cr2; | 198 | unsigned long cr2; |
| 319 | unsigned long cr3; | 199 | unsigned long cr3; |
| 320 | gpa_t para_state_gpa; | ||
| 321 | struct page *para_state_page; | ||
| 322 | gpa_t hypercall_gpa; | ||
| 323 | unsigned long cr4; | 200 | unsigned long cr4; |
| 324 | unsigned long cr8; | 201 | unsigned long cr8; |
| 325 | u64 pdptrs[4]; /* pae */ | 202 | u64 pdptrs[4]; /* pae */ |
| @@ -334,6 +211,7 @@ struct kvm_vcpu { | |||
| 334 | int mp_state; | 211 | int mp_state; |
| 335 | int sipi_vector; | 212 | int sipi_vector; |
| 336 | u64 ia32_misc_enable_msr; | 213 | u64 ia32_misc_enable_msr; |
| 214 | bool tpr_access_reporting; | ||
| 337 | 215 | ||
| 338 | struct kvm_mmu mmu; | 216 | struct kvm_mmu mmu; |
| 339 | 217 | ||
| @@ -344,29 +222,26 @@ struct kvm_vcpu { | |||
| 344 | 222 | ||
| 345 | gfn_t last_pt_write_gfn; | 223 | gfn_t last_pt_write_gfn; |
| 346 | int last_pt_write_count; | 224 | int last_pt_write_count; |
| 225 | u64 *last_pte_updated; | ||
| 347 | 226 | ||
| 348 | struct kvm_guest_debug guest_debug; | 227 | struct { |
| 228 | gfn_t gfn; /* presumed gfn during guest pte update */ | ||
| 229 | struct page *page; /* page corresponding to that gfn */ | ||
| 230 | } update_pte; | ||
| 349 | 231 | ||
| 350 | struct i387_fxsave_struct host_fx_image; | 232 | struct i387_fxsave_struct host_fx_image; |
| 351 | struct i387_fxsave_struct guest_fx_image; | 233 | struct i387_fxsave_struct guest_fx_image; |
| 352 | int fpu_active; | 234 | |
| 353 | int guest_fpu_loaded; | ||
| 354 | |||
| 355 | int mmio_needed; | ||
| 356 | int mmio_read_completed; | ||
| 357 | int mmio_is_write; | ||
| 358 | int mmio_size; | ||
| 359 | unsigned char mmio_data[8]; | ||
| 360 | gpa_t mmio_phys_addr; | ||
| 361 | gva_t mmio_fault_cr2; | 235 | gva_t mmio_fault_cr2; |
| 362 | struct kvm_pio_request pio; | 236 | struct kvm_pio_request pio; |
| 363 | void *pio_data; | 237 | void *pio_data; |
| 364 | wait_queue_head_t wq; | ||
| 365 | 238 | ||
| 366 | int sigset_active; | 239 | struct kvm_queued_exception { |
| 367 | sigset_t sigset; | 240 | bool pending; |
| 368 | 241 | bool has_error_code; | |
| 369 | struct kvm_stat stat; | 242 | u8 nr; |
| 243 | u32 error_code; | ||
| 244 | } exception; | ||
| 370 | 245 | ||
| 371 | struct { | 246 | struct { |
| 372 | int active; | 247 | int active; |
| @@ -381,7 +256,10 @@ struct kvm_vcpu { | |||
| 381 | int halt_request; /* real mode on Intel only */ | 256 | int halt_request; /* real mode on Intel only */ |
| 382 | 257 | ||
| 383 | int cpuid_nent; | 258 | int cpuid_nent; |
| 384 | struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; | 259 | struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; |
| 260 | /* emulate context */ | ||
| 261 | |||
| 262 | struct x86_emulate_ctxt emulate_ctxt; | ||
| 385 | }; | 263 | }; |
| 386 | 264 | ||
| 387 | struct kvm_mem_alias { | 265 | struct kvm_mem_alias { |
| @@ -390,51 +268,58 @@ struct kvm_mem_alias { | |||
| 390 | gfn_t target_gfn; | 268 | gfn_t target_gfn; |
| 391 | }; | 269 | }; |
| 392 | 270 | ||
| 393 | struct kvm_memory_slot { | 271 | struct kvm_arch{ |
| 394 | gfn_t base_gfn; | ||
| 395 | unsigned long npages; | ||
| 396 | unsigned long flags; | ||
| 397 | struct page **phys_mem; | ||
| 398 | unsigned long *dirty_bitmap; | ||
| 399 | }; | ||
| 400 | |||
| 401 | struct kvm { | ||
| 402 | struct mutex lock; /* protects everything except vcpus */ | ||
| 403 | int naliases; | 272 | int naliases; |
| 404 | struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; | 273 | struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; |
| 405 | int nmemslots; | 274 | |
| 406 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; | 275 | unsigned int n_free_mmu_pages; |
| 276 | unsigned int n_requested_mmu_pages; | ||
| 277 | unsigned int n_alloc_mmu_pages; | ||
| 278 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; | ||
| 407 | /* | 279 | /* |
| 408 | * Hash table of struct kvm_mmu_page. | 280 | * Hash table of struct kvm_mmu_page. |
| 409 | */ | 281 | */ |
| 410 | struct list_head active_mmu_pages; | 282 | struct list_head active_mmu_pages; |
| 411 | int n_free_mmu_pages; | ||
| 412 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; | ||
| 413 | struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; | ||
| 414 | unsigned long rmap_overflow; | ||
| 415 | struct list_head vm_list; | ||
| 416 | struct file *filp; | ||
| 417 | struct kvm_io_bus mmio_bus; | ||
| 418 | struct kvm_io_bus pio_bus; | ||
| 419 | struct kvm_pic *vpic; | 283 | struct kvm_pic *vpic; |
| 420 | struct kvm_ioapic *vioapic; | 284 | struct kvm_ioapic *vioapic; |
| 285 | |||
| 421 | int round_robin_prev_vcpu; | 286 | int round_robin_prev_vcpu; |
| 287 | unsigned int tss_addr; | ||
| 288 | struct page *apic_access_page; | ||
| 422 | }; | 289 | }; |
| 423 | 290 | ||
| 424 | static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) | 291 | struct kvm_vm_stat { |
| 425 | { | 292 | u32 mmu_shadow_zapped; |
| 426 | return kvm->vpic; | 293 | u32 mmu_pte_write; |
| 427 | } | 294 | u32 mmu_pte_updated; |
| 295 | u32 mmu_pde_zapped; | ||
| 296 | u32 mmu_flooded; | ||
| 297 | u32 mmu_recycled; | ||
| 298 | u32 mmu_cache_miss; | ||
| 299 | u32 remote_tlb_flush; | ||
| 300 | }; | ||
| 428 | 301 | ||
| 429 | static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) | 302 | struct kvm_vcpu_stat { |
| 430 | { | 303 | u32 pf_fixed; |
| 431 | return kvm->vioapic; | 304 | u32 pf_guest; |
| 432 | } | 305 | u32 tlb_flush; |
| 306 | u32 invlpg; | ||
| 433 | 307 | ||
| 434 | static inline int irqchip_in_kernel(struct kvm *kvm) | 308 | u32 exits; |
| 435 | { | 309 | u32 io_exits; |
| 436 | return pic_irqchip(kvm) != 0; | 310 | u32 mmio_exits; |
| 437 | } | 311 | u32 signal_exits; |
| 312 | u32 irq_window_exits; | ||
| 313 | u32 halt_exits; | ||
| 314 | u32 halt_wakeup; | ||
| 315 | u32 request_irq_exits; | ||
| 316 | u32 irq_exits; | ||
| 317 | u32 host_state_reload; | ||
| 318 | u32 efer_reload; | ||
| 319 | u32 fpu_reload; | ||
| 320 | u32 insn_emulation; | ||
| 321 | u32 insn_emulation_fail; | ||
| 322 | }; | ||
| 438 | 323 | ||
| 439 | struct descriptor_table { | 324 | struct descriptor_table { |
| 440 | u16 limit; | 325 | u16 limit; |
| @@ -449,11 +334,12 @@ struct kvm_x86_ops { | |||
| 449 | void (*check_processor_compatibility)(void *rtn); | 334 | void (*check_processor_compatibility)(void *rtn); |
| 450 | int (*hardware_setup)(void); /* __init */ | 335 | int (*hardware_setup)(void); /* __init */ |
| 451 | void (*hardware_unsetup)(void); /* __exit */ | 336 | void (*hardware_unsetup)(void); /* __exit */ |
| 337 | bool (*cpu_has_accelerated_tpr)(void); | ||
| 452 | 338 | ||
| 453 | /* Create, but do not attach this VCPU */ | 339 | /* Create, but do not attach this VCPU */ |
| 454 | struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); | 340 | struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); |
| 455 | void (*vcpu_free)(struct kvm_vcpu *vcpu); | 341 | void (*vcpu_free)(struct kvm_vcpu *vcpu); |
| 456 | void (*vcpu_reset)(struct kvm_vcpu *vcpu); | 342 | int (*vcpu_reset)(struct kvm_vcpu *vcpu); |
| 457 | 343 | ||
| 458 | void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); | 344 | void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); |
| 459 | void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); | 345 | void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); |
| @@ -489,10 +375,6 @@ struct kvm_x86_ops { | |||
| 489 | void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); | 375 | void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); |
| 490 | 376 | ||
| 491 | void (*tlb_flush)(struct kvm_vcpu *vcpu); | 377 | void (*tlb_flush)(struct kvm_vcpu *vcpu); |
| 492 | void (*inject_page_fault)(struct kvm_vcpu *vcpu, | ||
| 493 | unsigned long addr, u32 err_code); | ||
| 494 | |||
| 495 | void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code); | ||
| 496 | 378 | ||
| 497 | void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); | 379 | void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); |
| 498 | int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); | 380 | int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); |
| @@ -501,54 +383,31 @@ struct kvm_x86_ops { | |||
| 501 | unsigned char *hypercall_addr); | 383 | unsigned char *hypercall_addr); |
| 502 | int (*get_irq)(struct kvm_vcpu *vcpu); | 384 | int (*get_irq)(struct kvm_vcpu *vcpu); |
| 503 | void (*set_irq)(struct kvm_vcpu *vcpu, int vec); | 385 | void (*set_irq)(struct kvm_vcpu *vcpu, int vec); |
| 386 | void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, | ||
| 387 | bool has_error_code, u32 error_code); | ||
| 388 | bool (*exception_injected)(struct kvm_vcpu *vcpu); | ||
| 504 | void (*inject_pending_irq)(struct kvm_vcpu *vcpu); | 389 | void (*inject_pending_irq)(struct kvm_vcpu *vcpu); |
| 505 | void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, | 390 | void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, |
| 506 | struct kvm_run *run); | 391 | struct kvm_run *run); |
| 392 | |||
| 393 | int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); | ||
| 507 | }; | 394 | }; |
| 508 | 395 | ||
| 509 | extern struct kvm_x86_ops *kvm_x86_ops; | 396 | extern struct kvm_x86_ops *kvm_x86_ops; |
| 510 | 397 | ||
| 511 | /* The guest did something we don't support. */ | ||
| 512 | #define pr_unimpl(vcpu, fmt, ...) \ | ||
| 513 | do { \ | ||
| 514 | if (printk_ratelimit()) \ | ||
| 515 | printk(KERN_ERR "kvm: %i: cpu%i " fmt, \ | ||
| 516 | current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \ | ||
| 517 | } while(0) | ||
| 518 | |||
| 519 | #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) | ||
| 520 | #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) | ||
| 521 | |||
| 522 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); | ||
| 523 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); | ||
| 524 | |||
| 525 | int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, | ||
| 526 | struct module *module); | ||
| 527 | void kvm_exit_x86(void); | ||
| 528 | |||
| 529 | int kvm_mmu_module_init(void); | 398 | int kvm_mmu_module_init(void); |
| 530 | void kvm_mmu_module_exit(void); | 399 | void kvm_mmu_module_exit(void); |
| 531 | 400 | ||
| 532 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu); | 401 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu); |
| 533 | int kvm_mmu_create(struct kvm_vcpu *vcpu); | 402 | int kvm_mmu_create(struct kvm_vcpu *vcpu); |
| 534 | int kvm_mmu_setup(struct kvm_vcpu *vcpu); | 403 | int kvm_mmu_setup(struct kvm_vcpu *vcpu); |
| 404 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); | ||
| 535 | 405 | ||
| 536 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); | 406 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); |
| 537 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); | 407 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); |
| 538 | void kvm_mmu_zap_all(struct kvm *kvm); | 408 | void kvm_mmu_zap_all(struct kvm *kvm); |
| 539 | 409 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); | |
| 540 | hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); | 410 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); |
| 541 | #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) | ||
| 542 | #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) | ||
| 543 | static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } | ||
| 544 | hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva); | ||
| 545 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); | ||
| 546 | |||
| 547 | extern hpa_t bad_page_address; | ||
| 548 | |||
| 549 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); | ||
| 550 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); | ||
| 551 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn); | ||
| 552 | 411 | ||
| 553 | enum emulation_result { | 412 | enum emulation_result { |
| 554 | EMULATE_DONE, /* no further processing */ | 413 | EMULATE_DONE, /* no further processing */ |
| @@ -556,8 +415,10 @@ enum emulation_result { | |||
| 556 | EMULATE_FAIL, /* can't emulate this instruction */ | 415 | EMULATE_FAIL, /* can't emulate this instruction */ |
| 557 | }; | 416 | }; |
| 558 | 417 | ||
| 418 | #define EMULTYPE_NO_DECODE (1 << 0) | ||
| 419 | #define EMULTYPE_TRAP_UD (1 << 1) | ||
| 559 | int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, | 420 | int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, |
| 560 | unsigned long cr2, u16 error_code); | 421 | unsigned long cr2, u16 error_code, int emulation_type); |
| 561 | void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); | 422 | void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); |
| 562 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | 423 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); |
| 563 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | 424 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); |
| @@ -572,7 +433,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); | |||
| 572 | 433 | ||
| 573 | struct x86_emulate_ctxt; | 434 | struct x86_emulate_ctxt; |
| 574 | 435 | ||
| 575 | int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | 436 | int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, |
| 576 | int size, unsigned port); | 437 | int size, unsigned port); |
| 577 | int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | 438 | int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, |
| 578 | int size, unsigned long count, int down, | 439 | int size, unsigned long count, int down, |
| @@ -581,7 +442,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); | |||
| 581 | int kvm_emulate_halt(struct kvm_vcpu *vcpu); | 442 | int kvm_emulate_halt(struct kvm_vcpu *vcpu); |
| 582 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); | 443 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); |
| 583 | int emulate_clts(struct kvm_vcpu *vcpu); | 444 | int emulate_clts(struct kvm_vcpu *vcpu); |
| 584 | int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, | 445 | int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, |
| 585 | unsigned long *dest); | 446 | unsigned long *dest); |
| 586 | int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, | 447 | int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, |
| 587 | unsigned long value); | 448 | unsigned long value); |
| @@ -597,15 +458,15 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); | |||
| 597 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); | 458 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); |
| 598 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); | 459 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); |
| 599 | 460 | ||
| 600 | void fx_init(struct kvm_vcpu *vcpu); | 461 | void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); |
| 462 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); | ||
| 463 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, | ||
| 464 | u32 error_code); | ||
| 601 | 465 | ||
| 602 | void kvm_resched(struct kvm_vcpu *vcpu); | 466 | void fx_init(struct kvm_vcpu *vcpu); |
| 603 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); | ||
| 604 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); | ||
| 605 | void kvm_flush_remote_tlbs(struct kvm *kvm); | ||
| 606 | 467 | ||
| 607 | int emulator_read_std(unsigned long addr, | 468 | int emulator_read_std(unsigned long addr, |
| 608 | void *val, | 469 | void *val, |
| 609 | unsigned int bytes, | 470 | unsigned int bytes, |
| 610 | struct kvm_vcpu *vcpu); | 471 | struct kvm_vcpu *vcpu); |
| 611 | int emulator_write_emulated(unsigned long addr, | 472 | int emulator_write_emulated(unsigned long addr, |
| @@ -615,6 +476,7 @@ int emulator_write_emulated(unsigned long addr, | |||
| 615 | 476 | ||
| 616 | unsigned long segment_base(u16 selector); | 477 | unsigned long segment_base(u16 selector); |
| 617 | 478 | ||
| 479 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); | ||
| 618 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | 480 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, |
| 619 | const u8 *new, int bytes); | 481 | const u8 *new, int bytes); |
| 620 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); | 482 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); |
| @@ -622,66 +484,14 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); | |||
| 622 | int kvm_mmu_load(struct kvm_vcpu *vcpu); | 484 | int kvm_mmu_load(struct kvm_vcpu *vcpu); |
| 623 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); | 485 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); |
| 624 | 486 | ||
| 625 | int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); | 487 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); |
| 626 | 488 | ||
| 627 | static inline void kvm_guest_enter(void) | 489 | int kvm_fix_hypercall(struct kvm_vcpu *vcpu); |
| 628 | { | ||
| 629 | current->flags |= PF_VCPU; | ||
| 630 | } | ||
| 631 | 490 | ||
| 632 | static inline void kvm_guest_exit(void) | 491 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); |
| 633 | { | ||
| 634 | current->flags &= ~PF_VCPU; | ||
| 635 | } | ||
| 636 | 492 | ||
| 637 | static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | 493 | int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); |
| 638 | u32 error_code) | 494 | int complete_pio(struct kvm_vcpu *vcpu); |
| 639 | { | ||
| 640 | return vcpu->mmu.page_fault(vcpu, gva, error_code); | ||
| 641 | } | ||
| 642 | |||
| 643 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | ||
| 644 | { | ||
| 645 | if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) | ||
| 646 | __kvm_mmu_free_some_pages(vcpu); | ||
| 647 | } | ||
| 648 | |||
| 649 | static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) | ||
| 650 | { | ||
| 651 | if (likely(vcpu->mmu.root_hpa != INVALID_PAGE)) | ||
| 652 | return 0; | ||
| 653 | |||
| 654 | return kvm_mmu_load(vcpu); | ||
| 655 | } | ||
| 656 | |||
| 657 | static inline int is_long_mode(struct kvm_vcpu *vcpu) | ||
| 658 | { | ||
| 659 | #ifdef CONFIG_X86_64 | ||
| 660 | return vcpu->shadow_efer & EFER_LME; | ||
| 661 | #else | ||
| 662 | return 0; | ||
| 663 | #endif | ||
| 664 | } | ||
| 665 | |||
| 666 | static inline int is_pae(struct kvm_vcpu *vcpu) | ||
| 667 | { | ||
| 668 | return vcpu->cr4 & X86_CR4_PAE; | ||
| 669 | } | ||
| 670 | |||
| 671 | static inline int is_pse(struct kvm_vcpu *vcpu) | ||
| 672 | { | ||
| 673 | return vcpu->cr4 & X86_CR4_PSE; | ||
| 674 | } | ||
| 675 | |||
| 676 | static inline int is_paging(struct kvm_vcpu *vcpu) | ||
| 677 | { | ||
| 678 | return vcpu->cr0 & X86_CR0_PG; | ||
| 679 | } | ||
| 680 | |||
| 681 | static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) | ||
| 682 | { | ||
| 683 | return slot - kvm->memslots; | ||
| 684 | } | ||
| 685 | 495 | ||
| 686 | static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) | 496 | static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) |
| 687 | { | 497 | { |
| @@ -693,55 +503,55 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) | |||
| 693 | static inline u16 read_fs(void) | 503 | static inline u16 read_fs(void) |
| 694 | { | 504 | { |
| 695 | u16 seg; | 505 | u16 seg; |
| 696 | asm ("mov %%fs, %0" : "=g"(seg)); | 506 | asm("mov %%fs, %0" : "=g"(seg)); |
| 697 | return seg; | 507 | return seg; |
| 698 | } | 508 | } |
| 699 | 509 | ||
| 700 | static inline u16 read_gs(void) | 510 | static inline u16 read_gs(void) |
| 701 | { | 511 | { |
| 702 | u16 seg; | 512 | u16 seg; |
| 703 | asm ("mov %%gs, %0" : "=g"(seg)); | 513 | asm("mov %%gs, %0" : "=g"(seg)); |
| 704 | return seg; | 514 | return seg; |
| 705 | } | 515 | } |
| 706 | 516 | ||
| 707 | static inline u16 read_ldt(void) | 517 | static inline u16 read_ldt(void) |
| 708 | { | 518 | { |
| 709 | u16 ldt; | 519 | u16 ldt; |
| 710 | asm ("sldt %0" : "=g"(ldt)); | 520 | asm("sldt %0" : "=g"(ldt)); |
| 711 | return ldt; | 521 | return ldt; |
| 712 | } | 522 | } |
| 713 | 523 | ||
| 714 | static inline void load_fs(u16 sel) | 524 | static inline void load_fs(u16 sel) |
| 715 | { | 525 | { |
| 716 | asm ("mov %0, %%fs" : : "rm"(sel)); | 526 | asm("mov %0, %%fs" : : "rm"(sel)); |
| 717 | } | 527 | } |
| 718 | 528 | ||
| 719 | static inline void load_gs(u16 sel) | 529 | static inline void load_gs(u16 sel) |
| 720 | { | 530 | { |
| 721 | asm ("mov %0, %%gs" : : "rm"(sel)); | 531 | asm("mov %0, %%gs" : : "rm"(sel)); |
| 722 | } | 532 | } |
| 723 | 533 | ||
| 724 | #ifndef load_ldt | 534 | #ifndef load_ldt |
| 725 | static inline void load_ldt(u16 sel) | 535 | static inline void load_ldt(u16 sel) |
| 726 | { | 536 | { |
| 727 | asm ("lldt %0" : : "rm"(sel)); | 537 | asm("lldt %0" : : "rm"(sel)); |
| 728 | } | 538 | } |
| 729 | #endif | 539 | #endif |
| 730 | 540 | ||
| 731 | static inline void get_idt(struct descriptor_table *table) | 541 | static inline void get_idt(struct descriptor_table *table) |
| 732 | { | 542 | { |
| 733 | asm ("sidt %0" : "=m"(*table)); | 543 | asm("sidt %0" : "=m"(*table)); |
| 734 | } | 544 | } |
| 735 | 545 | ||
| 736 | static inline void get_gdt(struct descriptor_table *table) | 546 | static inline void get_gdt(struct descriptor_table *table) |
| 737 | { | 547 | { |
| 738 | asm ("sgdt %0" : "=m"(*table)); | 548 | asm("sgdt %0" : "=m"(*table)); |
| 739 | } | 549 | } |
| 740 | 550 | ||
| 741 | static inline unsigned long read_tr_base(void) | 551 | static inline unsigned long read_tr_base(void) |
| 742 | { | 552 | { |
| 743 | u16 tr; | 553 | u16 tr; |
| 744 | asm ("str %0" : "=g"(tr)); | 554 | asm("str %0" : "=g"(tr)); |
| 745 | return segment_base(tr); | 555 | return segment_base(tr); |
| 746 | } | 556 | } |
| 747 | 557 | ||
| @@ -757,17 +567,17 @@ static inline unsigned long read_msr(unsigned long msr) | |||
| 757 | 567 | ||
| 758 | static inline void fx_save(struct i387_fxsave_struct *image) | 568 | static inline void fx_save(struct i387_fxsave_struct *image) |
| 759 | { | 569 | { |
| 760 | asm ("fxsave (%0)":: "r" (image)); | 570 | asm("fxsave (%0)":: "r" (image)); |
| 761 | } | 571 | } |
| 762 | 572 | ||
| 763 | static inline void fx_restore(struct i387_fxsave_struct *image) | 573 | static inline void fx_restore(struct i387_fxsave_struct *image) |
| 764 | { | 574 | { |
| 765 | asm ("fxrstor (%0)":: "r" (image)); | 575 | asm("fxrstor (%0)":: "r" (image)); |
| 766 | } | 576 | } |
| 767 | 577 | ||
| 768 | static inline void fpu_init(void) | 578 | static inline void fpu_init(void) |
| 769 | { | 579 | { |
| 770 | asm ("finit"); | 580 | asm("finit"); |
| 771 | } | 581 | } |
| 772 | 582 | ||
| 773 | static inline u32 get_rdx_init_val(void) | 583 | static inline u32 get_rdx_init_val(void) |
| @@ -775,6 +585,11 @@ static inline u32 get_rdx_init_val(void) | |||
| 775 | return 0x600; /* P6 family */ | 585 | return 0x600; /* P6 family */ |
| 776 | } | 586 | } |
| 777 | 587 | ||
| 588 | static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) | ||
| 589 | { | ||
| 590 | kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); | ||
| 591 | } | ||
| 592 | |||
| 778 | #define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" | 593 | #define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" |
| 779 | #define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" | 594 | #define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" |
| 780 | #define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" | 595 | #define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" |
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h new file mode 100644 index 000000000000..c6f3fd8d8c53 --- /dev/null +++ b/include/asm-x86/kvm_para.h | |||
| @@ -0,0 +1,105 @@ | |||
| 1 | #ifndef __X86_KVM_PARA_H | ||
| 2 | #define __X86_KVM_PARA_H | ||
| 3 | |||
| 4 | /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It | ||
| 5 | * should be used to determine that a VM is running under KVM. | ||
| 6 | */ | ||
| 7 | #define KVM_CPUID_SIGNATURE 0x40000000 | ||
| 8 | |||
| 9 | /* This CPUID returns a feature bitmap in eax. Before enabling a particular | ||
| 10 | * paravirtualization, the appropriate feature bit should be checked. | ||
| 11 | */ | ||
| 12 | #define KVM_CPUID_FEATURES 0x40000001 | ||
| 13 | |||
| 14 | #ifdef __KERNEL__ | ||
| 15 | #include <asm/processor.h> | ||
| 16 | |||
| 17 | /* This instruction is vmcall. On non-VT architectures, it will generate a | ||
| 18 | * trap that we will then rewrite to the appropriate instruction. | ||
| 19 | */ | ||
| 20 | #define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" | ||
| 21 | |||
| 22 | /* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun | ||
| 23 | * instruction. The hypervisor may replace it with something else but only the | ||
| 24 | * instructions are guaranteed to be supported. | ||
| 25 | * | ||
| 26 | * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. | ||
| 27 | * The hypercall number should be placed in rax and the return value will be | ||
| 28 | * placed in rax. No other registers will be clobbered unless explicited | ||
| 29 | * noted by the particular hypercall. | ||
| 30 | */ | ||
| 31 | |||
| 32 | static inline long kvm_hypercall0(unsigned int nr) | ||
| 33 | { | ||
| 34 | long ret; | ||
| 35 | asm volatile(KVM_HYPERCALL | ||
| 36 | : "=a"(ret) | ||
| 37 | : "a"(nr)); | ||
| 38 | return ret; | ||
| 39 | } | ||
| 40 | |||
| 41 | static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) | ||
| 42 | { | ||
| 43 | long ret; | ||
| 44 | asm volatile(KVM_HYPERCALL | ||
| 45 | : "=a"(ret) | ||
| 46 | : "a"(nr), "b"(p1)); | ||
| 47 | return ret; | ||
| 48 | } | ||
| 49 | |||
| 50 | static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, | ||
| 51 | unsigned long p2) | ||
| 52 | { | ||
| 53 | long ret; | ||
| 54 | asm volatile(KVM_HYPERCALL | ||
| 55 | : "=a"(ret) | ||
| 56 | : "a"(nr), "b"(p1), "c"(p2)); | ||
| 57 | return ret; | ||
| 58 | } | ||
| 59 | |||
| 60 | static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, | ||
| 61 | unsigned long p2, unsigned long p3) | ||
| 62 | { | ||
| 63 | long ret; | ||
| 64 | asm volatile(KVM_HYPERCALL | ||
| 65 | : "=a"(ret) | ||
| 66 | : "a"(nr), "b"(p1), "c"(p2), "d"(p3)); | ||
| 67 | return ret; | ||
| 68 | } | ||
| 69 | |||
| 70 | static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, | ||
| 71 | unsigned long p2, unsigned long p3, | ||
| 72 | unsigned long p4) | ||
| 73 | { | ||
| 74 | long ret; | ||
| 75 | asm volatile(KVM_HYPERCALL | ||
| 76 | : "=a"(ret) | ||
| 77 | : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)); | ||
| 78 | return ret; | ||
| 79 | } | ||
| 80 | |||
| 81 | static inline int kvm_para_available(void) | ||
| 82 | { | ||
| 83 | unsigned int eax, ebx, ecx, edx; | ||
| 84 | char signature[13]; | ||
| 85 | |||
| 86 | cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); | ||
| 87 | memcpy(signature + 0, &ebx, 4); | ||
| 88 | memcpy(signature + 4, &ecx, 4); | ||
| 89 | memcpy(signature + 8, &edx, 4); | ||
| 90 | signature[12] = 0; | ||
| 91 | |||
| 92 | if (strcmp(signature, "KVMKVMKVM") == 0) | ||
| 93 | return 1; | ||
| 94 | |||
| 95 | return 0; | ||
| 96 | } | ||
| 97 | |||
| 98 | static inline unsigned int kvm_arch_para_features(void) | ||
| 99 | { | ||
| 100 | return cpuid_eax(KVM_CPUID_FEATURES); | ||
| 101 | } | ||
| 102 | |||
| 103 | #endif | ||
| 104 | |||
| 105 | #endif | ||
diff --git a/drivers/kvm/x86_emulate.h b/include/asm-x86/kvm_x86_emulate.h index 92c73aa7f9ac..7db91b9bdcd4 100644 --- a/drivers/kvm/x86_emulate.h +++ b/include/asm-x86/kvm_x86_emulate.h | |||
| @@ -63,17 +63,6 @@ struct x86_emulate_ops { | |||
| 63 | unsigned int bytes, struct kvm_vcpu *vcpu); | 63 | unsigned int bytes, struct kvm_vcpu *vcpu); |
| 64 | 64 | ||
| 65 | /* | 65 | /* |
| 66 | * write_std: Write bytes of standard (non-emulated/special) memory. | ||
| 67 | * Used for stack operations, and others. | ||
| 68 | * @addr: [IN ] Linear address to which to write. | ||
| 69 | * @val: [IN ] Value to write to memory (low-order bytes used as | ||
| 70 | * required). | ||
| 71 | * @bytes: [IN ] Number of bytes to write to memory. | ||
| 72 | */ | ||
| 73 | int (*write_std)(unsigned long addr, const void *val, | ||
| 74 | unsigned int bytes, struct kvm_vcpu *vcpu); | ||
| 75 | |||
| 76 | /* | ||
| 77 | * read_emulated: Read bytes from emulated/special memory area. | 66 | * read_emulated: Read bytes from emulated/special memory area. |
| 78 | * @addr: [IN ] Linear address from which to read. | 67 | * @addr: [IN ] Linear address from which to read. |
| 79 | * @val: [OUT] Value read from memory, zero-extended to 'u_long'. | 68 | * @val: [OUT] Value read from memory, zero-extended to 'u_long'. |
| @@ -112,13 +101,50 @@ struct x86_emulate_ops { | |||
| 112 | 101 | ||
| 113 | }; | 102 | }; |
| 114 | 103 | ||
| 104 | /* Type, address-of, and value of an instruction's operand. */ | ||
| 105 | struct operand { | ||
| 106 | enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; | ||
| 107 | unsigned int bytes; | ||
| 108 | unsigned long val, orig_val, *ptr; | ||
| 109 | }; | ||
| 110 | |||
| 111 | struct fetch_cache { | ||
| 112 | u8 data[15]; | ||
| 113 | unsigned long start; | ||
| 114 | unsigned long end; | ||
| 115 | }; | ||
| 116 | |||
| 117 | struct decode_cache { | ||
| 118 | u8 twobyte; | ||
| 119 | u8 b; | ||
| 120 | u8 lock_prefix; | ||
| 121 | u8 rep_prefix; | ||
| 122 | u8 op_bytes; | ||
| 123 | u8 ad_bytes; | ||
| 124 | u8 rex_prefix; | ||
| 125 | struct operand src; | ||
| 126 | struct operand dst; | ||
| 127 | unsigned long *override_base; | ||
| 128 | unsigned int d; | ||
| 129 | unsigned long regs[NR_VCPU_REGS]; | ||
| 130 | unsigned long eip; | ||
| 131 | /* modrm */ | ||
| 132 | u8 modrm; | ||
| 133 | u8 modrm_mod; | ||
| 134 | u8 modrm_reg; | ||
| 135 | u8 modrm_rm; | ||
| 136 | u8 use_modrm_ea; | ||
| 137 | unsigned long modrm_ea; | ||
| 138 | unsigned long modrm_val; | ||
| 139 | struct fetch_cache fetch; | ||
| 140 | }; | ||
| 141 | |||
| 115 | struct x86_emulate_ctxt { | 142 | struct x86_emulate_ctxt { |
| 116 | /* Register state before/after emulation. */ | 143 | /* Register state before/after emulation. */ |
| 117 | struct kvm_vcpu *vcpu; | 144 | struct kvm_vcpu *vcpu; |
| 118 | 145 | ||
| 119 | /* Linear faulting address (if emulating a page-faulting instruction). */ | 146 | /* Linear faulting address (if emulating a page-faulting instruction). */ |
| 120 | unsigned long eflags; | 147 | unsigned long eflags; |
| 121 | unsigned long cr2; | ||
| 122 | 148 | ||
| 123 | /* Emulated execution mode, represented by an X86EMUL_MODE value. */ | 149 | /* Emulated execution mode, represented by an X86EMUL_MODE value. */ |
| 124 | int mode; | 150 | int mode; |
| @@ -129,8 +155,16 @@ struct x86_emulate_ctxt { | |||
| 129 | unsigned long ss_base; | 155 | unsigned long ss_base; |
| 130 | unsigned long gs_base; | 156 | unsigned long gs_base; |
| 131 | unsigned long fs_base; | 157 | unsigned long fs_base; |
| 158 | |||
| 159 | /* decode cache */ | ||
| 160 | |||
| 161 | struct decode_cache decode; | ||
| 132 | }; | 162 | }; |
| 133 | 163 | ||
| 164 | /* Repeat String Operation Prefix */ | ||
| 165 | #define REPE_PREFIX 1 | ||
| 166 | #define REPNE_PREFIX 2 | ||
| 167 | |||
| 134 | /* Execution mode, passed to the emulator. */ | 168 | /* Execution mode, passed to the emulator. */ |
| 135 | #define X86EMUL_MODE_REAL 0 /* Real mode. */ | 169 | #define X86EMUL_MODE_REAL 0 /* Real mode. */ |
| 136 | #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ | 170 | #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ |
| @@ -144,12 +178,9 @@ struct x86_emulate_ctxt { | |||
| 144 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 | 178 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 |
| 145 | #endif | 179 | #endif |
| 146 | 180 | ||
| 147 | /* | 181 | int x86_decode_insn(struct x86_emulate_ctxt *ctxt, |
| 148 | * x86_emulate_memop: Emulate an instruction that faulted attempting to | 182 | struct x86_emulate_ops *ops); |
| 149 | * read/write a 'special' memory area. | 183 | int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, |
| 150 | * Returns -1 on failure, 0 on success. | 184 | struct x86_emulate_ops *ops); |
| 151 | */ | ||
| 152 | int x86_emulate_memop(struct x86_emulate_ctxt *ctxt, | ||
| 153 | struct x86_emulate_ops *ops); | ||
| 154 | 185 | ||
| 155 | #endif /* __X86_EMULATE_H__ */ | 186 | #endif /* __X86_EMULATE_H__ */ |
diff --git a/include/asm-x86/lguest.h b/include/asm-x86/lguest.h index 1c8367a692f6..4d9367b72976 100644 --- a/include/asm-x86/lguest.h +++ b/include/asm-x86/lguest.h | |||
| @@ -56,7 +56,7 @@ struct lguest_ro_state | |||
| 56 | struct desc_struct guest_gdt[GDT_ENTRIES]; | 56 | struct desc_struct guest_gdt[GDT_ENTRIES]; |
| 57 | }; | 57 | }; |
| 58 | 58 | ||
| 59 | struct lguest_arch | 59 | struct lg_cpu_arch |
| 60 | { | 60 | { |
| 61 | /* The GDT entries copied into lguest_ro_state when running. */ | 61 | /* The GDT entries copied into lguest_ro_state when running. */ |
| 62 | struct desc_struct gdt[GDT_ENTRIES]; | 62 | struct desc_struct gdt[GDT_ENTRIES]; |
diff --git a/include/asm-x86/lguest_hcall.h b/include/asm-x86/lguest_hcall.h index 2091779e91fb..758b9a5d4539 100644 --- a/include/asm-x86/lguest_hcall.h +++ b/include/asm-x86/lguest_hcall.h | |||
| @@ -4,7 +4,7 @@ | |||
| 4 | 4 | ||
| 5 | #define LHCALL_FLUSH_ASYNC 0 | 5 | #define LHCALL_FLUSH_ASYNC 0 |
| 6 | #define LHCALL_LGUEST_INIT 1 | 6 | #define LHCALL_LGUEST_INIT 1 |
| 7 | #define LHCALL_CRASH 2 | 7 | #define LHCALL_SHUTDOWN 2 |
| 8 | #define LHCALL_LOAD_GDT 3 | 8 | #define LHCALL_LOAD_GDT 3 |
| 9 | #define LHCALL_NEW_PGTABLE 4 | 9 | #define LHCALL_NEW_PGTABLE 4 |
| 10 | #define LHCALL_FLUSH_TLB 5 | 10 | #define LHCALL_FLUSH_TLB 5 |
| @@ -20,6 +20,10 @@ | |||
| 20 | 20 | ||
| 21 | #define LGUEST_TRAP_ENTRY 0x1F | 21 | #define LGUEST_TRAP_ENTRY 0x1F |
| 22 | 22 | ||
| 23 | /* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ | ||
| 24 | #define LGUEST_SHUTDOWN_POWEROFF 1 | ||
| 25 | #define LGUEST_SHUTDOWN_RESTART 2 | ||
| 26 | |||
| 23 | #ifndef __ASSEMBLY__ | 27 | #ifndef __ASSEMBLY__ |
| 24 | #include <asm/hw_irq.h> | 28 | #include <asm/hw_irq.h> |
| 25 | 29 | ||
diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 27b9350052b4..85b2482cc736 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild | |||
| @@ -100,7 +100,6 @@ header-y += iso_fs.h | |||
| 100 | header-y += ixjuser.h | 100 | header-y += ixjuser.h |
| 101 | header-y += jffs2.h | 101 | header-y += jffs2.h |
| 102 | header-y += keyctl.h | 102 | header-y += keyctl.h |
| 103 | header-y += kvm.h | ||
| 104 | header-y += limits.h | 103 | header-y += limits.h |
| 105 | header-y += lock_dlm_plock.h | 104 | header-y += lock_dlm_plock.h |
| 106 | header-y += magic.h | 105 | header-y += magic.h |
| @@ -256,6 +255,7 @@ unifdef-y += kd.h | |||
| 256 | unifdef-y += kernelcapi.h | 255 | unifdef-y += kernelcapi.h |
| 257 | unifdef-y += kernel.h | 256 | unifdef-y += kernel.h |
| 258 | unifdef-y += keyboard.h | 257 | unifdef-y += keyboard.h |
| 258 | unifdef-$(CONFIG_HAVE_KVM) += kvm.h | ||
| 259 | unifdef-y += llc.h | 259 | unifdef-y += llc.h |
| 260 | unifdef-y += loop.h | 260 | unifdef-y += loop.h |
| 261 | unifdef-y += lp.h | 261 | unifdef-y += lp.h |
diff --git a/include/linux/audit.h b/include/linux/audit.h index c68781692838..bdd6f5de5fc4 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h | |||
| @@ -115,6 +115,8 @@ | |||
| 115 | #define AUDIT_MAC_IPSEC_ADDSPD 1413 /* Not used */ | 115 | #define AUDIT_MAC_IPSEC_ADDSPD 1413 /* Not used */ |
| 116 | #define AUDIT_MAC_IPSEC_DELSPD 1414 /* Not used */ | 116 | #define AUDIT_MAC_IPSEC_DELSPD 1414 /* Not used */ |
| 117 | #define AUDIT_MAC_IPSEC_EVENT 1415 /* Audit an IPSec event */ | 117 | #define AUDIT_MAC_IPSEC_EVENT 1415 /* Audit an IPSec event */ |
| 118 | #define AUDIT_MAC_UNLBL_STCADD 1416 /* NetLabel: add a static label */ | ||
| 119 | #define AUDIT_MAC_UNLBL_STCDEL 1417 /* NetLabel: del a static label */ | ||
| 118 | 120 | ||
| 119 | #define AUDIT_FIRST_KERN_ANOM_MSG 1700 | 121 | #define AUDIT_FIRST_KERN_ANOM_MSG 1700 |
| 120 | #define AUDIT_LAST_KERN_ANOM_MSG 1799 | 122 | #define AUDIT_LAST_KERN_ANOM_MSG 1799 |
diff --git a/include/linux/device.h b/include/linux/device.h index 1880208964d6..db375be333c7 100644 --- a/include/linux/device.h +++ b/include/linux/device.h | |||
| @@ -84,6 +84,9 @@ int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data, | |||
| 84 | struct device *bus_find_device(struct bus_type *bus, struct device *start, | 84 | struct device *bus_find_device(struct bus_type *bus, struct device *start, |
| 85 | void *data, | 85 | void *data, |
| 86 | int (*match)(struct device *dev, void *data)); | 86 | int (*match)(struct device *dev, void *data)); |
| 87 | struct device *bus_find_device_by_name(struct bus_type *bus, | ||
| 88 | struct device *start, | ||
| 89 | const char *name); | ||
| 87 | 90 | ||
| 88 | int __must_check bus_for_each_drv(struct bus_type *bus, | 91 | int __must_check bus_for_each_drv(struct bus_type *bus, |
| 89 | struct device_driver *start, void *data, | 92 | struct device_driver *start, void *data, |
diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 057a7f34ee36..4de4fd2d8607 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h | |||
| @@ -9,12 +9,10 @@ | |||
| 9 | 9 | ||
| 10 | #include <asm/types.h> | 10 | #include <asm/types.h> |
| 11 | #include <linux/ioctl.h> | 11 | #include <linux/ioctl.h> |
| 12 | #include <asm/kvm.h> | ||
| 12 | 13 | ||
| 13 | #define KVM_API_VERSION 12 | 14 | #define KVM_API_VERSION 12 |
| 14 | 15 | ||
| 15 | /* Architectural interrupt line count. */ | ||
| 16 | #define KVM_NR_INTERRUPTS 256 | ||
| 17 | |||
| 18 | /* for KVM_CREATE_MEMORY_REGION */ | 16 | /* for KVM_CREATE_MEMORY_REGION */ |
| 19 | struct kvm_memory_region { | 17 | struct kvm_memory_region { |
| 20 | __u32 slot; | 18 | __u32 slot; |
| @@ -23,17 +21,19 @@ struct kvm_memory_region { | |||
| 23 | __u64 memory_size; /* bytes */ | 21 | __u64 memory_size; /* bytes */ |
| 24 | }; | 22 | }; |
| 25 | 23 | ||
| 26 | /* for kvm_memory_region::flags */ | 24 | /* for KVM_SET_USER_MEMORY_REGION */ |
| 27 | #define KVM_MEM_LOG_DIRTY_PAGES 1UL | 25 | struct kvm_userspace_memory_region { |
| 28 | 26 | __u32 slot; | |
| 29 | struct kvm_memory_alias { | ||
| 30 | __u32 slot; /* this has a different namespace than memory slots */ | ||
| 31 | __u32 flags; | 27 | __u32 flags; |
| 32 | __u64 guest_phys_addr; | 28 | __u64 guest_phys_addr; |
| 33 | __u64 memory_size; | 29 | __u64 memory_size; /* bytes */ |
| 34 | __u64 target_phys_addr; | 30 | __u64 userspace_addr; /* start of the userspace allocated memory */ |
| 35 | }; | 31 | }; |
| 36 | 32 | ||
| 33 | /* for kvm_memory_region::flags */ | ||
| 34 | #define KVM_MEM_LOG_DIRTY_PAGES 1UL | ||
| 35 | |||
| 36 | |||
| 37 | /* for KVM_IRQ_LINE */ | 37 | /* for KVM_IRQ_LINE */ |
| 38 | struct kvm_irq_level { | 38 | struct kvm_irq_level { |
| 39 | /* | 39 | /* |
| @@ -45,62 +45,18 @@ struct kvm_irq_level { | |||
| 45 | __u32 level; | 45 | __u32 level; |
| 46 | }; | 46 | }; |
| 47 | 47 | ||
| 48 | /* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ | ||
| 49 | struct kvm_pic_state { | ||
| 50 | __u8 last_irr; /* edge detection */ | ||
| 51 | __u8 irr; /* interrupt request register */ | ||
| 52 | __u8 imr; /* interrupt mask register */ | ||
| 53 | __u8 isr; /* interrupt service register */ | ||
| 54 | __u8 priority_add; /* highest irq priority */ | ||
| 55 | __u8 irq_base; | ||
| 56 | __u8 read_reg_select; | ||
| 57 | __u8 poll; | ||
| 58 | __u8 special_mask; | ||
| 59 | __u8 init_state; | ||
| 60 | __u8 auto_eoi; | ||
| 61 | __u8 rotate_on_auto_eoi; | ||
| 62 | __u8 special_fully_nested_mode; | ||
| 63 | __u8 init4; /* true if 4 byte init */ | ||
| 64 | __u8 elcr; /* PIIX edge/trigger selection */ | ||
| 65 | __u8 elcr_mask; | ||
| 66 | }; | ||
| 67 | |||
| 68 | #define KVM_IOAPIC_NUM_PINS 24 | ||
| 69 | struct kvm_ioapic_state { | ||
| 70 | __u64 base_address; | ||
| 71 | __u32 ioregsel; | ||
| 72 | __u32 id; | ||
| 73 | __u32 irr; | ||
| 74 | __u32 pad; | ||
| 75 | union { | ||
| 76 | __u64 bits; | ||
| 77 | struct { | ||
| 78 | __u8 vector; | ||
| 79 | __u8 delivery_mode:3; | ||
| 80 | __u8 dest_mode:1; | ||
| 81 | __u8 delivery_status:1; | ||
| 82 | __u8 polarity:1; | ||
| 83 | __u8 remote_irr:1; | ||
| 84 | __u8 trig_mode:1; | ||
| 85 | __u8 mask:1; | ||
| 86 | __u8 reserve:7; | ||
| 87 | __u8 reserved[4]; | ||
| 88 | __u8 dest_id; | ||
| 89 | } fields; | ||
| 90 | } redirtbl[KVM_IOAPIC_NUM_PINS]; | ||
| 91 | }; | ||
| 92 | |||
| 93 | #define KVM_IRQCHIP_PIC_MASTER 0 | ||
| 94 | #define KVM_IRQCHIP_PIC_SLAVE 1 | ||
| 95 | #define KVM_IRQCHIP_IOAPIC 2 | ||
| 96 | 48 | ||
| 97 | struct kvm_irqchip { | 49 | struct kvm_irqchip { |
| 98 | __u32 chip_id; | 50 | __u32 chip_id; |
| 99 | __u32 pad; | 51 | __u32 pad; |
| 100 | union { | 52 | union { |
| 101 | char dummy[512]; /* reserving space */ | 53 | char dummy[512]; /* reserving space */ |
| 54 | #ifdef CONFIG_X86 | ||
| 102 | struct kvm_pic_state pic; | 55 | struct kvm_pic_state pic; |
| 56 | #endif | ||
| 57 | #if defined(CONFIG_X86) || defined(CONFIG_IA64) | ||
| 103 | struct kvm_ioapic_state ioapic; | 58 | struct kvm_ioapic_state ioapic; |
| 59 | #endif | ||
| 104 | } chip; | 60 | } chip; |
| 105 | }; | 61 | }; |
| 106 | 62 | ||
| @@ -116,6 +72,7 @@ struct kvm_irqchip { | |||
| 116 | #define KVM_EXIT_FAIL_ENTRY 9 | 72 | #define KVM_EXIT_FAIL_ENTRY 9 |
| 117 | #define KVM_EXIT_INTR 10 | 73 | #define KVM_EXIT_INTR 10 |
| 118 | #define KVM_EXIT_SET_TPR 11 | 74 | #define KVM_EXIT_SET_TPR 11 |
| 75 | #define KVM_EXIT_TPR_ACCESS 12 | ||
| 119 | 76 | ||
| 120 | /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ | 77 | /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ |
| 121 | struct kvm_run { | 78 | struct kvm_run { |
| @@ -174,90 +131,17 @@ struct kvm_run { | |||
| 174 | __u32 longmode; | 131 | __u32 longmode; |
| 175 | __u32 pad; | 132 | __u32 pad; |
| 176 | } hypercall; | 133 | } hypercall; |
| 134 | /* KVM_EXIT_TPR_ACCESS */ | ||
| 135 | struct { | ||
| 136 | __u64 rip; | ||
| 137 | __u32 is_write; | ||
| 138 | __u32 pad; | ||
| 139 | } tpr_access; | ||
| 177 | /* Fix the size of the union. */ | 140 | /* Fix the size of the union. */ |
| 178 | char padding[256]; | 141 | char padding[256]; |
| 179 | }; | 142 | }; |
| 180 | }; | 143 | }; |
| 181 | 144 | ||
| 182 | /* for KVM_GET_REGS and KVM_SET_REGS */ | ||
| 183 | struct kvm_regs { | ||
| 184 | /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ | ||
| 185 | __u64 rax, rbx, rcx, rdx; | ||
| 186 | __u64 rsi, rdi, rsp, rbp; | ||
| 187 | __u64 r8, r9, r10, r11; | ||
| 188 | __u64 r12, r13, r14, r15; | ||
| 189 | __u64 rip, rflags; | ||
| 190 | }; | ||
| 191 | |||
| 192 | /* for KVM_GET_FPU and KVM_SET_FPU */ | ||
| 193 | struct kvm_fpu { | ||
| 194 | __u8 fpr[8][16]; | ||
| 195 | __u16 fcw; | ||
| 196 | __u16 fsw; | ||
| 197 | __u8 ftwx; /* in fxsave format */ | ||
| 198 | __u8 pad1; | ||
| 199 | __u16 last_opcode; | ||
| 200 | __u64 last_ip; | ||
| 201 | __u64 last_dp; | ||
| 202 | __u8 xmm[16][16]; | ||
| 203 | __u32 mxcsr; | ||
| 204 | __u32 pad2; | ||
| 205 | }; | ||
| 206 | |||
| 207 | /* for KVM_GET_LAPIC and KVM_SET_LAPIC */ | ||
| 208 | #define KVM_APIC_REG_SIZE 0x400 | ||
| 209 | struct kvm_lapic_state { | ||
| 210 | char regs[KVM_APIC_REG_SIZE]; | ||
| 211 | }; | ||
| 212 | |||
| 213 | struct kvm_segment { | ||
| 214 | __u64 base; | ||
| 215 | __u32 limit; | ||
| 216 | __u16 selector; | ||
| 217 | __u8 type; | ||
| 218 | __u8 present, dpl, db, s, l, g, avl; | ||
| 219 | __u8 unusable; | ||
| 220 | __u8 padding; | ||
| 221 | }; | ||
| 222 | |||
| 223 | struct kvm_dtable { | ||
| 224 | __u64 base; | ||
| 225 | __u16 limit; | ||
| 226 | __u16 padding[3]; | ||
| 227 | }; | ||
| 228 | |||
| 229 | /* for KVM_GET_SREGS and KVM_SET_SREGS */ | ||
| 230 | struct kvm_sregs { | ||
| 231 | /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */ | ||
| 232 | struct kvm_segment cs, ds, es, fs, gs, ss; | ||
| 233 | struct kvm_segment tr, ldt; | ||
| 234 | struct kvm_dtable gdt, idt; | ||
| 235 | __u64 cr0, cr2, cr3, cr4, cr8; | ||
| 236 | __u64 efer; | ||
| 237 | __u64 apic_base; | ||
| 238 | __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; | ||
| 239 | }; | ||
| 240 | |||
| 241 | struct kvm_msr_entry { | ||
| 242 | __u32 index; | ||
| 243 | __u32 reserved; | ||
| 244 | __u64 data; | ||
| 245 | }; | ||
| 246 | |||
| 247 | /* for KVM_GET_MSRS and KVM_SET_MSRS */ | ||
| 248 | struct kvm_msrs { | ||
| 249 | __u32 nmsrs; /* number of msrs in entries */ | ||
| 250 | __u32 pad; | ||
| 251 | |||
| 252 | struct kvm_msr_entry entries[0]; | ||
| 253 | }; | ||
| 254 | |||
| 255 | /* for KVM_GET_MSR_INDEX_LIST */ | ||
| 256 | struct kvm_msr_list { | ||
| 257 | __u32 nmsrs; /* number of msrs in entries */ | ||
| 258 | __u32 indices[0]; | ||
| 259 | }; | ||
| 260 | |||
| 261 | /* for KVM_TRANSLATE */ | 145 | /* for KVM_TRANSLATE */ |
| 262 | struct kvm_translation { | 146 | struct kvm_translation { |
| 263 | /* in */ | 147 | /* in */ |
| @@ -302,28 +186,24 @@ struct kvm_dirty_log { | |||
| 302 | }; | 186 | }; |
| 303 | }; | 187 | }; |
| 304 | 188 | ||
| 305 | struct kvm_cpuid_entry { | ||
| 306 | __u32 function; | ||
| 307 | __u32 eax; | ||
| 308 | __u32 ebx; | ||
| 309 | __u32 ecx; | ||
| 310 | __u32 edx; | ||
| 311 | __u32 padding; | ||
| 312 | }; | ||
| 313 | |||
| 314 | /* for KVM_SET_CPUID */ | ||
| 315 | struct kvm_cpuid { | ||
| 316 | __u32 nent; | ||
| 317 | __u32 padding; | ||
| 318 | struct kvm_cpuid_entry entries[0]; | ||
| 319 | }; | ||
| 320 | |||
| 321 | /* for KVM_SET_SIGNAL_MASK */ | 189 | /* for KVM_SET_SIGNAL_MASK */ |
| 322 | struct kvm_signal_mask { | 190 | struct kvm_signal_mask { |
| 323 | __u32 len; | 191 | __u32 len; |
| 324 | __u8 sigset[0]; | 192 | __u8 sigset[0]; |
| 325 | }; | 193 | }; |
| 326 | 194 | ||
| 195 | /* for KVM_TPR_ACCESS_REPORTING */ | ||
| 196 | struct kvm_tpr_access_ctl { | ||
| 197 | __u32 enabled; | ||
| 198 | __u32 flags; | ||
| 199 | __u32 reserved[8]; | ||
| 200 | }; | ||
| 201 | |||
| 202 | /* for KVM_SET_VAPIC_ADDR */ | ||
| 203 | struct kvm_vapic_addr { | ||
| 204 | __u64 vapic_addr; | ||
| 205 | }; | ||
| 206 | |||
| 327 | #define KVMIO 0xAE | 207 | #define KVMIO 0xAE |
| 328 | 208 | ||
| 329 | /* | 209 | /* |
| @@ -347,11 +227,21 @@ struct kvm_signal_mask { | |||
| 347 | */ | 227 | */ |
| 348 | #define KVM_CAP_IRQCHIP 0 | 228 | #define KVM_CAP_IRQCHIP 0 |
| 349 | #define KVM_CAP_HLT 1 | 229 | #define KVM_CAP_HLT 1 |
| 230 | #define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2 | ||
| 231 | #define KVM_CAP_USER_MEMORY 3 | ||
| 232 | #define KVM_CAP_SET_TSS_ADDR 4 | ||
| 233 | #define KVM_CAP_EXT_CPUID 5 | ||
| 234 | #define KVM_CAP_VAPIC 6 | ||
| 350 | 235 | ||
| 351 | /* | 236 | /* |
| 352 | * ioctls for VM fds | 237 | * ioctls for VM fds |
| 353 | */ | 238 | */ |
| 354 | #define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) | 239 | #define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) |
| 240 | #define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) | ||
| 241 | #define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) | ||
| 242 | #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\ | ||
| 243 | struct kvm_userspace_memory_region) | ||
| 244 | #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) | ||
| 355 | /* | 245 | /* |
| 356 | * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns | 246 | * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns |
| 357 | * a vcpu fd. | 247 | * a vcpu fd. |
| @@ -359,6 +249,7 @@ struct kvm_signal_mask { | |||
| 359 | #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) | 249 | #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) |
| 360 | #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) | 250 | #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) |
| 361 | #define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) | 251 | #define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) |
| 252 | #define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x48, struct kvm_cpuid2) | ||
| 362 | /* Device model IOC */ | 253 | /* Device model IOC */ |
| 363 | #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) | 254 | #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) |
| 364 | #define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) | 255 | #define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) |
| @@ -384,5 +275,11 @@ struct kvm_signal_mask { | |||
| 384 | #define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) | 275 | #define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) |
| 385 | #define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state) | 276 | #define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state) |
| 386 | #define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state) | 277 | #define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state) |
| 278 | #define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2) | ||
| 279 | #define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2) | ||
| 280 | /* Available with KVM_CAP_VAPIC */ | ||
| 281 | #define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl) | ||
| 282 | /* Available with KVM_CAP_VAPIC */ | ||
| 283 | #define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr) | ||
| 387 | 284 | ||
| 388 | #endif | 285 | #endif |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h new file mode 100644 index 000000000000..ea4764b0a2f4 --- /dev/null +++ b/include/linux/kvm_host.h | |||
| @@ -0,0 +1,299 @@ | |||
| 1 | #ifndef __KVM_HOST_H | ||
| 2 | #define __KVM_HOST_H | ||
| 3 | |||
| 4 | /* | ||
| 5 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
| 6 | * the COPYING file in the top-level directory. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/types.h> | ||
| 10 | #include <linux/hardirq.h> | ||
| 11 | #include <linux/list.h> | ||
| 12 | #include <linux/mutex.h> | ||
| 13 | #include <linux/spinlock.h> | ||
| 14 | #include <linux/signal.h> | ||
| 15 | #include <linux/sched.h> | ||
| 16 | #include <linux/mm.h> | ||
| 17 | #include <linux/preempt.h> | ||
| 18 | #include <asm/signal.h> | ||
| 19 | |||
| 20 | #include <linux/kvm.h> | ||
| 21 | #include <linux/kvm_para.h> | ||
| 22 | |||
| 23 | #include <linux/kvm_types.h> | ||
| 24 | |||
| 25 | #include <asm/kvm_host.h> | ||
| 26 | |||
| 27 | #define KVM_MAX_VCPUS 4 | ||
| 28 | #define KVM_MEMORY_SLOTS 8 | ||
| 29 | /* memory slots that does not exposed to userspace */ | ||
| 30 | #define KVM_PRIVATE_MEM_SLOTS 4 | ||
| 31 | |||
| 32 | #define KVM_PIO_PAGE_OFFSET 1 | ||
| 33 | |||
| 34 | /* | ||
| 35 | * vcpu->requests bit members | ||
| 36 | */ | ||
| 37 | #define KVM_REQ_TLB_FLUSH 0 | ||
| 38 | #define KVM_REQ_MIGRATE_TIMER 1 | ||
| 39 | #define KVM_REQ_REPORT_TPR_ACCESS 2 | ||
| 40 | |||
| 41 | struct kvm_vcpu; | ||
| 42 | extern struct kmem_cache *kvm_vcpu_cache; | ||
| 43 | |||
| 44 | struct kvm_guest_debug { | ||
| 45 | int enabled; | ||
| 46 | unsigned long bp[4]; | ||
| 47 | int singlestep; | ||
| 48 | }; | ||
| 49 | |||
| 50 | /* | ||
| 51 | * It would be nice to use something smarter than a linear search, TBD... | ||
| 52 | * Thankfully we dont expect many devices to register (famous last words :), | ||
| 53 | * so until then it will suffice. At least its abstracted so we can change | ||
| 54 | * in one place. | ||
| 55 | */ | ||
| 56 | struct kvm_io_bus { | ||
| 57 | int dev_count; | ||
| 58 | #define NR_IOBUS_DEVS 6 | ||
| 59 | struct kvm_io_device *devs[NR_IOBUS_DEVS]; | ||
| 60 | }; | ||
| 61 | |||
| 62 | void kvm_io_bus_init(struct kvm_io_bus *bus); | ||
| 63 | void kvm_io_bus_destroy(struct kvm_io_bus *bus); | ||
| 64 | struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr); | ||
| 65 | void kvm_io_bus_register_dev(struct kvm_io_bus *bus, | ||
| 66 | struct kvm_io_device *dev); | ||
| 67 | |||
| 68 | struct kvm_vcpu { | ||
| 69 | struct kvm *kvm; | ||
| 70 | struct preempt_notifier preempt_notifier; | ||
| 71 | int vcpu_id; | ||
| 72 | struct mutex mutex; | ||
| 73 | int cpu; | ||
| 74 | struct kvm_run *run; | ||
| 75 | int guest_mode; | ||
| 76 | unsigned long requests; | ||
| 77 | struct kvm_guest_debug guest_debug; | ||
| 78 | int fpu_active; | ||
| 79 | int guest_fpu_loaded; | ||
| 80 | wait_queue_head_t wq; | ||
| 81 | int sigset_active; | ||
| 82 | sigset_t sigset; | ||
| 83 | struct kvm_vcpu_stat stat; | ||
| 84 | |||
| 85 | #ifdef CONFIG_HAS_IOMEM | ||
| 86 | int mmio_needed; | ||
| 87 | int mmio_read_completed; | ||
| 88 | int mmio_is_write; | ||
| 89 | int mmio_size; | ||
| 90 | unsigned char mmio_data[8]; | ||
| 91 | gpa_t mmio_phys_addr; | ||
| 92 | #endif | ||
| 93 | |||
| 94 | struct kvm_vcpu_arch arch; | ||
| 95 | }; | ||
| 96 | |||
| 97 | struct kvm_memory_slot { | ||
| 98 | gfn_t base_gfn; | ||
| 99 | unsigned long npages; | ||
| 100 | unsigned long flags; | ||
| 101 | unsigned long *rmap; | ||
| 102 | unsigned long *dirty_bitmap; | ||
| 103 | unsigned long userspace_addr; | ||
| 104 | int user_alloc; | ||
| 105 | }; | ||
| 106 | |||
| 107 | struct kvm { | ||
| 108 | struct mutex lock; /* protects the vcpus array and APIC accesses */ | ||
| 109 | spinlock_t mmu_lock; | ||
| 110 | struct mm_struct *mm; /* userspace tied to this vm */ | ||
| 111 | int nmemslots; | ||
| 112 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + | ||
| 113 | KVM_PRIVATE_MEM_SLOTS]; | ||
| 114 | struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; | ||
| 115 | struct list_head vm_list; | ||
| 116 | struct file *filp; | ||
| 117 | struct kvm_io_bus mmio_bus; | ||
| 118 | struct kvm_io_bus pio_bus; | ||
| 119 | struct kvm_vm_stat stat; | ||
| 120 | struct kvm_arch arch; | ||
| 121 | }; | ||
| 122 | |||
| 123 | /* The guest did something we don't support. */ | ||
| 124 | #define pr_unimpl(vcpu, fmt, ...) \ | ||
| 125 | do { \ | ||
| 126 | if (printk_ratelimit()) \ | ||
| 127 | printk(KERN_ERR "kvm: %i: cpu%i " fmt, \ | ||
| 128 | current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \ | ||
| 129 | } while (0) | ||
| 130 | |||
| 131 | #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) | ||
| 132 | #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) | ||
| 133 | |||
| 134 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); | ||
| 135 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); | ||
| 136 | |||
| 137 | void vcpu_load(struct kvm_vcpu *vcpu); | ||
| 138 | void vcpu_put(struct kvm_vcpu *vcpu); | ||
| 139 | |||
| 140 | void decache_vcpus_on_cpu(int cpu); | ||
| 141 | |||
| 142 | |||
| 143 | int kvm_init(void *opaque, unsigned int vcpu_size, | ||
| 144 | struct module *module); | ||
| 145 | void kvm_exit(void); | ||
| 146 | |||
| 147 | #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) | ||
| 148 | #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) | ||
| 149 | static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } | ||
| 150 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); | ||
| 151 | |||
| 152 | extern struct page *bad_page; | ||
| 153 | |||
| 154 | int is_error_page(struct page *page); | ||
| 155 | int kvm_is_error_hva(unsigned long addr); | ||
| 156 | int kvm_set_memory_region(struct kvm *kvm, | ||
| 157 | struct kvm_userspace_memory_region *mem, | ||
| 158 | int user_alloc); | ||
| 159 | int __kvm_set_memory_region(struct kvm *kvm, | ||
| 160 | struct kvm_userspace_memory_region *mem, | ||
| 161 | int user_alloc); | ||
| 162 | int kvm_arch_set_memory_region(struct kvm *kvm, | ||
| 163 | struct kvm_userspace_memory_region *mem, | ||
| 164 | struct kvm_memory_slot old, | ||
| 165 | int user_alloc); | ||
| 166 | gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); | ||
| 167 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); | ||
| 168 | void kvm_release_page_clean(struct page *page); | ||
| 169 | void kvm_release_page_dirty(struct page *page); | ||
| 170 | int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, | ||
| 171 | int len); | ||
| 172 | int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, | ||
| 173 | unsigned long len); | ||
| 174 | int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); | ||
| 175 | int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, | ||
| 176 | int offset, int len); | ||
| 177 | int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, | ||
| 178 | unsigned long len); | ||
| 179 | int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); | ||
| 180 | int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); | ||
| 181 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); | ||
| 182 | int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); | ||
| 183 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn); | ||
| 184 | |||
| 185 | void kvm_vcpu_block(struct kvm_vcpu *vcpu); | ||
| 186 | void kvm_resched(struct kvm_vcpu *vcpu); | ||
| 187 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); | ||
| 188 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); | ||
| 189 | void kvm_flush_remote_tlbs(struct kvm *kvm); | ||
| 190 | |||
| 191 | long kvm_arch_dev_ioctl(struct file *filp, | ||
| 192 | unsigned int ioctl, unsigned long arg); | ||
| 193 | long kvm_arch_vcpu_ioctl(struct file *filp, | ||
| 194 | unsigned int ioctl, unsigned long arg); | ||
| 195 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); | ||
| 196 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); | ||
| 197 | |||
| 198 | int kvm_dev_ioctl_check_extension(long ext); | ||
| 199 | |||
| 200 | int kvm_get_dirty_log(struct kvm *kvm, | ||
| 201 | struct kvm_dirty_log *log, int *is_dirty); | ||
| 202 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | ||
| 203 | struct kvm_dirty_log *log); | ||
| 204 | |||
| 205 | int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, | ||
| 206 | struct | ||
| 207 | kvm_userspace_memory_region *mem, | ||
| 208 | int user_alloc); | ||
| 209 | long kvm_arch_vm_ioctl(struct file *filp, | ||
| 210 | unsigned int ioctl, unsigned long arg); | ||
| 211 | void kvm_arch_destroy_vm(struct kvm *kvm); | ||
| 212 | |||
| 213 | int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); | ||
| 214 | int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); | ||
| 215 | |||
| 216 | int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, | ||
| 217 | struct kvm_translation *tr); | ||
| 218 | |||
| 219 | int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); | ||
| 220 | int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); | ||
| 221 | int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | ||
| 222 | struct kvm_sregs *sregs); | ||
| 223 | int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | ||
| 224 | struct kvm_sregs *sregs); | ||
| 225 | int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, | ||
| 226 | struct kvm_debug_guest *dbg); | ||
| 227 | int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); | ||
| 228 | |||
| 229 | int kvm_arch_init(void *opaque); | ||
| 230 | void kvm_arch_exit(void); | ||
| 231 | |||
| 232 | int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu); | ||
| 233 | void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); | ||
| 234 | |||
| 235 | void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu); | ||
| 236 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); | ||
| 237 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); | ||
| 238 | struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); | ||
| 239 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); | ||
| 240 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); | ||
| 241 | |||
| 242 | int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu); | ||
| 243 | void kvm_arch_hardware_enable(void *garbage); | ||
| 244 | void kvm_arch_hardware_disable(void *garbage); | ||
| 245 | int kvm_arch_hardware_setup(void); | ||
| 246 | void kvm_arch_hardware_unsetup(void); | ||
| 247 | void kvm_arch_check_processor_compat(void *rtn); | ||
| 248 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); | ||
| 249 | |||
| 250 | void kvm_free_physmem(struct kvm *kvm); | ||
| 251 | |||
| 252 | struct kvm *kvm_arch_create_vm(void); | ||
| 253 | void kvm_arch_destroy_vm(struct kvm *kvm); | ||
| 254 | |||
| 255 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v); | ||
| 256 | int kvm_cpu_has_interrupt(struct kvm_vcpu *v); | ||
| 257 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu); | ||
| 258 | |||
| 259 | static inline void kvm_guest_enter(void) | ||
| 260 | { | ||
| 261 | account_system_vtime(current); | ||
| 262 | current->flags |= PF_VCPU; | ||
| 263 | } | ||
| 264 | |||
| 265 | static inline void kvm_guest_exit(void) | ||
| 266 | { | ||
| 267 | account_system_vtime(current); | ||
| 268 | current->flags &= ~PF_VCPU; | ||
| 269 | } | ||
| 270 | |||
| 271 | static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) | ||
| 272 | { | ||
| 273 | return slot - kvm->memslots; | ||
| 274 | } | ||
| 275 | |||
| 276 | static inline gpa_t gfn_to_gpa(gfn_t gfn) | ||
| 277 | { | ||
| 278 | return (gpa_t)gfn << PAGE_SHIFT; | ||
| 279 | } | ||
| 280 | |||
| 281 | static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) | ||
| 282 | { | ||
| 283 | set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); | ||
| 284 | } | ||
| 285 | |||
| 286 | enum kvm_stat_kind { | ||
| 287 | KVM_STAT_VM, | ||
| 288 | KVM_STAT_VCPU, | ||
| 289 | }; | ||
| 290 | |||
| 291 | struct kvm_stats_debugfs_item { | ||
| 292 | const char *name; | ||
| 293 | int offset; | ||
| 294 | enum kvm_stat_kind kind; | ||
| 295 | struct dentry *dentry; | ||
| 296 | }; | ||
| 297 | extern struct kvm_stats_debugfs_item debugfs_entries[]; | ||
| 298 | |||
| 299 | #endif | ||
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h index 3b292565a693..5497aac0d2f8 100644 --- a/include/linux/kvm_para.h +++ b/include/linux/kvm_para.h | |||
| @@ -2,72 +2,30 @@ | |||
| 2 | #define __LINUX_KVM_PARA_H | 2 | #define __LINUX_KVM_PARA_H |
| 3 | 3 | ||
| 4 | /* | 4 | /* |
| 5 | * Guest OS interface for KVM paravirtualization | 5 | * This header file provides a method for making a hypercall to the host |
| 6 | * | 6 | * Architectures should define: |
| 7 | * Note: this interface is totally experimental, and is certain to change | 7 | * - kvm_hypercall0, kvm_hypercall1... |
| 8 | * as we make progress. | 8 | * - kvm_arch_para_features |
| 9 | * - kvm_para_available | ||
| 9 | */ | 10 | */ |
| 10 | 11 | ||
| 11 | /* | 12 | /* Return values for hypercalls */ |
| 12 | * Per-VCPU descriptor area shared between guest and host. Writable to | 13 | #define KVM_ENOSYS 1000 |
| 13 | * both guest and host. Registered with the host by the guest when | ||
| 14 | * a guest acknowledges paravirtual mode. | ||
| 15 | * | ||
| 16 | * NOTE: all addresses are guest-physical addresses (gpa), to make it | ||
| 17 | * easier for the hypervisor to map between the various addresses. | ||
| 18 | */ | ||
| 19 | struct kvm_vcpu_para_state { | ||
| 20 | /* | ||
| 21 | * API version information for compatibility. If there's any support | ||
| 22 | * mismatch (too old host trying to execute too new guest) then | ||
| 23 | * the host will deny entry into paravirtual mode. Any other | ||
| 24 | * combination (new host + old guest and new host + new guest) | ||
| 25 | * is supposed to work - new host versions will support all old | ||
| 26 | * guest API versions. | ||
| 27 | */ | ||
| 28 | u32 guest_version; | ||
| 29 | u32 host_version; | ||
| 30 | u32 size; | ||
| 31 | u32 ret; | ||
| 32 | |||
| 33 | /* | ||
| 34 | * The address of the vm exit instruction (VMCALL or VMMCALL), | ||
| 35 | * which the host will patch according to the CPU model the | ||
| 36 | * VM runs on: | ||
| 37 | */ | ||
| 38 | u64 hypercall_gpa; | ||
| 39 | |||
| 40 | } __attribute__ ((aligned(PAGE_SIZE))); | ||
| 41 | |||
| 42 | #define KVM_PARA_API_VERSION 1 | ||
| 43 | |||
| 44 | /* | ||
| 45 | * This is used for an RDMSR's ECX parameter to probe for a KVM host. | ||
| 46 | * Hopefully no CPU vendor will use up this number. This is placed well | ||
| 47 | * out of way of the typical space occupied by CPU vendors' MSR indices, | ||
| 48 | * and we think (or at least hope) it wont be occupied in the future | ||
| 49 | * either. | ||
| 50 | */ | ||
| 51 | #define MSR_KVM_API_MAGIC 0x87655678 | ||
| 52 | 14 | ||
| 53 | #define KVM_EINVAL 1 | 15 | #define KVM_HC_VAPIC_POLL_IRQ 1 |
| 54 | 16 | ||
| 55 | /* | 17 | /* |
| 56 | * Hypercall calling convention: | 18 | * hypercalls use architecture specific |
| 57 | * | ||
| 58 | * Each hypercall may have 0-6 parameters. | ||
| 59 | * | ||
| 60 | * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1 | ||
| 61 | * | ||
| 62 | * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention | ||
| 63 | * order: RDI, RSI, RDX, RCX, R8, R9. | ||
| 64 | * | ||
| 65 | * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP. | ||
| 66 | * (the first 3 are according to the gcc regparm calling convention) | ||
| 67 | * | ||
| 68 | * No registers are clobbered by the hypercall, except that the | ||
| 69 | * return value is in RAX. | ||
| 70 | */ | 19 | */ |
| 71 | #define __NR_hypercalls 0 | 20 | #include <asm/kvm_para.h> |
| 21 | |||
| 22 | #ifdef __KERNEL__ | ||
| 23 | static inline int kvm_para_has_feature(unsigned int feature) | ||
| 24 | { | ||
| 25 | if (kvm_arch_para_features() & (1UL << feature)) | ||
| 26 | return 1; | ||
| 27 | return 0; | ||
| 28 | } | ||
| 29 | #endif /* __KERNEL__ */ | ||
| 30 | #endif /* __LINUX_KVM_PARA_H */ | ||
| 72 | 31 | ||
| 73 | #endif | ||
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h new file mode 100644 index 000000000000..1c4e46decb22 --- /dev/null +++ b/include/linux/kvm_types.h | |||
| @@ -0,0 +1,54 @@ | |||
| 1 | /* | ||
| 2 | * This program is free software; you can redistribute it and/or modify | ||
| 3 | * it under the terms of the GNU General Public License as published by | ||
| 4 | * the Free Software Foundation; either version 2 of the License. | ||
| 5 | * | ||
| 6 | * This program is distributed in the hope that it will be useful, | ||
| 7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 9 | * GNU General Public License for more details. | ||
| 10 | * | ||
| 11 | * You should have received a copy of the GNU General Public License | ||
| 12 | * along with this program; if not, write to the Free Software | ||
| 13 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
| 14 | * | ||
| 15 | */ | ||
| 16 | |||
| 17 | #ifndef __KVM_TYPES_H__ | ||
| 18 | #define __KVM_TYPES_H__ | ||
| 19 | |||
| 20 | #include <asm/types.h> | ||
| 21 | |||
| 22 | /* | ||
| 23 | * Address types: | ||
| 24 | * | ||
| 25 | * gva - guest virtual address | ||
| 26 | * gpa - guest physical address | ||
| 27 | * gfn - guest frame number | ||
| 28 | * hva - host virtual address | ||
| 29 | * hpa - host physical address | ||
| 30 | * hfn - host frame number | ||
| 31 | */ | ||
| 32 | |||
| 33 | typedef unsigned long gva_t; | ||
| 34 | typedef u64 gpa_t; | ||
| 35 | typedef unsigned long gfn_t; | ||
| 36 | |||
| 37 | typedef unsigned long hva_t; | ||
| 38 | typedef u64 hpa_t; | ||
| 39 | typedef unsigned long hfn_t; | ||
| 40 | |||
| 41 | struct kvm_pio_request { | ||
| 42 | unsigned long count; | ||
| 43 | int cur_count; | ||
| 44 | struct page *guest_pages[2]; | ||
| 45 | unsigned guest_page_offset; | ||
| 46 | int in; | ||
| 47 | int port; | ||
| 48 | int size; | ||
| 49 | int string; | ||
| 50 | int down; | ||
| 51 | int rep; | ||
| 52 | }; | ||
| 53 | |||
| 54 | #endif /* __KVM_TYPES_H__ */ | ||
diff --git a/include/linux/selinux.h b/include/linux/selinux.h index 6080f73fc85f..8c2cc4c02526 100644 --- a/include/linux/selinux.h +++ b/include/linux/selinux.h | |||
| @@ -120,16 +120,35 @@ void selinux_get_task_sid(struct task_struct *tsk, u32 *sid); | |||
| 120 | int selinux_string_to_sid(char *str, u32 *sid); | 120 | int selinux_string_to_sid(char *str, u32 *sid); |
| 121 | 121 | ||
| 122 | /** | 122 | /** |
| 123 | * selinux_relabel_packet_permission - check permission to relabel a packet | 123 | * selinux_secmark_relabel_packet_permission - secmark permission check |
| 124 | * @sid: ID value to be applied to network packet (via SECMARK, most likely) | 124 | * @sid: SECMARK ID value to be applied to network packet |
| 125 | * | 125 | * |
| 126 | * Returns 0 if the current task is allowed to label packets with the | 126 | * Returns 0 if the current task is allowed to set the SECMARK label of |
| 127 | * supplied security ID. Note that it is implicit that the packet is always | 127 | * packets with the supplied security ID. Note that it is implicit that |
| 128 | * being relabeled from the default unlabled value, and that the access | 128 | * the packet is always being relabeled from the default unlabeled value, |
| 129 | * control decision is made in the AVC. | 129 | * and that the access control decision is made in the AVC. |
| 130 | */ | 130 | */ |
| 131 | int selinux_relabel_packet_permission(u32 sid); | 131 | int selinux_secmark_relabel_packet_permission(u32 sid); |
| 132 | 132 | ||
| 133 | /** | ||
| 134 | * selinux_secmark_refcount_inc - increments the secmark use counter | ||
| 135 | * | ||
| 136 | * SELinux keeps track of the current SECMARK targets in use so it knows | ||
| 137 | * when to apply SECMARK label access checks to network packets. This | ||
| 138 | * function incements this reference count to indicate that a new SECMARK | ||
| 139 | * target has been configured. | ||
| 140 | */ | ||
| 141 | void selinux_secmark_refcount_inc(void); | ||
| 142 | |||
| 143 | /** | ||
| 144 | * selinux_secmark_refcount_dec - decrements the secmark use counter | ||
| 145 | * | ||
| 146 | * SELinux keeps track of the current SECMARK targets in use so it knows | ||
| 147 | * when to apply SECMARK label access checks to network packets. This | ||
| 148 | * function decements this reference count to indicate that one of the | ||
| 149 | * existing SECMARK targets has been removed/flushed. | ||
| 150 | */ | ||
| 151 | void selinux_secmark_refcount_dec(void); | ||
| 133 | #else | 152 | #else |
| 134 | 153 | ||
| 135 | static inline int selinux_audit_rule_init(u32 field, u32 op, | 154 | static inline int selinux_audit_rule_init(u32 field, u32 op, |
| @@ -184,11 +203,21 @@ static inline int selinux_string_to_sid(const char *str, u32 *sid) | |||
| 184 | return 0; | 203 | return 0; |
| 185 | } | 204 | } |
| 186 | 205 | ||
| 187 | static inline int selinux_relabel_packet_permission(u32 sid) | 206 | static inline int selinux_secmark_relabel_packet_permission(u32 sid) |
| 188 | { | 207 | { |
| 189 | return 0; | 208 | return 0; |
| 190 | } | 209 | } |
| 191 | 210 | ||
| 211 | static inline void selinux_secmark_refcount_inc(void) | ||
| 212 | { | ||
| 213 | return; | ||
| 214 | } | ||
| 215 | |||
| 216 | static inline void selinux_secmark_refcount_dec(void) | ||
| 217 | { | ||
| 218 | return; | ||
| 219 | } | ||
| 220 | |||
| 192 | #endif /* CONFIG_SECURITY_SELINUX */ | 221 | #endif /* CONFIG_SECURITY_SELINUX */ |
| 193 | 222 | ||
| 194 | #endif /* _LINUX_SELINUX_H */ | 223 | #endif /* _LINUX_SELINUX_H */ |
diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 2e5b2f6f9fa0..b3213c7c5309 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h | |||
| @@ -67,7 +67,11 @@ | |||
| 67 | * NetLabel NETLINK protocol | 67 | * NetLabel NETLINK protocol |
| 68 | */ | 68 | */ |
| 69 | 69 | ||
| 70 | #define NETLBL_PROTO_VERSION 1 | 70 | /* NetLabel NETLINK protocol version |
| 71 | * 1: initial version | ||
| 72 | * 2: added static labels for unlabeled connections | ||
| 73 | */ | ||
| 74 | #define NETLBL_PROTO_VERSION 2 | ||
| 71 | 75 | ||
| 72 | /* NetLabel NETLINK types/families */ | 76 | /* NetLabel NETLINK types/families */ |
| 73 | #define NETLBL_NLTYPE_NONE 0 | 77 | #define NETLBL_NLTYPE_NONE 0 |
| @@ -105,17 +109,49 @@ struct netlbl_dom_map; | |||
| 105 | /* Domain mapping operations */ | 109 | /* Domain mapping operations */ |
| 106 | int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info); | 110 | int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info); |
| 107 | 111 | ||
| 108 | /* LSM security attributes */ | 112 | /* |
| 113 | * LSM security attributes | ||
| 114 | */ | ||
| 115 | |||
| 116 | /** | ||
| 117 | * struct netlbl_lsm_cache - NetLabel LSM security attribute cache | ||
| 118 | * @refcount: atomic reference counter | ||
| 119 | * @free: LSM supplied function to free the cache data | ||
| 120 | * @data: LSM supplied cache data | ||
| 121 | * | ||
| 122 | * Description: | ||
| 123 | * This structure is provided for LSMs which wish to make use of the NetLabel | ||
| 124 | * caching mechanism to store LSM specific data/attributes in the NetLabel | ||
| 125 | * cache. If the LSM has to perform a lot of translation from the NetLabel | ||
| 126 | * security attributes into it's own internal representation then the cache | ||
| 127 | * mechanism can provide a way to eliminate some or all of that translation | ||
| 128 | * overhead on a cache hit. | ||
| 129 | * | ||
| 130 | */ | ||
| 109 | struct netlbl_lsm_cache { | 131 | struct netlbl_lsm_cache { |
| 110 | atomic_t refcount; | 132 | atomic_t refcount; |
| 111 | void (*free) (const void *data); | 133 | void (*free) (const void *data); |
| 112 | void *data; | 134 | void *data; |
| 113 | }; | 135 | }; |
| 114 | /* The catmap bitmap field MUST be a power of two in length and large | 136 | |
| 137 | /** | ||
| 138 | * struct netlbl_lsm_secattr_catmap - NetLabel LSM secattr category bitmap | ||
| 139 | * @startbit: the value of the lowest order bit in the bitmap | ||
| 140 | * @bitmap: the category bitmap | ||
| 141 | * @next: pointer to the next bitmap "node" or NULL | ||
| 142 | * | ||
| 143 | * Description: | ||
| 144 | * This structure is used to represent category bitmaps. Due to the large | ||
| 145 | * number of categories supported by most labeling protocols it is not | ||
| 146 | * practical to transfer a full bitmap internally so NetLabel adopts a sparse | ||
| 147 | * bitmap structure modeled after SELinux's ebitmap structure. | ||
| 148 | * The catmap bitmap field MUST be a power of two in length and large | ||
| 115 | * enough to hold at least 240 bits. Special care (i.e. check the code!) | 149 | * enough to hold at least 240 bits. Special care (i.e. check the code!) |
| 116 | * should be used when changing these values as the LSM implementation | 150 | * should be used when changing these values as the LSM implementation |
| 117 | * probably has functions which rely on the sizes of these types to speed | 151 | * probably has functions which rely on the sizes of these types to speed |
| 118 | * processing. */ | 152 | * processing. |
| 153 | * | ||
| 154 | */ | ||
| 119 | #define NETLBL_CATMAP_MAPTYPE u64 | 155 | #define NETLBL_CATMAP_MAPTYPE u64 |
| 120 | #define NETLBL_CATMAP_MAPCNT 4 | 156 | #define NETLBL_CATMAP_MAPCNT 4 |
| 121 | #define NETLBL_CATMAP_MAPSIZE (sizeof(NETLBL_CATMAP_MAPTYPE) * 8) | 157 | #define NETLBL_CATMAP_MAPSIZE (sizeof(NETLBL_CATMAP_MAPTYPE) * 8) |
| @@ -127,22 +163,48 @@ struct netlbl_lsm_secattr_catmap { | |||
| 127 | NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT]; | 163 | NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT]; |
| 128 | struct netlbl_lsm_secattr_catmap *next; | 164 | struct netlbl_lsm_secattr_catmap *next; |
| 129 | }; | 165 | }; |
| 166 | |||
| 167 | /** | ||
| 168 | * struct netlbl_lsm_secattr - NetLabel LSM security attributes | ||
| 169 | * @flags: indicate which attributes are contained in this structure | ||
| 170 | * @type: indicate the NLTYPE of the attributes | ||
| 171 | * @domain: the NetLabel LSM domain | ||
| 172 | * @cache: NetLabel LSM specific cache | ||
| 173 | * @attr.mls: MLS sensitivity label | ||
| 174 | * @attr.mls.cat: MLS category bitmap | ||
| 175 | * @attr.mls.lvl: MLS sensitivity level | ||
| 176 | * @attr.secid: LSM specific secid token | ||
| 177 | * | ||
| 178 | * Description: | ||
| 179 | * This structure is used to pass security attributes between NetLabel and the | ||
| 180 | * LSM modules. The flags field is used to specify which fields within the | ||
| 181 | * struct are valid and valid values can be created by bitwise OR'ing the | ||
| 182 | * NETLBL_SECATTR_* defines. The domain field is typically set by the LSM to | ||
| 183 | * specify domain specific configuration settings and is not usually used by | ||
| 184 | * NetLabel itself when returning security attributes to the LSM. | ||
| 185 | * | ||
| 186 | */ | ||
| 130 | #define NETLBL_SECATTR_NONE 0x00000000 | 187 | #define NETLBL_SECATTR_NONE 0x00000000 |
| 131 | #define NETLBL_SECATTR_DOMAIN 0x00000001 | 188 | #define NETLBL_SECATTR_DOMAIN 0x00000001 |
| 132 | #define NETLBL_SECATTR_CACHE 0x00000002 | 189 | #define NETLBL_SECATTR_CACHE 0x00000002 |
| 133 | #define NETLBL_SECATTR_MLS_LVL 0x00000004 | 190 | #define NETLBL_SECATTR_MLS_LVL 0x00000004 |
| 134 | #define NETLBL_SECATTR_MLS_CAT 0x00000008 | 191 | #define NETLBL_SECATTR_MLS_CAT 0x00000008 |
| 192 | #define NETLBL_SECATTR_SECID 0x00000010 | ||
| 135 | #define NETLBL_SECATTR_CACHEABLE (NETLBL_SECATTR_MLS_LVL | \ | 193 | #define NETLBL_SECATTR_CACHEABLE (NETLBL_SECATTR_MLS_LVL | \ |
| 136 | NETLBL_SECATTR_MLS_CAT) | 194 | NETLBL_SECATTR_MLS_CAT | \ |
| 195 | NETLBL_SECATTR_SECID) | ||
| 137 | struct netlbl_lsm_secattr { | 196 | struct netlbl_lsm_secattr { |
| 138 | u32 flags; | 197 | u32 flags; |
| 139 | 198 | u32 type; | |
| 140 | char *domain; | 199 | char *domain; |
| 141 | |||
| 142 | u32 mls_lvl; | ||
| 143 | struct netlbl_lsm_secattr_catmap *mls_cat; | ||
| 144 | |||
| 145 | struct netlbl_lsm_cache *cache; | 200 | struct netlbl_lsm_cache *cache; |
| 201 | union { | ||
| 202 | struct { | ||
| 203 | struct netlbl_lsm_secattr_catmap *cat; | ||
| 204 | u32 lvl; | ||
| 205 | } mls; | ||
| 206 | u32 secid; | ||
| 207 | } attr; | ||
| 146 | }; | 208 | }; |
| 147 | 209 | ||
| 148 | /* | 210 | /* |
| @@ -231,10 +293,7 @@ static inline void netlbl_secattr_catmap_free( | |||
| 231 | */ | 293 | */ |
| 232 | static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr) | 294 | static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr) |
| 233 | { | 295 | { |
| 234 | secattr->flags = 0; | 296 | memset(secattr, 0, sizeof(*secattr)); |
| 235 | secattr->domain = NULL; | ||
| 236 | secattr->mls_cat = NULL; | ||
| 237 | secattr->cache = NULL; | ||
| 238 | } | 297 | } |
| 239 | 298 | ||
| 240 | /** | 299 | /** |
| @@ -248,11 +307,11 @@ static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr) | |||
| 248 | */ | 307 | */ |
| 249 | static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr) | 308 | static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr) |
| 250 | { | 309 | { |
| 251 | if (secattr->cache) | ||
| 252 | netlbl_secattr_cache_free(secattr->cache); | ||
| 253 | kfree(secattr->domain); | 310 | kfree(secattr->domain); |
| 254 | if (secattr->mls_cat) | 311 | if (secattr->flags & NETLBL_SECATTR_CACHE) |
| 255 | netlbl_secattr_catmap_free(secattr->mls_cat); | 312 | netlbl_secattr_cache_free(secattr->cache); |
| 313 | if (secattr->flags & NETLBL_SECATTR_MLS_CAT) | ||
| 314 | netlbl_secattr_catmap_free(secattr->attr.mls.cat); | ||
| 256 | } | 315 | } |
| 257 | 316 | ||
| 258 | /** | 317 | /** |
| @@ -300,7 +359,7 @@ int netlbl_secattr_catmap_setrng(struct netlbl_lsm_secattr_catmap *catmap, | |||
| 300 | gfp_t flags); | 359 | gfp_t flags); |
| 301 | 360 | ||
| 302 | /* | 361 | /* |
| 303 | * LSM protocol operations | 362 | * LSM protocol operations (NetLabel LSM/kernel API) |
| 304 | */ | 363 | */ |
| 305 | int netlbl_enabled(void); | 364 | int netlbl_enabled(void); |
| 306 | int netlbl_sock_setattr(struct sock *sk, | 365 | int netlbl_sock_setattr(struct sock *sk, |
| @@ -308,6 +367,7 @@ int netlbl_sock_setattr(struct sock *sk, | |||
| 308 | int netlbl_sock_getattr(struct sock *sk, | 367 | int netlbl_sock_getattr(struct sock *sk, |
| 309 | struct netlbl_lsm_secattr *secattr); | 368 | struct netlbl_lsm_secattr *secattr); |
| 310 | int netlbl_skbuff_getattr(const struct sk_buff *skb, | 369 | int netlbl_skbuff_getattr(const struct sk_buff *skb, |
| 370 | u16 family, | ||
| 311 | struct netlbl_lsm_secattr *secattr); | 371 | struct netlbl_lsm_secattr *secattr); |
| 312 | void netlbl_skbuff_err(struct sk_buff *skb, int error); | 372 | void netlbl_skbuff_err(struct sk_buff *skb, int error); |
| 313 | 373 | ||
| @@ -360,6 +420,7 @@ static inline int netlbl_sock_getattr(struct sock *sk, | |||
| 360 | return -ENOSYS; | 420 | return -ENOSYS; |
| 361 | } | 421 | } |
| 362 | static inline int netlbl_skbuff_getattr(const struct sk_buff *skb, | 422 | static inline int netlbl_skbuff_getattr(const struct sk_buff *skb, |
| 423 | u16 family, | ||
| 363 | struct netlbl_lsm_secattr *secattr) | 424 | struct netlbl_lsm_secattr *secattr) |
| 364 | { | 425 | { |
| 365 | return -ENOSYS; | 426 | return -ENOSYS; |
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h index 702fcfeb37f1..82251575a9b4 100644 --- a/include/scsi/scsi.h +++ b/include/scsi/scsi.h | |||
| @@ -11,6 +11,25 @@ | |||
| 11 | #include <linux/types.h> | 11 | #include <linux/types.h> |
| 12 | 12 | ||
| 13 | /* | 13 | /* |
| 14 | * The maximum number of SG segments that we will put inside a | ||
| 15 | * scatterlist (unless chaining is used). Should ideally fit inside a | ||
| 16 | * single page, to avoid a higher order allocation. We could define this | ||
| 17 | * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order. The | ||
| 18 | * minimum value is 32 | ||
| 19 | */ | ||
| 20 | #define SCSI_MAX_SG_SEGMENTS 128 | ||
| 21 | |||
| 22 | /* | ||
| 23 | * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit | ||
| 24 | * is totally arbitrary, a setting of 2048 will get you at least 8mb ios. | ||
| 25 | */ | ||
| 26 | #ifdef ARCH_HAS_SG_CHAIN | ||
| 27 | #define SCSI_MAX_SG_CHAIN_SEGMENTS 2048 | ||
| 28 | #else | ||
| 29 | #define SCSI_MAX_SG_CHAIN_SEGMENTS SCSI_MAX_SG_SEGMENTS | ||
| 30 | #endif | ||
| 31 | |||
| 32 | /* | ||
| 14 | * SCSI command lengths | 33 | * SCSI command lengths |
| 15 | */ | 34 | */ |
| 16 | 35 | ||
| @@ -83,6 +102,7 @@ extern const unsigned char scsi_command_size[8]; | |||
| 83 | #define READ_TOC 0x43 | 102 | #define READ_TOC 0x43 |
| 84 | #define LOG_SELECT 0x4c | 103 | #define LOG_SELECT 0x4c |
| 85 | #define LOG_SENSE 0x4d | 104 | #define LOG_SENSE 0x4d |
| 105 | #define XDWRITEREAD_10 0x53 | ||
| 86 | #define MODE_SELECT_10 0x55 | 106 | #define MODE_SELECT_10 0x55 |
| 87 | #define RESERVE_10 0x56 | 107 | #define RESERVE_10 0x56 |
| 88 | #define RELEASE_10 0x57 | 108 | #define RELEASE_10 0x57 |
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index a457fca66f61..de28aab820b0 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h | |||
| @@ -2,15 +2,20 @@ | |||
| 2 | #define _SCSI_SCSI_CMND_H | 2 | #define _SCSI_SCSI_CMND_H |
| 3 | 3 | ||
| 4 | #include <linux/dma-mapping.h> | 4 | #include <linux/dma-mapping.h> |
| 5 | #include <linux/blkdev.h> | ||
| 5 | #include <linux/list.h> | 6 | #include <linux/list.h> |
| 6 | #include <linux/types.h> | 7 | #include <linux/types.h> |
| 7 | #include <linux/timer.h> | 8 | #include <linux/timer.h> |
| 8 | #include <linux/scatterlist.h> | 9 | #include <linux/scatterlist.h> |
| 9 | 10 | ||
| 10 | struct request; | ||
| 11 | struct Scsi_Host; | 11 | struct Scsi_Host; |
| 12 | struct scsi_device; | 12 | struct scsi_device; |
| 13 | 13 | ||
| 14 | struct scsi_data_buffer { | ||
| 15 | struct sg_table table; | ||
| 16 | unsigned length; | ||
| 17 | int resid; | ||
| 18 | }; | ||
| 14 | 19 | ||
| 15 | /* embedded in scsi_cmnd */ | 20 | /* embedded in scsi_cmnd */ |
| 16 | struct scsi_pointer { | 21 | struct scsi_pointer { |
| @@ -61,15 +66,11 @@ struct scsi_cmnd { | |||
| 61 | /* These elements define the operation we are about to perform */ | 66 | /* These elements define the operation we are about to perform */ |
| 62 | #define MAX_COMMAND_SIZE 16 | 67 | #define MAX_COMMAND_SIZE 16 |
| 63 | unsigned char cmnd[MAX_COMMAND_SIZE]; | 68 | unsigned char cmnd[MAX_COMMAND_SIZE]; |
| 64 | unsigned request_bufflen; /* Actual request size */ | ||
| 65 | 69 | ||
| 66 | struct timer_list eh_timeout; /* Used to time out the command. */ | 70 | struct timer_list eh_timeout; /* Used to time out the command. */ |
| 67 | void *request_buffer; /* Actual requested buffer */ | ||
| 68 | 71 | ||
| 69 | /* These elements define the operation we ultimately want to perform */ | 72 | /* These elements define the operation we ultimately want to perform */ |
| 70 | struct sg_table sg_table; | 73 | struct scsi_data_buffer sdb; |
| 71 | unsigned short use_sg; /* Number of pieces of scatter-gather */ | ||
| 72 | |||
| 73 | unsigned underflow; /* Return error if less than | 74 | unsigned underflow; /* Return error if less than |
| 74 | this amount is transferred */ | 75 | this amount is transferred */ |
| 75 | 76 | ||
| @@ -79,10 +80,6 @@ struct scsi_cmnd { | |||
| 79 | reconnects. Probably == sector | 80 | reconnects. Probably == sector |
| 80 | size */ | 81 | size */ |
| 81 | 82 | ||
| 82 | int resid; /* Number of bytes requested to be | ||
| 83 | transferred less actual number | ||
| 84 | transferred (0 if not supported) */ | ||
| 85 | |||
| 86 | struct request *request; /* The command we are | 83 | struct request *request; /* The command we are |
| 87 | working on */ | 84 | working on */ |
| 88 | 85 | ||
| @@ -127,27 +124,55 @@ extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count, | |||
| 127 | size_t *offset, size_t *len); | 124 | size_t *offset, size_t *len); |
| 128 | extern void scsi_kunmap_atomic_sg(void *virt); | 125 | extern void scsi_kunmap_atomic_sg(void *virt); |
| 129 | 126 | ||
| 130 | extern int scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t); | 127 | extern int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask); |
| 131 | extern void scsi_free_sgtable(struct scsi_cmnd *); | 128 | extern void scsi_release_buffers(struct scsi_cmnd *cmd); |
| 132 | 129 | ||
| 133 | extern int scsi_dma_map(struct scsi_cmnd *cmd); | 130 | extern int scsi_dma_map(struct scsi_cmnd *cmd); |
| 134 | extern void scsi_dma_unmap(struct scsi_cmnd *cmd); | 131 | extern void scsi_dma_unmap(struct scsi_cmnd *cmd); |
| 135 | 132 | ||
| 136 | #define scsi_sg_count(cmd) ((cmd)->use_sg) | 133 | static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd) |
| 137 | #define scsi_sglist(cmd) ((cmd)->sg_table.sgl) | 134 | { |
| 138 | #define scsi_bufflen(cmd) ((cmd)->request_bufflen) | 135 | return cmd->sdb.table.nents; |
| 136 | } | ||
| 137 | |||
| 138 | static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd) | ||
| 139 | { | ||
| 140 | return cmd->sdb.table.sgl; | ||
| 141 | } | ||
| 142 | |||
| 143 | static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd) | ||
| 144 | { | ||
| 145 | return cmd->sdb.length; | ||
| 146 | } | ||
| 139 | 147 | ||
| 140 | static inline void scsi_set_resid(struct scsi_cmnd *cmd, int resid) | 148 | static inline void scsi_set_resid(struct scsi_cmnd *cmd, int resid) |
| 141 | { | 149 | { |
| 142 | cmd->resid = resid; | 150 | cmd->sdb.resid = resid; |
| 143 | } | 151 | } |
| 144 | 152 | ||
| 145 | static inline int scsi_get_resid(struct scsi_cmnd *cmd) | 153 | static inline int scsi_get_resid(struct scsi_cmnd *cmd) |
| 146 | { | 154 | { |
| 147 | return cmd->resid; | 155 | return cmd->sdb.resid; |
| 148 | } | 156 | } |
| 149 | 157 | ||
| 150 | #define scsi_for_each_sg(cmd, sg, nseg, __i) \ | 158 | #define scsi_for_each_sg(cmd, sg, nseg, __i) \ |
| 151 | for_each_sg(scsi_sglist(cmd), sg, nseg, __i) | 159 | for_each_sg(scsi_sglist(cmd), sg, nseg, __i) |
| 152 | 160 | ||
| 161 | static inline int scsi_bidi_cmnd(struct scsi_cmnd *cmd) | ||
| 162 | { | ||
| 163 | return blk_bidi_rq(cmd->request) && | ||
| 164 | (cmd->request->next_rq->special != NULL); | ||
| 165 | } | ||
| 166 | |||
| 167 | static inline struct scsi_data_buffer *scsi_in(struct scsi_cmnd *cmd) | ||
| 168 | { | ||
| 169 | return scsi_bidi_cmnd(cmd) ? | ||
| 170 | cmd->request->next_rq->special : &cmd->sdb; | ||
| 171 | } | ||
| 172 | |||
| 173 | static inline struct scsi_data_buffer *scsi_out(struct scsi_cmnd *cmd) | ||
| 174 | { | ||
| 175 | return &cmd->sdb; | ||
| 176 | } | ||
| 177 | |||
| 153 | #endif /* _SCSI_SCSI_CMND_H */ | 178 | #endif /* _SCSI_SCSI_CMND_H */ |
diff --git a/include/scsi/scsi_eh.h b/include/scsi/scsi_eh.h index d21b8913ceb3..25071d5d9bf8 100644 --- a/include/scsi/scsi_eh.h +++ b/include/scsi/scsi_eh.h | |||
| @@ -68,16 +68,15 @@ extern int scsi_get_sense_info_fld(const u8 * sense_buffer, int sb_len, | |||
| 68 | extern int scsi_reset_provider(struct scsi_device *, int); | 68 | extern int scsi_reset_provider(struct scsi_device *, int); |
| 69 | 69 | ||
| 70 | struct scsi_eh_save { | 70 | struct scsi_eh_save { |
| 71 | /* saved state */ | ||
| 71 | int result; | 72 | int result; |
| 72 | enum dma_data_direction data_direction; | 73 | enum dma_data_direction data_direction; |
| 73 | unsigned char cmd_len; | 74 | unsigned char cmd_len; |
| 74 | unsigned char cmnd[MAX_COMMAND_SIZE]; | 75 | unsigned char cmnd[MAX_COMMAND_SIZE]; |
| 76 | struct scsi_data_buffer sdb; | ||
| 77 | struct request *next_rq; | ||
| 75 | 78 | ||
| 76 | void *buffer; | 79 | /* new command support */ |
| 77 | unsigned bufflen; | ||
| 78 | unsigned short use_sg; | ||
| 79 | int resid; | ||
| 80 | |||
| 81 | struct scatterlist sense_sgl; | 80 | struct scatterlist sense_sgl; |
| 82 | }; | 81 | }; |
| 83 | 82 | ||
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 0fd4746ee39d..5c58d594126a 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h | |||
| @@ -39,9 +39,6 @@ struct blk_queue_tags; | |||
| 39 | #define DISABLE_CLUSTERING 0 | 39 | #define DISABLE_CLUSTERING 0 |
| 40 | #define ENABLE_CLUSTERING 1 | 40 | #define ENABLE_CLUSTERING 1 |
| 41 | 41 | ||
| 42 | #define DISABLE_SG_CHAINING 0 | ||
| 43 | #define ENABLE_SG_CHAINING 1 | ||
| 44 | |||
| 45 | enum scsi_eh_timer_return { | 42 | enum scsi_eh_timer_return { |
| 46 | EH_NOT_HANDLED, | 43 | EH_NOT_HANDLED, |
| 47 | EH_HANDLED, | 44 | EH_HANDLED, |
| @@ -136,9 +133,9 @@ struct scsi_host_template { | |||
| 136 | * the done callback is invoked. | 133 | * the done callback is invoked. |
| 137 | * | 134 | * |
| 138 | * This is called to inform the LLD to transfer | 135 | * This is called to inform the LLD to transfer |
| 139 | * cmd->request_bufflen bytes. The cmd->use_sg speciefies the | 136 | * scsi_bufflen(cmd) bytes. scsi_sg_count(cmd) speciefies the |
| 140 | * number of scatterlist entried in the command and | 137 | * number of scatterlist entried in the command and |
| 141 | * cmd->request_buffer contains the scatterlist. | 138 | * scsi_sglist(cmd) returns the scatterlist. |
| 142 | * | 139 | * |
| 143 | * return values: see queuecommand | 140 | * return values: see queuecommand |
| 144 | * | 141 | * |
| @@ -446,15 +443,6 @@ struct scsi_host_template { | |||
| 446 | unsigned ordered_tag:1; | 443 | unsigned ordered_tag:1; |
| 447 | 444 | ||
| 448 | /* | 445 | /* |
| 449 | * true if the low-level driver can support sg chaining. this | ||
| 450 | * will be removed eventually when all the drivers are | ||
| 451 | * converted to support sg chaining. | ||
| 452 | * | ||
| 453 | * Status: OBSOLETE | ||
| 454 | */ | ||
| 455 | unsigned use_sg_chaining:1; | ||
| 456 | |||
| 457 | /* | ||
| 458 | * Countdown for host blocking with no commands outstanding | 446 | * Countdown for host blocking with no commands outstanding |
| 459 | */ | 447 | */ |
| 460 | unsigned int max_host_blocked; | 448 | unsigned int max_host_blocked; |
| @@ -598,7 +586,6 @@ struct Scsi_Host { | |||
| 598 | unsigned unchecked_isa_dma:1; | 586 | unsigned unchecked_isa_dma:1; |
| 599 | unsigned use_clustering:1; | 587 | unsigned use_clustering:1; |
| 600 | unsigned use_blk_tcq:1; | 588 | unsigned use_blk_tcq:1; |
| 601 | unsigned use_sg_chaining:1; | ||
| 602 | 589 | ||
| 603 | /* | 590 | /* |
| 604 | * Host has requested that no further requests come through for the | 591 | * Host has requested that no further requests come through for the |
diff --git a/kernel/fork.c b/kernel/fork.c index 314f5101d2b0..05e0b6f4365b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -393,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm) | |||
| 393 | destroy_context(mm); | 393 | destroy_context(mm); |
| 394 | free_mm(mm); | 394 | free_mm(mm); |
| 395 | } | 395 | } |
| 396 | EXPORT_SYMBOL_GPL(__mmdrop); | ||
| 396 | 397 | ||
| 397 | /* | 398 | /* |
| 398 | * Decrement the use count and release all resources for an mm. | 399 | * Decrement the use count and release all resources for an mm. |
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index d4dc4eb48d95..a2241060113b 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c | |||
| @@ -348,6 +348,7 @@ static int cipso_v4_cache_check(const unsigned char *key, | |||
| 348 | atomic_inc(&entry->lsm_data->refcount); | 348 | atomic_inc(&entry->lsm_data->refcount); |
| 349 | secattr->cache = entry->lsm_data; | 349 | secattr->cache = entry->lsm_data; |
| 350 | secattr->flags |= NETLBL_SECATTR_CACHE; | 350 | secattr->flags |= NETLBL_SECATTR_CACHE; |
| 351 | secattr->type = NETLBL_NLTYPE_CIPSOV4; | ||
| 351 | if (prev_entry == NULL) { | 352 | if (prev_entry == NULL) { |
| 352 | spin_unlock_bh(&cipso_v4_cache[bkt].lock); | 353 | spin_unlock_bh(&cipso_v4_cache[bkt].lock); |
| 353 | return 0; | 354 | return 0; |
| @@ -865,7 +866,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def, | |||
| 865 | } | 866 | } |
| 866 | 867 | ||
| 867 | for (;;) { | 868 | for (;;) { |
| 868 | host_spot = netlbl_secattr_catmap_walk(secattr->mls_cat, | 869 | host_spot = netlbl_secattr_catmap_walk(secattr->attr.mls.cat, |
| 869 | host_spot + 1); | 870 | host_spot + 1); |
| 870 | if (host_spot < 0) | 871 | if (host_spot < 0) |
| 871 | break; | 872 | break; |
| @@ -948,7 +949,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, | |||
| 948 | return -EPERM; | 949 | return -EPERM; |
| 949 | break; | 950 | break; |
| 950 | } | 951 | } |
| 951 | ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat, | 952 | ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat, |
| 952 | host_spot, | 953 | host_spot, |
| 953 | GFP_ATOMIC); | 954 | GFP_ATOMIC); |
| 954 | if (ret_val != 0) | 955 | if (ret_val != 0) |
| @@ -1014,7 +1015,8 @@ static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def, | |||
| 1014 | u32 cat_iter = 0; | 1015 | u32 cat_iter = 0; |
| 1015 | 1016 | ||
| 1016 | for (;;) { | 1017 | for (;;) { |
| 1017 | cat = netlbl_secattr_catmap_walk(secattr->mls_cat, cat + 1); | 1018 | cat = netlbl_secattr_catmap_walk(secattr->attr.mls.cat, |
| 1019 | cat + 1); | ||
| 1018 | if (cat < 0) | 1020 | if (cat < 0) |
| 1019 | break; | 1021 | break; |
| 1020 | if ((cat_iter + 2) > net_cat_len) | 1022 | if ((cat_iter + 2) > net_cat_len) |
| @@ -1049,7 +1051,7 @@ static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def, | |||
| 1049 | u32 iter; | 1051 | u32 iter; |
| 1050 | 1052 | ||
| 1051 | for (iter = 0; iter < net_cat_len; iter += 2) { | 1053 | for (iter = 0; iter < net_cat_len; iter += 2) { |
| 1052 | ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat, | 1054 | ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat, |
| 1053 | ntohs(get_unaligned((__be16 *)&net_cat[iter])), | 1055 | ntohs(get_unaligned((__be16 *)&net_cat[iter])), |
| 1054 | GFP_ATOMIC); | 1056 | GFP_ATOMIC); |
| 1055 | if (ret_val != 0) | 1057 | if (ret_val != 0) |
| @@ -1130,7 +1132,8 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def, | |||
| 1130 | return -ENOSPC; | 1132 | return -ENOSPC; |
| 1131 | 1133 | ||
| 1132 | for (;;) { | 1134 | for (;;) { |
| 1133 | iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1); | 1135 | iter = netlbl_secattr_catmap_walk(secattr->attr.mls.cat, |
| 1136 | iter + 1); | ||
| 1134 | if (iter < 0) | 1137 | if (iter < 0) |
| 1135 | break; | 1138 | break; |
| 1136 | cat_size += (iter == 0 ? 0 : sizeof(u16)); | 1139 | cat_size += (iter == 0 ? 0 : sizeof(u16)); |
| @@ -1138,7 +1141,8 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def, | |||
| 1138 | return -ENOSPC; | 1141 | return -ENOSPC; |
| 1139 | array[array_cnt++] = iter; | 1142 | array[array_cnt++] = iter; |
| 1140 | 1143 | ||
| 1141 | iter = netlbl_secattr_catmap_walk_rng(secattr->mls_cat, iter); | 1144 | iter = netlbl_secattr_catmap_walk_rng(secattr->attr.mls.cat, |
| 1145 | iter); | ||
| 1142 | if (iter < 0) | 1146 | if (iter < 0) |
| 1143 | return -EFAULT; | 1147 | return -EFAULT; |
| 1144 | cat_size += sizeof(u16); | 1148 | cat_size += sizeof(u16); |
| @@ -1191,7 +1195,7 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def, | |||
| 1191 | else | 1195 | else |
| 1192 | cat_low = 0; | 1196 | cat_low = 0; |
| 1193 | 1197 | ||
| 1194 | ret_val = netlbl_secattr_catmap_setrng(secattr->mls_cat, | 1198 | ret_val = netlbl_secattr_catmap_setrng(secattr->attr.mls.cat, |
| 1195 | cat_low, | 1199 | cat_low, |
| 1196 | cat_high, | 1200 | cat_high, |
| 1197 | GFP_ATOMIC); | 1201 | GFP_ATOMIC); |
| @@ -1251,7 +1255,9 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def, | |||
| 1251 | if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) | 1255 | if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) |
| 1252 | return -EPERM; | 1256 | return -EPERM; |
| 1253 | 1257 | ||
| 1254 | ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level); | 1258 | ret_val = cipso_v4_map_lvl_hton(doi_def, |
| 1259 | secattr->attr.mls.lvl, | ||
| 1260 | &level); | ||
| 1255 | if (ret_val != 0) | 1261 | if (ret_val != 0) |
| 1256 | return ret_val; | 1262 | return ret_val; |
| 1257 | 1263 | ||
| @@ -1303,12 +1309,13 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, | |||
| 1303 | ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); | 1309 | ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); |
| 1304 | if (ret_val != 0) | 1310 | if (ret_val != 0) |
| 1305 | return ret_val; | 1311 | return ret_val; |
| 1306 | secattr->mls_lvl = level; | 1312 | secattr->attr.mls.lvl = level; |
| 1307 | secattr->flags |= NETLBL_SECATTR_MLS_LVL; | 1313 | secattr->flags |= NETLBL_SECATTR_MLS_LVL; |
| 1308 | 1314 | ||
| 1309 | if (tag_len > 4) { | 1315 | if (tag_len > 4) { |
| 1310 | secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); | 1316 | secattr->attr.mls.cat = |
| 1311 | if (secattr->mls_cat == NULL) | 1317 | netlbl_secattr_catmap_alloc(GFP_ATOMIC); |
| 1318 | if (secattr->attr.mls.cat == NULL) | ||
| 1312 | return -ENOMEM; | 1319 | return -ENOMEM; |
| 1313 | 1320 | ||
| 1314 | ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def, | 1321 | ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def, |
| @@ -1316,7 +1323,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, | |||
| 1316 | tag_len - 4, | 1323 | tag_len - 4, |
| 1317 | secattr); | 1324 | secattr); |
| 1318 | if (ret_val != 0) { | 1325 | if (ret_val != 0) { |
| 1319 | netlbl_secattr_catmap_free(secattr->mls_cat); | 1326 | netlbl_secattr_catmap_free(secattr->attr.mls.cat); |
| 1320 | return ret_val; | 1327 | return ret_val; |
| 1321 | } | 1328 | } |
| 1322 | 1329 | ||
| @@ -1350,7 +1357,9 @@ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def, | |||
| 1350 | if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) | 1357 | if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) |
| 1351 | return -EPERM; | 1358 | return -EPERM; |
| 1352 | 1359 | ||
| 1353 | ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level); | 1360 | ret_val = cipso_v4_map_lvl_hton(doi_def, |
| 1361 | secattr->attr.mls.lvl, | ||
| 1362 | &level); | ||
| 1354 | if (ret_val != 0) | 1363 | if (ret_val != 0) |
| 1355 | return ret_val; | 1364 | return ret_val; |
| 1356 | 1365 | ||
| @@ -1396,12 +1405,13 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def, | |||
| 1396 | ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); | 1405 | ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); |
| 1397 | if (ret_val != 0) | 1406 | if (ret_val != 0) |
| 1398 | return ret_val; | 1407 | return ret_val; |
| 1399 | secattr->mls_lvl = level; | 1408 | secattr->attr.mls.lvl = level; |
| 1400 | secattr->flags |= NETLBL_SECATTR_MLS_LVL; | 1409 | secattr->flags |= NETLBL_SECATTR_MLS_LVL; |
| 1401 | 1410 | ||
| 1402 | if (tag_len > 4) { | 1411 | if (tag_len > 4) { |
| 1403 | secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); | 1412 | secattr->attr.mls.cat = |
| 1404 | if (secattr->mls_cat == NULL) | 1413 | netlbl_secattr_catmap_alloc(GFP_ATOMIC); |
| 1414 | if (secattr->attr.mls.cat == NULL) | ||
| 1405 | return -ENOMEM; | 1415 | return -ENOMEM; |
| 1406 | 1416 | ||
| 1407 | ret_val = cipso_v4_map_cat_enum_ntoh(doi_def, | 1417 | ret_val = cipso_v4_map_cat_enum_ntoh(doi_def, |
| @@ -1409,7 +1419,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def, | |||
| 1409 | tag_len - 4, | 1419 | tag_len - 4, |
| 1410 | secattr); | 1420 | secattr); |
| 1411 | if (ret_val != 0) { | 1421 | if (ret_val != 0) { |
| 1412 | netlbl_secattr_catmap_free(secattr->mls_cat); | 1422 | netlbl_secattr_catmap_free(secattr->attr.mls.cat); |
| 1413 | return ret_val; | 1423 | return ret_val; |
| 1414 | } | 1424 | } |
| 1415 | 1425 | ||
| @@ -1443,7 +1453,9 @@ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def, | |||
| 1443 | if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) | 1453 | if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) |
| 1444 | return -EPERM; | 1454 | return -EPERM; |
| 1445 | 1455 | ||
| 1446 | ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level); | 1456 | ret_val = cipso_v4_map_lvl_hton(doi_def, |
| 1457 | secattr->attr.mls.lvl, | ||
| 1458 | &level); | ||
| 1447 | if (ret_val != 0) | 1459 | if (ret_val != 0) |
| 1448 | return ret_val; | 1460 | return ret_val; |
| 1449 | 1461 | ||
| @@ -1488,12 +1500,13 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, | |||
| 1488 | ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); | 1500 | ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); |
| 1489 | if (ret_val != 0) | 1501 | if (ret_val != 0) |
| 1490 | return ret_val; | 1502 | return ret_val; |
| 1491 | secattr->mls_lvl = level; | 1503 | secattr->attr.mls.lvl = level; |
| 1492 | secattr->flags |= NETLBL_SECATTR_MLS_LVL; | 1504 | secattr->flags |= NETLBL_SECATTR_MLS_LVL; |
| 1493 | 1505 | ||
| 1494 | if (tag_len > 4) { | 1506 | if (tag_len > 4) { |
| 1495 | secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); | 1507 | secattr->attr.mls.cat = |
| 1496 | if (secattr->mls_cat == NULL) | 1508 | netlbl_secattr_catmap_alloc(GFP_ATOMIC); |
| 1509 | if (secattr->attr.mls.cat == NULL) | ||
| 1497 | return -ENOMEM; | 1510 | return -ENOMEM; |
| 1498 | 1511 | ||
| 1499 | ret_val = cipso_v4_map_cat_rng_ntoh(doi_def, | 1512 | ret_val = cipso_v4_map_cat_rng_ntoh(doi_def, |
| @@ -1501,7 +1514,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, | |||
| 1501 | tag_len - 4, | 1514 | tag_len - 4, |
| 1502 | secattr); | 1515 | secattr); |
| 1503 | if (ret_val != 0) { | 1516 | if (ret_val != 0) { |
| 1504 | netlbl_secattr_catmap_free(secattr->mls_cat); | 1517 | netlbl_secattr_catmap_free(secattr->attr.mls.cat); |
| 1505 | return ret_val; | 1518 | return ret_val; |
| 1506 | } | 1519 | } |
| 1507 | 1520 | ||
| @@ -1850,6 +1863,8 @@ static int cipso_v4_getattr(const unsigned char *cipso, | |||
| 1850 | ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr); | 1863 | ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr); |
| 1851 | break; | 1864 | break; |
| 1852 | } | 1865 | } |
| 1866 | if (ret_val == 0) | ||
| 1867 | secattr->type = NETLBL_NLTYPE_CIPSOV4; | ||
| 1853 | 1868 | ||
| 1854 | getattr_return: | 1869 | getattr_return: |
| 1855 | rcu_read_unlock(); | 1870 | rcu_read_unlock(); |
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index b11b3ecbb39d..7708e2084ce2 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c | |||
| @@ -72,12 +72,13 @@ static bool checkentry_selinux(struct xt_secmark_target_info *info) | |||
| 72 | return false; | 72 | return false; |
| 73 | } | 73 | } |
| 74 | 74 | ||
| 75 | err = selinux_relabel_packet_permission(sel->selsid); | 75 | err = selinux_secmark_relabel_packet_permission(sel->selsid); |
| 76 | if (err) { | 76 | if (err) { |
| 77 | printk(KERN_INFO PFX "unable to obtain relabeling permission\n"); | 77 | printk(KERN_INFO PFX "unable to obtain relabeling permission\n"); |
| 78 | return false; | 78 | return false; |
| 79 | } | 79 | } |
| 80 | 80 | ||
| 81 | selinux_secmark_refcount_inc(); | ||
| 81 | return true; | 82 | return true; |
| 82 | } | 83 | } |
| 83 | 84 | ||
| @@ -110,11 +111,20 @@ secmark_tg_check(const char *tablename, const void *entry, | |||
| 110 | return true; | 111 | return true; |
| 111 | } | 112 | } |
| 112 | 113 | ||
| 114 | void secmark_tg_destroy(const struct xt_target *target, void *targinfo) | ||
| 115 | { | ||
| 116 | switch (mode) { | ||
| 117 | case SECMARK_MODE_SEL: | ||
| 118 | selinux_secmark_refcount_dec(); | ||
| 119 | } | ||
| 120 | } | ||
| 121 | |||
| 113 | static struct xt_target secmark_tg_reg[] __read_mostly = { | 122 | static struct xt_target secmark_tg_reg[] __read_mostly = { |
| 114 | { | 123 | { |
| 115 | .name = "SECMARK", | 124 | .name = "SECMARK", |
| 116 | .family = AF_INET, | 125 | .family = AF_INET, |
| 117 | .checkentry = secmark_tg_check, | 126 | .checkentry = secmark_tg_check, |
| 127 | .destroy = secmark_tg_destroy, | ||
| 118 | .target = secmark_tg, | 128 | .target = secmark_tg, |
| 119 | .targetsize = sizeof(struct xt_secmark_target_info), | 129 | .targetsize = sizeof(struct xt_secmark_target_info), |
| 120 | .table = "mangle", | 130 | .table = "mangle", |
| @@ -124,6 +134,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = { | |||
| 124 | .name = "SECMARK", | 134 | .name = "SECMARK", |
| 125 | .family = AF_INET6, | 135 | .family = AF_INET6, |
| 126 | .checkentry = secmark_tg_check, | 136 | .checkentry = secmark_tg_check, |
| 137 | .destroy = secmark_tg_destroy, | ||
| 127 | .target = secmark_tg, | 138 | .target = secmark_tg, |
| 128 | .targetsize = sizeof(struct xt_secmark_target_info), | 139 | .targetsize = sizeof(struct xt_secmark_target_info), |
| 129 | .table = "mangle", | 140 | .table = "mangle", |
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index ba0ca8d3f77d..becf91a952ae 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c | |||
| @@ -38,6 +38,7 @@ | |||
| 38 | #include <net/genetlink.h> | 38 | #include <net/genetlink.h> |
| 39 | #include <net/netlabel.h> | 39 | #include <net/netlabel.h> |
| 40 | #include <net/cipso_ipv4.h> | 40 | #include <net/cipso_ipv4.h> |
| 41 | #include <asm/atomic.h> | ||
| 41 | 42 | ||
| 42 | #include "netlabel_user.h" | 43 | #include "netlabel_user.h" |
| 43 | #include "netlabel_cipso_v4.h" | 44 | #include "netlabel_cipso_v4.h" |
| @@ -421,7 +422,7 @@ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info) | |||
| 421 | break; | 422 | break; |
| 422 | } | 423 | } |
| 423 | if (ret_val == 0) | 424 | if (ret_val == 0) |
| 424 | netlbl_mgmt_protocount_inc(); | 425 | atomic_inc(&netlabel_mgmt_protocount); |
| 425 | 426 | ||
| 426 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD, | 427 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD, |
| 427 | &audit_info); | 428 | &audit_info); |
| @@ -698,7 +699,7 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info) | |||
| 698 | &audit_info, | 699 | &audit_info, |
| 699 | netlbl_cipsov4_doi_free); | 700 | netlbl_cipsov4_doi_free); |
| 700 | if (ret_val == 0) | 701 | if (ret_val == 0) |
| 701 | netlbl_mgmt_protocount_dec(); | 702 | atomic_dec(&netlabel_mgmt_protocount); |
| 702 | 703 | ||
| 703 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_DEL, | 704 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_DEL, |
| 704 | &audit_info); | 705 | &audit_info); |
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index b3675bd7db33..9a8ea0195c4f 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c | |||
| @@ -54,9 +54,6 @@ struct netlbl_domhsh_tbl { | |||
| 54 | * hash table should be okay */ | 54 | * hash table should be okay */ |
| 55 | static DEFINE_SPINLOCK(netlbl_domhsh_lock); | 55 | static DEFINE_SPINLOCK(netlbl_domhsh_lock); |
| 56 | static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL; | 56 | static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL; |
| 57 | |||
| 58 | /* Default domain mapping */ | ||
| 59 | static DEFINE_SPINLOCK(netlbl_domhsh_def_lock); | ||
| 60 | static struct netlbl_dom_map *netlbl_domhsh_def = NULL; | 57 | static struct netlbl_dom_map *netlbl_domhsh_def = NULL; |
| 61 | 58 | ||
| 62 | /* | 59 | /* |
| @@ -109,17 +106,14 @@ static u32 netlbl_domhsh_hash(const char *key) | |||
| 109 | /** | 106 | /** |
| 110 | * netlbl_domhsh_search - Search for a domain entry | 107 | * netlbl_domhsh_search - Search for a domain entry |
| 111 | * @domain: the domain | 108 | * @domain: the domain |
| 112 | * @def: return default if no match is found | ||
| 113 | * | 109 | * |
| 114 | * Description: | 110 | * Description: |
| 115 | * Searches the domain hash table and returns a pointer to the hash table | 111 | * Searches the domain hash table and returns a pointer to the hash table |
| 116 | * entry if found, otherwise NULL is returned. If @def is non-zero and a | 112 | * entry if found, otherwise NULL is returned. The caller is responsibile for |
| 117 | * match is not found in the domain hash table the default mapping is returned | 113 | * the rcu hash table locks (i.e. the caller much call rcu_read_[un]lock()). |
| 118 | * if it exists. The caller is responsibile for the rcu hash table locks | ||
| 119 | * (i.e. the caller much call rcu_read_[un]lock()). | ||
| 120 | * | 114 | * |
| 121 | */ | 115 | */ |
| 122 | static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, u32 def) | 116 | static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain) |
| 123 | { | 117 | { |
| 124 | u32 bkt; | 118 | u32 bkt; |
| 125 | struct netlbl_dom_map *iter; | 119 | struct netlbl_dom_map *iter; |
| @@ -133,10 +127,31 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, u32 def) | |||
| 133 | return iter; | 127 | return iter; |
| 134 | } | 128 | } |
| 135 | 129 | ||
| 136 | if (def != 0) { | 130 | return NULL; |
| 137 | iter = rcu_dereference(netlbl_domhsh_def); | 131 | } |
| 138 | if (iter != NULL && iter->valid) | 132 | |
| 139 | return iter; | 133 | /** |
| 134 | * netlbl_domhsh_search_def - Search for a domain entry | ||
| 135 | * @domain: the domain | ||
| 136 | * @def: return default if no match is found | ||
| 137 | * | ||
| 138 | * Description: | ||
| 139 | * Searches the domain hash table and returns a pointer to the hash table | ||
| 140 | * entry if an exact match is found, if an exact match is not present in the | ||
| 141 | * hash table then the default entry is returned if valid otherwise NULL is | ||
| 142 | * returned. The caller is responsibile for the rcu hash table locks | ||
| 143 | * (i.e. the caller much call rcu_read_[un]lock()). | ||
| 144 | * | ||
| 145 | */ | ||
| 146 | static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain) | ||
| 147 | { | ||
| 148 | struct netlbl_dom_map *entry; | ||
| 149 | |||
| 150 | entry = netlbl_domhsh_search(domain); | ||
| 151 | if (entry == NULL) { | ||
| 152 | entry = rcu_dereference(netlbl_domhsh_def); | ||
| 153 | if (entry != NULL && entry->valid) | ||
| 154 | return entry; | ||
| 140 | } | 155 | } |
| 141 | 156 | ||
| 142 | return NULL; | 157 | return NULL; |
| @@ -221,24 +236,22 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry, | |||
| 221 | INIT_RCU_HEAD(&entry->rcu); | 236 | INIT_RCU_HEAD(&entry->rcu); |
| 222 | 237 | ||
| 223 | rcu_read_lock(); | 238 | rcu_read_lock(); |
| 239 | spin_lock(&netlbl_domhsh_lock); | ||
| 224 | if (entry->domain != NULL) { | 240 | if (entry->domain != NULL) { |
| 225 | bkt = netlbl_domhsh_hash(entry->domain); | 241 | bkt = netlbl_domhsh_hash(entry->domain); |
| 226 | spin_lock(&netlbl_domhsh_lock); | 242 | if (netlbl_domhsh_search(entry->domain) == NULL) |
| 227 | if (netlbl_domhsh_search(entry->domain, 0) == NULL) | ||
| 228 | list_add_tail_rcu(&entry->list, | 243 | list_add_tail_rcu(&entry->list, |
| 229 | &rcu_dereference(netlbl_domhsh)->tbl[bkt]); | 244 | &rcu_dereference(netlbl_domhsh)->tbl[bkt]); |
| 230 | else | 245 | else |
| 231 | ret_val = -EEXIST; | 246 | ret_val = -EEXIST; |
| 232 | spin_unlock(&netlbl_domhsh_lock); | ||
| 233 | } else { | 247 | } else { |
| 234 | INIT_LIST_HEAD(&entry->list); | 248 | INIT_LIST_HEAD(&entry->list); |
| 235 | spin_lock(&netlbl_domhsh_def_lock); | ||
| 236 | if (rcu_dereference(netlbl_domhsh_def) == NULL) | 249 | if (rcu_dereference(netlbl_domhsh_def) == NULL) |
| 237 | rcu_assign_pointer(netlbl_domhsh_def, entry); | 250 | rcu_assign_pointer(netlbl_domhsh_def, entry); |
| 238 | else | 251 | else |
| 239 | ret_val = -EEXIST; | 252 | ret_val = -EEXIST; |
| 240 | spin_unlock(&netlbl_domhsh_def_lock); | ||
| 241 | } | 253 | } |
| 254 | spin_unlock(&netlbl_domhsh_lock); | ||
| 242 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); | 255 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); |
| 243 | if (audit_buf != NULL) { | 256 | if (audit_buf != NULL) { |
| 244 | audit_log_format(audit_buf, | 257 | audit_log_format(audit_buf, |
| @@ -307,7 +320,10 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) | |||
| 307 | struct audit_buffer *audit_buf; | 320 | struct audit_buffer *audit_buf; |
| 308 | 321 | ||
| 309 | rcu_read_lock(); | 322 | rcu_read_lock(); |
| 310 | entry = netlbl_domhsh_search(domain, (domain != NULL ? 0 : 1)); | 323 | if (domain) |
| 324 | entry = netlbl_domhsh_search(domain); | ||
| 325 | else | ||
| 326 | entry = netlbl_domhsh_search_def(domain); | ||
| 311 | if (entry == NULL) | 327 | if (entry == NULL) |
| 312 | goto remove_return; | 328 | goto remove_return; |
| 313 | switch (entry->type) { | 329 | switch (entry->type) { |
| @@ -316,23 +332,16 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) | |||
| 316 | entry->domain); | 332 | entry->domain); |
| 317 | break; | 333 | break; |
| 318 | } | 334 | } |
| 319 | if (entry != rcu_dereference(netlbl_domhsh_def)) { | 335 | spin_lock(&netlbl_domhsh_lock); |
| 320 | spin_lock(&netlbl_domhsh_lock); | 336 | if (entry->valid) { |
| 321 | if (entry->valid) { | 337 | entry->valid = 0; |
| 322 | entry->valid = 0; | 338 | if (entry != rcu_dereference(netlbl_domhsh_def)) |
| 323 | list_del_rcu(&entry->list); | 339 | list_del_rcu(&entry->list); |
| 324 | ret_val = 0; | 340 | else |
| 325 | } | ||
| 326 | spin_unlock(&netlbl_domhsh_lock); | ||
| 327 | } else { | ||
| 328 | spin_lock(&netlbl_domhsh_def_lock); | ||
| 329 | if (entry->valid) { | ||
| 330 | entry->valid = 0; | ||
| 331 | rcu_assign_pointer(netlbl_domhsh_def, NULL); | 341 | rcu_assign_pointer(netlbl_domhsh_def, NULL); |
| 332 | ret_val = 0; | 342 | ret_val = 0; |
| 333 | } | ||
| 334 | spin_unlock(&netlbl_domhsh_def_lock); | ||
| 335 | } | 343 | } |
| 344 | spin_unlock(&netlbl_domhsh_lock); | ||
| 336 | 345 | ||
| 337 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); | 346 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); |
| 338 | if (audit_buf != NULL) { | 347 | if (audit_buf != NULL) { |
| @@ -377,7 +386,7 @@ int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info) | |||
| 377 | */ | 386 | */ |
| 378 | struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain) | 387 | struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain) |
| 379 | { | 388 | { |
| 380 | return netlbl_domhsh_search(domain, 1); | 389 | return netlbl_domhsh_search_def(domain); |
| 381 | } | 390 | } |
| 382 | 391 | ||
| 383 | /** | 392 | /** |
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 4f50949722a9..c69e3e1f05c3 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c | |||
| @@ -34,6 +34,7 @@ | |||
| 34 | #include <net/netlabel.h> | 34 | #include <net/netlabel.h> |
| 35 | #include <net/cipso_ipv4.h> | 35 | #include <net/cipso_ipv4.h> |
| 36 | #include <asm/bug.h> | 36 | #include <asm/bug.h> |
| 37 | #include <asm/atomic.h> | ||
| 37 | 38 | ||
| 38 | #include "netlabel_domainhash.h" | 39 | #include "netlabel_domainhash.h" |
| 39 | #include "netlabel_unlabeled.h" | 40 | #include "netlabel_unlabeled.h" |
| @@ -262,7 +263,7 @@ int netlbl_enabled(void) | |||
| 262 | /* At some point we probably want to expose this mechanism to the user | 263 | /* At some point we probably want to expose this mechanism to the user |
| 263 | * as well so that admins can toggle NetLabel regardless of the | 264 | * as well so that admins can toggle NetLabel regardless of the |
| 264 | * configuration */ | 265 | * configuration */ |
| 265 | return (netlbl_mgmt_protocount_value() > 0 ? 1 : 0); | 266 | return (atomic_read(&netlabel_mgmt_protocount) > 0); |
| 266 | } | 267 | } |
| 267 | 268 | ||
| 268 | /** | 269 | /** |
| @@ -311,7 +312,7 @@ socket_setattr_return: | |||
| 311 | * @secattr: the security attributes | 312 | * @secattr: the security attributes |
| 312 | * | 313 | * |
| 313 | * Description: | 314 | * Description: |
| 314 | * Examines the given sock to see any NetLabel style labeling has been | 315 | * Examines the given sock to see if any NetLabel style labeling has been |
| 315 | * applied to the sock, if so it parses the socket label and returns the | 316 | * applied to the sock, if so it parses the socket label and returns the |
| 316 | * security attributes in @secattr. Returns zero on success, negative values | 317 | * security attributes in @secattr. Returns zero on success, negative values |
| 317 | * on failure. | 318 | * on failure. |
| @@ -319,18 +320,13 @@ socket_setattr_return: | |||
| 319 | */ | 320 | */ |
| 320 | int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) | 321 | int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) |
| 321 | { | 322 | { |
| 322 | int ret_val; | 323 | return cipso_v4_sock_getattr(sk, secattr); |
| 323 | |||
| 324 | ret_val = cipso_v4_sock_getattr(sk, secattr); | ||
| 325 | if (ret_val == 0) | ||
| 326 | return 0; | ||
| 327 | |||
| 328 | return netlbl_unlabel_getattr(secattr); | ||
| 329 | } | 324 | } |
| 330 | 325 | ||
| 331 | /** | 326 | /** |
| 332 | * netlbl_skbuff_getattr - Determine the security attributes of a packet | 327 | * netlbl_skbuff_getattr - Determine the security attributes of a packet |
| 333 | * @skb: the packet | 328 | * @skb: the packet |
| 329 | * @family: protocol family | ||
| 334 | * @secattr: the security attributes | 330 | * @secattr: the security attributes |
| 335 | * | 331 | * |
| 336 | * Description: | 332 | * Description: |
| @@ -341,13 +337,14 @@ int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) | |||
| 341 | * | 337 | * |
| 342 | */ | 338 | */ |
| 343 | int netlbl_skbuff_getattr(const struct sk_buff *skb, | 339 | int netlbl_skbuff_getattr(const struct sk_buff *skb, |
| 340 | u16 family, | ||
| 344 | struct netlbl_lsm_secattr *secattr) | 341 | struct netlbl_lsm_secattr *secattr) |
| 345 | { | 342 | { |
| 346 | if (CIPSO_V4_OPTEXIST(skb) && | 343 | if (CIPSO_V4_OPTEXIST(skb) && |
| 347 | cipso_v4_skbuff_getattr(skb, secattr) == 0) | 344 | cipso_v4_skbuff_getattr(skb, secattr) == 0) |
| 348 | return 0; | 345 | return 0; |
| 349 | 346 | ||
| 350 | return netlbl_unlabel_getattr(secattr); | 347 | return netlbl_unlabel_getattr(skb, family, secattr); |
| 351 | } | 348 | } |
| 352 | 349 | ||
| 353 | /** | 350 | /** |
| @@ -431,6 +428,10 @@ static int __init netlbl_init(void) | |||
| 431 | if (ret_val != 0) | 428 | if (ret_val != 0) |
| 432 | goto init_failure; | 429 | goto init_failure; |
| 433 | 430 | ||
| 431 | ret_val = netlbl_unlabel_init(NETLBL_UNLHSH_BITSIZE); | ||
| 432 | if (ret_val != 0) | ||
| 433 | goto init_failure; | ||
| 434 | |||
| 434 | ret_val = netlbl_netlink_init(); | 435 | ret_val = netlbl_netlink_init(); |
| 435 | if (ret_val != 0) | 436 | if (ret_val != 0) |
| 436 | goto init_failure; | 437 | goto init_failure; |
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index 9c41464d58d1..e2258dc3c845 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c | |||
| @@ -37,14 +37,14 @@ | |||
| 37 | #include <net/genetlink.h> | 37 | #include <net/genetlink.h> |
| 38 | #include <net/netlabel.h> | 38 | #include <net/netlabel.h> |
| 39 | #include <net/cipso_ipv4.h> | 39 | #include <net/cipso_ipv4.h> |
| 40 | #include <asm/atomic.h> | ||
| 40 | 41 | ||
| 41 | #include "netlabel_domainhash.h" | 42 | #include "netlabel_domainhash.h" |
| 42 | #include "netlabel_user.h" | 43 | #include "netlabel_user.h" |
| 43 | #include "netlabel_mgmt.h" | 44 | #include "netlabel_mgmt.h" |
| 44 | 45 | ||
| 45 | /* NetLabel configured protocol count */ | 46 | /* NetLabel configured protocol counter */ |
| 46 | static DEFINE_SPINLOCK(netlabel_mgmt_protocount_lock); | 47 | atomic_t netlabel_mgmt_protocount = ATOMIC_INIT(0); |
| 47 | static u32 netlabel_mgmt_protocount = 0; | ||
| 48 | 48 | ||
| 49 | /* Argument struct for netlbl_domhsh_walk() */ | 49 | /* Argument struct for netlbl_domhsh_walk() */ |
| 50 | struct netlbl_domhsh_walk_arg { | 50 | struct netlbl_domhsh_walk_arg { |
| @@ -71,63 +71,6 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = { | |||
| 71 | }; | 71 | }; |
| 72 | 72 | ||
| 73 | /* | 73 | /* |
| 74 | * NetLabel Misc Management Functions | ||
| 75 | */ | ||
| 76 | |||
| 77 | /** | ||
| 78 | * netlbl_mgmt_protocount_inc - Increment the configured labeled protocol count | ||
| 79 | * | ||
| 80 | * Description: | ||
| 81 | * Increment the number of labeled protocol configurations in the current | ||
| 82 | * NetLabel configuration. Keep track of this for use in determining if | ||
| 83 | * NetLabel label enforcement should be active/enabled or not in the LSM. | ||
| 84 | * | ||
| 85 | */ | ||
| 86 | void netlbl_mgmt_protocount_inc(void) | ||
| 87 | { | ||
| 88 | spin_lock(&netlabel_mgmt_protocount_lock); | ||
| 89 | netlabel_mgmt_protocount++; | ||
| 90 | spin_unlock(&netlabel_mgmt_protocount_lock); | ||
| 91 | } | ||
| 92 | |||
| 93 | /** | ||
| 94 | * netlbl_mgmt_protocount_dec - Decrement the configured labeled protocol count | ||
| 95 | * | ||
| 96 | * Description: | ||
| 97 | * Decrement the number of labeled protocol configurations in the current | ||
| 98 | * NetLabel configuration. Keep track of this for use in determining if | ||
| 99 | * NetLabel label enforcement should be active/enabled or not in the LSM. | ||
| 100 | * | ||
| 101 | */ | ||
| 102 | void netlbl_mgmt_protocount_dec(void) | ||
| 103 | { | ||
| 104 | spin_lock(&netlabel_mgmt_protocount_lock); | ||
| 105 | if (netlabel_mgmt_protocount > 0) | ||
| 106 | netlabel_mgmt_protocount--; | ||
| 107 | spin_unlock(&netlabel_mgmt_protocount_lock); | ||
| 108 | } | ||
| 109 | |||
| 110 | /** | ||
| 111 | * netlbl_mgmt_protocount_value - Return the number of configured protocols | ||
| 112 | * | ||
| 113 | * Description: | ||
| 114 | * Return the number of labeled protocols in the current NetLabel | ||
| 115 | * configuration. This value is useful in determining if NetLabel label | ||
| 116 | * enforcement should be active/enabled or not in the LSM. | ||
| 117 | * | ||
| 118 | */ | ||
| 119 | u32 netlbl_mgmt_protocount_value(void) | ||
| 120 | { | ||
| 121 | u32 val; | ||
| 122 | |||
| 123 | rcu_read_lock(); | ||
| 124 | val = netlabel_mgmt_protocount; | ||
| 125 | rcu_read_unlock(); | ||
| 126 | |||
| 127 | return val; | ||
| 128 | } | ||
| 129 | |||
| 130 | /* | ||
| 131 | * NetLabel Command Handlers | 74 | * NetLabel Command Handlers |
| 132 | */ | 75 | */ |
| 133 | 76 | ||
diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h index ccb2b3923591..a43bff169d6b 100644 --- a/net/netlabel/netlabel_mgmt.h +++ b/net/netlabel/netlabel_mgmt.h | |||
| @@ -32,6 +32,7 @@ | |||
| 32 | #define _NETLABEL_MGMT_H | 32 | #define _NETLABEL_MGMT_H |
| 33 | 33 | ||
| 34 | #include <net/netlabel.h> | 34 | #include <net/netlabel.h> |
| 35 | #include <asm/atomic.h> | ||
| 35 | 36 | ||
| 36 | /* | 37 | /* |
| 37 | * The following NetLabel payloads are supported by the management interface. | 38 | * The following NetLabel payloads are supported by the management interface. |
| @@ -168,9 +169,7 @@ enum { | |||
| 168 | /* NetLabel protocol functions */ | 169 | /* NetLabel protocol functions */ |
| 169 | int netlbl_mgmt_genl_init(void); | 170 | int netlbl_mgmt_genl_init(void); |
| 170 | 171 | ||
| 171 | /* NetLabel misc management functions */ | 172 | /* NetLabel configured protocol reference counter */ |
| 172 | void netlbl_mgmt_protocount_inc(void); | 173 | extern atomic_t netlabel_mgmt_protocount; |
| 173 | void netlbl_mgmt_protocount_dec(void); | ||
| 174 | u32 netlbl_mgmt_protocount_value(void); | ||
| 175 | 174 | ||
| 176 | #endif | 175 | #endif |
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 348292450deb..42e81fd8cc49 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c | |||
| @@ -10,7 +10,7 @@ | |||
| 10 | */ | 10 | */ |
| 11 | 11 | ||
| 12 | /* | 12 | /* |
| 13 | * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 | 13 | * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2007 |
| 14 | * | 14 | * |
| 15 | * This program is free software; you can redistribute it and/or modify | 15 | * This program is free software; you can redistribute it and/or modify |
| 16 | * it under the terms of the GNU General Public License as published by | 16 | * it under the terms of the GNU General Public License as published by |
| @@ -36,22 +36,92 @@ | |||
| 36 | #include <linux/string.h> | 36 | #include <linux/string.h> |
| 37 | #include <linux/skbuff.h> | 37 | #include <linux/skbuff.h> |
| 38 | #include <linux/audit.h> | 38 | #include <linux/audit.h> |
| 39 | #include <linux/in.h> | ||
| 40 | #include <linux/in6.h> | ||
| 41 | #include <linux/ip.h> | ||
| 42 | #include <linux/ipv6.h> | ||
| 43 | #include <linux/notifier.h> | ||
| 44 | #include <linux/netdevice.h> | ||
| 45 | #include <linux/security.h> | ||
| 39 | #include <net/sock.h> | 46 | #include <net/sock.h> |
| 40 | #include <net/netlink.h> | 47 | #include <net/netlink.h> |
| 41 | #include <net/genetlink.h> | 48 | #include <net/genetlink.h> |
| 42 | 49 | #include <net/ip.h> | |
| 50 | #include <net/ipv6.h> | ||
| 51 | #include <net/net_namespace.h> | ||
| 43 | #include <net/netlabel.h> | 52 | #include <net/netlabel.h> |
| 44 | #include <asm/bug.h> | 53 | #include <asm/bug.h> |
| 54 | #include <asm/atomic.h> | ||
| 45 | 55 | ||
| 46 | #include "netlabel_user.h" | 56 | #include "netlabel_user.h" |
| 47 | #include "netlabel_domainhash.h" | 57 | #include "netlabel_domainhash.h" |
| 48 | #include "netlabel_unlabeled.h" | 58 | #include "netlabel_unlabeled.h" |
| 59 | #include "netlabel_mgmt.h" | ||
| 60 | |||
| 61 | /* NOTE: at present we always use init's network namespace since we don't | ||
| 62 | * presently support different namespaces even though the majority of | ||
| 63 | * the functions in this file are "namespace safe" */ | ||
| 64 | |||
| 65 | /* The unlabeled connection hash table which we use to map network interfaces | ||
| 66 | * and addresses of unlabeled packets to a user specified secid value for the | ||
| 67 | * LSM. The hash table is used to lookup the network interface entry | ||
| 68 | * (struct netlbl_unlhsh_iface) and then the interface entry is used to | ||
| 69 | * lookup an IP address match from an ordered list. If a network interface | ||
| 70 | * match can not be found in the hash table then the default entry | ||
| 71 | * (netlbl_unlhsh_def) is used. The IP address entry list | ||
| 72 | * (struct netlbl_unlhsh_addr) is ordered such that the entries with a | ||
| 73 | * larger netmask come first. | ||
| 74 | */ | ||
| 75 | struct netlbl_unlhsh_tbl { | ||
| 76 | struct list_head *tbl; | ||
| 77 | u32 size; | ||
| 78 | }; | ||
| 79 | struct netlbl_unlhsh_addr4 { | ||
| 80 | __be32 addr; | ||
| 81 | __be32 mask; | ||
| 82 | u32 secid; | ||
| 83 | |||
| 84 | u32 valid; | ||
| 85 | struct list_head list; | ||
| 86 | struct rcu_head rcu; | ||
| 87 | }; | ||
| 88 | struct netlbl_unlhsh_addr6 { | ||
| 89 | struct in6_addr addr; | ||
| 90 | struct in6_addr mask; | ||
| 91 | u32 secid; | ||
| 92 | |||
| 93 | u32 valid; | ||
| 94 | struct list_head list; | ||
| 95 | struct rcu_head rcu; | ||
| 96 | }; | ||
| 97 | struct netlbl_unlhsh_iface { | ||
| 98 | int ifindex; | ||
| 99 | struct list_head addr4_list; | ||
| 100 | struct list_head addr6_list; | ||
| 101 | |||
| 102 | u32 valid; | ||
| 103 | struct list_head list; | ||
| 104 | struct rcu_head rcu; | ||
| 105 | }; | ||
| 106 | |||
| 107 | /* Argument struct for netlbl_unlhsh_walk() */ | ||
| 108 | struct netlbl_unlhsh_walk_arg { | ||
| 109 | struct netlink_callback *nl_cb; | ||
| 110 | struct sk_buff *skb; | ||
| 111 | u32 seq; | ||
| 112 | }; | ||
| 113 | |||
| 114 | /* Unlabeled connection hash table */ | ||
| 115 | /* updates should be so rare that having one spinlock for the entire | ||
| 116 | * hash table should be okay */ | ||
| 117 | static DEFINE_SPINLOCK(netlbl_unlhsh_lock); | ||
| 118 | static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL; | ||
| 119 | static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL; | ||
| 49 | 120 | ||
| 50 | /* Accept unlabeled packets flag */ | 121 | /* Accept unlabeled packets flag */ |
| 51 | static DEFINE_SPINLOCK(netlabel_unlabel_acceptflg_lock); | ||
| 52 | static u8 netlabel_unlabel_acceptflg = 0; | 122 | static u8 netlabel_unlabel_acceptflg = 0; |
| 53 | 123 | ||
| 54 | /* NetLabel Generic NETLINK CIPSOv4 family */ | 124 | /* NetLabel Generic NETLINK unlabeled family */ |
| 55 | static struct genl_family netlbl_unlabel_gnl_family = { | 125 | static struct genl_family netlbl_unlabel_gnl_family = { |
| 56 | .id = GENL_ID_GENERATE, | 126 | .id = GENL_ID_GENERATE, |
| 57 | .hdrsize = 0, | 127 | .hdrsize = 0, |
| @@ -63,11 +133,841 @@ static struct genl_family netlbl_unlabel_gnl_family = { | |||
| 63 | /* NetLabel Netlink attribute policy */ | 133 | /* NetLabel Netlink attribute policy */ |
| 64 | static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { | 134 | static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { |
| 65 | [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 }, | 135 | [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 }, |
| 136 | [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY, | ||
| 137 | .len = sizeof(struct in6_addr) }, | ||
| 138 | [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY, | ||
| 139 | .len = sizeof(struct in6_addr) }, | ||
| 140 | [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY, | ||
| 141 | .len = sizeof(struct in_addr) }, | ||
| 142 | [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY, | ||
| 143 | .len = sizeof(struct in_addr) }, | ||
| 144 | [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING, | ||
| 145 | .len = IFNAMSIZ - 1 }, | ||
| 146 | [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY } | ||
| 66 | }; | 147 | }; |
| 67 | 148 | ||
| 68 | /* | 149 | /* |
| 69 | * Helper Functions | 150 | * Audit Helper Functions |
| 151 | */ | ||
| 152 | |||
| 153 | /** | ||
| 154 | * netlbl_unlabel_audit_addr4 - Audit an IPv4 address | ||
| 155 | * @audit_buf: audit buffer | ||
| 156 | * @dev: network interface | ||
| 157 | * @addr: IP address | ||
| 158 | * @mask: IP address mask | ||
| 159 | * | ||
| 160 | * Description: | ||
| 161 | * Write the IPv4 address and address mask, if necessary, to @audit_buf. | ||
| 162 | * | ||
| 163 | */ | ||
| 164 | static void netlbl_unlabel_audit_addr4(struct audit_buffer *audit_buf, | ||
| 165 | const char *dev, | ||
| 166 | __be32 addr, __be32 mask) | ||
| 167 | { | ||
| 168 | u32 mask_val = ntohl(mask); | ||
| 169 | |||
| 170 | if (dev != NULL) | ||
| 171 | audit_log_format(audit_buf, " netif=%s", dev); | ||
| 172 | audit_log_format(audit_buf, " src=" NIPQUAD_FMT, NIPQUAD(addr)); | ||
| 173 | if (mask_val != 0xffffffff) { | ||
| 174 | u32 mask_len = 0; | ||
| 175 | while (mask_val > 0) { | ||
| 176 | mask_val <<= 1; | ||
| 177 | mask_len++; | ||
| 178 | } | ||
| 179 | audit_log_format(audit_buf, " src_prefixlen=%d", mask_len); | ||
| 180 | } | ||
| 181 | } | ||
| 182 | |||
| 183 | /** | ||
| 184 | * netlbl_unlabel_audit_addr6 - Audit an IPv6 address | ||
| 185 | * @audit_buf: audit buffer | ||
| 186 | * @dev: network interface | ||
| 187 | * @addr: IP address | ||
| 188 | * @mask: IP address mask | ||
| 189 | * | ||
| 190 | * Description: | ||
| 191 | * Write the IPv6 address and address mask, if necessary, to @audit_buf. | ||
| 192 | * | ||
| 193 | */ | ||
| 194 | static void netlbl_unlabel_audit_addr6(struct audit_buffer *audit_buf, | ||
| 195 | const char *dev, | ||
| 196 | const struct in6_addr *addr, | ||
| 197 | const struct in6_addr *mask) | ||
| 198 | { | ||
| 199 | if (dev != NULL) | ||
| 200 | audit_log_format(audit_buf, " netif=%s", dev); | ||
| 201 | audit_log_format(audit_buf, " src=" NIP6_FMT, NIP6(*addr)); | ||
| 202 | if (ntohl(mask->s6_addr32[3]) != 0xffffffff) { | ||
| 203 | u32 mask_len = 0; | ||
| 204 | u32 mask_val; | ||
| 205 | int iter = -1; | ||
| 206 | while (ntohl(mask->s6_addr32[++iter]) == 0xffffffff) | ||
| 207 | mask_len += 32; | ||
| 208 | mask_val = ntohl(mask->s6_addr32[iter]); | ||
| 209 | while (mask_val > 0) { | ||
| 210 | mask_val <<= 1; | ||
| 211 | mask_len++; | ||
| 212 | } | ||
| 213 | audit_log_format(audit_buf, " src_prefixlen=%d", mask_len); | ||
| 214 | } | ||
| 215 | } | ||
| 216 | |||
| 217 | /* | ||
| 218 | * Unlabeled Connection Hash Table Functions | ||
| 219 | */ | ||
| 220 | |||
| 221 | /** | ||
| 222 | * netlbl_unlhsh_free_addr4 - Frees an IPv4 address entry from the hash table | ||
| 223 | * @entry: the entry's RCU field | ||
| 224 | * | ||
| 225 | * Description: | ||
| 226 | * This function is designed to be used as a callback to the call_rcu() | ||
| 227 | * function so that memory allocated to a hash table address entry can be | ||
| 228 | * released safely. | ||
| 229 | * | ||
| 230 | */ | ||
| 231 | static void netlbl_unlhsh_free_addr4(struct rcu_head *entry) | ||
| 232 | { | ||
| 233 | struct netlbl_unlhsh_addr4 *ptr; | ||
| 234 | |||
| 235 | ptr = container_of(entry, struct netlbl_unlhsh_addr4, rcu); | ||
| 236 | kfree(ptr); | ||
| 237 | } | ||
| 238 | |||
| 239 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | ||
| 240 | /** | ||
| 241 | * netlbl_unlhsh_free_addr6 - Frees an IPv6 address entry from the hash table | ||
| 242 | * @entry: the entry's RCU field | ||
| 243 | * | ||
| 244 | * Description: | ||
| 245 | * This function is designed to be used as a callback to the call_rcu() | ||
| 246 | * function so that memory allocated to a hash table address entry can be | ||
| 247 | * released safely. | ||
| 248 | * | ||
| 249 | */ | ||
| 250 | static void netlbl_unlhsh_free_addr6(struct rcu_head *entry) | ||
| 251 | { | ||
| 252 | struct netlbl_unlhsh_addr6 *ptr; | ||
| 253 | |||
| 254 | ptr = container_of(entry, struct netlbl_unlhsh_addr6, rcu); | ||
| 255 | kfree(ptr); | ||
| 256 | } | ||
| 257 | #endif /* IPv6 */ | ||
| 258 | |||
| 259 | /** | ||
| 260 | * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table | ||
| 261 | * @entry: the entry's RCU field | ||
| 262 | * | ||
| 263 | * Description: | ||
| 264 | * This function is designed to be used as a callback to the call_rcu() | ||
| 265 | * function so that memory allocated to a hash table interface entry can be | ||
| 266 | * released safely. It is important to note that this function does not free | ||
| 267 | * the IPv4 and IPv6 address lists contained as part of an interface entry. It | ||
| 268 | * is up to the rest of the code to make sure an interface entry is only freed | ||
| 269 | * once it's address lists are empty. | ||
| 270 | * | ||
| 271 | */ | ||
| 272 | static void netlbl_unlhsh_free_iface(struct rcu_head *entry) | ||
| 273 | { | ||
| 274 | struct netlbl_unlhsh_iface *iface; | ||
| 275 | struct netlbl_unlhsh_addr4 *iter4; | ||
| 276 | struct netlbl_unlhsh_addr4 *tmp4; | ||
| 277 | struct netlbl_unlhsh_addr6 *iter6; | ||
| 278 | struct netlbl_unlhsh_addr6 *tmp6; | ||
| 279 | |||
| 280 | iface = container_of(entry, struct netlbl_unlhsh_iface, rcu); | ||
| 281 | |||
| 282 | /* no need for locks here since we are the only one with access to this | ||
| 283 | * structure */ | ||
| 284 | |||
| 285 | list_for_each_entry_safe(iter4, tmp4, &iface->addr4_list, list) | ||
| 286 | if (iter4->valid) { | ||
| 287 | list_del_rcu(&iter4->list); | ||
| 288 | kfree(iter4); | ||
| 289 | } | ||
| 290 | list_for_each_entry_safe(iter6, tmp6, &iface->addr6_list, list) | ||
| 291 | if (iter6->valid) { | ||
| 292 | list_del_rcu(&iter6->list); | ||
| 293 | kfree(iter6); | ||
| 294 | } | ||
| 295 | kfree(iface); | ||
| 296 | } | ||
| 297 | |||
| 298 | /** | ||
| 299 | * netlbl_unlhsh_hash - Hashing function for the hash table | ||
| 300 | * @ifindex: the network interface/device to hash | ||
| 301 | * | ||
| 302 | * Description: | ||
| 303 | * This is the hashing function for the unlabeled hash table, it returns the | ||
| 304 | * bucket number for the given device/interface. The caller is responsible for | ||
| 305 | * calling the rcu_read_[un]lock() functions. | ||
| 306 | * | ||
| 70 | */ | 307 | */ |
| 308 | static u32 netlbl_unlhsh_hash(int ifindex) | ||
| 309 | { | ||
| 310 | /* this is taken _almost_ directly from | ||
| 311 | * security/selinux/netif.c:sel_netif_hasfn() as they do pretty much | ||
| 312 | * the same thing */ | ||
| 313 | return ifindex & (rcu_dereference(netlbl_unlhsh)->size - 1); | ||
| 314 | } | ||
| 315 | |||
| 316 | /** | ||
| 317 | * netlbl_unlhsh_search_addr4 - Search for a matching IPv4 address entry | ||
| 318 | * @addr: IPv4 address | ||
| 319 | * @iface: the network interface entry | ||
| 320 | * | ||
| 321 | * Description: | ||
| 322 | * Searches the IPv4 address list of the network interface specified by @iface. | ||
| 323 | * If a matching address entry is found it is returned, otherwise NULL is | ||
| 324 | * returned. The caller is responsible for calling the rcu_read_[un]lock() | ||
| 325 | * functions. | ||
| 326 | * | ||
| 327 | */ | ||
| 328 | static struct netlbl_unlhsh_addr4 *netlbl_unlhsh_search_addr4( | ||
| 329 | __be32 addr, | ||
| 330 | const struct netlbl_unlhsh_iface *iface) | ||
| 331 | { | ||
| 332 | struct netlbl_unlhsh_addr4 *iter; | ||
| 333 | |||
| 334 | list_for_each_entry_rcu(iter, &iface->addr4_list, list) | ||
| 335 | if (iter->valid && (addr & iter->mask) == iter->addr) | ||
| 336 | return iter; | ||
| 337 | |||
| 338 | return NULL; | ||
| 339 | } | ||
| 340 | |||
| 341 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | ||
| 342 | /** | ||
| 343 | * netlbl_unlhsh_search_addr6 - Search for a matching IPv6 address entry | ||
| 344 | * @addr: IPv6 address | ||
| 345 | * @iface: the network interface entry | ||
| 346 | * | ||
| 347 | * Description: | ||
| 348 | * Searches the IPv6 address list of the network interface specified by @iface. | ||
| 349 | * If a matching address entry is found it is returned, otherwise NULL is | ||
| 350 | * returned. The caller is responsible for calling the rcu_read_[un]lock() | ||
| 351 | * functions. | ||
| 352 | * | ||
| 353 | */ | ||
| 354 | static struct netlbl_unlhsh_addr6 *netlbl_unlhsh_search_addr6( | ||
| 355 | const struct in6_addr *addr, | ||
| 356 | const struct netlbl_unlhsh_iface *iface) | ||
| 357 | { | ||
| 358 | struct netlbl_unlhsh_addr6 *iter; | ||
| 359 | |||
| 360 | list_for_each_entry_rcu(iter, &iface->addr6_list, list) | ||
| 361 | if (iter->valid && | ||
| 362 | ipv6_masked_addr_cmp(&iter->addr, &iter->mask, addr) == 0) | ||
| 363 | return iter; | ||
| 364 | |||
| 365 | return NULL; | ||
| 366 | } | ||
| 367 | #endif /* IPv6 */ | ||
| 368 | |||
| 369 | /** | ||
| 370 | * netlbl_unlhsh_search_iface - Search for a matching interface entry | ||
| 371 | * @ifindex: the network interface | ||
| 372 | * | ||
| 373 | * Description: | ||
| 374 | * Searches the unlabeled connection hash table and returns a pointer to the | ||
| 375 | * interface entry which matches @ifindex, otherwise NULL is returned. The | ||
| 376 | * caller is responsible for calling the rcu_read_[un]lock() functions. | ||
| 377 | * | ||
| 378 | */ | ||
| 379 | static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex) | ||
| 380 | { | ||
| 381 | u32 bkt; | ||
| 382 | struct netlbl_unlhsh_iface *iter; | ||
| 383 | |||
| 384 | bkt = netlbl_unlhsh_hash(ifindex); | ||
| 385 | list_for_each_entry_rcu(iter, | ||
| 386 | &rcu_dereference(netlbl_unlhsh)->tbl[bkt], | ||
| 387 | list) | ||
| 388 | if (iter->valid && iter->ifindex == ifindex) | ||
| 389 | return iter; | ||
| 390 | |||
| 391 | return NULL; | ||
| 392 | } | ||
| 393 | |||
| 394 | /** | ||
| 395 | * netlbl_unlhsh_search_iface_def - Search for a matching interface entry | ||
| 396 | * @ifindex: the network interface | ||
| 397 | * | ||
| 398 | * Description: | ||
| 399 | * Searches the unlabeled connection hash table and returns a pointer to the | ||
| 400 | * interface entry which matches @ifindex. If an exact match can not be found | ||
| 401 | * and there is a valid default entry, the default entry is returned, otherwise | ||
| 402 | * NULL is returned. The caller is responsible for calling the | ||
| 403 | * rcu_read_[un]lock() functions. | ||
| 404 | * | ||
| 405 | */ | ||
| 406 | static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface_def(int ifindex) | ||
| 407 | { | ||
| 408 | struct netlbl_unlhsh_iface *entry; | ||
| 409 | |||
| 410 | entry = netlbl_unlhsh_search_iface(ifindex); | ||
| 411 | if (entry != NULL) | ||
| 412 | return entry; | ||
| 413 | |||
| 414 | entry = rcu_dereference(netlbl_unlhsh_def); | ||
| 415 | if (entry != NULL && entry->valid) | ||
| 416 | return entry; | ||
| 417 | |||
| 418 | return NULL; | ||
| 419 | } | ||
| 420 | |||
| 421 | /** | ||
| 422 | * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table | ||
| 423 | * @iface: the associated interface entry | ||
| 424 | * @addr: IPv4 address in network byte order | ||
| 425 | * @mask: IPv4 address mask in network byte order | ||
| 426 | * @secid: LSM secid value for entry | ||
| 427 | * | ||
| 428 | * Description: | ||
| 429 | * Add a new address entry into the unlabeled connection hash table using the | ||
| 430 | * interface entry specified by @iface. On success zero is returned, otherwise | ||
| 431 | * a negative value is returned. The caller is responsible for calling the | ||
| 432 | * rcu_read_[un]lock() functions. | ||
| 433 | * | ||
| 434 | */ | ||
| 435 | static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface, | ||
| 436 | const struct in_addr *addr, | ||
| 437 | const struct in_addr *mask, | ||
| 438 | u32 secid) | ||
| 439 | { | ||
| 440 | struct netlbl_unlhsh_addr4 *entry; | ||
| 441 | struct netlbl_unlhsh_addr4 *iter; | ||
| 442 | |||
| 443 | entry = kzalloc(sizeof(*entry), GFP_ATOMIC); | ||
| 444 | if (entry == NULL) | ||
| 445 | return -ENOMEM; | ||
| 446 | |||
| 447 | entry->addr = addr->s_addr & mask->s_addr; | ||
| 448 | entry->mask = mask->s_addr; | ||
| 449 | entry->secid = secid; | ||
| 450 | entry->valid = 1; | ||
| 451 | INIT_RCU_HEAD(&entry->rcu); | ||
| 452 | |||
| 453 | spin_lock(&netlbl_unlhsh_lock); | ||
| 454 | iter = netlbl_unlhsh_search_addr4(entry->addr, iface); | ||
| 455 | if (iter != NULL && | ||
| 456 | iter->addr == addr->s_addr && iter->mask == mask->s_addr) { | ||
| 457 | spin_unlock(&netlbl_unlhsh_lock); | ||
| 458 | kfree(entry); | ||
| 459 | return -EEXIST; | ||
| 460 | } | ||
| 461 | /* in order to speed up address searches through the list (the common | ||
| 462 | * case) we need to keep the list in order based on the size of the | ||
| 463 | * address mask such that the entry with the widest mask (smallest | ||
| 464 | * numerical value) appears first in the list */ | ||
| 465 | list_for_each_entry_rcu(iter, &iface->addr4_list, list) | ||
| 466 | if (iter->valid && | ||
| 467 | ntohl(entry->mask) > ntohl(iter->mask)) { | ||
| 468 | __list_add_rcu(&entry->list, | ||
| 469 | iter->list.prev, | ||
| 470 | &iter->list); | ||
| 471 | spin_unlock(&netlbl_unlhsh_lock); | ||
| 472 | return 0; | ||
| 473 | } | ||
| 474 | list_add_tail_rcu(&entry->list, &iface->addr4_list); | ||
| 475 | spin_unlock(&netlbl_unlhsh_lock); | ||
| 476 | return 0; | ||
| 477 | } | ||
| 478 | |||
| 479 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | ||
| 480 | /** | ||
| 481 | * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table | ||
| 482 | * @iface: the associated interface entry | ||
| 483 | * @addr: IPv6 address in network byte order | ||
| 484 | * @mask: IPv6 address mask in network byte order | ||
| 485 | * @secid: LSM secid value for entry | ||
| 486 | * | ||
| 487 | * Description: | ||
| 488 | * Add a new address entry into the unlabeled connection hash table using the | ||
| 489 | * interface entry specified by @iface. On success zero is returned, otherwise | ||
| 490 | * a negative value is returned. The caller is responsible for calling the | ||
| 491 | * rcu_read_[un]lock() functions. | ||
| 492 | * | ||
| 493 | */ | ||
| 494 | static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface, | ||
| 495 | const struct in6_addr *addr, | ||
| 496 | const struct in6_addr *mask, | ||
| 497 | u32 secid) | ||
| 498 | { | ||
| 499 | struct netlbl_unlhsh_addr6 *entry; | ||
| 500 | struct netlbl_unlhsh_addr6 *iter; | ||
| 501 | |||
| 502 | entry = kzalloc(sizeof(*entry), GFP_ATOMIC); | ||
| 503 | if (entry == NULL) | ||
| 504 | return -ENOMEM; | ||
| 505 | |||
| 506 | ipv6_addr_copy(&entry->addr, addr); | ||
| 507 | entry->addr.s6_addr32[0] &= mask->s6_addr32[0]; | ||
| 508 | entry->addr.s6_addr32[1] &= mask->s6_addr32[1]; | ||
| 509 | entry->addr.s6_addr32[2] &= mask->s6_addr32[2]; | ||
| 510 | entry->addr.s6_addr32[3] &= mask->s6_addr32[3]; | ||
| 511 | ipv6_addr_copy(&entry->mask, mask); | ||
| 512 | entry->secid = secid; | ||
| 513 | entry->valid = 1; | ||
| 514 | INIT_RCU_HEAD(&entry->rcu); | ||
| 515 | |||
| 516 | spin_lock(&netlbl_unlhsh_lock); | ||
| 517 | iter = netlbl_unlhsh_search_addr6(&entry->addr, iface); | ||
| 518 | if (iter != NULL && | ||
| 519 | (ipv6_addr_equal(&iter->addr, addr) && | ||
| 520 | ipv6_addr_equal(&iter->mask, mask))) { | ||
| 521 | spin_unlock(&netlbl_unlhsh_lock); | ||
| 522 | kfree(entry); | ||
| 523 | return -EEXIST; | ||
| 524 | } | ||
| 525 | /* in order to speed up address searches through the list (the common | ||
| 526 | * case) we need to keep the list in order based on the size of the | ||
| 527 | * address mask such that the entry with the widest mask (smallest | ||
| 528 | * numerical value) appears first in the list */ | ||
| 529 | list_for_each_entry_rcu(iter, &iface->addr6_list, list) | ||
| 530 | if (iter->valid && | ||
| 531 | ipv6_addr_cmp(&entry->mask, &iter->mask) > 0) { | ||
| 532 | __list_add_rcu(&entry->list, | ||
| 533 | iter->list.prev, | ||
| 534 | &iter->list); | ||
| 535 | spin_unlock(&netlbl_unlhsh_lock); | ||
| 536 | return 0; | ||
| 537 | } | ||
| 538 | list_add_tail_rcu(&entry->list, &iface->addr6_list); | ||
| 539 | spin_unlock(&netlbl_unlhsh_lock); | ||
| 540 | return 0; | ||
| 541 | } | ||
| 542 | #endif /* IPv6 */ | ||
| 543 | |||
| 544 | /** | ||
| 545 | * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table | ||
| 546 | * @ifindex: network interface | ||
| 547 | * | ||
| 548 | * Description: | ||
| 549 | * Add a new, empty, interface entry into the unlabeled connection hash table. | ||
| 550 | * On success a pointer to the new interface entry is returned, on failure NULL | ||
| 551 | * is returned. The caller is responsible for calling the rcu_read_[un]lock() | ||
| 552 | * functions. | ||
| 553 | * | ||
| 554 | */ | ||
| 555 | static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex) | ||
| 556 | { | ||
| 557 | u32 bkt; | ||
| 558 | struct netlbl_unlhsh_iface *iface; | ||
| 559 | |||
| 560 | iface = kzalloc(sizeof(*iface), GFP_ATOMIC); | ||
| 561 | if (iface == NULL) | ||
| 562 | return NULL; | ||
| 563 | |||
| 564 | iface->ifindex = ifindex; | ||
| 565 | INIT_LIST_HEAD(&iface->addr4_list); | ||
| 566 | INIT_LIST_HEAD(&iface->addr6_list); | ||
| 567 | iface->valid = 1; | ||
| 568 | INIT_RCU_HEAD(&iface->rcu); | ||
| 569 | |||
| 570 | spin_lock(&netlbl_unlhsh_lock); | ||
| 571 | if (ifindex > 0) { | ||
| 572 | bkt = netlbl_unlhsh_hash(ifindex); | ||
| 573 | if (netlbl_unlhsh_search_iface(ifindex) != NULL) | ||
| 574 | goto add_iface_failure; | ||
| 575 | list_add_tail_rcu(&iface->list, | ||
| 576 | &rcu_dereference(netlbl_unlhsh)->tbl[bkt]); | ||
| 577 | } else { | ||
| 578 | INIT_LIST_HEAD(&iface->list); | ||
| 579 | if (rcu_dereference(netlbl_unlhsh_def) != NULL) | ||
| 580 | goto add_iface_failure; | ||
| 581 | rcu_assign_pointer(netlbl_unlhsh_def, iface); | ||
| 582 | } | ||
| 583 | spin_unlock(&netlbl_unlhsh_lock); | ||
| 584 | |||
| 585 | return iface; | ||
| 586 | |||
| 587 | add_iface_failure: | ||
| 588 | spin_unlock(&netlbl_unlhsh_lock); | ||
| 589 | kfree(iface); | ||
| 590 | return NULL; | ||
| 591 | } | ||
| 592 | |||
| 593 | /** | ||
| 594 | * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table | ||
| 595 | * @net: network namespace | ||
| 596 | * @dev_name: interface name | ||
| 597 | * @addr: IP address in network byte order | ||
| 598 | * @mask: address mask in network byte order | ||
| 599 | * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) | ||
| 600 | * @secid: LSM secid value for the entry | ||
| 601 | * @audit_info: NetLabel audit information | ||
| 602 | * | ||
| 603 | * Description: | ||
| 604 | * Adds a new entry to the unlabeled connection hash table. Returns zero on | ||
| 605 | * success, negative values on failure. | ||
| 606 | * | ||
| 607 | */ | ||
| 608 | static int netlbl_unlhsh_add(struct net *net, | ||
| 609 | const char *dev_name, | ||
| 610 | const void *addr, | ||
| 611 | const void *mask, | ||
| 612 | u32 addr_len, | ||
| 613 | u32 secid, | ||
| 614 | struct netlbl_audit *audit_info) | ||
| 615 | { | ||
| 616 | int ret_val; | ||
| 617 | int ifindex; | ||
| 618 | struct net_device *dev; | ||
| 619 | struct netlbl_unlhsh_iface *iface; | ||
| 620 | struct in_addr *addr4, *mask4; | ||
| 621 | struct in6_addr *addr6, *mask6; | ||
| 622 | struct audit_buffer *audit_buf = NULL; | ||
| 623 | char *secctx = NULL; | ||
| 624 | u32 secctx_len; | ||
| 625 | |||
| 626 | if (addr_len != sizeof(struct in_addr) && | ||
| 627 | addr_len != sizeof(struct in6_addr)) | ||
| 628 | return -EINVAL; | ||
| 629 | |||
| 630 | rcu_read_lock(); | ||
| 631 | if (dev_name != NULL) { | ||
| 632 | dev = dev_get_by_name(net, dev_name); | ||
| 633 | if (dev == NULL) { | ||
| 634 | ret_val = -ENODEV; | ||
| 635 | goto unlhsh_add_return; | ||
| 636 | } | ||
| 637 | ifindex = dev->ifindex; | ||
| 638 | dev_put(dev); | ||
| 639 | iface = netlbl_unlhsh_search_iface(ifindex); | ||
| 640 | } else { | ||
| 641 | ifindex = 0; | ||
| 642 | iface = rcu_dereference(netlbl_unlhsh_def); | ||
| 643 | } | ||
| 644 | if (iface == NULL) { | ||
| 645 | iface = netlbl_unlhsh_add_iface(ifindex); | ||
| 646 | if (iface == NULL) { | ||
| 647 | ret_val = -ENOMEM; | ||
| 648 | goto unlhsh_add_return; | ||
| 649 | } | ||
| 650 | } | ||
| 651 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD, | ||
| 652 | audit_info); | ||
| 653 | switch (addr_len) { | ||
| 654 | case sizeof(struct in_addr): | ||
| 655 | addr4 = (struct in_addr *)addr; | ||
| 656 | mask4 = (struct in_addr *)mask; | ||
| 657 | ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid); | ||
| 658 | if (audit_buf != NULL) | ||
| 659 | netlbl_unlabel_audit_addr4(audit_buf, | ||
| 660 | dev_name, | ||
| 661 | addr4->s_addr, | ||
| 662 | mask4->s_addr); | ||
| 663 | break; | ||
| 664 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | ||
| 665 | case sizeof(struct in6_addr): | ||
| 666 | addr6 = (struct in6_addr *)addr; | ||
| 667 | mask6 = (struct in6_addr *)mask; | ||
| 668 | ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid); | ||
| 669 | if (audit_buf != NULL) | ||
| 670 | netlbl_unlabel_audit_addr6(audit_buf, | ||
| 671 | dev_name, | ||
| 672 | addr6, mask6); | ||
| 673 | break; | ||
| 674 | #endif /* IPv6 */ | ||
| 675 | default: | ||
| 676 | ret_val = -EINVAL; | ||
| 677 | } | ||
| 678 | if (ret_val == 0) | ||
| 679 | atomic_inc(&netlabel_mgmt_protocount); | ||
| 680 | |||
| 681 | unlhsh_add_return: | ||
| 682 | rcu_read_unlock(); | ||
| 683 | if (audit_buf != NULL) { | ||
| 684 | if (security_secid_to_secctx(secid, | ||
| 685 | &secctx, | ||
| 686 | &secctx_len) == 0) { | ||
| 687 | audit_log_format(audit_buf, " sec_obj=%s", secctx); | ||
| 688 | security_release_secctx(secctx, secctx_len); | ||
| 689 | } | ||
| 690 | audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); | ||
| 691 | audit_log_end(audit_buf); | ||
| 692 | } | ||
| 693 | return ret_val; | ||
| 694 | } | ||
| 695 | |||
| 696 | /** | ||
| 697 | * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry | ||
| 698 | * @net: network namespace | ||
| 699 | * @iface: interface entry | ||
| 700 | * @addr: IP address | ||
| 701 | * @mask: IP address mask | ||
| 702 | * @audit_info: NetLabel audit information | ||
| 703 | * | ||
| 704 | * Description: | ||
| 705 | * Remove an IP address entry from the unlabeled connection hash table. | ||
| 706 | * Returns zero on success, negative values on failure. The caller is | ||
| 707 | * responsible for calling the rcu_read_[un]lock() functions. | ||
| 708 | * | ||
| 709 | */ | ||
| 710 | static int netlbl_unlhsh_remove_addr4(struct net *net, | ||
| 711 | struct netlbl_unlhsh_iface *iface, | ||
| 712 | const struct in_addr *addr, | ||
| 713 | const struct in_addr *mask, | ||
| 714 | struct netlbl_audit *audit_info) | ||
| 715 | { | ||
| 716 | int ret_val = -ENOENT; | ||
| 717 | struct netlbl_unlhsh_addr4 *entry; | ||
| 718 | struct audit_buffer *audit_buf = NULL; | ||
| 719 | struct net_device *dev; | ||
| 720 | char *secctx = NULL; | ||
| 721 | u32 secctx_len; | ||
| 722 | |||
| 723 | spin_lock(&netlbl_unlhsh_lock); | ||
| 724 | entry = netlbl_unlhsh_search_addr4(addr->s_addr, iface); | ||
| 725 | if (entry != NULL && | ||
| 726 | entry->addr == addr->s_addr && entry->mask == mask->s_addr) { | ||
| 727 | entry->valid = 0; | ||
| 728 | list_del_rcu(&entry->list); | ||
| 729 | ret_val = 0; | ||
| 730 | } | ||
| 731 | spin_unlock(&netlbl_unlhsh_lock); | ||
| 732 | |||
| 733 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, | ||
| 734 | audit_info); | ||
| 735 | if (audit_buf != NULL) { | ||
| 736 | dev = dev_get_by_index(net, iface->ifindex); | ||
| 737 | netlbl_unlabel_audit_addr4(audit_buf, | ||
| 738 | (dev != NULL ? dev->name : NULL), | ||
| 739 | entry->addr, entry->mask); | ||
| 740 | if (dev != NULL) | ||
| 741 | dev_put(dev); | ||
| 742 | if (security_secid_to_secctx(entry->secid, | ||
| 743 | &secctx, | ||
| 744 | &secctx_len) == 0) { | ||
| 745 | audit_log_format(audit_buf, " sec_obj=%s", secctx); | ||
| 746 | security_release_secctx(secctx, secctx_len); | ||
| 747 | } | ||
| 748 | audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); | ||
| 749 | audit_log_end(audit_buf); | ||
| 750 | } | ||
| 751 | |||
| 752 | if (ret_val == 0) | ||
| 753 | call_rcu(&entry->rcu, netlbl_unlhsh_free_addr4); | ||
| 754 | return ret_val; | ||
| 755 | } | ||
| 756 | |||
| 757 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | ||
| 758 | /** | ||
| 759 | * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry | ||
| 760 | * @net: network namespace | ||
| 761 | * @iface: interface entry | ||
| 762 | * @addr: IP address | ||
| 763 | * @mask: IP address mask | ||
| 764 | * @audit_info: NetLabel audit information | ||
| 765 | * | ||
| 766 | * Description: | ||
| 767 | * Remove an IP address entry from the unlabeled connection hash table. | ||
| 768 | * Returns zero on success, negative values on failure. The caller is | ||
| 769 | * responsible for calling the rcu_read_[un]lock() functions. | ||
| 770 | * | ||
| 771 | */ | ||
| 772 | static int netlbl_unlhsh_remove_addr6(struct net *net, | ||
| 773 | struct netlbl_unlhsh_iface *iface, | ||
| 774 | const struct in6_addr *addr, | ||
| 775 | const struct in6_addr *mask, | ||
| 776 | struct netlbl_audit *audit_info) | ||
| 777 | { | ||
| 778 | int ret_val = -ENOENT; | ||
| 779 | struct netlbl_unlhsh_addr6 *entry; | ||
| 780 | struct audit_buffer *audit_buf = NULL; | ||
| 781 | struct net_device *dev; | ||
| 782 | char *secctx = NULL; | ||
| 783 | u32 secctx_len; | ||
| 784 | |||
| 785 | spin_lock(&netlbl_unlhsh_lock); | ||
| 786 | entry = netlbl_unlhsh_search_addr6(addr, iface); | ||
| 787 | if (entry != NULL && | ||
| 788 | (ipv6_addr_equal(&entry->addr, addr) && | ||
| 789 | ipv6_addr_equal(&entry->mask, mask))) { | ||
| 790 | entry->valid = 0; | ||
| 791 | list_del_rcu(&entry->list); | ||
| 792 | ret_val = 0; | ||
| 793 | } | ||
| 794 | spin_unlock(&netlbl_unlhsh_lock); | ||
| 795 | |||
| 796 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, | ||
| 797 | audit_info); | ||
| 798 | if (audit_buf != NULL) { | ||
| 799 | dev = dev_get_by_index(net, iface->ifindex); | ||
| 800 | netlbl_unlabel_audit_addr6(audit_buf, | ||
| 801 | (dev != NULL ? dev->name : NULL), | ||
| 802 | addr, mask); | ||
| 803 | if (dev != NULL) | ||
| 804 | dev_put(dev); | ||
| 805 | if (security_secid_to_secctx(entry->secid, | ||
| 806 | &secctx, | ||
| 807 | &secctx_len) == 0) { | ||
| 808 | audit_log_format(audit_buf, " sec_obj=%s", secctx); | ||
| 809 | security_release_secctx(secctx, secctx_len); | ||
| 810 | } | ||
| 811 | audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); | ||
| 812 | audit_log_end(audit_buf); | ||
| 813 | } | ||
| 814 | |||
| 815 | if (ret_val == 0) | ||
| 816 | call_rcu(&entry->rcu, netlbl_unlhsh_free_addr6); | ||
| 817 | return ret_val; | ||
| 818 | } | ||
| 819 | #endif /* IPv6 */ | ||
| 820 | |||
| 821 | /** | ||
| 822 | * netlbl_unlhsh_condremove_iface - Remove an interface entry | ||
| 823 | * @iface: the interface entry | ||
| 824 | * | ||
| 825 | * Description: | ||
| 826 | * Remove an interface entry from the unlabeled connection hash table if it is | ||
| 827 | * empty. An interface entry is considered to be empty if there are no | ||
| 828 | * address entries assigned to it. | ||
| 829 | * | ||
| 830 | */ | ||
| 831 | static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface) | ||
| 832 | { | ||
| 833 | struct netlbl_unlhsh_addr4 *iter4; | ||
| 834 | struct netlbl_unlhsh_addr6 *iter6; | ||
| 835 | |||
| 836 | spin_lock(&netlbl_unlhsh_lock); | ||
| 837 | list_for_each_entry_rcu(iter4, &iface->addr4_list, list) | ||
| 838 | if (iter4->valid) | ||
| 839 | goto unlhsh_condremove_failure; | ||
| 840 | list_for_each_entry_rcu(iter6, &iface->addr6_list, list) | ||
| 841 | if (iter6->valid) | ||
| 842 | goto unlhsh_condremove_failure; | ||
| 843 | iface->valid = 0; | ||
| 844 | if (iface->ifindex > 0) | ||
| 845 | list_del_rcu(&iface->list); | ||
| 846 | else | ||
| 847 | rcu_assign_pointer(netlbl_unlhsh_def, NULL); | ||
| 848 | spin_unlock(&netlbl_unlhsh_lock); | ||
| 849 | |||
| 850 | call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); | ||
| 851 | return; | ||
| 852 | |||
| 853 | unlhsh_condremove_failure: | ||
| 854 | spin_unlock(&netlbl_unlhsh_lock); | ||
| 855 | return; | ||
| 856 | } | ||
| 857 | |||
| 858 | /** | ||
| 859 | * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table | ||
| 860 | * @net: network namespace | ||
| 861 | * @dev_name: interface name | ||
| 862 | * @addr: IP address in network byte order | ||
| 863 | * @mask: address mask in network byte order | ||
| 864 | * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) | ||
| 865 | * @audit_info: NetLabel audit information | ||
| 866 | * | ||
| 867 | * Description: | ||
| 868 | * Removes and existing entry from the unlabeled connection hash table. | ||
| 869 | * Returns zero on success, negative values on failure. | ||
| 870 | * | ||
| 871 | */ | ||
| 872 | static int netlbl_unlhsh_remove(struct net *net, | ||
| 873 | const char *dev_name, | ||
| 874 | const void *addr, | ||
| 875 | const void *mask, | ||
| 876 | u32 addr_len, | ||
| 877 | struct netlbl_audit *audit_info) | ||
| 878 | { | ||
| 879 | int ret_val; | ||
| 880 | struct net_device *dev; | ||
| 881 | struct netlbl_unlhsh_iface *iface; | ||
| 882 | |||
| 883 | if (addr_len != sizeof(struct in_addr) && | ||
| 884 | addr_len != sizeof(struct in6_addr)) | ||
| 885 | return -EINVAL; | ||
| 886 | |||
| 887 | rcu_read_lock(); | ||
| 888 | if (dev_name != NULL) { | ||
| 889 | dev = dev_get_by_name(net, dev_name); | ||
| 890 | if (dev == NULL) { | ||
| 891 | ret_val = -ENODEV; | ||
| 892 | goto unlhsh_remove_return; | ||
| 893 | } | ||
| 894 | iface = netlbl_unlhsh_search_iface(dev->ifindex); | ||
| 895 | dev_put(dev); | ||
| 896 | } else | ||
| 897 | iface = rcu_dereference(netlbl_unlhsh_def); | ||
| 898 | if (iface == NULL) { | ||
| 899 | ret_val = -ENOENT; | ||
| 900 | goto unlhsh_remove_return; | ||
| 901 | } | ||
| 902 | switch (addr_len) { | ||
| 903 | case sizeof(struct in_addr): | ||
| 904 | ret_val = netlbl_unlhsh_remove_addr4(net, | ||
| 905 | iface, addr, mask, | ||
| 906 | audit_info); | ||
| 907 | break; | ||
| 908 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | ||
| 909 | case sizeof(struct in6_addr): | ||
| 910 | ret_val = netlbl_unlhsh_remove_addr6(net, | ||
| 911 | iface, addr, mask, | ||
| 912 | audit_info); | ||
| 913 | break; | ||
| 914 | #endif /* IPv6 */ | ||
| 915 | default: | ||
| 916 | ret_val = -EINVAL; | ||
| 917 | } | ||
| 918 | if (ret_val == 0) { | ||
| 919 | netlbl_unlhsh_condremove_iface(iface); | ||
| 920 | atomic_dec(&netlabel_mgmt_protocount); | ||
| 921 | } | ||
| 922 | |||
| 923 | unlhsh_remove_return: | ||
| 924 | rcu_read_unlock(); | ||
| 925 | return ret_val; | ||
| 926 | } | ||
| 927 | |||
| 928 | /* | ||
| 929 | * General Helper Functions | ||
| 930 | */ | ||
| 931 | |||
| 932 | /** | ||
| 933 | * netlbl_unlhsh_netdev_handler - Network device notification handler | ||
| 934 | * @this: notifier block | ||
| 935 | * @event: the event | ||
| 936 | * @ptr: the network device (cast to void) | ||
| 937 | * | ||
| 938 | * Description: | ||
| 939 | * Handle network device events, although at present all we care about is a | ||
| 940 | * network device going away. In the case of a device going away we clear any | ||
| 941 | * related entries from the unlabeled connection hash table. | ||
| 942 | * | ||
| 943 | */ | ||
| 944 | static int netlbl_unlhsh_netdev_handler(struct notifier_block *this, | ||
| 945 | unsigned long event, | ||
| 946 | void *ptr) | ||
| 947 | { | ||
| 948 | struct net_device *dev = ptr; | ||
| 949 | struct netlbl_unlhsh_iface *iface = NULL; | ||
| 950 | |||
| 951 | if (dev->nd_net != &init_net) | ||
| 952 | return NOTIFY_DONE; | ||
| 953 | |||
| 954 | /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */ | ||
| 955 | if (event == NETDEV_DOWN) { | ||
| 956 | spin_lock(&netlbl_unlhsh_lock); | ||
| 957 | iface = netlbl_unlhsh_search_iface(dev->ifindex); | ||
| 958 | if (iface != NULL && iface->valid) { | ||
| 959 | iface->valid = 0; | ||
| 960 | list_del_rcu(&iface->list); | ||
| 961 | } else | ||
| 962 | iface = NULL; | ||
| 963 | spin_unlock(&netlbl_unlhsh_lock); | ||
| 964 | } | ||
| 965 | |||
| 966 | if (iface != NULL) | ||
| 967 | call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); | ||
| 968 | |||
| 969 | return NOTIFY_DONE; | ||
| 970 | } | ||
| 71 | 971 | ||
| 72 | /** | 972 | /** |
| 73 | * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag | 973 | * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag |
| @@ -84,11 +984,8 @@ static void netlbl_unlabel_acceptflg_set(u8 value, | |||
| 84 | struct audit_buffer *audit_buf; | 984 | struct audit_buffer *audit_buf; |
| 85 | u8 old_val; | 985 | u8 old_val; |
| 86 | 986 | ||
| 87 | spin_lock(&netlabel_unlabel_acceptflg_lock); | ||
| 88 | old_val = netlabel_unlabel_acceptflg; | 987 | old_val = netlabel_unlabel_acceptflg; |
| 89 | netlabel_unlabel_acceptflg = value; | 988 | netlabel_unlabel_acceptflg = value; |
| 90 | spin_unlock(&netlabel_unlabel_acceptflg_lock); | ||
| 91 | |||
| 92 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW, | 989 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW, |
| 93 | audit_info); | 990 | audit_info); |
| 94 | if (audit_buf != NULL) { | 991 | if (audit_buf != NULL) { |
| @@ -98,6 +995,48 @@ static void netlbl_unlabel_acceptflg_set(u8 value, | |||
| 98 | } | 995 | } |
| 99 | } | 996 | } |
| 100 | 997 | ||
| 998 | /** | ||
| 999 | * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information | ||
| 1000 | * @info: the Generic NETLINK info block | ||
| 1001 | * @addr: the IP address | ||
| 1002 | * @mask: the IP address mask | ||
| 1003 | * @len: the address length | ||
| 1004 | * | ||
| 1005 | * Description: | ||
| 1006 | * Examine the Generic NETLINK message and extract the IP address information. | ||
| 1007 | * Returns zero on success, negative values on failure. | ||
| 1008 | * | ||
| 1009 | */ | ||
| 1010 | static int netlbl_unlabel_addrinfo_get(struct genl_info *info, | ||
| 1011 | void **addr, | ||
| 1012 | void **mask, | ||
| 1013 | u32 *len) | ||
| 1014 | { | ||
| 1015 | u32 addr_len; | ||
| 1016 | |||
| 1017 | if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) { | ||
| 1018 | addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); | ||
| 1019 | if (addr_len != sizeof(struct in_addr) && | ||
| 1020 | addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK])) | ||
| 1021 | return -EINVAL; | ||
| 1022 | *len = addr_len; | ||
| 1023 | *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); | ||
| 1024 | *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]); | ||
| 1025 | return 0; | ||
| 1026 | } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) { | ||
| 1027 | addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); | ||
| 1028 | if (addr_len != sizeof(struct in6_addr) && | ||
| 1029 | addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK])) | ||
| 1030 | return -EINVAL; | ||
| 1031 | *len = addr_len; | ||
| 1032 | *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); | ||
| 1033 | *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]); | ||
| 1034 | return 0; | ||
| 1035 | } | ||
| 1036 | |||
| 1037 | return -EINVAL; | ||
| 1038 | } | ||
| 1039 | |||
| 101 | /* | 1040 | /* |
| 102 | * NetLabel Command Handlers | 1041 | * NetLabel Command Handlers |
| 103 | */ | 1042 | */ |
| @@ -155,11 +1094,9 @@ static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info) | |||
| 155 | goto list_failure; | 1094 | goto list_failure; |
| 156 | } | 1095 | } |
| 157 | 1096 | ||
| 158 | rcu_read_lock(); | ||
| 159 | ret_val = nla_put_u8(ans_skb, | 1097 | ret_val = nla_put_u8(ans_skb, |
| 160 | NLBL_UNLABEL_A_ACPTFLG, | 1098 | NLBL_UNLABEL_A_ACPTFLG, |
| 161 | netlabel_unlabel_acceptflg); | 1099 | netlabel_unlabel_acceptflg); |
| 162 | rcu_read_unlock(); | ||
| 163 | if (ret_val != 0) | 1100 | if (ret_val != 0) |
| 164 | goto list_failure; | 1101 | goto list_failure; |
| 165 | 1102 | ||
| @@ -175,11 +1112,489 @@ list_failure: | |||
| 175 | return ret_val; | 1112 | return ret_val; |
| 176 | } | 1113 | } |
| 177 | 1114 | ||
| 1115 | /** | ||
| 1116 | * netlbl_unlabel_staticadd - Handle a STATICADD message | ||
| 1117 | * @skb: the NETLINK buffer | ||
| 1118 | * @info: the Generic NETLINK info block | ||
| 1119 | * | ||
| 1120 | * Description: | ||
| 1121 | * Process a user generated STATICADD message and add a new unlabeled | ||
| 1122 | * connection entry to the hash table. Returns zero on success, negative | ||
| 1123 | * values on failure. | ||
| 1124 | * | ||
| 1125 | */ | ||
| 1126 | static int netlbl_unlabel_staticadd(struct sk_buff *skb, | ||
| 1127 | struct genl_info *info) | ||
| 1128 | { | ||
| 1129 | int ret_val; | ||
| 1130 | char *dev_name; | ||
| 1131 | void *addr; | ||
| 1132 | void *mask; | ||
| 1133 | u32 addr_len; | ||
| 1134 | u32 secid; | ||
| 1135 | struct netlbl_audit audit_info; | ||
| 1136 | |||
| 1137 | /* Don't allow users to add both IPv4 and IPv6 addresses for a | ||
| 1138 | * single entry. However, allow users to create two entries, one each | ||
| 1139 | * for IPv4 and IPv4, with the same LSM security context which should | ||
| 1140 | * achieve the same result. */ | ||
| 1141 | if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || | ||
| 1142 | !info->attrs[NLBL_UNLABEL_A_IFACE] || | ||
| 1143 | !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || | ||
| 1144 | !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ | ||
| 1145 | (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || | ||
| 1146 | !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) | ||
| 1147 | return -EINVAL; | ||
| 1148 | |||
| 1149 | netlbl_netlink_auditinfo(skb, &audit_info); | ||
| 1150 | |||
| 1151 | ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); | ||
| 1152 | if (ret_val != 0) | ||
| 1153 | return ret_val; | ||
| 1154 | dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); | ||
| 1155 | ret_val = security_secctx_to_secid( | ||
| 1156 | nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), | ||
| 1157 | nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), | ||
| 1158 | &secid); | ||
| 1159 | if (ret_val != 0) | ||
| 1160 | return ret_val; | ||
| 1161 | |||
| 1162 | return netlbl_unlhsh_add(&init_net, | ||
| 1163 | dev_name, addr, mask, addr_len, secid, | ||
| 1164 | &audit_info); | ||
| 1165 | } | ||
| 1166 | |||
| 1167 | /** | ||
| 1168 | * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message | ||
| 1169 | * @skb: the NETLINK buffer | ||
| 1170 | * @info: the Generic NETLINK info block | ||
| 1171 | * | ||
| 1172 | * Description: | ||
| 1173 | * Process a user generated STATICADDDEF message and add a new default | ||
| 1174 | * unlabeled connection entry. Returns zero on success, negative values on | ||
| 1175 | * failure. | ||
| 1176 | * | ||
| 1177 | */ | ||
| 1178 | static int netlbl_unlabel_staticadddef(struct sk_buff *skb, | ||
| 1179 | struct genl_info *info) | ||
| 1180 | { | ||
| 1181 | int ret_val; | ||
| 1182 | void *addr; | ||
| 1183 | void *mask; | ||
| 1184 | u32 addr_len; | ||
| 1185 | u32 secid; | ||
| 1186 | struct netlbl_audit audit_info; | ||
| 1187 | |||
| 1188 | /* Don't allow users to add both IPv4 and IPv6 addresses for a | ||
| 1189 | * single entry. However, allow users to create two entries, one each | ||
| 1190 | * for IPv4 and IPv6, with the same LSM security context which should | ||
| 1191 | * achieve the same result. */ | ||
| 1192 | if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || | ||
| 1193 | !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || | ||
| 1194 | !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ | ||
| 1195 | (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || | ||
| 1196 | !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) | ||
| 1197 | return -EINVAL; | ||
| 1198 | |||
| 1199 | netlbl_netlink_auditinfo(skb, &audit_info); | ||
| 1200 | |||
| 1201 | ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); | ||
| 1202 | if (ret_val != 0) | ||
| 1203 | return ret_val; | ||
| 1204 | ret_val = security_secctx_to_secid( | ||
| 1205 | nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), | ||
| 1206 | nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), | ||
| 1207 | &secid); | ||
| 1208 | if (ret_val != 0) | ||
| 1209 | return ret_val; | ||
| 1210 | |||
| 1211 | return netlbl_unlhsh_add(&init_net, | ||
| 1212 | NULL, addr, mask, addr_len, secid, | ||
| 1213 | &audit_info); | ||
| 1214 | } | ||
| 1215 | |||
| 1216 | /** | ||
| 1217 | * netlbl_unlabel_staticremove - Handle a STATICREMOVE message | ||
| 1218 | * @skb: the NETLINK buffer | ||
| 1219 | * @info: the Generic NETLINK info block | ||
| 1220 | * | ||
| 1221 | * Description: | ||
| 1222 | * Process a user generated STATICREMOVE message and remove the specified | ||
| 1223 | * unlabeled connection entry. Returns zero on success, negative values on | ||
| 1224 | * failure. | ||
| 1225 | * | ||
| 1226 | */ | ||
| 1227 | static int netlbl_unlabel_staticremove(struct sk_buff *skb, | ||
| 1228 | struct genl_info *info) | ||
| 1229 | { | ||
| 1230 | int ret_val; | ||
| 1231 | char *dev_name; | ||
| 1232 | void *addr; | ||
| 1233 | void *mask; | ||
| 1234 | u32 addr_len; | ||
| 1235 | struct netlbl_audit audit_info; | ||
| 1236 | |||
| 1237 | /* See the note in netlbl_unlabel_staticadd() about not allowing both | ||
| 1238 | * IPv4 and IPv6 in the same entry. */ | ||
| 1239 | if (!info->attrs[NLBL_UNLABEL_A_IFACE] || | ||
| 1240 | !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || | ||
| 1241 | !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ | ||
| 1242 | (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || | ||
| 1243 | !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) | ||
| 1244 | return -EINVAL; | ||
| 1245 | |||
| 1246 | netlbl_netlink_auditinfo(skb, &audit_info); | ||
| 1247 | |||
| 1248 | ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); | ||
| 1249 | if (ret_val != 0) | ||
| 1250 | return ret_val; | ||
| 1251 | dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); | ||
| 1252 | |||
| 1253 | return netlbl_unlhsh_remove(&init_net, | ||
| 1254 | dev_name, addr, mask, addr_len, | ||
| 1255 | &audit_info); | ||
| 1256 | } | ||
| 1257 | |||
| 1258 | /** | ||
| 1259 | * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message | ||
| 1260 | * @skb: the NETLINK buffer | ||
| 1261 | * @info: the Generic NETLINK info block | ||
| 1262 | * | ||
| 1263 | * Description: | ||
| 1264 | * Process a user generated STATICREMOVEDEF message and remove the default | ||
| 1265 | * unlabeled connection entry. Returns zero on success, negative values on | ||
| 1266 | * failure. | ||
| 1267 | * | ||
| 1268 | */ | ||
| 1269 | static int netlbl_unlabel_staticremovedef(struct sk_buff *skb, | ||
| 1270 | struct genl_info *info) | ||
| 1271 | { | ||
| 1272 | int ret_val; | ||
| 1273 | void *addr; | ||
| 1274 | void *mask; | ||
| 1275 | u32 addr_len; | ||
| 1276 | struct netlbl_audit audit_info; | ||
| 1277 | |||
| 1278 | /* See the note in netlbl_unlabel_staticadd() about not allowing both | ||
| 1279 | * IPv4 and IPv6 in the same entry. */ | ||
| 1280 | if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || | ||
| 1281 | !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ | ||
| 1282 | (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || | ||
| 1283 | !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) | ||
| 1284 | return -EINVAL; | ||
| 1285 | |||
| 1286 | netlbl_netlink_auditinfo(skb, &audit_info); | ||
| 1287 | |||
| 1288 | ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); | ||
| 1289 | if (ret_val != 0) | ||
| 1290 | return ret_val; | ||
| 1291 | |||
| 1292 | return netlbl_unlhsh_remove(&init_net, | ||
| 1293 | NULL, addr, mask, addr_len, | ||
| 1294 | &audit_info); | ||
| 1295 | } | ||
| 1296 | |||
| 1297 | |||
| 1298 | /** | ||
| 1299 | * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF] | ||
| 1300 | * @cmd: command/message | ||
| 1301 | * @iface: the interface entry | ||
| 1302 | * @addr4: the IPv4 address entry | ||
| 1303 | * @addr6: the IPv6 address entry | ||
| 1304 | * @arg: the netlbl_unlhsh_walk_arg structure | ||
| 1305 | * | ||
| 1306 | * Description: | ||
| 1307 | * This function is designed to be used to generate a response for a | ||
| 1308 | * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6 | ||
| 1309 | * can be specified, not both, the other unspecified entry should be set to | ||
| 1310 | * NULL by the caller. Returns the size of the message on success, negative | ||
| 1311 | * values on failure. | ||
| 1312 | * | ||
| 1313 | */ | ||
| 1314 | static int netlbl_unlabel_staticlist_gen(u32 cmd, | ||
| 1315 | const struct netlbl_unlhsh_iface *iface, | ||
| 1316 | const struct netlbl_unlhsh_addr4 *addr4, | ||
| 1317 | const struct netlbl_unlhsh_addr6 *addr6, | ||
| 1318 | void *arg) | ||
| 1319 | { | ||
| 1320 | int ret_val = -ENOMEM; | ||
| 1321 | struct netlbl_unlhsh_walk_arg *cb_arg = arg; | ||
| 1322 | struct net_device *dev; | ||
| 1323 | void *data; | ||
| 1324 | u32 secid; | ||
| 1325 | char *secctx; | ||
| 1326 | u32 secctx_len; | ||
| 1327 | |||
| 1328 | data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid, | ||
| 1329 | cb_arg->seq, &netlbl_unlabel_gnl_family, | ||
| 1330 | NLM_F_MULTI, cmd); | ||
| 1331 | if (data == NULL) | ||
| 1332 | goto list_cb_failure; | ||
| 1333 | |||
| 1334 | if (iface->ifindex > 0) { | ||
| 1335 | dev = dev_get_by_index(&init_net, iface->ifindex); | ||
| 1336 | ret_val = nla_put_string(cb_arg->skb, | ||
| 1337 | NLBL_UNLABEL_A_IFACE, dev->name); | ||
| 1338 | dev_put(dev); | ||
| 1339 | if (ret_val != 0) | ||
| 1340 | goto list_cb_failure; | ||
| 1341 | } | ||
| 1342 | |||
| 1343 | if (addr4) { | ||
| 1344 | struct in_addr addr_struct; | ||
| 1345 | |||
| 1346 | addr_struct.s_addr = addr4->addr; | ||
| 1347 | ret_val = nla_put(cb_arg->skb, | ||
| 1348 | NLBL_UNLABEL_A_IPV4ADDR, | ||
| 1349 | sizeof(struct in_addr), | ||
| 1350 | &addr_struct); | ||
| 1351 | if (ret_val != 0) | ||
| 1352 | goto list_cb_failure; | ||
| 1353 | |||
| 1354 | addr_struct.s_addr = addr4->mask; | ||
| 1355 | ret_val = nla_put(cb_arg->skb, | ||
| 1356 | NLBL_UNLABEL_A_IPV4MASK, | ||
| 1357 | sizeof(struct in_addr), | ||
| 1358 | &addr_struct); | ||
| 1359 | if (ret_val != 0) | ||
| 1360 | goto list_cb_failure; | ||
| 1361 | |||
| 1362 | secid = addr4->secid; | ||
| 1363 | } else { | ||
| 1364 | ret_val = nla_put(cb_arg->skb, | ||
| 1365 | NLBL_UNLABEL_A_IPV6ADDR, | ||
| 1366 | sizeof(struct in6_addr), | ||
| 1367 | &addr6->addr); | ||
| 1368 | if (ret_val != 0) | ||
| 1369 | goto list_cb_failure; | ||
| 1370 | |||
| 1371 | ret_val = nla_put(cb_arg->skb, | ||
| 1372 | NLBL_UNLABEL_A_IPV6MASK, | ||
| 1373 | sizeof(struct in6_addr), | ||
| 1374 | &addr6->mask); | ||
| 1375 | if (ret_val != 0) | ||
| 1376 | goto list_cb_failure; | ||
| 1377 | |||
| 1378 | secid = addr6->secid; | ||
| 1379 | } | ||
| 1380 | |||
| 1381 | ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len); | ||
| 1382 | if (ret_val != 0) | ||
| 1383 | goto list_cb_failure; | ||
| 1384 | ret_val = nla_put(cb_arg->skb, | ||
| 1385 | NLBL_UNLABEL_A_SECCTX, | ||
| 1386 | secctx_len, | ||
| 1387 | secctx); | ||
| 1388 | security_release_secctx(secctx, secctx_len); | ||
| 1389 | if (ret_val != 0) | ||
| 1390 | goto list_cb_failure; | ||
| 1391 | |||
| 1392 | cb_arg->seq++; | ||
| 1393 | return genlmsg_end(cb_arg->skb, data); | ||
| 1394 | |||
| 1395 | list_cb_failure: | ||
| 1396 | genlmsg_cancel(cb_arg->skb, data); | ||
| 1397 | return ret_val; | ||
| 1398 | } | ||
| 1399 | |||
| 1400 | /** | ||
| 1401 | * netlbl_unlabel_staticlist - Handle a STATICLIST message | ||
| 1402 | * @skb: the NETLINK buffer | ||
| 1403 | * @cb: the NETLINK callback | ||
| 1404 | * | ||
| 1405 | * Description: | ||
| 1406 | * Process a user generated STATICLIST message and dump the unlabeled | ||
| 1407 | * connection hash table in a form suitable for use in a kernel generated | ||
| 1408 | * STATICLIST message. Returns the length of @skb. | ||
| 1409 | * | ||
| 1410 | */ | ||
| 1411 | static int netlbl_unlabel_staticlist(struct sk_buff *skb, | ||
| 1412 | struct netlink_callback *cb) | ||
| 1413 | { | ||
| 1414 | struct netlbl_unlhsh_walk_arg cb_arg; | ||
| 1415 | u32 skip_bkt = cb->args[0]; | ||
| 1416 | u32 skip_chain = cb->args[1]; | ||
| 1417 | u32 skip_addr4 = cb->args[2]; | ||
| 1418 | u32 skip_addr6 = cb->args[3]; | ||
| 1419 | u32 iter_bkt; | ||
| 1420 | u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; | ||
| 1421 | struct netlbl_unlhsh_iface *iface; | ||
| 1422 | struct netlbl_unlhsh_addr4 *addr4; | ||
| 1423 | struct netlbl_unlhsh_addr6 *addr6; | ||
| 1424 | |||
| 1425 | cb_arg.nl_cb = cb; | ||
| 1426 | cb_arg.skb = skb; | ||
| 1427 | cb_arg.seq = cb->nlh->nlmsg_seq; | ||
| 1428 | |||
| 1429 | rcu_read_lock(); | ||
| 1430 | for (iter_bkt = skip_bkt; | ||
| 1431 | iter_bkt < rcu_dereference(netlbl_unlhsh)->size; | ||
| 1432 | iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) { | ||
| 1433 | list_for_each_entry_rcu(iface, | ||
| 1434 | &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt], | ||
| 1435 | list) { | ||
| 1436 | if (!iface->valid || | ||
| 1437 | iter_chain++ < skip_chain) | ||
| 1438 | continue; | ||
| 1439 | list_for_each_entry_rcu(addr4, | ||
| 1440 | &iface->addr4_list, | ||
| 1441 | list) { | ||
| 1442 | if (!addr4->valid || iter_addr4++ < skip_addr4) | ||
| 1443 | continue; | ||
| 1444 | if (netlbl_unlabel_staticlist_gen( | ||
| 1445 | NLBL_UNLABEL_C_STATICLIST, | ||
| 1446 | iface, | ||
| 1447 | addr4, | ||
| 1448 | NULL, | ||
| 1449 | &cb_arg) < 0) { | ||
| 1450 | iter_addr4--; | ||
| 1451 | iter_chain--; | ||
| 1452 | goto unlabel_staticlist_return; | ||
| 1453 | } | ||
| 1454 | } | ||
| 1455 | list_for_each_entry_rcu(addr6, | ||
| 1456 | &iface->addr6_list, | ||
| 1457 | list) { | ||
| 1458 | if (!addr6->valid || iter_addr6++ < skip_addr6) | ||
| 1459 | continue; | ||
| 1460 | if (netlbl_unlabel_staticlist_gen( | ||
| 1461 | NLBL_UNLABEL_C_STATICLIST, | ||
| 1462 | iface, | ||
| 1463 | NULL, | ||
| 1464 | addr6, | ||
| 1465 | &cb_arg) < 0) { | ||
| 1466 | iter_addr6--; | ||
| 1467 | iter_chain--; | ||
| 1468 | goto unlabel_staticlist_return; | ||
| 1469 | } | ||
| 1470 | } | ||
| 1471 | } | ||
| 1472 | } | ||
| 1473 | |||
| 1474 | unlabel_staticlist_return: | ||
| 1475 | rcu_read_unlock(); | ||
| 1476 | cb->args[0] = skip_bkt; | ||
| 1477 | cb->args[1] = skip_chain; | ||
| 1478 | cb->args[2] = skip_addr4; | ||
| 1479 | cb->args[3] = skip_addr6; | ||
| 1480 | return skb->len; | ||
| 1481 | } | ||
| 1482 | |||
| 1483 | /** | ||
| 1484 | * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message | ||
| 1485 | * @skb: the NETLINK buffer | ||
| 1486 | * @cb: the NETLINK callback | ||
| 1487 | * | ||
| 1488 | * Description: | ||
| 1489 | * Process a user generated STATICLISTDEF message and dump the default | ||
| 1490 | * unlabeled connection entry in a form suitable for use in a kernel generated | ||
| 1491 | * STATICLISTDEF message. Returns the length of @skb. | ||
| 1492 | * | ||
| 1493 | */ | ||
| 1494 | static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, | ||
| 1495 | struct netlink_callback *cb) | ||
| 1496 | { | ||
| 1497 | struct netlbl_unlhsh_walk_arg cb_arg; | ||
| 1498 | struct netlbl_unlhsh_iface *iface; | ||
| 1499 | u32 skip_addr4 = cb->args[0]; | ||
| 1500 | u32 skip_addr6 = cb->args[1]; | ||
| 1501 | u32 iter_addr4 = 0, iter_addr6 = 0; | ||
| 1502 | struct netlbl_unlhsh_addr4 *addr4; | ||
| 1503 | struct netlbl_unlhsh_addr6 *addr6; | ||
| 1504 | |||
| 1505 | cb_arg.nl_cb = cb; | ||
| 1506 | cb_arg.skb = skb; | ||
| 1507 | cb_arg.seq = cb->nlh->nlmsg_seq; | ||
| 1508 | |||
| 1509 | rcu_read_lock(); | ||
| 1510 | iface = rcu_dereference(netlbl_unlhsh_def); | ||
| 1511 | if (iface == NULL || !iface->valid) | ||
| 1512 | goto unlabel_staticlistdef_return; | ||
| 1513 | |||
| 1514 | list_for_each_entry_rcu(addr4, &iface->addr4_list, list) { | ||
| 1515 | if (!addr4->valid || iter_addr4++ < skip_addr4) | ||
| 1516 | continue; | ||
| 1517 | if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, | ||
| 1518 | iface, | ||
| 1519 | addr4, | ||
| 1520 | NULL, | ||
| 1521 | &cb_arg) < 0) { | ||
| 1522 | iter_addr4--; | ||
| 1523 | goto unlabel_staticlistdef_return; | ||
| 1524 | } | ||
| 1525 | } | ||
| 1526 | list_for_each_entry_rcu(addr6, &iface->addr6_list, list) { | ||
| 1527 | if (addr6->valid || iter_addr6++ < skip_addr6) | ||
| 1528 | continue; | ||
| 1529 | if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, | ||
| 1530 | iface, | ||
| 1531 | NULL, | ||
| 1532 | addr6, | ||
| 1533 | &cb_arg) < 0) { | ||
| 1534 | iter_addr6--; | ||
| 1535 | goto unlabel_staticlistdef_return; | ||
| 1536 | } | ||
| 1537 | } | ||
| 1538 | |||
| 1539 | unlabel_staticlistdef_return: | ||
| 1540 | rcu_read_unlock(); | ||
| 1541 | cb->args[0] = skip_addr4; | ||
| 1542 | cb->args[1] = skip_addr6; | ||
| 1543 | return skb->len; | ||
| 1544 | } | ||
| 178 | 1545 | ||
| 179 | /* | 1546 | /* |
| 180 | * NetLabel Generic NETLINK Command Definitions | 1547 | * NetLabel Generic NETLINK Command Definitions |
| 181 | */ | 1548 | */ |
| 182 | 1549 | ||
| 1550 | static struct genl_ops netlbl_unlabel_genl_c_staticadd = { | ||
| 1551 | .cmd = NLBL_UNLABEL_C_STATICADD, | ||
| 1552 | .flags = GENL_ADMIN_PERM, | ||
| 1553 | .policy = netlbl_unlabel_genl_policy, | ||
| 1554 | .doit = netlbl_unlabel_staticadd, | ||
| 1555 | .dumpit = NULL, | ||
| 1556 | }; | ||
| 1557 | |||
| 1558 | static struct genl_ops netlbl_unlabel_genl_c_staticremove = { | ||
| 1559 | .cmd = NLBL_UNLABEL_C_STATICREMOVE, | ||
| 1560 | .flags = GENL_ADMIN_PERM, | ||
| 1561 | .policy = netlbl_unlabel_genl_policy, | ||
| 1562 | .doit = netlbl_unlabel_staticremove, | ||
| 1563 | .dumpit = NULL, | ||
| 1564 | }; | ||
| 1565 | |||
| 1566 | static struct genl_ops netlbl_unlabel_genl_c_staticlist = { | ||
| 1567 | .cmd = NLBL_UNLABEL_C_STATICLIST, | ||
| 1568 | .flags = 0, | ||
| 1569 | .policy = netlbl_unlabel_genl_policy, | ||
| 1570 | .doit = NULL, | ||
| 1571 | .dumpit = netlbl_unlabel_staticlist, | ||
| 1572 | }; | ||
| 1573 | |||
| 1574 | static struct genl_ops netlbl_unlabel_genl_c_staticadddef = { | ||
| 1575 | .cmd = NLBL_UNLABEL_C_STATICADDDEF, | ||
| 1576 | .flags = GENL_ADMIN_PERM, | ||
| 1577 | .policy = netlbl_unlabel_genl_policy, | ||
| 1578 | .doit = netlbl_unlabel_staticadddef, | ||
| 1579 | .dumpit = NULL, | ||
| 1580 | }; | ||
| 1581 | |||
| 1582 | static struct genl_ops netlbl_unlabel_genl_c_staticremovedef = { | ||
| 1583 | .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF, | ||
| 1584 | .flags = GENL_ADMIN_PERM, | ||
| 1585 | .policy = netlbl_unlabel_genl_policy, | ||
| 1586 | .doit = netlbl_unlabel_staticremovedef, | ||
| 1587 | .dumpit = NULL, | ||
| 1588 | }; | ||
| 1589 | |||
| 1590 | static struct genl_ops netlbl_unlabel_genl_c_staticlistdef = { | ||
| 1591 | .cmd = NLBL_UNLABEL_C_STATICLISTDEF, | ||
| 1592 | .flags = 0, | ||
| 1593 | .policy = netlbl_unlabel_genl_policy, | ||
| 1594 | .doit = NULL, | ||
| 1595 | .dumpit = netlbl_unlabel_staticlistdef, | ||
| 1596 | }; | ||
| 1597 | |||
| 183 | static struct genl_ops netlbl_unlabel_genl_c_accept = { | 1598 | static struct genl_ops netlbl_unlabel_genl_c_accept = { |
| 184 | .cmd = NLBL_UNLABEL_C_ACCEPT, | 1599 | .cmd = NLBL_UNLABEL_C_ACCEPT, |
| 185 | .flags = GENL_ADMIN_PERM, | 1600 | .flags = GENL_ADMIN_PERM, |
| @@ -196,7 +1611,6 @@ static struct genl_ops netlbl_unlabel_genl_c_list = { | |||
| 196 | .dumpit = NULL, | 1611 | .dumpit = NULL, |
| 197 | }; | 1612 | }; |
| 198 | 1613 | ||
| 199 | |||
| 200 | /* | 1614 | /* |
| 201 | * NetLabel Generic NETLINK Protocol Functions | 1615 | * NetLabel Generic NETLINK Protocol Functions |
| 202 | */ | 1616 | */ |
| @@ -218,6 +1632,36 @@ int netlbl_unlabel_genl_init(void) | |||
| 218 | return ret_val; | 1632 | return ret_val; |
| 219 | 1633 | ||
| 220 | ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, | 1634 | ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, |
| 1635 | &netlbl_unlabel_genl_c_staticadd); | ||
| 1636 | if (ret_val != 0) | ||
| 1637 | return ret_val; | ||
| 1638 | |||
| 1639 | ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, | ||
| 1640 | &netlbl_unlabel_genl_c_staticremove); | ||
| 1641 | if (ret_val != 0) | ||
| 1642 | return ret_val; | ||
| 1643 | |||
| 1644 | ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, | ||
| 1645 | &netlbl_unlabel_genl_c_staticlist); | ||
| 1646 | if (ret_val != 0) | ||
| 1647 | return ret_val; | ||
| 1648 | |||
| 1649 | ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, | ||
| 1650 | &netlbl_unlabel_genl_c_staticadddef); | ||
| 1651 | if (ret_val != 0) | ||
| 1652 | return ret_val; | ||
| 1653 | |||
| 1654 | ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, | ||
| 1655 | &netlbl_unlabel_genl_c_staticremovedef); | ||
| 1656 | if (ret_val != 0) | ||
| 1657 | return ret_val; | ||
| 1658 | |||
| 1659 | ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, | ||
| 1660 | &netlbl_unlabel_genl_c_staticlistdef); | ||
| 1661 | if (ret_val != 0) | ||
| 1662 | return ret_val; | ||
| 1663 | |||
| 1664 | ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, | ||
| 221 | &netlbl_unlabel_genl_c_accept); | 1665 | &netlbl_unlabel_genl_c_accept); |
| 222 | if (ret_val != 0) | 1666 | if (ret_val != 0) |
| 223 | return ret_val; | 1667 | return ret_val; |
| @@ -234,8 +1678,58 @@ int netlbl_unlabel_genl_init(void) | |||
| 234 | * NetLabel KAPI Hooks | 1678 | * NetLabel KAPI Hooks |
| 235 | */ | 1679 | */ |
| 236 | 1680 | ||
| 1681 | static struct notifier_block netlbl_unlhsh_netdev_notifier = { | ||
| 1682 | .notifier_call = netlbl_unlhsh_netdev_handler, | ||
| 1683 | }; | ||
| 1684 | |||
| 1685 | /** | ||
| 1686 | * netlbl_unlabel_init - Initialize the unlabeled connection hash table | ||
| 1687 | * @size: the number of bits to use for the hash buckets | ||
| 1688 | * | ||
| 1689 | * Description: | ||
| 1690 | * Initializes the unlabeled connection hash table and registers a network | ||
| 1691 | * device notification handler. This function should only be called by the | ||
| 1692 | * NetLabel subsystem itself during initialization. Returns zero on success, | ||
| 1693 | * non-zero values on error. | ||
| 1694 | * | ||
| 1695 | */ | ||
| 1696 | int netlbl_unlabel_init(u32 size) | ||
| 1697 | { | ||
| 1698 | u32 iter; | ||
| 1699 | struct netlbl_unlhsh_tbl *hsh_tbl; | ||
| 1700 | |||
| 1701 | if (size == 0) | ||
| 1702 | return -EINVAL; | ||
| 1703 | |||
| 1704 | hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); | ||
| 1705 | if (hsh_tbl == NULL) | ||
| 1706 | return -ENOMEM; | ||
| 1707 | hsh_tbl->size = 1 << size; | ||
| 1708 | hsh_tbl->tbl = kcalloc(hsh_tbl->size, | ||
| 1709 | sizeof(struct list_head), | ||
| 1710 | GFP_KERNEL); | ||
| 1711 | if (hsh_tbl->tbl == NULL) { | ||
| 1712 | kfree(hsh_tbl); | ||
| 1713 | return -ENOMEM; | ||
| 1714 | } | ||
| 1715 | for (iter = 0; iter < hsh_tbl->size; iter++) | ||
| 1716 | INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); | ||
| 1717 | |||
| 1718 | rcu_read_lock(); | ||
| 1719 | spin_lock(&netlbl_unlhsh_lock); | ||
| 1720 | rcu_assign_pointer(netlbl_unlhsh, hsh_tbl); | ||
| 1721 | spin_unlock(&netlbl_unlhsh_lock); | ||
| 1722 | rcu_read_unlock(); | ||
| 1723 | |||
| 1724 | register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier); | ||
| 1725 | |||
| 1726 | return 0; | ||
| 1727 | } | ||
| 1728 | |||
| 237 | /** | 1729 | /** |
| 238 | * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet | 1730 | * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet |
| 1731 | * @skb: the packet | ||
| 1732 | * @family: protocol family | ||
| 239 | * @secattr: the security attributes | 1733 | * @secattr: the security attributes |
| 240 | * | 1734 | * |
| 241 | * Description: | 1735 | * Description: |
| @@ -243,19 +1737,52 @@ int netlbl_unlabel_genl_init(void) | |||
| 243 | * them in @secattr. Returns zero on success and negative values on failure. | 1737 | * them in @secattr. Returns zero on success and negative values on failure. |
| 244 | * | 1738 | * |
| 245 | */ | 1739 | */ |
| 246 | int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr) | 1740 | int netlbl_unlabel_getattr(const struct sk_buff *skb, |
| 1741 | u16 family, | ||
| 1742 | struct netlbl_lsm_secattr *secattr) | ||
| 247 | { | 1743 | { |
| 248 | int ret_val; | 1744 | struct iphdr *hdr4; |
| 1745 | struct ipv6hdr *hdr6; | ||
| 1746 | struct netlbl_unlhsh_addr4 *addr4; | ||
| 1747 | struct netlbl_unlhsh_addr6 *addr6; | ||
| 1748 | struct netlbl_unlhsh_iface *iface; | ||
| 249 | 1749 | ||
| 250 | rcu_read_lock(); | 1750 | rcu_read_lock(); |
| 251 | if (netlabel_unlabel_acceptflg == 1) { | 1751 | iface = netlbl_unlhsh_search_iface_def(skb->iif); |
| 252 | netlbl_secattr_init(secattr); | 1752 | if (iface == NULL) |
| 253 | ret_val = 0; | 1753 | goto unlabel_getattr_nolabel; |
| 254 | } else | 1754 | switch (family) { |
| 255 | ret_val = -ENOMSG; | 1755 | case PF_INET: |
| 1756 | hdr4 = ip_hdr(skb); | ||
| 1757 | addr4 = netlbl_unlhsh_search_addr4(hdr4->saddr, iface); | ||
| 1758 | if (addr4 == NULL) | ||
| 1759 | goto unlabel_getattr_nolabel; | ||
| 1760 | secattr->attr.secid = addr4->secid; | ||
| 1761 | break; | ||
| 1762 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | ||
| 1763 | case PF_INET6: | ||
| 1764 | hdr6 = ipv6_hdr(skb); | ||
| 1765 | addr6 = netlbl_unlhsh_search_addr6(&hdr6->saddr, iface); | ||
| 1766 | if (addr6 == NULL) | ||
| 1767 | goto unlabel_getattr_nolabel; | ||
| 1768 | secattr->attr.secid = addr6->secid; | ||
| 1769 | break; | ||
| 1770 | #endif /* IPv6 */ | ||
| 1771 | default: | ||
| 1772 | goto unlabel_getattr_nolabel; | ||
| 1773 | } | ||
| 256 | rcu_read_unlock(); | 1774 | rcu_read_unlock(); |
| 257 | 1775 | ||
| 258 | return ret_val; | 1776 | secattr->flags |= NETLBL_SECATTR_SECID; |
| 1777 | secattr->type = NETLBL_NLTYPE_UNLABELED; | ||
| 1778 | return 0; | ||
| 1779 | |||
| 1780 | unlabel_getattr_nolabel: | ||
| 1781 | rcu_read_unlock(); | ||
| 1782 | if (netlabel_unlabel_acceptflg == 0) | ||
| 1783 | return -ENOMSG; | ||
| 1784 | secattr->type = NETLBL_NLTYPE_UNLABELED; | ||
| 1785 | return 0; | ||
| 259 | } | 1786 | } |
| 260 | 1787 | ||
| 261 | /** | 1788 | /** |
diff --git a/net/netlabel/netlabel_unlabeled.h b/net/netlabel/netlabel_unlabeled.h index c2917fbb42cf..06b1301ac072 100644 --- a/net/netlabel/netlabel_unlabeled.h +++ b/net/netlabel/netlabel_unlabeled.h | |||
| @@ -36,6 +36,116 @@ | |||
| 36 | /* | 36 | /* |
| 37 | * The following NetLabel payloads are supported by the Unlabeled subsystem. | 37 | * The following NetLabel payloads are supported by the Unlabeled subsystem. |
| 38 | * | 38 | * |
| 39 | * o STATICADD | ||
| 40 | * This message is sent from an application to add a new static label for | ||
| 41 | * incoming unlabeled connections. | ||
| 42 | * | ||
| 43 | * Required attributes: | ||
| 44 | * | ||
| 45 | * NLBL_UNLABEL_A_IFACE | ||
| 46 | * NLBL_UNLABEL_A_SECCTX | ||
| 47 | * | ||
| 48 | * If IPv4 is specified the following attributes are required: | ||
| 49 | * | ||
| 50 | * NLBL_UNLABEL_A_IPV4ADDR | ||
| 51 | * NLBL_UNLABEL_A_IPV4MASK | ||
| 52 | * | ||
| 53 | * If IPv6 is specified the following attributes are required: | ||
| 54 | * | ||
| 55 | * NLBL_UNLABEL_A_IPV6ADDR | ||
| 56 | * NLBL_UNLABEL_A_IPV6MASK | ||
| 57 | * | ||
| 58 | * o STATICREMOVE | ||
| 59 | * This message is sent from an application to remove an existing static | ||
| 60 | * label for incoming unlabeled connections. | ||
| 61 | * | ||
| 62 | * Required attributes: | ||
| 63 | * | ||
| 64 | * NLBL_UNLABEL_A_IFACE | ||
| 65 | * | ||
| 66 | * If IPv4 is specified the following attributes are required: | ||
| 67 | * | ||
| 68 | * NLBL_UNLABEL_A_IPV4ADDR | ||
| 69 | * NLBL_UNLABEL_A_IPV4MASK | ||
| 70 | * | ||
| 71 | * If IPv6 is specified the following attributes are required: | ||
| 72 | * | ||
| 73 | * NLBL_UNLABEL_A_IPV6ADDR | ||
| 74 | * NLBL_UNLABEL_A_IPV6MASK | ||
| 75 | * | ||
| 76 | * o STATICLIST | ||
| 77 | * This message can be sent either from an application or by the kernel in | ||
| 78 | * response to an application generated STATICLIST message. When sent by an | ||
| 79 | * application there is no payload and the NLM_F_DUMP flag should be set. | ||
| 80 | * The kernel should response with a series of the following messages. | ||
| 81 | * | ||
| 82 | * Required attributes: | ||
| 83 | * | ||
| 84 | * NLBL_UNLABEL_A_IFACE | ||
| 85 | * NLBL_UNLABEL_A_SECCTX | ||
| 86 | * | ||
| 87 | * If IPv4 is specified the following attributes are required: | ||
| 88 | * | ||
| 89 | * NLBL_UNLABEL_A_IPV4ADDR | ||
| 90 | * NLBL_UNLABEL_A_IPV4MASK | ||
| 91 | * | ||
| 92 | * If IPv6 is specified the following attributes are required: | ||
| 93 | * | ||
| 94 | * NLBL_UNLABEL_A_IPV6ADDR | ||
| 95 | * NLBL_UNLABEL_A_IPV6MASK | ||
| 96 | * | ||
| 97 | * o STATICADDDEF | ||
| 98 | * This message is sent from an application to set the default static | ||
| 99 | * label for incoming unlabeled connections. | ||
| 100 | * | ||
| 101 | * Required attribute: | ||
| 102 | * | ||
| 103 | * NLBL_UNLABEL_A_SECCTX | ||
| 104 | * | ||
| 105 | * If IPv4 is specified the following attributes are required: | ||
| 106 | * | ||
| 107 | * NLBL_UNLABEL_A_IPV4ADDR | ||
| 108 | * NLBL_UNLABEL_A_IPV4MASK | ||
| 109 | * | ||
| 110 | * If IPv6 is specified the following attributes are required: | ||
| 111 | * | ||
| 112 | * NLBL_UNLABEL_A_IPV6ADDR | ||
| 113 | * NLBL_UNLABEL_A_IPV6MASK | ||
| 114 | * | ||
| 115 | * o STATICREMOVEDEF | ||
| 116 | * This message is sent from an application to remove the existing default | ||
| 117 | * static label for incoming unlabeled connections. | ||
| 118 | * | ||
| 119 | * If IPv4 is specified the following attributes are required: | ||
| 120 | * | ||
| 121 | * NLBL_UNLABEL_A_IPV4ADDR | ||
| 122 | * NLBL_UNLABEL_A_IPV4MASK | ||
| 123 | * | ||
| 124 | * If IPv6 is specified the following attributes are required: | ||
| 125 | * | ||
| 126 | * NLBL_UNLABEL_A_IPV6ADDR | ||
| 127 | * NLBL_UNLABEL_A_IPV6MASK | ||
| 128 | * | ||
| 129 | * o STATICLISTDEF | ||
| 130 | * This message can be sent either from an application or by the kernel in | ||
| 131 | * response to an application generated STATICLISTDEF message. When sent by | ||
| 132 | * an application there is no payload and the NLM_F_DUMP flag should be set. | ||
| 133 | * The kernel should response with the following message. | ||
| 134 | * | ||
| 135 | * Required attribute: | ||
| 136 | * | ||
| 137 | * NLBL_UNLABEL_A_SECCTX | ||
| 138 | * | ||
| 139 | * If IPv4 is specified the following attributes are required: | ||
| 140 | * | ||
| 141 | * NLBL_UNLABEL_A_IPV4ADDR | ||
| 142 | * NLBL_UNLABEL_A_IPV4MASK | ||
| 143 | * | ||
| 144 | * If IPv6 is specified the following attributes are required: | ||
| 145 | * | ||
| 146 | * NLBL_UNLABEL_A_IPV6ADDR | ||
| 147 | * NLBL_UNLABEL_A_IPV6MASK | ||
| 148 | * | ||
| 39 | * o ACCEPT | 149 | * o ACCEPT |
| 40 | * This message is sent from an application to specify if the kernel should | 150 | * This message is sent from an application to specify if the kernel should |
| 41 | * allow unlabled packets to pass if they do not match any of the static | 151 | * allow unlabled packets to pass if they do not match any of the static |
| @@ -62,6 +172,12 @@ enum { | |||
| 62 | NLBL_UNLABEL_C_UNSPEC, | 172 | NLBL_UNLABEL_C_UNSPEC, |
| 63 | NLBL_UNLABEL_C_ACCEPT, | 173 | NLBL_UNLABEL_C_ACCEPT, |
| 64 | NLBL_UNLABEL_C_LIST, | 174 | NLBL_UNLABEL_C_LIST, |
| 175 | NLBL_UNLABEL_C_STATICADD, | ||
| 176 | NLBL_UNLABEL_C_STATICREMOVE, | ||
| 177 | NLBL_UNLABEL_C_STATICLIST, | ||
| 178 | NLBL_UNLABEL_C_STATICADDDEF, | ||
| 179 | NLBL_UNLABEL_C_STATICREMOVEDEF, | ||
| 180 | NLBL_UNLABEL_C_STATICLISTDEF, | ||
| 65 | __NLBL_UNLABEL_C_MAX, | 181 | __NLBL_UNLABEL_C_MAX, |
| 66 | }; | 182 | }; |
| 67 | #define NLBL_UNLABEL_C_MAX (__NLBL_UNLABEL_C_MAX - 1) | 183 | #define NLBL_UNLABEL_C_MAX (__NLBL_UNLABEL_C_MAX - 1) |
| @@ -73,6 +189,24 @@ enum { | |||
| 73 | /* (NLA_U8) | 189 | /* (NLA_U8) |
| 74 | * if true then unlabeled packets are allowed to pass, else unlabeled | 190 | * if true then unlabeled packets are allowed to pass, else unlabeled |
| 75 | * packets are rejected */ | 191 | * packets are rejected */ |
| 192 | NLBL_UNLABEL_A_IPV6ADDR, | ||
| 193 | /* (NLA_BINARY, struct in6_addr) | ||
| 194 | * an IPv6 address */ | ||
| 195 | NLBL_UNLABEL_A_IPV6MASK, | ||
| 196 | /* (NLA_BINARY, struct in6_addr) | ||
| 197 | * an IPv6 address mask */ | ||
| 198 | NLBL_UNLABEL_A_IPV4ADDR, | ||
| 199 | /* (NLA_BINARY, struct in_addr) | ||
| 200 | * an IPv4 address */ | ||
| 201 | NLBL_UNLABEL_A_IPV4MASK, | ||
| 202 | /* (NLA_BINARY, struct in_addr) | ||
| 203 | * and IPv4 address mask */ | ||
| 204 | NLBL_UNLABEL_A_IFACE, | ||
| 205 | /* (NLA_NULL_STRING) | ||
| 206 | * network interface */ | ||
| 207 | NLBL_UNLABEL_A_SECCTX, | ||
| 208 | /* (NLA_BINARY) | ||
| 209 | * a LSM specific security context */ | ||
| 76 | __NLBL_UNLABEL_A_MAX, | 210 | __NLBL_UNLABEL_A_MAX, |
| 77 | }; | 211 | }; |
| 78 | #define NLBL_UNLABEL_A_MAX (__NLBL_UNLABEL_A_MAX - 1) | 212 | #define NLBL_UNLABEL_A_MAX (__NLBL_UNLABEL_A_MAX - 1) |
| @@ -80,8 +214,17 @@ enum { | |||
| 80 | /* NetLabel protocol functions */ | 214 | /* NetLabel protocol functions */ |
| 81 | int netlbl_unlabel_genl_init(void); | 215 | int netlbl_unlabel_genl_init(void); |
| 82 | 216 | ||
| 217 | /* Unlabeled connection hash table size */ | ||
| 218 | /* XXX - currently this number is an uneducated guess */ | ||
| 219 | #define NETLBL_UNLHSH_BITSIZE 7 | ||
| 220 | |||
| 221 | /* General Unlabeled init function */ | ||
| 222 | int netlbl_unlabel_init(u32 size); | ||
| 223 | |||
| 83 | /* Process Unlabeled incoming network packets */ | 224 | /* Process Unlabeled incoming network packets */ |
| 84 | int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr); | 225 | int netlbl_unlabel_getattr(const struct sk_buff *skb, |
| 226 | u16 family, | ||
| 227 | struct netlbl_lsm_secattr *secattr); | ||
| 85 | 228 | ||
| 86 | /* Set the default configuration to allow Unlabeled packets */ | 229 | /* Set the default configuration to allow Unlabeled packets */ |
| 87 | int netlbl_unlabel_defconf(void); | 230 | int netlbl_unlabel_defconf(void); |
diff --git a/security/Kconfig b/security/Kconfig index 8086e61058e3..389e151e3b68 100644 --- a/security/Kconfig +++ b/security/Kconfig | |||
| @@ -76,6 +76,7 @@ config SECURITY_NETWORK_XFRM | |||
| 76 | config SECURITY_CAPABILITIES | 76 | config SECURITY_CAPABILITIES |
| 77 | bool "Default Linux Capabilities" | 77 | bool "Default Linux Capabilities" |
| 78 | depends on SECURITY | 78 | depends on SECURITY |
| 79 | default y | ||
| 79 | help | 80 | help |
| 80 | This enables the "default" Linux capabilities functionality. | 81 | This enables the "default" Linux capabilities functionality. |
| 81 | If you are unsure how to answer this question, answer Y. | 82 | If you are unsure how to answer this question, answer Y. |
diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig index b32a459c0683..2b517d618672 100644 --- a/security/selinux/Kconfig +++ b/security/selinux/Kconfig | |||
| @@ -145,7 +145,7 @@ config SECURITY_SELINUX_POLICYDB_VERSION_MAX | |||
| 145 | config SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE | 145 | config SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE |
| 146 | int "NSA SELinux maximum supported policy format version value" | 146 | int "NSA SELinux maximum supported policy format version value" |
| 147 | depends on SECURITY_SELINUX_POLICYDB_VERSION_MAX | 147 | depends on SECURITY_SELINUX_POLICYDB_VERSION_MAX |
| 148 | range 15 21 | 148 | range 15 22 |
| 149 | default 19 | 149 | default 19 |
| 150 | help | 150 | help |
| 151 | This option sets the value for the maximum policy format version | 151 | This option sets the value for the maximum policy format version |
diff --git a/security/selinux/Makefile b/security/selinux/Makefile index dc3502e30b19..00afd85f1edb 100644 --- a/security/selinux/Makefile +++ b/security/selinux/Makefile | |||
| @@ -4,7 +4,14 @@ | |||
| 4 | 4 | ||
| 5 | obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/ | 5 | obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/ |
| 6 | 6 | ||
| 7 | selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o exports.o | 7 | selinux-y := avc.o \ |
| 8 | hooks.o \ | ||
| 9 | selinuxfs.o \ | ||
| 10 | netlink.o \ | ||
| 11 | nlmsgtab.o \ | ||
| 12 | netif.o \ | ||
| 13 | netnode.o \ | ||
| 14 | exports.o | ||
| 8 | 15 | ||
| 9 | selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o | 16 | selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o |
| 10 | 17 | ||
diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 81b3dff3cbf0..e8529e2f51e5 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c | |||
| @@ -661,9 +661,18 @@ void avc_audit(u32 ssid, u32 tsid, | |||
| 661 | "daddr", "dest"); | 661 | "daddr", "dest"); |
| 662 | break; | 662 | break; |
| 663 | } | 663 | } |
| 664 | if (a->u.net.netif) | 664 | if (a->u.net.netif > 0) { |
| 665 | audit_log_format(ab, " netif=%s", | 665 | struct net_device *dev; |
| 666 | a->u.net.netif); | 666 | |
| 667 | /* NOTE: we always use init's namespace */ | ||
| 668 | dev = dev_get_by_index(&init_net, | ||
| 669 | a->u.net.netif); | ||
| 670 | if (dev) { | ||
| 671 | audit_log_format(ab, " netif=%s", | ||
| 672 | dev->name); | ||
| 673 | dev_put(dev); | ||
| 674 | } | ||
| 675 | } | ||
| 667 | break; | 676 | break; |
| 668 | } | 677 | } |
| 669 | } | 678 | } |
diff --git a/security/selinux/exports.c b/security/selinux/exports.c index b6f96943be1f..87d2bb3ea355 100644 --- a/security/selinux/exports.c +++ b/security/selinux/exports.c | |||
| @@ -17,10 +17,14 @@ | |||
| 17 | #include <linux/selinux.h> | 17 | #include <linux/selinux.h> |
| 18 | #include <linux/fs.h> | 18 | #include <linux/fs.h> |
| 19 | #include <linux/ipc.h> | 19 | #include <linux/ipc.h> |
| 20 | #include <asm/atomic.h> | ||
| 20 | 21 | ||
| 21 | #include "security.h" | 22 | #include "security.h" |
| 22 | #include "objsec.h" | 23 | #include "objsec.h" |
| 23 | 24 | ||
| 25 | /* SECMARK reference count */ | ||
| 26 | extern atomic_t selinux_secmark_refcount; | ||
| 27 | |||
| 24 | int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen) | 28 | int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen) |
| 25 | { | 29 | { |
| 26 | if (selinux_enabled) | 30 | if (selinux_enabled) |
| @@ -74,7 +78,7 @@ int selinux_string_to_sid(char *str, u32 *sid) | |||
| 74 | } | 78 | } |
| 75 | EXPORT_SYMBOL_GPL(selinux_string_to_sid); | 79 | EXPORT_SYMBOL_GPL(selinux_string_to_sid); |
| 76 | 80 | ||
| 77 | int selinux_relabel_packet_permission(u32 sid) | 81 | int selinux_secmark_relabel_packet_permission(u32 sid) |
| 78 | { | 82 | { |
| 79 | if (selinux_enabled) { | 83 | if (selinux_enabled) { |
| 80 | struct task_security_struct *tsec = current->security; | 84 | struct task_security_struct *tsec = current->security; |
| @@ -84,4 +88,16 @@ int selinux_relabel_packet_permission(u32 sid) | |||
| 84 | } | 88 | } |
| 85 | return 0; | 89 | return 0; |
| 86 | } | 90 | } |
| 87 | EXPORT_SYMBOL_GPL(selinux_relabel_packet_permission); | 91 | EXPORT_SYMBOL_GPL(selinux_secmark_relabel_packet_permission); |
| 92 | |||
| 93 | void selinux_secmark_refcount_inc(void) | ||
| 94 | { | ||
| 95 | atomic_inc(&selinux_secmark_refcount); | ||
| 96 | } | ||
| 97 | EXPORT_SYMBOL_GPL(selinux_secmark_refcount_inc); | ||
| 98 | |||
| 99 | void selinux_secmark_refcount_dec(void) | ||
| 100 | { | ||
| 101 | atomic_dec(&selinux_secmark_refcount); | ||
| 102 | } | ||
| 103 | EXPORT_SYMBOL_GPL(selinux_secmark_refcount_dec); | ||
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 64d414efb404..be6de0b8734f 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c | |||
| @@ -12,8 +12,8 @@ | |||
| 12 | * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> | 12 | * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> |
| 13 | * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. | 13 | * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. |
| 14 | * <dgoeddel@trustedcs.com> | 14 | * <dgoeddel@trustedcs.com> |
| 15 | * Copyright (C) 2006 Hewlett-Packard Development Company, L.P. | 15 | * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P. |
| 16 | * Paul Moore, <paul.moore@hp.com> | 16 | * Paul Moore <paul.moore@hp.com> |
| 17 | * Copyright (C) 2007 Hitachi Software Engineering Co., Ltd. | 17 | * Copyright (C) 2007 Hitachi Software Engineering Co., Ltd. |
| 18 | * Yuichi Nakamura <ynakam@hitachisoft.jp> | 18 | * Yuichi Nakamura <ynakam@hitachisoft.jp> |
| 19 | * | 19 | * |
| @@ -50,8 +50,11 @@ | |||
| 50 | #include <net/icmp.h> | 50 | #include <net/icmp.h> |
| 51 | #include <net/ip.h> /* for local_port_range[] */ | 51 | #include <net/ip.h> /* for local_port_range[] */ |
| 52 | #include <net/tcp.h> /* struct or_callable used in sock_rcv_skb */ | 52 | #include <net/tcp.h> /* struct or_callable used in sock_rcv_skb */ |
| 53 | #include <net/net_namespace.h> | ||
| 54 | #include <net/netlabel.h> | ||
| 53 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
| 54 | #include <asm/ioctls.h> | 56 | #include <asm/ioctls.h> |
| 57 | #include <asm/atomic.h> | ||
| 55 | #include <linux/bitops.h> | 58 | #include <linux/bitops.h> |
| 56 | #include <linux/interrupt.h> | 59 | #include <linux/interrupt.h> |
| 57 | #include <linux/netdevice.h> /* for network interface checks */ | 60 | #include <linux/netdevice.h> /* for network interface checks */ |
| @@ -76,6 +79,7 @@ | |||
| 76 | #include "avc.h" | 79 | #include "avc.h" |
| 77 | #include "objsec.h" | 80 | #include "objsec.h" |
| 78 | #include "netif.h" | 81 | #include "netif.h" |
| 82 | #include "netnode.h" | ||
| 79 | #include "xfrm.h" | 83 | #include "xfrm.h" |
| 80 | #include "netlabel.h" | 84 | #include "netlabel.h" |
| 81 | 85 | ||
| @@ -89,6 +93,9 @@ extern int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm); | |||
| 89 | extern int selinux_compat_net; | 93 | extern int selinux_compat_net; |
| 90 | extern struct security_operations *security_ops; | 94 | extern struct security_operations *security_ops; |
| 91 | 95 | ||
| 96 | /* SECMARK reference count */ | ||
| 97 | atomic_t selinux_secmark_refcount = ATOMIC_INIT(0); | ||
| 98 | |||
| 92 | #ifdef CONFIG_SECURITY_SELINUX_DEVELOP | 99 | #ifdef CONFIG_SECURITY_SELINUX_DEVELOP |
| 93 | int selinux_enforcing = 0; | 100 | int selinux_enforcing = 0; |
| 94 | 101 | ||
| @@ -155,6 +162,21 @@ getsecurity_exit: | |||
| 155 | return len; | 162 | return len; |
| 156 | } | 163 | } |
| 157 | 164 | ||
| 165 | /** | ||
| 166 | * selinux_secmark_enabled - Check to see if SECMARK is currently enabled | ||
| 167 | * | ||
| 168 | * Description: | ||
| 169 | * This function checks the SECMARK reference counter to see if any SECMARK | ||
| 170 | * targets are currently configured, if the reference counter is greater than | ||
| 171 | * zero SECMARK is considered to be enabled. Returns true (1) if SECMARK is | ||
| 172 | * enabled, false (0) if SECMARK is disabled. | ||
| 173 | * | ||
| 174 | */ | ||
| 175 | static int selinux_secmark_enabled(void) | ||
| 176 | { | ||
| 177 | return (atomic_read(&selinux_secmark_refcount) > 0); | ||
| 178 | } | ||
| 179 | |||
| 158 | /* Allocate and free functions for each kind of security blob. */ | 180 | /* Allocate and free functions for each kind of security blob. */ |
| 159 | 181 | ||
| 160 | static int task_alloc_security(struct task_struct *task) | 182 | static int task_alloc_security(struct task_struct *task) |
| @@ -561,8 +583,8 @@ static int bad_option(struct superblock_security_struct *sbsec, char flag, | |||
| 561 | * Allow filesystems with binary mount data to explicitly set mount point | 583 | * Allow filesystems with binary mount data to explicitly set mount point |
| 562 | * labeling information. | 584 | * labeling information. |
| 563 | */ | 585 | */ |
| 564 | int selinux_set_mnt_opts(struct super_block *sb, char **mount_options, | 586 | static int selinux_set_mnt_opts(struct super_block *sb, char **mount_options, |
| 565 | int *flags, int num_opts) | 587 | int *flags, int num_opts) |
| 566 | { | 588 | { |
| 567 | int rc = 0, i; | 589 | int rc = 0, i; |
| 568 | struct task_security_struct *tsec = current->security; | 590 | struct task_security_struct *tsec = current->security; |
| @@ -3395,7 +3417,7 @@ out: | |||
| 3395 | #endif /* IPV6 */ | 3417 | #endif /* IPV6 */ |
| 3396 | 3418 | ||
| 3397 | static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, | 3419 | static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, |
| 3398 | char **addrp, int *len, int src, u8 *proto) | 3420 | char **addrp, int src, u8 *proto) |
| 3399 | { | 3421 | { |
| 3400 | int ret = 0; | 3422 | int ret = 0; |
| 3401 | 3423 | ||
| @@ -3404,7 +3426,6 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, | |||
| 3404 | ret = selinux_parse_skb_ipv4(skb, ad, proto); | 3426 | ret = selinux_parse_skb_ipv4(skb, ad, proto); |
| 3405 | if (ret || !addrp) | 3427 | if (ret || !addrp) |
| 3406 | break; | 3428 | break; |
| 3407 | *len = 4; | ||
| 3408 | *addrp = (char *)(src ? &ad->u.net.v4info.saddr : | 3429 | *addrp = (char *)(src ? &ad->u.net.v4info.saddr : |
| 3409 | &ad->u.net.v4info.daddr); | 3430 | &ad->u.net.v4info.daddr); |
| 3410 | break; | 3431 | break; |
| @@ -3414,7 +3435,6 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, | |||
| 3414 | ret = selinux_parse_skb_ipv6(skb, ad, proto); | 3435 | ret = selinux_parse_skb_ipv6(skb, ad, proto); |
| 3415 | if (ret || !addrp) | 3436 | if (ret || !addrp) |
| 3416 | break; | 3437 | break; |
| 3417 | *len = 16; | ||
| 3418 | *addrp = (char *)(src ? &ad->u.net.v6info.saddr : | 3438 | *addrp = (char *)(src ? &ad->u.net.v6info.saddr : |
| 3419 | &ad->u.net.v6info.daddr); | 3439 | &ad->u.net.v6info.daddr); |
| 3420 | break; | 3440 | break; |
| @@ -3423,36 +3443,48 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, | |||
| 3423 | break; | 3443 | break; |
| 3424 | } | 3444 | } |
| 3425 | 3445 | ||
| 3446 | if (unlikely(ret)) | ||
| 3447 | printk(KERN_WARNING | ||
| 3448 | "SELinux: failure in selinux_parse_skb()," | ||
| 3449 | " unable to parse packet\n"); | ||
| 3450 | |||
| 3426 | return ret; | 3451 | return ret; |
| 3427 | } | 3452 | } |
| 3428 | 3453 | ||
| 3429 | /** | 3454 | /** |
| 3430 | * selinux_skb_extlbl_sid - Determine the external label of a packet | 3455 | * selinux_skb_peerlbl_sid - Determine the peer label of a packet |
| 3431 | * @skb: the packet | 3456 | * @skb: the packet |
| 3432 | * @sid: the packet's SID | 3457 | * @family: protocol family |
| 3458 | * @sid: the packet's peer label SID | ||
| 3433 | * | 3459 | * |
| 3434 | * Description: | 3460 | * Description: |
| 3435 | * Check the various different forms of external packet labeling and determine | 3461 | * Check the various different forms of network peer labeling and determine |
| 3436 | * the external SID for the packet. If only one form of external labeling is | 3462 | * the peer label/SID for the packet; most of the magic actually occurs in |
| 3437 | * present then it is used, if both labeled IPsec and NetLabel labels are | 3463 | * the security server function security_net_peersid_cmp(). The function |
| 3438 | * present then the SELinux type information is taken from the labeled IPsec | 3464 | * returns zero if the value in @sid is valid (although it may be SECSID_NULL) |
| 3439 | * SA and the MLS sensitivity label information is taken from the NetLabel | 3465 | * or -EACCES if @sid is invalid due to inconsistencies with the different |
| 3440 | * security attributes. This bit of "magic" is done in the call to | 3466 | * peer labels. |
| 3441 | * selinux_netlbl_skbuff_getsid(). | ||
| 3442 | * | 3467 | * |
| 3443 | */ | 3468 | */ |
| 3444 | static void selinux_skb_extlbl_sid(struct sk_buff *skb, u32 *sid) | 3469 | static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid) |
| 3445 | { | 3470 | { |
| 3471 | int err; | ||
| 3446 | u32 xfrm_sid; | 3472 | u32 xfrm_sid; |
| 3447 | u32 nlbl_sid; | 3473 | u32 nlbl_sid; |
| 3474 | u32 nlbl_type; | ||
| 3448 | 3475 | ||
| 3449 | selinux_skb_xfrm_sid(skb, &xfrm_sid); | 3476 | selinux_skb_xfrm_sid(skb, &xfrm_sid); |
| 3450 | if (selinux_netlbl_skbuff_getsid(skb, | 3477 | selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid); |
| 3451 | (xfrm_sid == SECSID_NULL ? | 3478 | |
| 3452 | SECINITSID_NETMSG : xfrm_sid), | 3479 | err = security_net_peersid_resolve(nlbl_sid, nlbl_type, xfrm_sid, sid); |
| 3453 | &nlbl_sid) != 0) | 3480 | if (unlikely(err)) { |
| 3454 | nlbl_sid = SECSID_NULL; | 3481 | printk(KERN_WARNING |
| 3455 | *sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid); | 3482 | "SELinux: failure in selinux_skb_peerlbl_sid()," |
| 3483 | " unable to determine packet's peer label\n"); | ||
| 3484 | return -EACCES; | ||
| 3485 | } | ||
| 3486 | |||
| 3487 | return 0; | ||
| 3456 | } | 3488 | } |
| 3457 | 3489 | ||
| 3458 | /* socket security operations */ | 3490 | /* socket security operations */ |
| @@ -3518,6 +3550,7 @@ static int selinux_socket_post_create(struct socket *sock, int family, | |||
| 3518 | if (sock->sk) { | 3550 | if (sock->sk) { |
| 3519 | sksec = sock->sk->sk_security; | 3551 | sksec = sock->sk->sk_security; |
| 3520 | sksec->sid = isec->sid; | 3552 | sksec->sid = isec->sid; |
| 3553 | sksec->sclass = isec->sclass; | ||
| 3521 | err = selinux_netlbl_socket_post_create(sock); | 3554 | err = selinux_netlbl_socket_post_create(sock); |
| 3522 | } | 3555 | } |
| 3523 | 3556 | ||
| @@ -3610,7 +3643,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in | |||
| 3610 | break; | 3643 | break; |
| 3611 | } | 3644 | } |
| 3612 | 3645 | ||
| 3613 | err = security_node_sid(family, addrp, addrlen, &sid); | 3646 | err = sel_netnode_sid(addrp, family, &sid); |
| 3614 | if (err) | 3647 | if (err) |
| 3615 | goto out; | 3648 | goto out; |
| 3616 | 3649 | ||
| @@ -3821,131 +3854,182 @@ static int selinux_socket_unix_may_send(struct socket *sock, | |||
| 3821 | return 0; | 3854 | return 0; |
| 3822 | } | 3855 | } |
| 3823 | 3856 | ||
| 3824 | static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb, | 3857 | static int selinux_inet_sys_rcv_skb(int ifindex, char *addrp, u16 family, |
| 3825 | struct avc_audit_data *ad, u16 family, char *addrp, int len) | 3858 | u32 peer_sid, |
| 3859 | struct avc_audit_data *ad) | ||
| 3826 | { | 3860 | { |
| 3827 | int err = 0; | 3861 | int err; |
| 3828 | u32 netif_perm, node_perm, node_sid, if_sid, recv_perm = 0; | 3862 | u32 if_sid; |
| 3829 | struct socket *sock; | 3863 | u32 node_sid; |
| 3830 | u16 sock_class = 0; | ||
| 3831 | u32 sock_sid = 0; | ||
| 3832 | |||
| 3833 | read_lock_bh(&sk->sk_callback_lock); | ||
| 3834 | sock = sk->sk_socket; | ||
| 3835 | if (sock) { | ||
| 3836 | struct inode *inode; | ||
| 3837 | inode = SOCK_INODE(sock); | ||
| 3838 | if (inode) { | ||
| 3839 | struct inode_security_struct *isec; | ||
| 3840 | isec = inode->i_security; | ||
| 3841 | sock_sid = isec->sid; | ||
| 3842 | sock_class = isec->sclass; | ||
| 3843 | } | ||
| 3844 | } | ||
| 3845 | read_unlock_bh(&sk->sk_callback_lock); | ||
| 3846 | if (!sock_sid) | ||
| 3847 | goto out; | ||
| 3848 | 3864 | ||
| 3849 | if (!skb->dev) | 3865 | err = sel_netif_sid(ifindex, &if_sid); |
| 3850 | goto out; | 3866 | if (err) |
| 3867 | return err; | ||
| 3868 | err = avc_has_perm(peer_sid, if_sid, | ||
| 3869 | SECCLASS_NETIF, NETIF__INGRESS, ad); | ||
| 3870 | if (err) | ||
| 3871 | return err; | ||
| 3851 | 3872 | ||
| 3852 | err = sel_netif_sids(skb->dev, &if_sid, NULL); | 3873 | err = sel_netnode_sid(addrp, family, &node_sid); |
| 3853 | if (err) | 3874 | if (err) |
| 3854 | goto out; | 3875 | return err; |
| 3876 | return avc_has_perm(peer_sid, node_sid, | ||
| 3877 | SECCLASS_NODE, NODE__RECVFROM, ad); | ||
| 3878 | } | ||
| 3879 | |||
| 3880 | static int selinux_sock_rcv_skb_iptables_compat(struct sock *sk, | ||
| 3881 | struct sk_buff *skb, | ||
| 3882 | struct avc_audit_data *ad, | ||
| 3883 | u16 family, | ||
| 3884 | char *addrp) | ||
| 3885 | { | ||
| 3886 | int err; | ||
| 3887 | struct sk_security_struct *sksec = sk->sk_security; | ||
| 3888 | u16 sk_class; | ||
| 3889 | u32 netif_perm, node_perm, recv_perm; | ||
| 3890 | u32 port_sid, node_sid, if_sid, sk_sid; | ||
| 3855 | 3891 | ||
| 3856 | switch (sock_class) { | 3892 | sk_sid = sksec->sid; |
| 3893 | sk_class = sksec->sclass; | ||
| 3894 | |||
| 3895 | switch (sk_class) { | ||
| 3857 | case SECCLASS_UDP_SOCKET: | 3896 | case SECCLASS_UDP_SOCKET: |
| 3858 | netif_perm = NETIF__UDP_RECV; | 3897 | netif_perm = NETIF__UDP_RECV; |
| 3859 | node_perm = NODE__UDP_RECV; | 3898 | node_perm = NODE__UDP_RECV; |
| 3860 | recv_perm = UDP_SOCKET__RECV_MSG; | 3899 | recv_perm = UDP_SOCKET__RECV_MSG; |
| 3861 | break; | 3900 | break; |
| 3862 | |||
| 3863 | case SECCLASS_TCP_SOCKET: | 3901 | case SECCLASS_TCP_SOCKET: |
| 3864 | netif_perm = NETIF__TCP_RECV; | 3902 | netif_perm = NETIF__TCP_RECV; |
| 3865 | node_perm = NODE__TCP_RECV; | 3903 | node_perm = NODE__TCP_RECV; |
| 3866 | recv_perm = TCP_SOCKET__RECV_MSG; | 3904 | recv_perm = TCP_SOCKET__RECV_MSG; |
| 3867 | break; | 3905 | break; |
| 3868 | |||
| 3869 | case SECCLASS_DCCP_SOCKET: | 3906 | case SECCLASS_DCCP_SOCKET: |
| 3870 | netif_perm = NETIF__DCCP_RECV; | 3907 | netif_perm = NETIF__DCCP_RECV; |
| 3871 | node_perm = NODE__DCCP_RECV; | 3908 | node_perm = NODE__DCCP_RECV; |
| 3872 | recv_perm = DCCP_SOCKET__RECV_MSG; | 3909 | recv_perm = DCCP_SOCKET__RECV_MSG; |
| 3873 | break; | 3910 | break; |
| 3874 | |||
| 3875 | default: | 3911 | default: |
| 3876 | netif_perm = NETIF__RAWIP_RECV; | 3912 | netif_perm = NETIF__RAWIP_RECV; |
| 3877 | node_perm = NODE__RAWIP_RECV; | 3913 | node_perm = NODE__RAWIP_RECV; |
| 3914 | recv_perm = 0; | ||
| 3878 | break; | 3915 | break; |
| 3879 | } | 3916 | } |
| 3880 | 3917 | ||
| 3881 | err = avc_has_perm(sock_sid, if_sid, SECCLASS_NETIF, netif_perm, ad); | 3918 | err = sel_netif_sid(skb->iif, &if_sid); |
| 3882 | if (err) | 3919 | if (err) |
| 3883 | goto out; | 3920 | return err; |
| 3884 | 3921 | err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad); | |
| 3885 | err = security_node_sid(family, addrp, len, &node_sid); | ||
| 3886 | if (err) | 3922 | if (err) |
| 3887 | goto out; | 3923 | return err; |
| 3888 | 3924 | ||
| 3889 | err = avc_has_perm(sock_sid, node_sid, SECCLASS_NODE, node_perm, ad); | 3925 | err = sel_netnode_sid(addrp, family, &node_sid); |
| 3890 | if (err) | 3926 | if (err) |
| 3891 | goto out; | 3927 | return err; |
| 3928 | err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad); | ||
| 3929 | if (err) | ||
| 3930 | return err; | ||
| 3892 | 3931 | ||
| 3893 | if (recv_perm) { | 3932 | if (!recv_perm) |
| 3894 | u32 port_sid; | 3933 | return 0; |
| 3934 | err = security_port_sid(sk->sk_family, sk->sk_type, | ||
| 3935 | sk->sk_protocol, ntohs(ad->u.net.sport), | ||
| 3936 | &port_sid); | ||
| 3937 | if (unlikely(err)) { | ||
| 3938 | printk(KERN_WARNING | ||
| 3939 | "SELinux: failure in" | ||
| 3940 | " selinux_sock_rcv_skb_iptables_compat()," | ||
| 3941 | " network port label not found\n"); | ||
| 3942 | return err; | ||
| 3943 | } | ||
| 3944 | return avc_has_perm(sk_sid, port_sid, sk_class, recv_perm, ad); | ||
| 3945 | } | ||
| 3895 | 3946 | ||
| 3896 | err = security_port_sid(sk->sk_family, sk->sk_type, | 3947 | static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb, |
| 3897 | sk->sk_protocol, ntohs(ad->u.net.sport), | 3948 | struct avc_audit_data *ad, |
| 3898 | &port_sid); | 3949 | u16 family, char *addrp) |
| 3899 | if (err) | 3950 | { |
| 3900 | goto out; | 3951 | int err; |
| 3952 | struct sk_security_struct *sksec = sk->sk_security; | ||
| 3953 | u32 peer_sid; | ||
| 3954 | u32 sk_sid = sksec->sid; | ||
| 3901 | 3955 | ||
| 3902 | err = avc_has_perm(sock_sid, port_sid, | 3956 | if (selinux_compat_net) |
| 3903 | sock_class, recv_perm, ad); | 3957 | err = selinux_sock_rcv_skb_iptables_compat(sk, skb, ad, |
| 3958 | family, addrp); | ||
| 3959 | else | ||
| 3960 | err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET, | ||
| 3961 | PACKET__RECV, ad); | ||
| 3962 | if (err) | ||
| 3963 | return err; | ||
| 3964 | |||
| 3965 | if (selinux_policycap_netpeer) { | ||
| 3966 | err = selinux_skb_peerlbl_sid(skb, family, &peer_sid); | ||
| 3967 | if (err) | ||
| 3968 | return err; | ||
| 3969 | err = avc_has_perm(sk_sid, peer_sid, | ||
| 3970 | SECCLASS_PEER, PEER__RECV, ad); | ||
| 3971 | } else { | ||
| 3972 | err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, ad); | ||
| 3973 | if (err) | ||
| 3974 | return err; | ||
| 3975 | err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, ad); | ||
| 3904 | } | 3976 | } |
| 3905 | 3977 | ||
| 3906 | out: | ||
| 3907 | return err; | 3978 | return err; |
| 3908 | } | 3979 | } |
| 3909 | 3980 | ||
| 3910 | static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) | 3981 | static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) |
| 3911 | { | 3982 | { |
| 3912 | u16 family; | 3983 | int err; |
| 3913 | char *addrp; | ||
| 3914 | int len, err = 0; | ||
| 3915 | struct avc_audit_data ad; | ||
| 3916 | struct sk_security_struct *sksec = sk->sk_security; | 3984 | struct sk_security_struct *sksec = sk->sk_security; |
| 3985 | u16 family = sk->sk_family; | ||
| 3986 | u32 sk_sid = sksec->sid; | ||
| 3987 | struct avc_audit_data ad; | ||
| 3988 | char *addrp; | ||
| 3917 | 3989 | ||
| 3918 | family = sk->sk_family; | ||
| 3919 | if (family != PF_INET && family != PF_INET6) | 3990 | if (family != PF_INET && family != PF_INET6) |
| 3920 | goto out; | 3991 | return 0; |
| 3921 | 3992 | ||
| 3922 | /* Handle mapped IPv4 packets arriving via IPv6 sockets */ | 3993 | /* Handle mapped IPv4 packets arriving via IPv6 sockets */ |
| 3923 | if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) | 3994 | if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) |
| 3924 | family = PF_INET; | 3995 | family = PF_INET; |
| 3925 | 3996 | ||
| 3926 | AVC_AUDIT_DATA_INIT(&ad, NET); | 3997 | AVC_AUDIT_DATA_INIT(&ad, NET); |
| 3927 | ad.u.net.netif = skb->dev ? skb->dev->name : "[unknown]"; | 3998 | ad.u.net.netif = skb->iif; |
| 3928 | ad.u.net.family = family; | 3999 | ad.u.net.family = family; |
| 3929 | 4000 | err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL); | |
| 3930 | err = selinux_parse_skb(skb, &ad, &addrp, &len, 1, NULL); | ||
| 3931 | if (err) | 4001 | if (err) |
| 3932 | goto out; | 4002 | return err; |
| 3933 | 4003 | ||
| 3934 | if (selinux_compat_net) | 4004 | /* If any sort of compatibility mode is enabled then handoff processing |
| 3935 | err = selinux_sock_rcv_skb_compat(sk, skb, &ad, family, | 4005 | * to the selinux_sock_rcv_skb_compat() function to deal with the |
| 3936 | addrp, len); | 4006 | * special handling. We do this in an attempt to keep this function |
| 3937 | else | 4007 | * as fast and as clean as possible. */ |
| 3938 | err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET, | 4008 | if (selinux_compat_net || !selinux_policycap_netpeer) |
| 3939 | PACKET__RECV, &ad); | 4009 | return selinux_sock_rcv_skb_compat(sk, skb, &ad, |
| 3940 | if (err) | 4010 | family, addrp); |
| 3941 | goto out; | ||
| 3942 | 4011 | ||
| 3943 | err = selinux_netlbl_sock_rcv_skb(sksec, skb, &ad); | 4012 | if (netlbl_enabled() || selinux_xfrm_enabled()) { |
| 3944 | if (err) | 4013 | u32 peer_sid; |
| 3945 | goto out; | 4014 | |
| 4015 | err = selinux_skb_peerlbl_sid(skb, family, &peer_sid); | ||
| 4016 | if (err) | ||
| 4017 | return err; | ||
| 4018 | err = selinux_inet_sys_rcv_skb(skb->iif, addrp, family, | ||
| 4019 | peer_sid, &ad); | ||
| 4020 | if (err) | ||
| 4021 | return err; | ||
| 4022 | err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER, | ||
| 4023 | PEER__RECV, &ad); | ||
| 4024 | } | ||
| 4025 | |||
| 4026 | if (selinux_secmark_enabled()) { | ||
| 4027 | err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET, | ||
| 4028 | PACKET__RECV, &ad); | ||
| 4029 | if (err) | ||
| 4030 | return err; | ||
| 4031 | } | ||
| 3946 | 4032 | ||
| 3947 | err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad); | ||
| 3948 | out: | ||
| 3949 | return err; | 4033 | return err; |
| 3950 | } | 4034 | } |
| 3951 | 4035 | ||
| @@ -3996,18 +4080,25 @@ out: | |||
| 3996 | static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) | 4080 | static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) |
| 3997 | { | 4081 | { |
| 3998 | u32 peer_secid = SECSID_NULL; | 4082 | u32 peer_secid = SECSID_NULL; |
| 3999 | int err = 0; | 4083 | u16 family; |
| 4084 | |||
| 4085 | if (sock) | ||
| 4086 | family = sock->sk->sk_family; | ||
| 4087 | else if (skb && skb->sk) | ||
| 4088 | family = skb->sk->sk_family; | ||
| 4089 | else | ||
| 4090 | goto out; | ||
| 4000 | 4091 | ||
| 4001 | if (sock && sock->sk->sk_family == PF_UNIX) | 4092 | if (sock && family == PF_UNIX) |
| 4002 | selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid); | 4093 | selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid); |
| 4003 | else if (skb) | 4094 | else if (skb) |
| 4004 | selinux_skb_extlbl_sid(skb, &peer_secid); | 4095 | selinux_skb_peerlbl_sid(skb, family, &peer_secid); |
| 4005 | 4096 | ||
| 4006 | if (peer_secid == SECSID_NULL) | 4097 | out: |
| 4007 | err = -EINVAL; | ||
| 4008 | *secid = peer_secid; | 4098 | *secid = peer_secid; |
| 4009 | 4099 | if (peer_secid == SECSID_NULL) | |
| 4010 | return err; | 4100 | return -EINVAL; |
| 4101 | return 0; | ||
| 4011 | } | 4102 | } |
| 4012 | 4103 | ||
| 4013 | static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority) | 4104 | static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority) |
| @@ -4027,6 +4118,7 @@ static void selinux_sk_clone_security(const struct sock *sk, struct sock *newsk) | |||
| 4027 | 4118 | ||
| 4028 | newssec->sid = ssec->sid; | 4119 | newssec->sid = ssec->sid; |
| 4029 | newssec->peer_sid = ssec->peer_sid; | 4120 | newssec->peer_sid = ssec->peer_sid; |
| 4121 | newssec->sclass = ssec->sclass; | ||
| 4030 | 4122 | ||
| 4031 | selinux_netlbl_sk_security_clone(ssec, newssec); | 4123 | selinux_netlbl_sk_security_clone(ssec, newssec); |
| 4032 | } | 4124 | } |
| @@ -4050,6 +4142,7 @@ static void selinux_sock_graft(struct sock* sk, struct socket *parent) | |||
| 4050 | if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 || | 4142 | if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 || |
| 4051 | sk->sk_family == PF_UNIX) | 4143 | sk->sk_family == PF_UNIX) |
| 4052 | isec->sid = sksec->sid; | 4144 | isec->sid = sksec->sid; |
| 4145 | sksec->sclass = isec->sclass; | ||
| 4053 | 4146 | ||
| 4054 | selinux_netlbl_sock_graft(sk, parent); | 4147 | selinux_netlbl_sock_graft(sk, parent); |
| 4055 | } | 4148 | } |
| @@ -4062,7 +4155,9 @@ static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb, | |||
| 4062 | u32 newsid; | 4155 | u32 newsid; |
| 4063 | u32 peersid; | 4156 | u32 peersid; |
| 4064 | 4157 | ||
| 4065 | selinux_skb_extlbl_sid(skb, &peersid); | 4158 | err = selinux_skb_peerlbl_sid(skb, sk->sk_family, &peersid); |
| 4159 | if (err) | ||
| 4160 | return err; | ||
| 4066 | if (peersid == SECSID_NULL) { | 4161 | if (peersid == SECSID_NULL) { |
| 4067 | req->secid = sksec->sid; | 4162 | req->secid = sksec->sid; |
| 4068 | req->peer_secid = SECSID_NULL; | 4163 | req->peer_secid = SECSID_NULL; |
| @@ -4100,7 +4195,7 @@ static void selinux_inet_conn_established(struct sock *sk, | |||
| 4100 | { | 4195 | { |
| 4101 | struct sk_security_struct *sksec = sk->sk_security; | 4196 | struct sk_security_struct *sksec = sk->sk_security; |
| 4102 | 4197 | ||
| 4103 | selinux_skb_extlbl_sid(skb, &sksec->peer_sid); | 4198 | selinux_skb_peerlbl_sid(skb, sk->sk_family, &sksec->peer_sid); |
| 4104 | } | 4199 | } |
| 4105 | 4200 | ||
| 4106 | static void selinux_req_classify_flow(const struct request_sock *req, | 4201 | static void selinux_req_classify_flow(const struct request_sock *req, |
| @@ -4147,149 +4242,260 @@ out: | |||
| 4147 | 4242 | ||
| 4148 | #ifdef CONFIG_NETFILTER | 4243 | #ifdef CONFIG_NETFILTER |
| 4149 | 4244 | ||
| 4150 | static int selinux_ip_postroute_last_compat(struct sock *sk, struct net_device *dev, | 4245 | static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex, |
| 4151 | struct avc_audit_data *ad, | 4246 | u16 family) |
| 4152 | u16 family, char *addrp, int len) | ||
| 4153 | { | 4247 | { |
| 4154 | int err = 0; | 4248 | char *addrp; |
| 4155 | u32 netif_perm, node_perm, node_sid, if_sid, send_perm = 0; | 4249 | u32 peer_sid; |
| 4156 | struct socket *sock; | 4250 | struct avc_audit_data ad; |
| 4157 | struct inode *inode; | 4251 | u8 secmark_active; |
| 4158 | struct inode_security_struct *isec; | 4252 | u8 peerlbl_active; |
| 4159 | 4253 | ||
| 4160 | sock = sk->sk_socket; | 4254 | if (!selinux_policycap_netpeer) |
| 4161 | if (!sock) | 4255 | return NF_ACCEPT; |
| 4162 | goto out; | ||
| 4163 | 4256 | ||
| 4164 | inode = SOCK_INODE(sock); | 4257 | secmark_active = selinux_secmark_enabled(); |
| 4165 | if (!inode) | 4258 | peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled(); |
| 4166 | goto out; | 4259 | if (!secmark_active && !peerlbl_active) |
| 4260 | return NF_ACCEPT; | ||
| 4167 | 4261 | ||
| 4168 | isec = inode->i_security; | 4262 | AVC_AUDIT_DATA_INIT(&ad, NET); |
| 4169 | 4263 | ad.u.net.netif = ifindex; | |
| 4170 | err = sel_netif_sids(dev, &if_sid, NULL); | 4264 | ad.u.net.family = family; |
| 4171 | if (err) | 4265 | if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0) |
| 4172 | goto out; | 4266 | return NF_DROP; |
| 4267 | |||
| 4268 | if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0) | ||
| 4269 | return NF_DROP; | ||
| 4270 | |||
| 4271 | if (peerlbl_active) | ||
| 4272 | if (selinux_inet_sys_rcv_skb(ifindex, addrp, family, | ||
| 4273 | peer_sid, &ad) != 0) | ||
| 4274 | return NF_DROP; | ||
| 4275 | |||
| 4276 | if (secmark_active) | ||
| 4277 | if (avc_has_perm(peer_sid, skb->secmark, | ||
| 4278 | SECCLASS_PACKET, PACKET__FORWARD_IN, &ad)) | ||
| 4279 | return NF_DROP; | ||
| 4280 | |||
| 4281 | return NF_ACCEPT; | ||
| 4282 | } | ||
| 4283 | |||
| 4284 | static unsigned int selinux_ipv4_forward(unsigned int hooknum, | ||
| 4285 | struct sk_buff *skb, | ||
| 4286 | const struct net_device *in, | ||
| 4287 | const struct net_device *out, | ||
| 4288 | int (*okfn)(struct sk_buff *)) | ||
| 4289 | { | ||
| 4290 | return selinux_ip_forward(skb, in->ifindex, PF_INET); | ||
| 4291 | } | ||
| 4173 | 4292 | ||
| 4174 | switch (isec->sclass) { | 4293 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 4294 | static unsigned int selinux_ipv6_forward(unsigned int hooknum, | ||
| 4295 | struct sk_buff *skb, | ||
| 4296 | const struct net_device *in, | ||
| 4297 | const struct net_device *out, | ||
| 4298 | int (*okfn)(struct sk_buff *)) | ||
| 4299 | { | ||
| 4300 | return selinux_ip_forward(skb, in->ifindex, PF_INET6); | ||
| 4301 | } | ||
| 4302 | #endif /* IPV6 */ | ||
| 4303 | |||
| 4304 | static int selinux_ip_postroute_iptables_compat(struct sock *sk, | ||
| 4305 | int ifindex, | ||
| 4306 | struct avc_audit_data *ad, | ||
| 4307 | u16 family, char *addrp) | ||
| 4308 | { | ||
| 4309 | int err; | ||
| 4310 | struct sk_security_struct *sksec = sk->sk_security; | ||
| 4311 | u16 sk_class; | ||
| 4312 | u32 netif_perm, node_perm, send_perm; | ||
| 4313 | u32 port_sid, node_sid, if_sid, sk_sid; | ||
| 4314 | |||
| 4315 | sk_sid = sksec->sid; | ||
| 4316 | sk_class = sksec->sclass; | ||
| 4317 | |||
| 4318 | switch (sk_class) { | ||
| 4175 | case SECCLASS_UDP_SOCKET: | 4319 | case SECCLASS_UDP_SOCKET: |
| 4176 | netif_perm = NETIF__UDP_SEND; | 4320 | netif_perm = NETIF__UDP_SEND; |
| 4177 | node_perm = NODE__UDP_SEND; | 4321 | node_perm = NODE__UDP_SEND; |
| 4178 | send_perm = UDP_SOCKET__SEND_MSG; | 4322 | send_perm = UDP_SOCKET__SEND_MSG; |
| 4179 | break; | 4323 | break; |
| 4180 | |||
| 4181 | case SECCLASS_TCP_SOCKET: | 4324 | case SECCLASS_TCP_SOCKET: |
| 4182 | netif_perm = NETIF__TCP_SEND; | 4325 | netif_perm = NETIF__TCP_SEND; |
| 4183 | node_perm = NODE__TCP_SEND; | 4326 | node_perm = NODE__TCP_SEND; |
| 4184 | send_perm = TCP_SOCKET__SEND_MSG; | 4327 | send_perm = TCP_SOCKET__SEND_MSG; |
| 4185 | break; | 4328 | break; |
| 4186 | |||
| 4187 | case SECCLASS_DCCP_SOCKET: | 4329 | case SECCLASS_DCCP_SOCKET: |
| 4188 | netif_perm = NETIF__DCCP_SEND; | 4330 | netif_perm = NETIF__DCCP_SEND; |
| 4189 | node_perm = NODE__DCCP_SEND; | 4331 | node_perm = NODE__DCCP_SEND; |
| 4190 | send_perm = DCCP_SOCKET__SEND_MSG; | 4332 | send_perm = DCCP_SOCKET__SEND_MSG; |
| 4191 | break; | 4333 | break; |
| 4192 | |||
| 4193 | default: | 4334 | default: |
| 4194 | netif_perm = NETIF__RAWIP_SEND; | 4335 | netif_perm = NETIF__RAWIP_SEND; |
| 4195 | node_perm = NODE__RAWIP_SEND; | 4336 | node_perm = NODE__RAWIP_SEND; |
| 4337 | send_perm = 0; | ||
| 4196 | break; | 4338 | break; |
| 4197 | } | 4339 | } |
| 4198 | 4340 | ||
| 4199 | err = avc_has_perm(isec->sid, if_sid, SECCLASS_NETIF, netif_perm, ad); | 4341 | err = sel_netif_sid(ifindex, &if_sid); |
| 4200 | if (err) | 4342 | if (err) |
| 4201 | goto out; | 4343 | return err; |
| 4344 | err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad); | ||
| 4345 | return err; | ||
| 4202 | 4346 | ||
| 4203 | err = security_node_sid(family, addrp, len, &node_sid); | 4347 | err = sel_netnode_sid(addrp, family, &node_sid); |
| 4204 | if (err) | 4348 | if (err) |
| 4205 | goto out; | 4349 | return err; |
| 4206 | 4350 | err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad); | |
| 4207 | err = avc_has_perm(isec->sid, node_sid, SECCLASS_NODE, node_perm, ad); | ||
| 4208 | if (err) | 4351 | if (err) |
| 4209 | goto out; | 4352 | return err; |
| 4210 | 4353 | ||
| 4211 | if (send_perm) { | 4354 | if (send_perm != 0) |
| 4212 | u32 port_sid; | 4355 | return 0; |
| 4213 | |||
| 4214 | err = security_port_sid(sk->sk_family, | ||
| 4215 | sk->sk_type, | ||
| 4216 | sk->sk_protocol, | ||
| 4217 | ntohs(ad->u.net.dport), | ||
| 4218 | &port_sid); | ||
| 4219 | if (err) | ||
| 4220 | goto out; | ||
| 4221 | 4356 | ||
| 4222 | err = avc_has_perm(isec->sid, port_sid, isec->sclass, | 4357 | err = security_port_sid(sk->sk_family, sk->sk_type, |
| 4223 | send_perm, ad); | 4358 | sk->sk_protocol, ntohs(ad->u.net.dport), |
| 4359 | &port_sid); | ||
| 4360 | if (unlikely(err)) { | ||
| 4361 | printk(KERN_WARNING | ||
| 4362 | "SELinux: failure in" | ||
| 4363 | " selinux_ip_postroute_iptables_compat()," | ||
| 4364 | " network port label not found\n"); | ||
| 4365 | return err; | ||
| 4224 | } | 4366 | } |
| 4225 | out: | 4367 | return avc_has_perm(sk_sid, port_sid, sk_class, send_perm, ad); |
| 4226 | return err; | ||
| 4227 | } | 4368 | } |
| 4228 | 4369 | ||
| 4229 | static unsigned int selinux_ip_postroute_last(unsigned int hooknum, | 4370 | static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb, |
| 4230 | struct sk_buff *skb, | 4371 | int ifindex, |
| 4231 | const struct net_device *in, | 4372 | struct avc_audit_data *ad, |
| 4232 | const struct net_device *out, | 4373 | u16 family, |
| 4233 | int (*okfn)(struct sk_buff *), | 4374 | char *addrp, |
| 4234 | u16 family) | 4375 | u8 proto) |
| 4235 | { | 4376 | { |
| 4236 | char *addrp; | 4377 | struct sock *sk = skb->sk; |
| 4237 | int len, err = 0; | ||
| 4238 | struct sock *sk; | ||
| 4239 | struct avc_audit_data ad; | ||
| 4240 | struct net_device *dev = (struct net_device *)out; | ||
| 4241 | struct sk_security_struct *sksec; | 4378 | struct sk_security_struct *sksec; |
| 4242 | u8 proto; | ||
| 4243 | |||
| 4244 | sk = skb->sk; | ||
| 4245 | if (!sk) | ||
| 4246 | goto out; | ||
| 4247 | 4379 | ||
| 4380 | if (sk == NULL) | ||
| 4381 | return NF_ACCEPT; | ||
| 4248 | sksec = sk->sk_security; | 4382 | sksec = sk->sk_security; |
| 4249 | 4383 | ||
| 4250 | AVC_AUDIT_DATA_INIT(&ad, NET); | 4384 | if (selinux_compat_net) { |
| 4251 | ad.u.net.netif = dev->name; | 4385 | if (selinux_ip_postroute_iptables_compat(skb->sk, ifindex, |
| 4252 | ad.u.net.family = family; | 4386 | ad, family, addrp)) |
| 4387 | return NF_DROP; | ||
| 4388 | } else { | ||
| 4389 | if (avc_has_perm(sksec->sid, skb->secmark, | ||
| 4390 | SECCLASS_PACKET, PACKET__SEND, ad)) | ||
| 4391 | return NF_DROP; | ||
| 4392 | } | ||
| 4253 | 4393 | ||
| 4254 | err = selinux_parse_skb(skb, &ad, &addrp, &len, 0, &proto); | 4394 | if (selinux_policycap_netpeer) |
| 4255 | if (err) | 4395 | if (selinux_xfrm_postroute_last(sksec->sid, skb, ad, proto)) |
| 4256 | goto out; | 4396 | return NF_DROP; |
| 4257 | 4397 | ||
| 4258 | if (selinux_compat_net) | 4398 | return NF_ACCEPT; |
| 4259 | err = selinux_ip_postroute_last_compat(sk, dev, &ad, | 4399 | } |
| 4260 | family, addrp, len); | ||
| 4261 | else | ||
| 4262 | err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET, | ||
| 4263 | PACKET__SEND, &ad); | ||
| 4264 | 4400 | ||
| 4265 | if (err) | 4401 | static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex, |
| 4266 | goto out; | 4402 | u16 family) |
| 4403 | { | ||
| 4404 | u32 secmark_perm; | ||
| 4405 | u32 peer_sid; | ||
| 4406 | struct sock *sk; | ||
| 4407 | struct avc_audit_data ad; | ||
| 4408 | char *addrp; | ||
| 4409 | u8 proto; | ||
| 4410 | u8 secmark_active; | ||
| 4411 | u8 peerlbl_active; | ||
| 4267 | 4412 | ||
| 4268 | err = selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto); | 4413 | AVC_AUDIT_DATA_INIT(&ad, NET); |
| 4269 | out: | 4414 | ad.u.net.netif = ifindex; |
| 4270 | return err ? NF_DROP : NF_ACCEPT; | 4415 | ad.u.net.family = family; |
| 4416 | if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto)) | ||
| 4417 | return NF_DROP; | ||
| 4418 | |||
| 4419 | /* If any sort of compatibility mode is enabled then handoff processing | ||
| 4420 | * to the selinux_ip_postroute_compat() function to deal with the | ||
| 4421 | * special handling. We do this in an attempt to keep this function | ||
| 4422 | * as fast and as clean as possible. */ | ||
| 4423 | if (selinux_compat_net || !selinux_policycap_netpeer) | ||
| 4424 | return selinux_ip_postroute_compat(skb, ifindex, &ad, | ||
| 4425 | family, addrp, proto); | ||
| 4426 | |||
| 4427 | /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec | ||
| 4428 | * packet transformation so allow the packet to pass without any checks | ||
| 4429 | * since we'll have another chance to perform access control checks | ||
| 4430 | * when the packet is on it's final way out. | ||
| 4431 | * NOTE: there appear to be some IPv6 multicast cases where skb->dst | ||
| 4432 | * is NULL, in this case go ahead and apply access control. */ | ||
| 4433 | if (skb->dst != NULL && skb->dst->xfrm != NULL) | ||
| 4434 | return NF_ACCEPT; | ||
| 4435 | |||
| 4436 | secmark_active = selinux_secmark_enabled(); | ||
| 4437 | peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled(); | ||
| 4438 | if (!secmark_active && !peerlbl_active) | ||
| 4439 | return NF_ACCEPT; | ||
| 4440 | |||
| 4441 | /* if the packet is locally generated (skb->sk != NULL) then use the | ||
| 4442 | * socket's label as the peer label, otherwise the packet is being | ||
| 4443 | * forwarded through this system and we need to fetch the peer label | ||
| 4444 | * directly from the packet */ | ||
| 4445 | sk = skb->sk; | ||
| 4446 | if (sk) { | ||
| 4447 | struct sk_security_struct *sksec = sk->sk_security; | ||
| 4448 | peer_sid = sksec->sid; | ||
| 4449 | secmark_perm = PACKET__SEND; | ||
| 4450 | } else { | ||
| 4451 | if (selinux_skb_peerlbl_sid(skb, family, &peer_sid)) | ||
| 4452 | return NF_DROP; | ||
| 4453 | secmark_perm = PACKET__FORWARD_OUT; | ||
| 4454 | } | ||
| 4455 | |||
| 4456 | if (secmark_active) | ||
| 4457 | if (avc_has_perm(peer_sid, skb->secmark, | ||
| 4458 | SECCLASS_PACKET, secmark_perm, &ad)) | ||
| 4459 | return NF_DROP; | ||
| 4460 | |||
| 4461 | if (peerlbl_active) { | ||
| 4462 | u32 if_sid; | ||
| 4463 | u32 node_sid; | ||
| 4464 | |||
| 4465 | if (sel_netif_sid(ifindex, &if_sid)) | ||
| 4466 | return NF_DROP; | ||
| 4467 | if (avc_has_perm(peer_sid, if_sid, | ||
| 4468 | SECCLASS_NETIF, NETIF__EGRESS, &ad)) | ||
| 4469 | return NF_DROP; | ||
| 4470 | |||
| 4471 | if (sel_netnode_sid(addrp, family, &node_sid)) | ||
| 4472 | return NF_DROP; | ||
| 4473 | if (avc_has_perm(peer_sid, node_sid, | ||
| 4474 | SECCLASS_NODE, NODE__SENDTO, &ad)) | ||
| 4475 | return NF_DROP; | ||
| 4476 | } | ||
| 4477 | |||
| 4478 | return NF_ACCEPT; | ||
| 4271 | } | 4479 | } |
| 4272 | 4480 | ||
| 4273 | static unsigned int selinux_ipv4_postroute_last(unsigned int hooknum, | 4481 | static unsigned int selinux_ipv4_postroute(unsigned int hooknum, |
| 4274 | struct sk_buff *skb, | 4482 | struct sk_buff *skb, |
| 4275 | const struct net_device *in, | 4483 | const struct net_device *in, |
| 4276 | const struct net_device *out, | 4484 | const struct net_device *out, |
| 4277 | int (*okfn)(struct sk_buff *)) | 4485 | int (*okfn)(struct sk_buff *)) |
| 4278 | { | 4486 | { |
| 4279 | return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET); | 4487 | return selinux_ip_postroute(skb, out->ifindex, PF_INET); |
| 4280 | } | 4488 | } |
| 4281 | 4489 | ||
| 4282 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 4490 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 4283 | 4491 | static unsigned int selinux_ipv6_postroute(unsigned int hooknum, | |
| 4284 | static unsigned int selinux_ipv6_postroute_last(unsigned int hooknum, | 4492 | struct sk_buff *skb, |
| 4285 | struct sk_buff *skb, | 4493 | const struct net_device *in, |
| 4286 | const struct net_device *in, | 4494 | const struct net_device *out, |
| 4287 | const struct net_device *out, | 4495 | int (*okfn)(struct sk_buff *)) |
| 4288 | int (*okfn)(struct sk_buff *)) | ||
| 4289 | { | 4496 | { |
| 4290 | return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET6); | 4497 | return selinux_ip_postroute(skb, out->ifindex, PF_INET6); |
| 4291 | } | 4498 | } |
| 4292 | |||
| 4293 | #endif /* IPV6 */ | 4499 | #endif /* IPV6 */ |
| 4294 | 4500 | ||
| 4295 | #endif /* CONFIG_NETFILTER */ | 4501 | #endif /* CONFIG_NETFILTER */ |
| @@ -5277,22 +5483,40 @@ security_initcall(selinux_init); | |||
| 5277 | 5483 | ||
| 5278 | #if defined(CONFIG_NETFILTER) | 5484 | #if defined(CONFIG_NETFILTER) |
| 5279 | 5485 | ||
| 5280 | static struct nf_hook_ops selinux_ipv4_op = { | 5486 | static struct nf_hook_ops selinux_ipv4_ops[] = { |
| 5281 | .hook = selinux_ipv4_postroute_last, | 5487 | { |
| 5282 | .owner = THIS_MODULE, | 5488 | .hook = selinux_ipv4_postroute, |
| 5283 | .pf = PF_INET, | 5489 | .owner = THIS_MODULE, |
| 5284 | .hooknum = NF_INET_POST_ROUTING, | 5490 | .pf = PF_INET, |
| 5285 | .priority = NF_IP_PRI_SELINUX_LAST, | 5491 | .hooknum = NF_INET_POST_ROUTING, |
| 5492 | .priority = NF_IP_PRI_SELINUX_LAST, | ||
| 5493 | }, | ||
| 5494 | { | ||
| 5495 | .hook = selinux_ipv4_forward, | ||
| 5496 | .owner = THIS_MODULE, | ||
| 5497 | .pf = PF_INET, | ||
| 5498 | .hooknum = NF_INET_FORWARD, | ||
| 5499 | .priority = NF_IP_PRI_SELINUX_FIRST, | ||
| 5500 | } | ||
| 5286 | }; | 5501 | }; |
| 5287 | 5502 | ||
| 5288 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 5503 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 5289 | 5504 | ||
| 5290 | static struct nf_hook_ops selinux_ipv6_op = { | 5505 | static struct nf_hook_ops selinux_ipv6_ops[] = { |
| 5291 | .hook = selinux_ipv6_postroute_last, | 5506 | { |
| 5292 | .owner = THIS_MODULE, | 5507 | .hook = selinux_ipv6_postroute, |
| 5293 | .pf = PF_INET6, | 5508 | .owner = THIS_MODULE, |
| 5294 | .hooknum = NF_INET_POST_ROUTING, | 5509 | .pf = PF_INET6, |
| 5295 | .priority = NF_IP6_PRI_SELINUX_LAST, | 5510 | .hooknum = NF_INET_POST_ROUTING, |
| 5511 | .priority = NF_IP6_PRI_SELINUX_LAST, | ||
| 5512 | }, | ||
| 5513 | { | ||
| 5514 | .hook = selinux_ipv6_forward, | ||
| 5515 | .owner = THIS_MODULE, | ||
| 5516 | .pf = PF_INET6, | ||
| 5517 | .hooknum = NF_INET_FORWARD, | ||
| 5518 | .priority = NF_IP6_PRI_SELINUX_FIRST, | ||
| 5519 | } | ||
| 5296 | }; | 5520 | }; |
| 5297 | 5521 | ||
| 5298 | #endif /* IPV6 */ | 5522 | #endif /* IPV6 */ |
| @@ -5300,22 +5524,27 @@ static struct nf_hook_ops selinux_ipv6_op = { | |||
| 5300 | static int __init selinux_nf_ip_init(void) | 5524 | static int __init selinux_nf_ip_init(void) |
| 5301 | { | 5525 | { |
| 5302 | int err = 0; | 5526 | int err = 0; |
| 5527 | u32 iter; | ||
| 5303 | 5528 | ||
| 5304 | if (!selinux_enabled) | 5529 | if (!selinux_enabled) |
| 5305 | goto out; | 5530 | goto out; |
| 5306 | 5531 | ||
| 5307 | printk(KERN_DEBUG "SELinux: Registering netfilter hooks\n"); | 5532 | printk(KERN_DEBUG "SELinux: Registering netfilter hooks\n"); |
| 5308 | 5533 | ||
| 5309 | err = nf_register_hook(&selinux_ipv4_op); | 5534 | for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++) { |
| 5310 | if (err) | 5535 | err = nf_register_hook(&selinux_ipv4_ops[iter]); |
| 5311 | panic("SELinux: nf_register_hook for IPv4: error %d\n", err); | 5536 | if (err) |
| 5537 | panic("SELinux: nf_register_hook for IPv4: error %d\n", | ||
| 5538 | err); | ||
| 5539 | } | ||
| 5312 | 5540 | ||
| 5313 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 5541 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 5314 | 5542 | for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++) { | |
| 5315 | err = nf_register_hook(&selinux_ipv6_op); | 5543 | err = nf_register_hook(&selinux_ipv6_ops[iter]); |
| 5316 | if (err) | 5544 | if (err) |
| 5317 | panic("SELinux: nf_register_hook for IPv6: error %d\n", err); | 5545 | panic("SELinux: nf_register_hook for IPv6: error %d\n", |
| 5318 | 5546 | err); | |
| 5547 | } | ||
| 5319 | #endif /* IPV6 */ | 5548 | #endif /* IPV6 */ |
| 5320 | 5549 | ||
| 5321 | out: | 5550 | out: |
| @@ -5327,11 +5556,15 @@ __initcall(selinux_nf_ip_init); | |||
| 5327 | #ifdef CONFIG_SECURITY_SELINUX_DISABLE | 5556 | #ifdef CONFIG_SECURITY_SELINUX_DISABLE |
| 5328 | static void selinux_nf_ip_exit(void) | 5557 | static void selinux_nf_ip_exit(void) |
| 5329 | { | 5558 | { |
| 5559 | u32 iter; | ||
| 5560 | |||
| 5330 | printk(KERN_DEBUG "SELinux: Unregistering netfilter hooks\n"); | 5561 | printk(KERN_DEBUG "SELinux: Unregistering netfilter hooks\n"); |
| 5331 | 5562 | ||
| 5332 | nf_unregister_hook(&selinux_ipv4_op); | 5563 | for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++) |
| 5564 | nf_unregister_hook(&selinux_ipv4_ops[iter]); | ||
| 5333 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 5565 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 5334 | nf_unregister_hook(&selinux_ipv6_op); | 5566 | for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++) |
| 5567 | nf_unregister_hook(&selinux_ipv6_ops[iter]); | ||
| 5335 | #endif /* IPV6 */ | 5568 | #endif /* IPV6 */ |
| 5336 | } | 5569 | } |
| 5337 | #endif | 5570 | #endif |
diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h index 049bf69429b6..399f868c5c8f 100644 --- a/security/selinux/include/av_perm_to_string.h +++ b/security/selinux/include/av_perm_to_string.h | |||
| @@ -37,6 +37,8 @@ | |||
| 37 | S_(SECCLASS_NODE, NODE__ENFORCE_DEST, "enforce_dest") | 37 | S_(SECCLASS_NODE, NODE__ENFORCE_DEST, "enforce_dest") |
| 38 | S_(SECCLASS_NODE, NODE__DCCP_RECV, "dccp_recv") | 38 | S_(SECCLASS_NODE, NODE__DCCP_RECV, "dccp_recv") |
| 39 | S_(SECCLASS_NODE, NODE__DCCP_SEND, "dccp_send") | 39 | S_(SECCLASS_NODE, NODE__DCCP_SEND, "dccp_send") |
| 40 | S_(SECCLASS_NODE, NODE__RECVFROM, "recvfrom") | ||
| 41 | S_(SECCLASS_NODE, NODE__SENDTO, "sendto") | ||
| 40 | S_(SECCLASS_NETIF, NETIF__TCP_RECV, "tcp_recv") | 42 | S_(SECCLASS_NETIF, NETIF__TCP_RECV, "tcp_recv") |
| 41 | S_(SECCLASS_NETIF, NETIF__TCP_SEND, "tcp_send") | 43 | S_(SECCLASS_NETIF, NETIF__TCP_SEND, "tcp_send") |
| 42 | S_(SECCLASS_NETIF, NETIF__UDP_RECV, "udp_recv") | 44 | S_(SECCLASS_NETIF, NETIF__UDP_RECV, "udp_recv") |
| @@ -45,6 +47,8 @@ | |||
| 45 | S_(SECCLASS_NETIF, NETIF__RAWIP_SEND, "rawip_send") | 47 | S_(SECCLASS_NETIF, NETIF__RAWIP_SEND, "rawip_send") |
| 46 | S_(SECCLASS_NETIF, NETIF__DCCP_RECV, "dccp_recv") | 48 | S_(SECCLASS_NETIF, NETIF__DCCP_RECV, "dccp_recv") |
| 47 | S_(SECCLASS_NETIF, NETIF__DCCP_SEND, "dccp_send") | 49 | S_(SECCLASS_NETIF, NETIF__DCCP_SEND, "dccp_send") |
| 50 | S_(SECCLASS_NETIF, NETIF__INGRESS, "ingress") | ||
| 51 | S_(SECCLASS_NETIF, NETIF__EGRESS, "egress") | ||
| 48 | S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__CONNECTTO, "connectto") | 52 | S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__CONNECTTO, "connectto") |
| 49 | S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__NEWCONN, "newconn") | 53 | S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__NEWCONN, "newconn") |
| 50 | S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__ACCEPTFROM, "acceptfrom") | 54 | S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__ACCEPTFROM, "acceptfrom") |
| @@ -149,6 +153,10 @@ | |||
| 149 | S_(SECCLASS_PACKET, PACKET__SEND, "send") | 153 | S_(SECCLASS_PACKET, PACKET__SEND, "send") |
| 150 | S_(SECCLASS_PACKET, PACKET__RECV, "recv") | 154 | S_(SECCLASS_PACKET, PACKET__RECV, "recv") |
| 151 | S_(SECCLASS_PACKET, PACKET__RELABELTO, "relabelto") | 155 | S_(SECCLASS_PACKET, PACKET__RELABELTO, "relabelto") |
| 156 | S_(SECCLASS_PACKET, PACKET__FLOW_IN, "flow_in") | ||
| 157 | S_(SECCLASS_PACKET, PACKET__FLOW_OUT, "flow_out") | ||
| 158 | S_(SECCLASS_PACKET, PACKET__FORWARD_IN, "forward_in") | ||
| 159 | S_(SECCLASS_PACKET, PACKET__FORWARD_OUT, "forward_out") | ||
| 152 | S_(SECCLASS_KEY, KEY__VIEW, "view") | 160 | S_(SECCLASS_KEY, KEY__VIEW, "view") |
| 153 | S_(SECCLASS_KEY, KEY__READ, "read") | 161 | S_(SECCLASS_KEY, KEY__READ, "read") |
| 154 | S_(SECCLASS_KEY, KEY__WRITE, "write") | 162 | S_(SECCLASS_KEY, KEY__WRITE, "write") |
| @@ -159,3 +167,4 @@ | |||
| 159 | S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind") | 167 | S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind") |
| 160 | S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect") | 168 | S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect") |
| 161 | S_(SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, "mmap_zero") | 169 | S_(SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, "mmap_zero") |
| 170 | S_(SECCLASS_PEER, PEER__RECV, "recv") | ||
diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h index eda89a2ec635..84c9abc80978 100644 --- a/security/selinux/include/av_permissions.h +++ b/security/selinux/include/av_permissions.h | |||
| @@ -292,6 +292,8 @@ | |||
| 292 | #define NODE__ENFORCE_DEST 0x00000040UL | 292 | #define NODE__ENFORCE_DEST 0x00000040UL |
| 293 | #define NODE__DCCP_RECV 0x00000080UL | 293 | #define NODE__DCCP_RECV 0x00000080UL |
| 294 | #define NODE__DCCP_SEND 0x00000100UL | 294 | #define NODE__DCCP_SEND 0x00000100UL |
| 295 | #define NODE__RECVFROM 0x00000200UL | ||
| 296 | #define NODE__SENDTO 0x00000400UL | ||
| 295 | #define NETIF__TCP_RECV 0x00000001UL | 297 | #define NETIF__TCP_RECV 0x00000001UL |
| 296 | #define NETIF__TCP_SEND 0x00000002UL | 298 | #define NETIF__TCP_SEND 0x00000002UL |
| 297 | #define NETIF__UDP_RECV 0x00000004UL | 299 | #define NETIF__UDP_RECV 0x00000004UL |
| @@ -300,6 +302,8 @@ | |||
| 300 | #define NETIF__RAWIP_SEND 0x00000020UL | 302 | #define NETIF__RAWIP_SEND 0x00000020UL |
| 301 | #define NETIF__DCCP_RECV 0x00000040UL | 303 | #define NETIF__DCCP_RECV 0x00000040UL |
| 302 | #define NETIF__DCCP_SEND 0x00000080UL | 304 | #define NETIF__DCCP_SEND 0x00000080UL |
| 305 | #define NETIF__INGRESS 0x00000100UL | ||
| 306 | #define NETIF__EGRESS 0x00000200UL | ||
| 303 | #define NETLINK_SOCKET__IOCTL 0x00000001UL | 307 | #define NETLINK_SOCKET__IOCTL 0x00000001UL |
| 304 | #define NETLINK_SOCKET__READ 0x00000002UL | 308 | #define NETLINK_SOCKET__READ 0x00000002UL |
| 305 | #define NETLINK_SOCKET__WRITE 0x00000004UL | 309 | #define NETLINK_SOCKET__WRITE 0x00000004UL |
| @@ -792,6 +796,10 @@ | |||
| 792 | #define PACKET__SEND 0x00000001UL | 796 | #define PACKET__SEND 0x00000001UL |
| 793 | #define PACKET__RECV 0x00000002UL | 797 | #define PACKET__RECV 0x00000002UL |
| 794 | #define PACKET__RELABELTO 0x00000004UL | 798 | #define PACKET__RELABELTO 0x00000004UL |
| 799 | #define PACKET__FLOW_IN 0x00000008UL | ||
| 800 | #define PACKET__FLOW_OUT 0x00000010UL | ||
| 801 | #define PACKET__FORWARD_IN 0x00000020UL | ||
| 802 | #define PACKET__FORWARD_OUT 0x00000040UL | ||
| 795 | #define KEY__VIEW 0x00000001UL | 803 | #define KEY__VIEW 0x00000001UL |
| 796 | #define KEY__READ 0x00000002UL | 804 | #define KEY__READ 0x00000002UL |
| 797 | #define KEY__WRITE 0x00000004UL | 805 | #define KEY__WRITE 0x00000004UL |
| @@ -824,3 +832,4 @@ | |||
| 824 | #define DCCP_SOCKET__NODE_BIND 0x00400000UL | 832 | #define DCCP_SOCKET__NODE_BIND 0x00400000UL |
| 825 | #define DCCP_SOCKET__NAME_CONNECT 0x00800000UL | 833 | #define DCCP_SOCKET__NAME_CONNECT 0x00800000UL |
| 826 | #define MEMPROTECT__MMAP_ZERO 0x00000001UL | 834 | #define MEMPROTECT__MMAP_ZERO 0x00000001UL |
| 835 | #define PEER__RECV 0x00000001UL | ||
diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h index 553607a19e92..80c28fa6621c 100644 --- a/security/selinux/include/avc.h +++ b/security/selinux/include/avc.h | |||
| @@ -51,7 +51,7 @@ struct avc_audit_data { | |||
| 51 | struct inode *inode; | 51 | struct inode *inode; |
| 52 | } fs; | 52 | } fs; |
| 53 | struct { | 53 | struct { |
| 54 | char *netif; | 54 | int netif; |
| 55 | struct sock *sk; | 55 | struct sock *sk; |
| 56 | u16 family; | 56 | u16 family; |
| 57 | __be16 dport; | 57 | __be16 dport; |
diff --git a/security/selinux/include/class_to_string.h b/security/selinux/include/class_to_string.h index e77de0e62ea0..b1b0d1d8f950 100644 --- a/security/selinux/include/class_to_string.h +++ b/security/selinux/include/class_to_string.h | |||
| @@ -64,3 +64,10 @@ | |||
| 64 | S_(NULL) | 64 | S_(NULL) |
| 65 | S_("dccp_socket") | 65 | S_("dccp_socket") |
| 66 | S_("memprotect") | 66 | S_("memprotect") |
| 67 | S_(NULL) | ||
| 68 | S_(NULL) | ||
| 69 | S_(NULL) | ||
| 70 | S_(NULL) | ||
| 71 | S_(NULL) | ||
| 72 | S_(NULL) | ||
| 73 | S_("peer") | ||
diff --git a/security/selinux/include/flask.h b/security/selinux/include/flask.h index a9c2b20f14b5..09e9dd23ee1a 100644 --- a/security/selinux/include/flask.h +++ b/security/selinux/include/flask.h | |||
| @@ -50,6 +50,7 @@ | |||
| 50 | #define SECCLASS_KEY 58 | 50 | #define SECCLASS_KEY 58 |
| 51 | #define SECCLASS_DCCP_SOCKET 60 | 51 | #define SECCLASS_DCCP_SOCKET 60 |
| 52 | #define SECCLASS_MEMPROTECT 61 | 52 | #define SECCLASS_MEMPROTECT 61 |
| 53 | #define SECCLASS_PEER 68 | ||
| 53 | 54 | ||
| 54 | /* | 55 | /* |
| 55 | * Security identifier indices for initial entities | 56 | * Security identifier indices for initial entities |
diff --git a/security/selinux/include/netif.h b/security/selinux/include/netif.h index 8bd6f9992d2b..ce23edd128b3 100644 --- a/security/selinux/include/netif.h +++ b/security/selinux/include/netif.h | |||
| @@ -7,6 +7,8 @@ | |||
| 7 | * Author: James Morris <jmorris@redhat.com> | 7 | * Author: James Morris <jmorris@redhat.com> |
| 8 | * | 8 | * |
| 9 | * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> | 9 | * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> |
| 10 | * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. | ||
| 11 | * Paul Moore, <paul.moore@hp.com> | ||
| 10 | * | 12 | * |
| 11 | * This program is free software; you can redistribute it and/or modify | 13 | * This program is free software; you can redistribute it and/or modify |
| 12 | * it under the terms of the GNU General Public License version 2, | 14 | * it under the terms of the GNU General Public License version 2, |
| @@ -15,7 +17,7 @@ | |||
| 15 | #ifndef _SELINUX_NETIF_H_ | 17 | #ifndef _SELINUX_NETIF_H_ |
| 16 | #define _SELINUX_NETIF_H_ | 18 | #define _SELINUX_NETIF_H_ |
| 17 | 19 | ||
| 18 | int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid); | 20 | int sel_netif_sid(int ifindex, u32 *sid); |
| 19 | 21 | ||
| 20 | #endif /* _SELINUX_NETIF_H_ */ | 22 | #endif /* _SELINUX_NETIF_H_ */ |
| 21 | 23 | ||
diff --git a/security/selinux/include/netlabel.h b/security/selinux/include/netlabel.h index 218e3f77c350..00a2809c8506 100644 --- a/security/selinux/include/netlabel.h +++ b/security/selinux/include/netlabel.h | |||
| @@ -46,13 +46,17 @@ void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec, | |||
| 46 | void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, | 46 | void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, |
| 47 | struct sk_security_struct *newssec); | 47 | struct sk_security_struct *newssec); |
| 48 | 48 | ||
| 49 | int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid); | 49 | int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, |
| 50 | u16 family, | ||
| 51 | u32 *type, | ||
| 52 | u32 *sid); | ||
| 50 | 53 | ||
| 51 | void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock); | 54 | void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock); |
| 52 | int selinux_netlbl_socket_post_create(struct socket *sock); | 55 | int selinux_netlbl_socket_post_create(struct socket *sock); |
| 53 | int selinux_netlbl_inode_permission(struct inode *inode, int mask); | 56 | int selinux_netlbl_inode_permission(struct inode *inode, int mask); |
| 54 | int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, | 57 | int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, |
| 55 | struct sk_buff *skb, | 58 | struct sk_buff *skb, |
| 59 | u16 family, | ||
| 56 | struct avc_audit_data *ad); | 60 | struct avc_audit_data *ad); |
| 57 | int selinux_netlbl_socket_setsockopt(struct socket *sock, | 61 | int selinux_netlbl_socket_setsockopt(struct socket *sock, |
| 58 | int level, | 62 | int level, |
| @@ -83,9 +87,11 @@ static inline void selinux_netlbl_sk_security_clone( | |||
| 83 | } | 87 | } |
| 84 | 88 | ||
| 85 | static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, | 89 | static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, |
| 86 | u32 base_sid, | 90 | u16 family, |
| 91 | u32 *type, | ||
| 87 | u32 *sid) | 92 | u32 *sid) |
| 88 | { | 93 | { |
| 94 | *type = NETLBL_NLTYPE_NONE; | ||
| 89 | *sid = SECSID_NULL; | 95 | *sid = SECSID_NULL; |
| 90 | return 0; | 96 | return 0; |
| 91 | } | 97 | } |
| @@ -106,6 +112,7 @@ static inline int selinux_netlbl_inode_permission(struct inode *inode, | |||
| 106 | } | 112 | } |
| 107 | static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, | 113 | static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, |
| 108 | struct sk_buff *skb, | 114 | struct sk_buff *skb, |
| 115 | u16 family, | ||
| 109 | struct avc_audit_data *ad) | 116 | struct avc_audit_data *ad) |
| 110 | { | 117 | { |
| 111 | return 0; | 118 | return 0; |
diff --git a/security/selinux/include/netnode.h b/security/selinux/include/netnode.h new file mode 100644 index 000000000000..1b94450d11d2 --- /dev/null +++ b/security/selinux/include/netnode.h | |||
| @@ -0,0 +1,32 @@ | |||
| 1 | /* | ||
| 2 | * Network node table | ||
| 3 | * | ||
| 4 | * SELinux must keep a mapping of network nodes to labels/SIDs. This | ||
| 5 | * mapping is maintained as part of the normal policy but a fast cache is | ||
| 6 | * needed to reduce the lookup overhead since most of these queries happen on | ||
| 7 | * a per-packet basis. | ||
| 8 | * | ||
| 9 | * Author: Paul Moore <paul.moore@hp.com> | ||
| 10 | * | ||
| 11 | */ | ||
| 12 | |||
| 13 | /* | ||
| 14 | * (c) Copyright Hewlett-Packard Development Company, L.P., 2007 | ||
| 15 | * | ||
| 16 | * This program is free software: you can redistribute it and/or modify | ||
| 17 | * it under the terms of version 2 of the GNU General Public License as | ||
| 18 | * published by the Free Software Foundation. | ||
| 19 | * | ||
| 20 | * This program is distributed in the hope that it will be useful, | ||
| 21 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 22 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 23 | * GNU General Public License for more details. | ||
| 24 | * | ||
| 25 | */ | ||
| 26 | |||
| 27 | #ifndef _SELINUX_NETNODE_H | ||
| 28 | #define _SELINUX_NETNODE_H | ||
| 29 | |||
| 30 | int sel_netnode_sid(void *addr, u16 family, u32 *sid); | ||
| 31 | |||
| 32 | #endif | ||
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 4138a80f8e27..c6c2bb4ebacc 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h | |||
| @@ -96,17 +96,25 @@ struct bprm_security_struct { | |||
| 96 | }; | 96 | }; |
| 97 | 97 | ||
| 98 | struct netif_security_struct { | 98 | struct netif_security_struct { |
| 99 | struct net_device *dev; /* back pointer */ | 99 | int ifindex; /* device index */ |
| 100 | u32 if_sid; /* SID for this interface */ | 100 | u32 sid; /* SID for this interface */ |
| 101 | u32 msg_sid; /* default SID for messages received on this interface */ | 101 | }; |
| 102 | |||
| 103 | struct netnode_security_struct { | ||
| 104 | union { | ||
| 105 | __be32 ipv4; /* IPv4 node address */ | ||
| 106 | struct in6_addr ipv6; /* IPv6 node address */ | ||
| 107 | } addr; | ||
| 108 | u32 sid; /* SID for this node */ | ||
| 109 | u16 family; /* address family */ | ||
| 102 | }; | 110 | }; |
| 103 | 111 | ||
| 104 | struct sk_security_struct { | 112 | struct sk_security_struct { |
| 105 | struct sock *sk; /* back pointer to sk object */ | 113 | struct sock *sk; /* back pointer to sk object */ |
| 106 | u32 sid; /* SID of this object */ | 114 | u32 sid; /* SID of this object */ |
| 107 | u32 peer_sid; /* SID of peer */ | 115 | u32 peer_sid; /* SID of peer */ |
| 108 | #ifdef CONFIG_NETLABEL | ||
| 109 | u16 sclass; /* sock security class */ | 116 | u16 sclass; /* sock security class */ |
| 117 | #ifdef CONFIG_NETLABEL | ||
| 110 | enum { /* NetLabel state */ | 118 | enum { /* NetLabel state */ |
| 111 | NLBL_UNSET = 0, | 119 | NLBL_UNSET = 0, |
| 112 | NLBL_REQUIRE, | 120 | NLBL_REQUIRE, |
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 39337afffec2..23137c17f917 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h | |||
| @@ -25,13 +25,14 @@ | |||
| 25 | #define POLICYDB_VERSION_MLS 19 | 25 | #define POLICYDB_VERSION_MLS 19 |
| 26 | #define POLICYDB_VERSION_AVTAB 20 | 26 | #define POLICYDB_VERSION_AVTAB 20 |
| 27 | #define POLICYDB_VERSION_RANGETRANS 21 | 27 | #define POLICYDB_VERSION_RANGETRANS 21 |
| 28 | #define POLICYDB_VERSION_POLCAP 22 | ||
| 28 | 29 | ||
| 29 | /* Range of policy versions we understand*/ | 30 | /* Range of policy versions we understand*/ |
| 30 | #define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE | 31 | #define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE |
| 31 | #ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX | 32 | #ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX |
| 32 | #define POLICYDB_VERSION_MAX CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE | 33 | #define POLICYDB_VERSION_MAX CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE |
| 33 | #else | 34 | #else |
| 34 | #define POLICYDB_VERSION_MAX POLICYDB_VERSION_RANGETRANS | 35 | #define POLICYDB_VERSION_MAX POLICYDB_VERSION_POLCAP |
| 35 | #endif | 36 | #endif |
| 36 | 37 | ||
| 37 | struct netlbl_lsm_secattr; | 38 | struct netlbl_lsm_secattr; |
| @@ -39,8 +40,19 @@ struct netlbl_lsm_secattr; | |||
| 39 | extern int selinux_enabled; | 40 | extern int selinux_enabled; |
| 40 | extern int selinux_mls_enabled; | 41 | extern int selinux_mls_enabled; |
| 41 | 42 | ||
| 43 | /* Policy capabilities */ | ||
| 44 | enum { | ||
| 45 | POLICYDB_CAPABILITY_NETPEER, | ||
| 46 | __POLICYDB_CAPABILITY_MAX | ||
| 47 | }; | ||
| 48 | #define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1) | ||
| 49 | |||
| 50 | extern int selinux_policycap_netpeer; | ||
| 51 | |||
| 42 | int security_load_policy(void * data, size_t len); | 52 | int security_load_policy(void * data, size_t len); |
| 43 | 53 | ||
| 54 | int security_policycap_supported(unsigned int req_cap); | ||
| 55 | |||
| 44 | #define SEL_VEC_MAX 32 | 56 | #define SEL_VEC_MAX 32 |
| 45 | struct av_decision { | 57 | struct av_decision { |
| 46 | u32 allowed; | 58 | u32 allowed; |
| @@ -77,8 +89,7 @@ int security_get_user_sids(u32 callsid, char *username, | |||
| 77 | int security_port_sid(u16 domain, u16 type, u8 protocol, u16 port, | 89 | int security_port_sid(u16 domain, u16 type, u8 protocol, u16 port, |
| 78 | u32 *out_sid); | 90 | u32 *out_sid); |
| 79 | 91 | ||
| 80 | int security_netif_sid(char *name, u32 *if_sid, | 92 | int security_netif_sid(char *name, u32 *if_sid); |
| 81 | u32 *msg_sid); | ||
| 82 | 93 | ||
| 83 | int security_node_sid(u16 domain, void *addr, u32 addrlen, | 94 | int security_node_sid(u16 domain, void *addr, u32 addrlen, |
| 84 | u32 *out_sid); | 95 | u32 *out_sid); |
| @@ -88,10 +99,15 @@ int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, | |||
| 88 | 99 | ||
| 89 | int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid); | 100 | int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid); |
| 90 | 101 | ||
| 102 | int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type, | ||
| 103 | u32 xfrm_sid, | ||
| 104 | u32 *peer_sid); | ||
| 105 | |||
| 91 | int security_get_classes(char ***classes, int *nclasses); | 106 | int security_get_classes(char ***classes, int *nclasses); |
| 92 | int security_get_permissions(char *class, char ***perms, int *nperms); | 107 | int security_get_permissions(char *class, char ***perms, int *nperms); |
| 93 | int security_get_reject_unknown(void); | 108 | int security_get_reject_unknown(void); |
| 94 | int security_get_allow_unknown(void); | 109 | int security_get_allow_unknown(void); |
| 110 | int security_get_policycaps(int *len, int **values); | ||
| 95 | 111 | ||
| 96 | #define SECURITY_FS_USE_XATTR 1 /* use xattr */ | 112 | #define SECURITY_FS_USE_XATTR 1 /* use xattr */ |
| 97 | #define SECURITY_FS_USE_TRANS 2 /* use transition SIDs, e.g. devpts/tmpfs */ | 113 | #define SECURITY_FS_USE_TRANS 2 /* use transition SIDs, e.g. devpts/tmpfs */ |
| @@ -108,7 +124,6 @@ int security_genfs_sid(const char *fstype, char *name, u16 sclass, | |||
| 108 | 124 | ||
| 109 | #ifdef CONFIG_NETLABEL | 125 | #ifdef CONFIG_NETLABEL |
| 110 | int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, | 126 | int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, |
| 111 | u32 base_sid, | ||
| 112 | u32 *sid); | 127 | u32 *sid); |
| 113 | 128 | ||
| 114 | int security_netlbl_sid_to_secattr(u32 sid, | 129 | int security_netlbl_sid_to_secattr(u32 sid, |
| @@ -116,7 +131,6 @@ int security_netlbl_sid_to_secattr(u32 sid, | |||
| 116 | #else | 131 | #else |
| 117 | static inline int security_netlbl_secattr_to_sid( | 132 | static inline int security_netlbl_secattr_to_sid( |
| 118 | struct netlbl_lsm_secattr *secattr, | 133 | struct netlbl_lsm_secattr *secattr, |
| 119 | u32 base_sid, | ||
| 120 | u32 *sid) | 134 | u32 *sid) |
| 121 | { | 135 | { |
| 122 | return -EIDRM; | 136 | return -EIDRM; |
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index 31929e39f5ca..36b0510efa7b 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h | |||
| @@ -32,6 +32,13 @@ static inline struct inode_security_struct *get_sock_isec(struct sock *sk) | |||
| 32 | } | 32 | } |
| 33 | 33 | ||
| 34 | #ifdef CONFIG_SECURITY_NETWORK_XFRM | 34 | #ifdef CONFIG_SECURITY_NETWORK_XFRM |
| 35 | extern atomic_t selinux_xfrm_refcount; | ||
| 36 | |||
| 37 | static inline int selinux_xfrm_enabled(void) | ||
| 38 | { | ||
| 39 | return (atomic_read(&selinux_xfrm_refcount) > 0); | ||
| 40 | } | ||
| 41 | |||
| 35 | int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb, | 42 | int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb, |
| 36 | struct avc_audit_data *ad); | 43 | struct avc_audit_data *ad); |
| 37 | int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb, | 44 | int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb, |
| @@ -43,6 +50,11 @@ static inline void selinux_xfrm_notify_policyload(void) | |||
| 43 | atomic_inc(&flow_cache_genid); | 50 | atomic_inc(&flow_cache_genid); |
| 44 | } | 51 | } |
| 45 | #else | 52 | #else |
| 53 | static inline int selinux_xfrm_enabled(void) | ||
| 54 | { | ||
| 55 | return 0; | ||
| 56 | } | ||
| 57 | |||
| 46 | static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb, | 58 | static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb, |
| 47 | struct avc_audit_data *ad) | 59 | struct avc_audit_data *ad) |
| 48 | { | 60 | { |
diff --git a/security/selinux/netif.c b/security/selinux/netif.c index e87ab948104c..013d3117a86b 100644 --- a/security/selinux/netif.c +++ b/security/selinux/netif.c | |||
| @@ -7,6 +7,8 @@ | |||
| 7 | * Author: James Morris <jmorris@redhat.com> | 7 | * Author: James Morris <jmorris@redhat.com> |
| 8 | * | 8 | * |
| 9 | * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> | 9 | * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> |
| 10 | * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. | ||
| 11 | * Paul Moore <paul.moore@hp.com> | ||
| 10 | * | 12 | * |
| 11 | * This program is free software; you can redistribute it and/or modify | 13 | * This program is free software; you can redistribute it and/or modify |
| 12 | * it under the terms of the GNU General Public License version 2, | 14 | * it under the terms of the GNU General Public License version 2, |
| @@ -29,14 +31,6 @@ | |||
| 29 | #define SEL_NETIF_HASH_SIZE 64 | 31 | #define SEL_NETIF_HASH_SIZE 64 |
| 30 | #define SEL_NETIF_HASH_MAX 1024 | 32 | #define SEL_NETIF_HASH_MAX 1024 |
| 31 | 33 | ||
| 32 | #undef DEBUG | ||
| 33 | |||
| 34 | #ifdef DEBUG | ||
| 35 | #define DEBUGP printk | ||
| 36 | #else | ||
| 37 | #define DEBUGP(format, args...) | ||
| 38 | #endif | ||
| 39 | |||
| 40 | struct sel_netif | 34 | struct sel_netif |
| 41 | { | 35 | { |
| 42 | struct list_head list; | 36 | struct list_head list; |
| @@ -49,174 +43,226 @@ static LIST_HEAD(sel_netif_list); | |||
| 49 | static DEFINE_SPINLOCK(sel_netif_lock); | 43 | static DEFINE_SPINLOCK(sel_netif_lock); |
| 50 | static struct list_head sel_netif_hash[SEL_NETIF_HASH_SIZE]; | 44 | static struct list_head sel_netif_hash[SEL_NETIF_HASH_SIZE]; |
| 51 | 45 | ||
| 52 | static inline u32 sel_netif_hasfn(struct net_device *dev) | 46 | /** |
| 47 | * sel_netif_hashfn - Hashing function for the interface table | ||
| 48 | * @ifindex: the network interface | ||
| 49 | * | ||
| 50 | * Description: | ||
| 51 | * This is the hashing function for the network interface table, it returns the | ||
| 52 | * bucket number for the given interface. | ||
| 53 | * | ||
| 54 | */ | ||
| 55 | static inline u32 sel_netif_hashfn(int ifindex) | ||
| 53 | { | 56 | { |
| 54 | return (dev->ifindex & (SEL_NETIF_HASH_SIZE - 1)); | 57 | return (ifindex & (SEL_NETIF_HASH_SIZE - 1)); |
| 55 | } | 58 | } |
| 56 | 59 | ||
| 57 | /* | 60 | /** |
| 58 | * All of the devices should normally fit in the hash, so we optimize | 61 | * sel_netif_find - Search for an interface record |
| 59 | * for that case. | 62 | * @ifindex: the network interface |
| 63 | * | ||
| 64 | * Description: | ||
| 65 | * Search the network interface table and return the record matching @ifindex. | ||
| 66 | * If an entry can not be found in the table return NULL. | ||
| 67 | * | ||
| 60 | */ | 68 | */ |
| 61 | static inline struct sel_netif *sel_netif_find(struct net_device *dev) | 69 | static inline struct sel_netif *sel_netif_find(int ifindex) |
| 62 | { | 70 | { |
| 63 | struct list_head *pos; | 71 | int idx = sel_netif_hashfn(ifindex); |
| 64 | int idx = sel_netif_hasfn(dev); | 72 | struct sel_netif *netif; |
| 65 | 73 | ||
| 66 | __list_for_each_rcu(pos, &sel_netif_hash[idx]) { | 74 | list_for_each_entry_rcu(netif, &sel_netif_hash[idx], list) |
| 67 | struct sel_netif *netif = list_entry(pos, | 75 | /* all of the devices should normally fit in the hash, so we |
| 68 | struct sel_netif, list); | 76 | * optimize for that case */ |
| 69 | if (likely(netif->nsec.dev == dev)) | 77 | if (likely(netif->nsec.ifindex == ifindex)) |
| 70 | return netif; | 78 | return netif; |
| 71 | } | 79 | |
| 72 | return NULL; | 80 | return NULL; |
| 73 | } | 81 | } |
| 74 | 82 | ||
| 83 | /** | ||
| 84 | * sel_netif_insert - Insert a new interface into the table | ||
| 85 | * @netif: the new interface record | ||
| 86 | * | ||
| 87 | * Description: | ||
| 88 | * Add a new interface record to the network interface hash table. Returns | ||
| 89 | * zero on success, negative values on failure. | ||
| 90 | * | ||
| 91 | */ | ||
| 75 | static int sel_netif_insert(struct sel_netif *netif) | 92 | static int sel_netif_insert(struct sel_netif *netif) |
| 76 | { | 93 | { |
| 77 | int idx, ret = 0; | 94 | int idx; |
| 78 | 95 | ||
| 79 | if (sel_netif_total >= SEL_NETIF_HASH_MAX) { | 96 | if (sel_netif_total >= SEL_NETIF_HASH_MAX) |
| 80 | ret = -ENOSPC; | 97 | return -ENOSPC; |
| 81 | goto out; | ||
| 82 | } | ||
| 83 | 98 | ||
| 84 | idx = sel_netif_hasfn(netif->nsec.dev); | 99 | idx = sel_netif_hashfn(netif->nsec.ifindex); |
| 85 | list_add_rcu(&netif->list, &sel_netif_hash[idx]); | 100 | list_add_rcu(&netif->list, &sel_netif_hash[idx]); |
| 86 | sel_netif_total++; | 101 | sel_netif_total++; |
| 87 | out: | 102 | |
| 88 | return ret; | 103 | return 0; |
| 89 | } | 104 | } |
| 90 | 105 | ||
| 106 | /** | ||
| 107 | * sel_netif_free - Frees an interface entry | ||
| 108 | * @p: the entry's RCU field | ||
| 109 | * | ||
| 110 | * Description: | ||
| 111 | * This function is designed to be used as a callback to the call_rcu() | ||
| 112 | * function so that memory allocated to a hash table interface entry can be | ||
| 113 | * released safely. | ||
| 114 | * | ||
| 115 | */ | ||
| 91 | static void sel_netif_free(struct rcu_head *p) | 116 | static void sel_netif_free(struct rcu_head *p) |
| 92 | { | 117 | { |
| 93 | struct sel_netif *netif = container_of(p, struct sel_netif, rcu_head); | 118 | struct sel_netif *netif = container_of(p, struct sel_netif, rcu_head); |
| 94 | |||
| 95 | DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name); | ||
| 96 | kfree(netif); | 119 | kfree(netif); |
| 97 | } | 120 | } |
| 98 | 121 | ||
| 122 | /** | ||
| 123 | * sel_netif_destroy - Remove an interface record from the table | ||
| 124 | * @netif: the existing interface record | ||
| 125 | * | ||
| 126 | * Description: | ||
| 127 | * Remove an existing interface record from the network interface table. | ||
| 128 | * | ||
| 129 | */ | ||
| 99 | static void sel_netif_destroy(struct sel_netif *netif) | 130 | static void sel_netif_destroy(struct sel_netif *netif) |
| 100 | { | 131 | { |
| 101 | DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name); | ||
| 102 | |||
| 103 | list_del_rcu(&netif->list); | 132 | list_del_rcu(&netif->list); |
| 104 | sel_netif_total--; | 133 | sel_netif_total--; |
| 105 | call_rcu(&netif->rcu_head, sel_netif_free); | 134 | call_rcu(&netif->rcu_head, sel_netif_free); |
| 106 | } | 135 | } |
| 107 | 136 | ||
| 108 | static struct sel_netif *sel_netif_lookup(struct net_device *dev) | 137 | /** |
| 138 | * sel_netif_sid_slow - Lookup the SID of a network interface using the policy | ||
| 139 | * @ifindex: the network interface | ||
| 140 | * @sid: interface SID | ||
| 141 | * | ||
| 142 | * Description: | ||
| 143 | * This function determines the SID of a network interface by quering the | ||
| 144 | * security policy. The result is added to the network interface table to | ||
| 145 | * speedup future queries. Returns zero on success, negative values on | ||
| 146 | * failure. | ||
| 147 | * | ||
| 148 | */ | ||
| 149 | static int sel_netif_sid_slow(int ifindex, u32 *sid) | ||
| 109 | { | 150 | { |
| 110 | int ret; | 151 | int ret; |
| 111 | struct sel_netif *netif, *new; | 152 | struct sel_netif *netif; |
| 112 | struct netif_security_struct *nsec; | 153 | struct sel_netif *new = NULL; |
| 113 | 154 | struct net_device *dev; | |
| 114 | netif = sel_netif_find(dev); | 155 | |
| 115 | if (likely(netif != NULL)) | 156 | /* NOTE: we always use init's network namespace since we don't |
| 116 | goto out; | 157 | * currently support containers */ |
| 117 | 158 | ||
| 118 | new = kzalloc(sizeof(*new), GFP_ATOMIC); | 159 | dev = dev_get_by_index(&init_net, ifindex); |
| 119 | if (!new) { | 160 | if (unlikely(dev == NULL)) { |
| 120 | netif = ERR_PTR(-ENOMEM); | 161 | printk(KERN_WARNING |
| 121 | goto out; | 162 | "SELinux: failure in sel_netif_sid_slow()," |
| 163 | " invalid network interface (%d)\n", ifindex); | ||
| 164 | return -ENOENT; | ||
| 122 | } | 165 | } |
| 123 | |||
| 124 | nsec = &new->nsec; | ||
| 125 | 166 | ||
| 126 | ret = security_netif_sid(dev->name, &nsec->if_sid, &nsec->msg_sid); | 167 | spin_lock_bh(&sel_netif_lock); |
| 127 | if (ret < 0) { | 168 | netif = sel_netif_find(ifindex); |
| 128 | kfree(new); | 169 | if (netif != NULL) { |
| 129 | netif = ERR_PTR(ret); | 170 | *sid = netif->nsec.sid; |
| 171 | ret = 0; | ||
| 130 | goto out; | 172 | goto out; |
| 131 | } | 173 | } |
| 132 | 174 | new = kzalloc(sizeof(*new), GFP_ATOMIC); | |
| 133 | nsec->dev = dev; | 175 | if (new == NULL) { |
| 134 | 176 | ret = -ENOMEM; | |
| 135 | spin_lock_bh(&sel_netif_lock); | ||
| 136 | |||
| 137 | netif = sel_netif_find(dev); | ||
| 138 | if (netif) { | ||
| 139 | spin_unlock_bh(&sel_netif_lock); | ||
| 140 | kfree(new); | ||
| 141 | goto out; | 177 | goto out; |
| 142 | } | 178 | } |
| 143 | 179 | ret = security_netif_sid(dev->name, &new->nsec.sid); | |
| 180 | if (ret != 0) | ||
| 181 | goto out; | ||
| 182 | new->nsec.ifindex = ifindex; | ||
| 144 | ret = sel_netif_insert(new); | 183 | ret = sel_netif_insert(new); |
| 145 | spin_unlock_bh(&sel_netif_lock); | 184 | if (ret != 0) |
| 146 | |||
| 147 | if (ret) { | ||
| 148 | kfree(new); | ||
| 149 | netif = ERR_PTR(ret); | ||
| 150 | goto out; | 185 | goto out; |
| 151 | } | 186 | *sid = new->nsec.sid; |
| 152 | 187 | ||
| 153 | netif = new; | ||
| 154 | |||
| 155 | DEBUGP("new: ifindex=%u name=%s if_sid=%u msg_sid=%u\n", dev->ifindex, dev->name, | ||
| 156 | nsec->if_sid, nsec->msg_sid); | ||
| 157 | out: | 188 | out: |
| 158 | return netif; | 189 | spin_unlock_bh(&sel_netif_lock); |
| 159 | } | 190 | dev_put(dev); |
| 160 | 191 | if (unlikely(ret)) { | |
| 161 | static void sel_netif_assign_sids(u32 if_sid_in, u32 msg_sid_in, u32 *if_sid_out, u32 *msg_sid_out) | 192 | printk(KERN_WARNING |
| 162 | { | 193 | "SELinux: failure in sel_netif_sid_slow()," |
| 163 | if (if_sid_out) | 194 | " unable to determine network interface label (%d)\n", |
| 164 | *if_sid_out = if_sid_in; | 195 | ifindex); |
| 165 | if (msg_sid_out) | 196 | kfree(new); |
| 166 | *msg_sid_out = msg_sid_in; | 197 | } |
| 167 | } | ||
| 168 | |||
| 169 | static int sel_netif_sids_slow(struct net_device *dev, u32 *if_sid, u32 *msg_sid) | ||
| 170 | { | ||
| 171 | int ret = 0; | ||
| 172 | u32 tmp_if_sid, tmp_msg_sid; | ||
| 173 | |||
| 174 | ret = security_netif_sid(dev->name, &tmp_if_sid, &tmp_msg_sid); | ||
| 175 | if (!ret) | ||
| 176 | sel_netif_assign_sids(tmp_if_sid, tmp_msg_sid, if_sid, msg_sid); | ||
| 177 | return ret; | 198 | return ret; |
| 178 | } | 199 | } |
| 179 | 200 | ||
| 180 | int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid) | 201 | /** |
| 202 | * sel_netif_sid - Lookup the SID of a network interface | ||
| 203 | * @ifindex: the network interface | ||
| 204 | * @sid: interface SID | ||
| 205 | * | ||
| 206 | * Description: | ||
| 207 | * This function determines the SID of a network interface using the fastest | ||
| 208 | * method possible. First the interface table is queried, but if an entry | ||
| 209 | * can't be found then the policy is queried and the result is added to the | ||
| 210 | * table to speedup future queries. Returns zero on success, negative values | ||
| 211 | * on failure. | ||
| 212 | * | ||
| 213 | */ | ||
| 214 | int sel_netif_sid(int ifindex, u32 *sid) | ||
| 181 | { | 215 | { |
| 182 | int ret = 0; | ||
| 183 | struct sel_netif *netif; | 216 | struct sel_netif *netif; |
| 184 | 217 | ||
| 185 | rcu_read_lock(); | 218 | rcu_read_lock(); |
| 186 | netif = sel_netif_lookup(dev); | 219 | netif = sel_netif_find(ifindex); |
| 187 | if (IS_ERR(netif)) { | 220 | if (likely(netif != NULL)) { |
| 221 | *sid = netif->nsec.sid; | ||
| 188 | rcu_read_unlock(); | 222 | rcu_read_unlock(); |
| 189 | ret = sel_netif_sids_slow(dev, if_sid, msg_sid); | 223 | return 0; |
| 190 | goto out; | ||
| 191 | } | 224 | } |
| 192 | sel_netif_assign_sids(netif->nsec.if_sid, netif->nsec.msg_sid, if_sid, msg_sid); | ||
| 193 | rcu_read_unlock(); | 225 | rcu_read_unlock(); |
| 194 | out: | 226 | |
| 195 | return ret; | 227 | return sel_netif_sid_slow(ifindex, sid); |
| 196 | } | 228 | } |
| 197 | 229 | ||
| 198 | static void sel_netif_kill(struct net_device *dev) | 230 | /** |
| 231 | * sel_netif_kill - Remove an entry from the network interface table | ||
| 232 | * @ifindex: the network interface | ||
| 233 | * | ||
| 234 | * Description: | ||
| 235 | * This function removes the entry matching @ifindex from the network interface | ||
| 236 | * table if it exists. | ||
| 237 | * | ||
| 238 | */ | ||
| 239 | static void sel_netif_kill(int ifindex) | ||
| 199 | { | 240 | { |
| 200 | struct sel_netif *netif; | 241 | struct sel_netif *netif; |
| 201 | 242 | ||
| 202 | spin_lock_bh(&sel_netif_lock); | 243 | spin_lock_bh(&sel_netif_lock); |
| 203 | netif = sel_netif_find(dev); | 244 | netif = sel_netif_find(ifindex); |
| 204 | if (netif) | 245 | if (netif) |
| 205 | sel_netif_destroy(netif); | 246 | sel_netif_destroy(netif); |
| 206 | spin_unlock_bh(&sel_netif_lock); | 247 | spin_unlock_bh(&sel_netif_lock); |
| 207 | } | 248 | } |
| 208 | 249 | ||
| 250 | /** | ||
| 251 | * sel_netif_flush - Flush the entire network interface table | ||
| 252 | * | ||
| 253 | * Description: | ||
| 254 | * Remove all entries from the network interface table. | ||
| 255 | * | ||
| 256 | */ | ||
| 209 | static void sel_netif_flush(void) | 257 | static void sel_netif_flush(void) |
| 210 | { | 258 | { |
| 211 | int idx; | 259 | int idx; |
| 260 | struct sel_netif *netif; | ||
| 212 | 261 | ||
| 213 | spin_lock_bh(&sel_netif_lock); | 262 | spin_lock_bh(&sel_netif_lock); |
| 214 | for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++) { | 263 | for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++) |
| 215 | struct sel_netif *netif; | ||
| 216 | |||
| 217 | list_for_each_entry(netif, &sel_netif_hash[idx], list) | 264 | list_for_each_entry(netif, &sel_netif_hash[idx], list) |
| 218 | sel_netif_destroy(netif); | 265 | sel_netif_destroy(netif); |
| 219 | } | ||
| 220 | spin_unlock_bh(&sel_netif_lock); | 266 | spin_unlock_bh(&sel_netif_lock); |
| 221 | } | 267 | } |
| 222 | 268 | ||
| @@ -239,7 +285,7 @@ static int sel_netif_netdev_notifier_handler(struct notifier_block *this, | |||
| 239 | return NOTIFY_DONE; | 285 | return NOTIFY_DONE; |
| 240 | 286 | ||
| 241 | if (event == NETDEV_DOWN) | 287 | if (event == NETDEV_DOWN) |
| 242 | sel_netif_kill(dev); | 288 | sel_netif_kill(dev->ifindex); |
| 243 | 289 | ||
| 244 | return NOTIFY_DONE; | 290 | return NOTIFY_DONE; |
| 245 | } | 291 | } |
| @@ -250,10 +296,10 @@ static struct notifier_block sel_netif_netdev_notifier = { | |||
| 250 | 296 | ||
| 251 | static __init int sel_netif_init(void) | 297 | static __init int sel_netif_init(void) |
| 252 | { | 298 | { |
| 253 | int i, err = 0; | 299 | int i, err; |
| 254 | 300 | ||
| 255 | if (!selinux_enabled) | 301 | if (!selinux_enabled) |
| 256 | goto out; | 302 | return 0; |
| 257 | 303 | ||
| 258 | for (i = 0; i < SEL_NETIF_HASH_SIZE; i++) | 304 | for (i = 0; i < SEL_NETIF_HASH_SIZE; i++) |
| 259 | INIT_LIST_HEAD(&sel_netif_hash[i]); | 305 | INIT_LIST_HEAD(&sel_netif_hash[i]); |
| @@ -265,7 +311,6 @@ static __init int sel_netif_init(void) | |||
| 265 | if (err) | 311 | if (err) |
| 266 | panic("avc_add_callback() failed, error %d\n", err); | 312 | panic("avc_add_callback() failed, error %d\n", err); |
| 267 | 313 | ||
| 268 | out: | ||
| 269 | return err; | 314 | return err; |
| 270 | } | 315 | } |
| 271 | 316 | ||
diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c index 66e013d6f6f6..0fa2be4149e8 100644 --- a/security/selinux/netlabel.c +++ b/security/selinux/netlabel.c | |||
| @@ -36,6 +36,33 @@ | |||
| 36 | #include "security.h" | 36 | #include "security.h" |
| 37 | 37 | ||
| 38 | /** | 38 | /** |
| 39 | * selinux_netlbl_sidlookup_cached - Cache a SID lookup | ||
| 40 | * @skb: the packet | ||
| 41 | * @secattr: the NetLabel security attributes | ||
| 42 | * @sid: the SID | ||
| 43 | * | ||
| 44 | * Description: | ||
| 45 | * Query the SELinux security server to lookup the correct SID for the given | ||
| 46 | * security attributes. If the query is successful, cache the result to speed | ||
| 47 | * up future lookups. Returns zero on success, negative values on failure. | ||
| 48 | * | ||
| 49 | */ | ||
| 50 | static int selinux_netlbl_sidlookup_cached(struct sk_buff *skb, | ||
| 51 | struct netlbl_lsm_secattr *secattr, | ||
| 52 | u32 *sid) | ||
| 53 | { | ||
| 54 | int rc; | ||
| 55 | |||
| 56 | rc = security_netlbl_secattr_to_sid(secattr, sid); | ||
| 57 | if (rc == 0 && | ||
| 58 | (secattr->flags & NETLBL_SECATTR_CACHEABLE) && | ||
| 59 | (secattr->flags & NETLBL_SECATTR_CACHE)) | ||
| 60 | netlbl_cache_add(skb, secattr); | ||
| 61 | |||
| 62 | return rc; | ||
| 63 | } | ||
| 64 | |||
| 65 | /** | ||
| 39 | * selinux_netlbl_sock_setsid - Label a socket using the NetLabel mechanism | 66 | * selinux_netlbl_sock_setsid - Label a socket using the NetLabel mechanism |
| 40 | * @sk: the socket to label | 67 | * @sk: the socket to label |
| 41 | * @sid: the SID to use | 68 | * @sid: the SID to use |
| @@ -137,14 +164,14 @@ void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, | |||
| 137 | * lock as other threads could have access to ssec */ | 164 | * lock as other threads could have access to ssec */ |
| 138 | rcu_read_lock(); | 165 | rcu_read_lock(); |
| 139 | selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family); | 166 | selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family); |
| 140 | newssec->sclass = ssec->sclass; | ||
| 141 | rcu_read_unlock(); | 167 | rcu_read_unlock(); |
| 142 | } | 168 | } |
| 143 | 169 | ||
| 144 | /** | 170 | /** |
| 145 | * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel | 171 | * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel |
| 146 | * @skb: the packet | 172 | * @skb: the packet |
| 147 | * @base_sid: the SELinux SID to use as a context for MLS only attributes | 173 | * @family: protocol family |
| 174 | * @type: NetLabel labeling protocol type | ||
| 148 | * @sid: the SID | 175 | * @sid: the SID |
| 149 | * | 176 | * |
| 150 | * Description: | 177 | * Description: |
| @@ -153,7 +180,10 @@ void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, | |||
| 153 | * assign to the packet. Returns zero on success, negative values on failure. | 180 | * assign to the packet. Returns zero on success, negative values on failure. |
| 154 | * | 181 | * |
| 155 | */ | 182 | */ |
| 156 | int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid) | 183 | int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, |
| 184 | u16 family, | ||
| 185 | u32 *type, | ||
| 186 | u32 *sid) | ||
| 157 | { | 187 | { |
| 158 | int rc; | 188 | int rc; |
| 159 | struct netlbl_lsm_secattr secattr; | 189 | struct netlbl_lsm_secattr secattr; |
| @@ -164,15 +194,12 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid) | |||
| 164 | } | 194 | } |
| 165 | 195 | ||
| 166 | netlbl_secattr_init(&secattr); | 196 | netlbl_secattr_init(&secattr); |
| 167 | rc = netlbl_skbuff_getattr(skb, &secattr); | 197 | rc = netlbl_skbuff_getattr(skb, family, &secattr); |
| 168 | if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) { | 198 | if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) |
| 169 | rc = security_netlbl_secattr_to_sid(&secattr, base_sid, sid); | 199 | rc = selinux_netlbl_sidlookup_cached(skb, &secattr, sid); |
| 170 | if (rc == 0 && | 200 | else |
| 171 | (secattr.flags & NETLBL_SECATTR_CACHEABLE) && | ||
| 172 | (secattr.flags & NETLBL_SECATTR_CACHE)) | ||
| 173 | netlbl_cache_add(skb, &secattr); | ||
| 174 | } else | ||
| 175 | *sid = SECSID_NULL; | 201 | *sid = SECSID_NULL; |
| 202 | *type = secattr.type; | ||
| 176 | netlbl_secattr_destroy(&secattr); | 203 | netlbl_secattr_destroy(&secattr); |
| 177 | 204 | ||
| 178 | return rc; | 205 | return rc; |
| @@ -190,13 +217,10 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid) | |||
| 190 | */ | 217 | */ |
| 191 | void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock) | 218 | void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock) |
| 192 | { | 219 | { |
| 193 | struct inode_security_struct *isec = SOCK_INODE(sock)->i_security; | ||
| 194 | struct sk_security_struct *sksec = sk->sk_security; | 220 | struct sk_security_struct *sksec = sk->sk_security; |
| 195 | struct netlbl_lsm_secattr secattr; | 221 | struct netlbl_lsm_secattr secattr; |
| 196 | u32 nlbl_peer_sid; | 222 | u32 nlbl_peer_sid; |
| 197 | 223 | ||
| 198 | sksec->sclass = isec->sclass; | ||
| 199 | |||
| 200 | rcu_read_lock(); | 224 | rcu_read_lock(); |
| 201 | 225 | ||
| 202 | if (sksec->nlbl_state != NLBL_REQUIRE) { | 226 | if (sksec->nlbl_state != NLBL_REQUIRE) { |
| @@ -207,9 +231,7 @@ void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock) | |||
| 207 | netlbl_secattr_init(&secattr); | 231 | netlbl_secattr_init(&secattr); |
| 208 | if (netlbl_sock_getattr(sk, &secattr) == 0 && | 232 | if (netlbl_sock_getattr(sk, &secattr) == 0 && |
| 209 | secattr.flags != NETLBL_SECATTR_NONE && | 233 | secattr.flags != NETLBL_SECATTR_NONE && |
| 210 | security_netlbl_secattr_to_sid(&secattr, | 234 | security_netlbl_secattr_to_sid(&secattr, &nlbl_peer_sid) == 0) |
| 211 | SECINITSID_NETMSG, | ||
| 212 | &nlbl_peer_sid) == 0) | ||
| 213 | sksec->peer_sid = nlbl_peer_sid; | 235 | sksec->peer_sid = nlbl_peer_sid; |
| 214 | netlbl_secattr_destroy(&secattr); | 236 | netlbl_secattr_destroy(&secattr); |
| 215 | 237 | ||
| @@ -234,11 +256,8 @@ int selinux_netlbl_socket_post_create(struct socket *sock) | |||
| 234 | { | 256 | { |
| 235 | int rc = 0; | 257 | int rc = 0; |
| 236 | struct sock *sk = sock->sk; | 258 | struct sock *sk = sock->sk; |
| 237 | struct inode_security_struct *isec = SOCK_INODE(sock)->i_security; | ||
| 238 | struct sk_security_struct *sksec = sk->sk_security; | 259 | struct sk_security_struct *sksec = sk->sk_security; |
| 239 | 260 | ||
| 240 | sksec->sclass = isec->sclass; | ||
| 241 | |||
| 242 | rcu_read_lock(); | 261 | rcu_read_lock(); |
| 243 | if (sksec->nlbl_state == NLBL_REQUIRE) | 262 | if (sksec->nlbl_state == NLBL_REQUIRE) |
| 244 | rc = selinux_netlbl_sock_setsid(sk, sksec->sid); | 263 | rc = selinux_netlbl_sock_setsid(sk, sksec->sid); |
| @@ -292,6 +311,7 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask) | |||
| 292 | * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel | 311 | * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel |
| 293 | * @sksec: the sock's sk_security_struct | 312 | * @sksec: the sock's sk_security_struct |
| 294 | * @skb: the packet | 313 | * @skb: the packet |
| 314 | * @family: protocol family | ||
| 295 | * @ad: the audit data | 315 | * @ad: the audit data |
| 296 | * | 316 | * |
| 297 | * Description: | 317 | * Description: |
| @@ -302,6 +322,7 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask) | |||
| 302 | */ | 322 | */ |
| 303 | int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, | 323 | int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, |
| 304 | struct sk_buff *skb, | 324 | struct sk_buff *skb, |
| 325 | u16 family, | ||
| 305 | struct avc_audit_data *ad) | 326 | struct avc_audit_data *ad) |
| 306 | { | 327 | { |
| 307 | int rc; | 328 | int rc; |
| @@ -313,16 +334,10 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, | |||
| 313 | return 0; | 334 | return 0; |
| 314 | 335 | ||
| 315 | netlbl_secattr_init(&secattr); | 336 | netlbl_secattr_init(&secattr); |
| 316 | rc = netlbl_skbuff_getattr(skb, &secattr); | 337 | rc = netlbl_skbuff_getattr(skb, family, &secattr); |
| 317 | if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) { | 338 | if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) |
| 318 | rc = security_netlbl_secattr_to_sid(&secattr, | 339 | rc = selinux_netlbl_sidlookup_cached(skb, &secattr, &nlbl_sid); |
| 319 | SECINITSID_NETMSG, | 340 | else |
| 320 | &nlbl_sid); | ||
| 321 | if (rc == 0 && | ||
| 322 | (secattr.flags & NETLBL_SECATTR_CACHEABLE) && | ||
| 323 | (secattr.flags & NETLBL_SECATTR_CACHE)) | ||
| 324 | netlbl_cache_add(skb, &secattr); | ||
| 325 | } else | ||
| 326 | nlbl_sid = SECINITSID_UNLABELED; | 341 | nlbl_sid = SECINITSID_UNLABELED; |
| 327 | netlbl_secattr_destroy(&secattr); | 342 | netlbl_secattr_destroy(&secattr); |
| 328 | if (rc != 0) | 343 | if (rc != 0) |
diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c new file mode 100644 index 000000000000..f3c526f2cacb --- /dev/null +++ b/security/selinux/netnode.c | |||
| @@ -0,0 +1,354 @@ | |||
| 1 | /* | ||
| 2 | * Network node table | ||
| 3 | * | ||
| 4 | * SELinux must keep a mapping of network nodes to labels/SIDs. This | ||
| 5 | * mapping is maintained as part of the normal policy but a fast cache is | ||
| 6 | * needed to reduce the lookup overhead since most of these queries happen on | ||
| 7 | * a per-packet basis. | ||
| 8 | * | ||
| 9 | * Author: Paul Moore <paul.moore@hp.com> | ||
| 10 | * | ||
| 11 | * This code is heavily based on the "netif" concept originally developed by | ||
| 12 | * James Morris <jmorris@redhat.com> | ||
| 13 | * (see security/selinux/netif.c for more information) | ||
| 14 | * | ||
| 15 | */ | ||
| 16 | |||
| 17 | /* | ||
| 18 | * (c) Copyright Hewlett-Packard Development Company, L.P., 2007 | ||
| 19 | * | ||
| 20 | * This program is free software: you can redistribute it and/or modify | ||
| 21 | * it under the terms of version 2 of the GNU General Public License as | ||
| 22 | * published by the Free Software Foundation. | ||
| 23 | * | ||
| 24 | * This program is distributed in the hope that it will be useful, | ||
| 25 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 26 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 27 | * GNU General Public License for more details. | ||
| 28 | * | ||
| 29 | */ | ||
| 30 | |||
| 31 | #include <linux/types.h> | ||
| 32 | #include <linux/rcupdate.h> | ||
| 33 | #include <linux/list.h> | ||
| 34 | #include <linux/spinlock.h> | ||
| 35 | #include <linux/in.h> | ||
| 36 | #include <linux/in6.h> | ||
| 37 | #include <linux/ip.h> | ||
| 38 | #include <linux/ipv6.h> | ||
| 39 | #include <net/ip.h> | ||
| 40 | #include <net/ipv6.h> | ||
| 41 | #include <asm/bug.h> | ||
| 42 | |||
| 43 | #include "objsec.h" | ||
| 44 | |||
| 45 | #define SEL_NETNODE_HASH_SIZE 256 | ||
| 46 | #define SEL_NETNODE_HASH_BKT_LIMIT 16 | ||
| 47 | |||
| 48 | struct sel_netnode { | ||
| 49 | struct netnode_security_struct nsec; | ||
| 50 | |||
| 51 | struct list_head list; | ||
| 52 | struct rcu_head rcu; | ||
| 53 | }; | ||
| 54 | |||
| 55 | /* NOTE: we are using a combined hash table for both IPv4 and IPv6, the reason | ||
| 56 | * for this is that I suspect most users will not make heavy use of both | ||
| 57 | * address families at the same time so one table will usually end up wasted, | ||
| 58 | * if this becomes a problem we can always add a hash table for each address | ||
| 59 | * family later */ | ||
| 60 | |||
| 61 | static LIST_HEAD(sel_netnode_list); | ||
| 62 | static DEFINE_SPINLOCK(sel_netnode_lock); | ||
| 63 | static struct list_head sel_netnode_hash[SEL_NETNODE_HASH_SIZE]; | ||
| 64 | |||
| 65 | /** | ||
| 66 | * sel_netnode_free - Frees a node entry | ||
| 67 | * @p: the entry's RCU field | ||
| 68 | * | ||
| 69 | * Description: | ||
| 70 | * This function is designed to be used as a callback to the call_rcu() | ||
| 71 | * function so that memory allocated to a hash table node entry can be | ||
| 72 | * released safely. | ||
| 73 | * | ||
| 74 | */ | ||
| 75 | static void sel_netnode_free(struct rcu_head *p) | ||
| 76 | { | ||
| 77 | struct sel_netnode *node = container_of(p, struct sel_netnode, rcu); | ||
| 78 | kfree(node); | ||
| 79 | } | ||
| 80 | |||
| 81 | /** | ||
| 82 | * sel_netnode_hashfn_ipv4 - IPv4 hashing function for the node table | ||
| 83 | * @addr: IPv4 address | ||
| 84 | * | ||
| 85 | * Description: | ||
| 86 | * This is the IPv4 hashing function for the node interface table, it returns | ||
| 87 | * the bucket number for the given IP address. | ||
| 88 | * | ||
| 89 | */ | ||
| 90 | static u32 sel_netnode_hashfn_ipv4(__be32 addr) | ||
| 91 | { | ||
| 92 | /* at some point we should determine if the mismatch in byte order | ||
| 93 | * affects the hash function dramatically */ | ||
| 94 | return (addr & (SEL_NETNODE_HASH_SIZE - 1)); | ||
| 95 | } | ||
| 96 | |||
| 97 | /** | ||
| 98 | * sel_netnode_hashfn_ipv6 - IPv6 hashing function for the node table | ||
| 99 | * @addr: IPv6 address | ||
| 100 | * | ||
| 101 | * Description: | ||
| 102 | * This is the IPv6 hashing function for the node interface table, it returns | ||
| 103 | * the bucket number for the given IP address. | ||
| 104 | * | ||
| 105 | */ | ||
| 106 | static u32 sel_netnode_hashfn_ipv6(const struct in6_addr *addr) | ||
| 107 | { | ||
| 108 | /* just hash the least significant 32 bits to keep things fast (they | ||
| 109 | * are the most likely to be different anyway), we can revisit this | ||
| 110 | * later if needed */ | ||
| 111 | return (addr->s6_addr32[3] & (SEL_NETNODE_HASH_SIZE - 1)); | ||
| 112 | } | ||
| 113 | |||
| 114 | /** | ||
| 115 | * sel_netnode_find - Search for a node record | ||
| 116 | * @addr: IP address | ||
| 117 | * @family: address family | ||
| 118 | * | ||
| 119 | * Description: | ||
| 120 | * Search the network node table and return the record matching @addr. If an | ||
| 121 | * entry can not be found in the table return NULL. | ||
| 122 | * | ||
| 123 | */ | ||
| 124 | static struct sel_netnode *sel_netnode_find(const void *addr, u16 family) | ||
| 125 | { | ||
| 126 | u32 idx; | ||
| 127 | struct sel_netnode *node; | ||
| 128 | |||
| 129 | switch (family) { | ||
| 130 | case PF_INET: | ||
| 131 | idx = sel_netnode_hashfn_ipv4(*(__be32 *)addr); | ||
| 132 | break; | ||
| 133 | case PF_INET6: | ||
| 134 | idx = sel_netnode_hashfn_ipv6(addr); | ||
| 135 | break; | ||
| 136 | default: | ||
| 137 | BUG(); | ||
| 138 | } | ||
| 139 | |||
| 140 | list_for_each_entry_rcu(node, &sel_netnode_hash[idx], list) | ||
| 141 | if (node->nsec.family == family) | ||
| 142 | switch (family) { | ||
| 143 | case PF_INET: | ||
| 144 | if (node->nsec.addr.ipv4 == *(__be32 *)addr) | ||
| 145 | return node; | ||
| 146 | break; | ||
| 147 | case PF_INET6: | ||
| 148 | if (ipv6_addr_equal(&node->nsec.addr.ipv6, | ||
| 149 | addr)) | ||
| 150 | return node; | ||
| 151 | break; | ||
| 152 | } | ||
| 153 | |||
| 154 | return NULL; | ||
| 155 | } | ||
| 156 | |||
| 157 | /** | ||
| 158 | * sel_netnode_insert - Insert a new node into the table | ||
| 159 | * @node: the new node record | ||
| 160 | * | ||
| 161 | * Description: | ||
| 162 | * Add a new node record to the network address hash table. Returns zero on | ||
| 163 | * success, negative values on failure. | ||
| 164 | * | ||
| 165 | */ | ||
| 166 | static int sel_netnode_insert(struct sel_netnode *node) | ||
| 167 | { | ||
| 168 | u32 idx; | ||
| 169 | u32 count = 0; | ||
| 170 | struct sel_netnode *iter; | ||
| 171 | |||
| 172 | switch (node->nsec.family) { | ||
| 173 | case PF_INET: | ||
| 174 | idx = sel_netnode_hashfn_ipv4(node->nsec.addr.ipv4); | ||
| 175 | break; | ||
| 176 | case PF_INET6: | ||
| 177 | idx = sel_netnode_hashfn_ipv6(&node->nsec.addr.ipv6); | ||
| 178 | break; | ||
| 179 | default: | ||
| 180 | BUG(); | ||
| 181 | } | ||
| 182 | list_add_rcu(&node->list, &sel_netnode_hash[idx]); | ||
| 183 | |||
| 184 | /* we need to impose a limit on the growth of the hash table so check | ||
| 185 | * this bucket to make sure it is within the specified bounds */ | ||
| 186 | list_for_each_entry(iter, &sel_netnode_hash[idx], list) | ||
| 187 | if (++count > SEL_NETNODE_HASH_BKT_LIMIT) { | ||
| 188 | list_del_rcu(&iter->list); | ||
| 189 | call_rcu(&iter->rcu, sel_netnode_free); | ||
| 190 | break; | ||
| 191 | } | ||
| 192 | |||
| 193 | return 0; | ||
| 194 | } | ||
| 195 | |||
| 196 | /** | ||
| 197 | * sel_netnode_destroy - Remove a node record from the table | ||
| 198 | * @node: the existing node record | ||
| 199 | * | ||
| 200 | * Description: | ||
| 201 | * Remove an existing node record from the network address table. | ||
| 202 | * | ||
| 203 | */ | ||
| 204 | static void sel_netnode_destroy(struct sel_netnode *node) | ||
| 205 | { | ||
| 206 | list_del_rcu(&node->list); | ||
| 207 | call_rcu(&node->rcu, sel_netnode_free); | ||
| 208 | } | ||
| 209 | |||
| 210 | /** | ||
| 211 | * sel_netnode_sid_slow - Lookup the SID of a network address using the policy | ||
| 212 | * @addr: the IP address | ||
| 213 | * @family: the address family | ||
| 214 | * @sid: node SID | ||
| 215 | * | ||
| 216 | * Description: | ||
| 217 | * This function determines the SID of a network address by quering the | ||
| 218 | * security policy. The result is added to the network address table to | ||
| 219 | * speedup future queries. Returns zero on success, negative values on | ||
| 220 | * failure. | ||
| 221 | * | ||
| 222 | */ | ||
| 223 | static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid) | ||
| 224 | { | ||
| 225 | int ret; | ||
| 226 | struct sel_netnode *node; | ||
| 227 | struct sel_netnode *new = NULL; | ||
| 228 | |||
| 229 | spin_lock_bh(&sel_netnode_lock); | ||
| 230 | node = sel_netnode_find(addr, family); | ||
| 231 | if (node != NULL) { | ||
| 232 | *sid = node->nsec.sid; | ||
| 233 | ret = 0; | ||
| 234 | goto out; | ||
| 235 | } | ||
| 236 | new = kzalloc(sizeof(*new), GFP_ATOMIC); | ||
| 237 | if (new == NULL) { | ||
| 238 | ret = -ENOMEM; | ||
| 239 | goto out; | ||
| 240 | } | ||
| 241 | switch (family) { | ||
| 242 | case PF_INET: | ||
| 243 | ret = security_node_sid(PF_INET, | ||
| 244 | addr, sizeof(struct in_addr), | ||
| 245 | &new->nsec.sid); | ||
| 246 | new->nsec.addr.ipv4 = *(__be32 *)addr; | ||
| 247 | break; | ||
| 248 | case PF_INET6: | ||
| 249 | ret = security_node_sid(PF_INET6, | ||
| 250 | addr, sizeof(struct in6_addr), | ||
| 251 | &new->nsec.sid); | ||
| 252 | ipv6_addr_copy(&new->nsec.addr.ipv6, addr); | ||
| 253 | break; | ||
| 254 | default: | ||
| 255 | BUG(); | ||
| 256 | } | ||
| 257 | if (ret != 0) | ||
| 258 | goto out; | ||
| 259 | new->nsec.family = family; | ||
| 260 | ret = sel_netnode_insert(new); | ||
| 261 | if (ret != 0) | ||
| 262 | goto out; | ||
| 263 | *sid = new->nsec.sid; | ||
| 264 | |||
| 265 | out: | ||
| 266 | spin_unlock_bh(&sel_netnode_lock); | ||
| 267 | if (unlikely(ret)) { | ||
| 268 | printk(KERN_WARNING | ||
| 269 | "SELinux: failure in sel_netnode_sid_slow()," | ||
| 270 | " unable to determine network node label\n"); | ||
| 271 | kfree(new); | ||
| 272 | } | ||
| 273 | return ret; | ||
| 274 | } | ||
| 275 | |||
| 276 | /** | ||
| 277 | * sel_netnode_sid - Lookup the SID of a network address | ||
| 278 | * @addr: the IP address | ||
| 279 | * @family: the address family | ||
| 280 | * @sid: node SID | ||
| 281 | * | ||
| 282 | * Description: | ||
| 283 | * This function determines the SID of a network address using the fastest | ||
| 284 | * method possible. First the address table is queried, but if an entry | ||
| 285 | * can't be found then the policy is queried and the result is added to the | ||
| 286 | * table to speedup future queries. Returns zero on success, negative values | ||
| 287 | * on failure. | ||
| 288 | * | ||
| 289 | */ | ||
| 290 | int sel_netnode_sid(void *addr, u16 family, u32 *sid) | ||
| 291 | { | ||
| 292 | struct sel_netnode *node; | ||
| 293 | |||
| 294 | rcu_read_lock(); | ||
| 295 | node = sel_netnode_find(addr, family); | ||
| 296 | if (node != NULL) { | ||
| 297 | *sid = node->nsec.sid; | ||
| 298 | rcu_read_unlock(); | ||
| 299 | return 0; | ||
| 300 | } | ||
| 301 | rcu_read_unlock(); | ||
| 302 | |||
| 303 | return sel_netnode_sid_slow(addr, family, sid); | ||
| 304 | } | ||
| 305 | |||
| 306 | /** | ||
| 307 | * sel_netnode_flush - Flush the entire network address table | ||
| 308 | * | ||
| 309 | * Description: | ||
| 310 | * Remove all entries from the network address table. | ||
| 311 | * | ||
| 312 | */ | ||
| 313 | static void sel_netnode_flush(void) | ||
| 314 | { | ||
| 315 | u32 idx; | ||
| 316 | struct sel_netnode *node; | ||
| 317 | |||
| 318 | spin_lock_bh(&sel_netnode_lock); | ||
| 319 | for (idx = 0; idx < SEL_NETNODE_HASH_SIZE; idx++) | ||
| 320 | list_for_each_entry(node, &sel_netnode_hash[idx], list) | ||
| 321 | sel_netnode_destroy(node); | ||
| 322 | spin_unlock_bh(&sel_netnode_lock); | ||
| 323 | } | ||
| 324 | |||
| 325 | static int sel_netnode_avc_callback(u32 event, u32 ssid, u32 tsid, | ||
| 326 | u16 class, u32 perms, u32 *retained) | ||
| 327 | { | ||
| 328 | if (event == AVC_CALLBACK_RESET) { | ||
| 329 | sel_netnode_flush(); | ||
| 330 | synchronize_net(); | ||
| 331 | } | ||
| 332 | return 0; | ||
| 333 | } | ||
| 334 | |||
| 335 | static __init int sel_netnode_init(void) | ||
| 336 | { | ||
| 337 | int iter; | ||
| 338 | int ret; | ||
| 339 | |||
| 340 | if (!selinux_enabled) | ||
| 341 | return 0; | ||
| 342 | |||
| 343 | for (iter = 0; iter < SEL_NETNODE_HASH_SIZE; iter++) | ||
| 344 | INIT_LIST_HEAD(&sel_netnode_hash[iter]); | ||
| 345 | |||
| 346 | ret = avc_add_callback(sel_netnode_avc_callback, AVC_CALLBACK_RESET, | ||
| 347 | SECSID_NULL, SECSID_NULL, SECCLASS_NULL, 0); | ||
| 348 | if (ret != 0) | ||
| 349 | panic("avc_add_callback() failed, error %d\n", ret); | ||
| 350 | |||
| 351 | return ret; | ||
| 352 | } | ||
| 353 | |||
| 354 | __initcall(sel_netnode_init); | ||
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 397fd4955fe1..a85740530afc 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c | |||
| @@ -2,6 +2,11 @@ | |||
| 2 | * | 2 | * |
| 3 | * Added conditional policy language extensions | 3 | * Added conditional policy language extensions |
| 4 | * | 4 | * |
| 5 | * Updated: Hewlett-Packard <paul.moore@hp.com> | ||
| 6 | * | ||
| 7 | * Added support for the policy capability bitmap | ||
| 8 | * | ||
| 9 | * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. | ||
| 5 | * Copyright (C) 2003 - 2004 Tresys Technology, LLC | 10 | * Copyright (C) 2003 - 2004 Tresys Technology, LLC |
| 6 | * Copyright (C) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> | 11 | * Copyright (C) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> |
| 7 | * This program is free software; you can redistribute it and/or modify | 12 | * This program is free software; you can redistribute it and/or modify |
| @@ -35,6 +40,11 @@ | |||
| 35 | #include "objsec.h" | 40 | #include "objsec.h" |
| 36 | #include "conditional.h" | 41 | #include "conditional.h" |
| 37 | 42 | ||
| 43 | /* Policy capability filenames */ | ||
| 44 | static char *policycap_names[] = { | ||
| 45 | "network_peer_controls" | ||
| 46 | }; | ||
| 47 | |||
| 38 | unsigned int selinux_checkreqprot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; | 48 | unsigned int selinux_checkreqprot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; |
| 39 | 49 | ||
| 40 | #ifdef CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT | 50 | #ifdef CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT |
| @@ -72,6 +82,9 @@ static int *bool_pending_values = NULL; | |||
| 72 | static struct dentry *class_dir = NULL; | 82 | static struct dentry *class_dir = NULL; |
| 73 | static unsigned long last_class_ino; | 83 | static unsigned long last_class_ino; |
| 74 | 84 | ||
| 85 | /* global data for policy capabilities */ | ||
| 86 | static struct dentry *policycap_dir = NULL; | ||
| 87 | |||
| 75 | extern void selnl_notify_setenforce(int val); | 88 | extern void selnl_notify_setenforce(int val); |
| 76 | 89 | ||
| 77 | /* Check whether a task is allowed to use a security operation. */ | 90 | /* Check whether a task is allowed to use a security operation. */ |
| @@ -111,10 +124,11 @@ enum sel_inos { | |||
| 111 | 124 | ||
| 112 | static unsigned long sel_last_ino = SEL_INO_NEXT - 1; | 125 | static unsigned long sel_last_ino = SEL_INO_NEXT - 1; |
| 113 | 126 | ||
| 114 | #define SEL_INITCON_INO_OFFSET 0x01000000 | 127 | #define SEL_INITCON_INO_OFFSET 0x01000000 |
| 115 | #define SEL_BOOL_INO_OFFSET 0x02000000 | 128 | #define SEL_BOOL_INO_OFFSET 0x02000000 |
| 116 | #define SEL_CLASS_INO_OFFSET 0x04000000 | 129 | #define SEL_CLASS_INO_OFFSET 0x04000000 |
| 117 | #define SEL_INO_MASK 0x00ffffff | 130 | #define SEL_POLICYCAP_INO_OFFSET 0x08000000 |
| 131 | #define SEL_INO_MASK 0x00ffffff | ||
| 118 | 132 | ||
| 119 | #define TMPBUFLEN 12 | 133 | #define TMPBUFLEN 12 |
| 120 | static ssize_t sel_read_enforce(struct file *filp, char __user *buf, | 134 | static ssize_t sel_read_enforce(struct file *filp, char __user *buf, |
| @@ -263,6 +277,7 @@ static const struct file_operations sel_policyvers_ops = { | |||
| 263 | /* declaration for sel_write_load */ | 277 | /* declaration for sel_write_load */ |
| 264 | static int sel_make_bools(void); | 278 | static int sel_make_bools(void); |
| 265 | static int sel_make_classes(void); | 279 | static int sel_make_classes(void); |
| 280 | static int sel_make_policycap(void); | ||
| 266 | 281 | ||
| 267 | /* declaration for sel_make_class_dirs */ | 282 | /* declaration for sel_make_class_dirs */ |
| 268 | static int sel_make_dir(struct inode *dir, struct dentry *dentry, | 283 | static int sel_make_dir(struct inode *dir, struct dentry *dentry, |
| @@ -323,6 +338,12 @@ static ssize_t sel_write_load(struct file * file, const char __user * buf, | |||
| 323 | } | 338 | } |
| 324 | 339 | ||
| 325 | ret = sel_make_classes(); | 340 | ret = sel_make_classes(); |
| 341 | if (ret) { | ||
| 342 | length = ret; | ||
| 343 | goto out1; | ||
| 344 | } | ||
| 345 | |||
| 346 | ret = sel_make_policycap(); | ||
| 326 | if (ret) | 347 | if (ret) |
| 327 | length = ret; | 348 | length = ret; |
| 328 | else | 349 | else |
| @@ -1399,6 +1420,24 @@ static const struct file_operations sel_perm_ops = { | |||
| 1399 | .read = sel_read_perm, | 1420 | .read = sel_read_perm, |
| 1400 | }; | 1421 | }; |
| 1401 | 1422 | ||
| 1423 | static ssize_t sel_read_policycap(struct file *file, char __user *buf, | ||
| 1424 | size_t count, loff_t *ppos) | ||
| 1425 | { | ||
| 1426 | int value; | ||
| 1427 | char tmpbuf[TMPBUFLEN]; | ||
| 1428 | ssize_t length; | ||
| 1429 | unsigned long i_ino = file->f_path.dentry->d_inode->i_ino; | ||
| 1430 | |||
| 1431 | value = security_policycap_supported(i_ino & SEL_INO_MASK); | ||
| 1432 | length = scnprintf(tmpbuf, TMPBUFLEN, "%d", value); | ||
| 1433 | |||
| 1434 | return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); | ||
| 1435 | } | ||
| 1436 | |||
| 1437 | static const struct file_operations sel_policycap_ops = { | ||
| 1438 | .read = sel_read_policycap, | ||
| 1439 | }; | ||
| 1440 | |||
| 1402 | static int sel_make_perm_files(char *objclass, int classvalue, | 1441 | static int sel_make_perm_files(char *objclass, int classvalue, |
| 1403 | struct dentry *dir) | 1442 | struct dentry *dir) |
| 1404 | { | 1443 | { |
| @@ -1545,6 +1584,36 @@ out: | |||
| 1545 | return rc; | 1584 | return rc; |
| 1546 | } | 1585 | } |
| 1547 | 1586 | ||
| 1587 | static int sel_make_policycap(void) | ||
| 1588 | { | ||
| 1589 | unsigned int iter; | ||
| 1590 | struct dentry *dentry = NULL; | ||
| 1591 | struct inode *inode = NULL; | ||
| 1592 | |||
| 1593 | sel_remove_entries(policycap_dir); | ||
| 1594 | |||
| 1595 | for (iter = 0; iter <= POLICYDB_CAPABILITY_MAX; iter++) { | ||
| 1596 | if (iter < ARRAY_SIZE(policycap_names)) | ||
| 1597 | dentry = d_alloc_name(policycap_dir, | ||
| 1598 | policycap_names[iter]); | ||
| 1599 | else | ||
| 1600 | dentry = d_alloc_name(policycap_dir, "unknown"); | ||
| 1601 | |||
| 1602 | if (dentry == NULL) | ||
| 1603 | return -ENOMEM; | ||
| 1604 | |||
| 1605 | inode = sel_make_inode(policycap_dir->d_sb, S_IFREG | S_IRUGO); | ||
| 1606 | if (inode == NULL) | ||
| 1607 | return -ENOMEM; | ||
| 1608 | |||
| 1609 | inode->i_fop = &sel_policycap_ops; | ||
| 1610 | inode->i_ino = iter | SEL_POLICYCAP_INO_OFFSET; | ||
| 1611 | d_add(dentry, inode); | ||
| 1612 | } | ||
| 1613 | |||
| 1614 | return 0; | ||
| 1615 | } | ||
| 1616 | |||
| 1548 | static int sel_make_dir(struct inode *dir, struct dentry *dentry, | 1617 | static int sel_make_dir(struct inode *dir, struct dentry *dentry, |
| 1549 | unsigned long *ino) | 1618 | unsigned long *ino) |
| 1550 | { | 1619 | { |
| @@ -1673,6 +1742,18 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent) | |||
| 1673 | 1742 | ||
| 1674 | class_dir = dentry; | 1743 | class_dir = dentry; |
| 1675 | 1744 | ||
| 1745 | dentry = d_alloc_name(sb->s_root, "policy_capabilities"); | ||
| 1746 | if (!dentry) { | ||
| 1747 | ret = -ENOMEM; | ||
| 1748 | goto err; | ||
| 1749 | } | ||
| 1750 | |||
| 1751 | ret = sel_make_dir(root_inode, dentry, &sel_last_ino); | ||
| 1752 | if (ret) | ||
| 1753 | goto err; | ||
| 1754 | |||
| 1755 | policycap_dir = dentry; | ||
| 1756 | |||
| 1676 | out: | 1757 | out: |
| 1677 | return ret; | 1758 | return ret; |
| 1678 | err: | 1759 | err: |
diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c index 3bbcb5369af9..feaf0a5b828f 100644 --- a/security/selinux/ss/mls.c +++ b/security/selinux/ss/mls.c | |||
| @@ -562,7 +562,7 @@ void mls_export_netlbl_lvl(struct context *context, | |||
| 562 | if (!selinux_mls_enabled) | 562 | if (!selinux_mls_enabled) |
| 563 | return; | 563 | return; |
| 564 | 564 | ||
| 565 | secattr->mls_lvl = context->range.level[0].sens - 1; | 565 | secattr->attr.mls.lvl = context->range.level[0].sens - 1; |
| 566 | secattr->flags |= NETLBL_SECATTR_MLS_LVL; | 566 | secattr->flags |= NETLBL_SECATTR_MLS_LVL; |
| 567 | } | 567 | } |
| 568 | 568 | ||
| @@ -582,7 +582,7 @@ void mls_import_netlbl_lvl(struct context *context, | |||
| 582 | if (!selinux_mls_enabled) | 582 | if (!selinux_mls_enabled) |
| 583 | return; | 583 | return; |
| 584 | 584 | ||
| 585 | context->range.level[0].sens = secattr->mls_lvl + 1; | 585 | context->range.level[0].sens = secattr->attr.mls.lvl + 1; |
| 586 | context->range.level[1].sens = context->range.level[0].sens; | 586 | context->range.level[1].sens = context->range.level[0].sens; |
| 587 | } | 587 | } |
| 588 | 588 | ||
| @@ -605,8 +605,8 @@ int mls_export_netlbl_cat(struct context *context, | |||
| 605 | return 0; | 605 | return 0; |
| 606 | 606 | ||
| 607 | rc = ebitmap_netlbl_export(&context->range.level[0].cat, | 607 | rc = ebitmap_netlbl_export(&context->range.level[0].cat, |
| 608 | &secattr->mls_cat); | 608 | &secattr->attr.mls.cat); |
| 609 | if (rc == 0 && secattr->mls_cat != NULL) | 609 | if (rc == 0 && secattr->attr.mls.cat != NULL) |
| 610 | secattr->flags |= NETLBL_SECATTR_MLS_CAT; | 610 | secattr->flags |= NETLBL_SECATTR_MLS_CAT; |
| 611 | 611 | ||
| 612 | return rc; | 612 | return rc; |
| @@ -633,7 +633,7 @@ int mls_import_netlbl_cat(struct context *context, | |||
| 633 | return 0; | 633 | return 0; |
| 634 | 634 | ||
| 635 | rc = ebitmap_netlbl_import(&context->range.level[0].cat, | 635 | rc = ebitmap_netlbl_import(&context->range.level[0].cat, |
| 636 | secattr->mls_cat); | 636 | secattr->attr.mls.cat); |
| 637 | if (rc != 0) | 637 | if (rc != 0) |
| 638 | goto import_netlbl_cat_failure; | 638 | goto import_netlbl_cat_failure; |
| 639 | 639 | ||
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index b582aae3c62c..bd7d6a00342d 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c | |||
| @@ -13,6 +13,11 @@ | |||
| 13 | * | 13 | * |
| 14 | * Added conditional policy language extensions | 14 | * Added conditional policy language extensions |
| 15 | * | 15 | * |
| 16 | * Updated: Hewlett-Packard <paul.moore@hp.com> | ||
| 17 | * | ||
| 18 | * Added support for the policy capability bitmap | ||
| 19 | * | ||
| 20 | * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. | ||
| 16 | * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. | 21 | * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. |
| 17 | * Copyright (C) 2003 - 2004 Tresys Technology, LLC | 22 | * Copyright (C) 2003 - 2004 Tresys Technology, LLC |
| 18 | * This program is free software; you can redistribute it and/or modify | 23 | * This program is free software; you can redistribute it and/or modify |
| @@ -102,6 +107,11 @@ static struct policydb_compat_info policydb_compat[] = { | |||
| 102 | .sym_num = SYM_NUM, | 107 | .sym_num = SYM_NUM, |
| 103 | .ocon_num = OCON_NUM, | 108 | .ocon_num = OCON_NUM, |
| 104 | }, | 109 | }, |
| 110 | { | ||
| 111 | .version = POLICYDB_VERSION_POLCAP, | ||
| 112 | .sym_num = SYM_NUM, | ||
| 113 | .ocon_num = OCON_NUM, | ||
| 114 | } | ||
| 105 | }; | 115 | }; |
| 106 | 116 | ||
| 107 | static struct policydb_compat_info *policydb_lookup_compat(int version) | 117 | static struct policydb_compat_info *policydb_lookup_compat(int version) |
| @@ -183,6 +193,8 @@ static int policydb_init(struct policydb *p) | |||
| 183 | if (rc) | 193 | if (rc) |
| 184 | goto out_free_symtab; | 194 | goto out_free_symtab; |
| 185 | 195 | ||
| 196 | ebitmap_init(&p->policycaps); | ||
| 197 | |||
| 186 | out: | 198 | out: |
| 187 | return rc; | 199 | return rc; |
| 188 | 200 | ||
| @@ -673,8 +685,8 @@ void policydb_destroy(struct policydb *p) | |||
| 673 | ebitmap_destroy(&p->type_attr_map[i]); | 685 | ebitmap_destroy(&p->type_attr_map[i]); |
| 674 | } | 686 | } |
| 675 | kfree(p->type_attr_map); | 687 | kfree(p->type_attr_map); |
| 676 | |||
| 677 | kfree(p->undefined_perms); | 688 | kfree(p->undefined_perms); |
| 689 | ebitmap_destroy(&p->policycaps); | ||
| 678 | 690 | ||
| 679 | return; | 691 | return; |
| 680 | } | 692 | } |
| @@ -1554,6 +1566,10 @@ int policydb_read(struct policydb *p, void *fp) | |||
| 1554 | p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN); | 1566 | p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN); |
| 1555 | p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN); | 1567 | p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN); |
| 1556 | 1568 | ||
| 1569 | if (p->policyvers >= POLICYDB_VERSION_POLCAP && | ||
| 1570 | ebitmap_read(&p->policycaps, fp) != 0) | ||
| 1571 | goto bad; | ||
| 1572 | |||
| 1557 | info = policydb_lookup_compat(p->policyvers); | 1573 | info = policydb_lookup_compat(p->policyvers); |
| 1558 | if (!info) { | 1574 | if (!info) { |
| 1559 | printk(KERN_ERR "security: unable to find policy compat info " | 1575 | printk(KERN_ERR "security: unable to find policy compat info " |
diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h index ed6fc687c66f..c4ce996e202c 100644 --- a/security/selinux/ss/policydb.h +++ b/security/selinux/ss/policydb.h | |||
| @@ -241,6 +241,8 @@ struct policydb { | |||
| 241 | /* type -> attribute reverse mapping */ | 241 | /* type -> attribute reverse mapping */ |
| 242 | struct ebitmap *type_attr_map; | 242 | struct ebitmap *type_attr_map; |
| 243 | 243 | ||
| 244 | struct ebitmap policycaps; | ||
| 245 | |||
| 244 | unsigned int policyvers; | 246 | unsigned int policyvers; |
| 245 | 247 | ||
| 246 | unsigned int reject_unknown : 1; | 248 | unsigned int reject_unknown : 1; |
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 4bf715d4cf29..f96dec1f9258 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c | |||
| @@ -16,12 +16,13 @@ | |||
| 16 | * Updated: Hewlett-Packard <paul.moore@hp.com> | 16 | * Updated: Hewlett-Packard <paul.moore@hp.com> |
| 17 | * | 17 | * |
| 18 | * Added support for NetLabel | 18 | * Added support for NetLabel |
| 19 | * Added support for the policy capability bitmap | ||
| 19 | * | 20 | * |
| 20 | * Updated: Chad Sellers <csellers@tresys.com> | 21 | * Updated: Chad Sellers <csellers@tresys.com> |
| 21 | * | 22 | * |
| 22 | * Added validation of kernel classes and permissions | 23 | * Added validation of kernel classes and permissions |
| 23 | * | 24 | * |
| 24 | * Copyright (C) 2006 Hewlett-Packard Development Company, L.P. | 25 | * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P. |
| 25 | * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. | 26 | * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. |
| 26 | * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC | 27 | * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC |
| 27 | * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> | 28 | * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> |
| @@ -59,6 +60,8 @@ | |||
| 59 | extern void selnl_notify_policyload(u32 seqno); | 60 | extern void selnl_notify_policyload(u32 seqno); |
| 60 | unsigned int policydb_loaded_version; | 61 | unsigned int policydb_loaded_version; |
| 61 | 62 | ||
| 63 | int selinux_policycap_netpeer; | ||
| 64 | |||
| 62 | /* | 65 | /* |
| 63 | * This is declared in avc.c | 66 | * This is declared in avc.c |
| 64 | */ | 67 | */ |
| @@ -1299,6 +1302,12 @@ bad: | |||
| 1299 | goto out; | 1302 | goto out; |
| 1300 | } | 1303 | } |
| 1301 | 1304 | ||
| 1305 | static void security_load_policycaps(void) | ||
| 1306 | { | ||
| 1307 | selinux_policycap_netpeer = ebitmap_get_bit(&policydb.policycaps, | ||
| 1308 | POLICYDB_CAPABILITY_NETPEER); | ||
| 1309 | } | ||
| 1310 | |||
| 1302 | extern void selinux_complete_init(void); | 1311 | extern void selinux_complete_init(void); |
| 1303 | static int security_preserve_bools(struct policydb *p); | 1312 | static int security_preserve_bools(struct policydb *p); |
| 1304 | 1313 | ||
| @@ -1346,6 +1355,7 @@ int security_load_policy(void *data, size_t len) | |||
| 1346 | avtab_cache_destroy(); | 1355 | avtab_cache_destroy(); |
| 1347 | return -EINVAL; | 1356 | return -EINVAL; |
| 1348 | } | 1357 | } |
| 1358 | security_load_policycaps(); | ||
| 1349 | policydb_loaded_version = policydb.policyvers; | 1359 | policydb_loaded_version = policydb.policyvers; |
| 1350 | ss_initialized = 1; | 1360 | ss_initialized = 1; |
| 1351 | seqno = ++latest_granting; | 1361 | seqno = ++latest_granting; |
| @@ -1404,6 +1414,7 @@ int security_load_policy(void *data, size_t len) | |||
| 1404 | POLICY_WRLOCK; | 1414 | POLICY_WRLOCK; |
| 1405 | memcpy(&policydb, &newpolicydb, sizeof policydb); | 1415 | memcpy(&policydb, &newpolicydb, sizeof policydb); |
| 1406 | sidtab_set(&sidtab, &newsidtab); | 1416 | sidtab_set(&sidtab, &newsidtab); |
| 1417 | security_load_policycaps(); | ||
| 1407 | seqno = ++latest_granting; | 1418 | seqno = ++latest_granting; |
| 1408 | policydb_loaded_version = policydb.policyvers; | 1419 | policydb_loaded_version = policydb.policyvers; |
| 1409 | POLICY_WRUNLOCK; | 1420 | POLICY_WRUNLOCK; |
| @@ -1478,11 +1489,8 @@ out: | |||
| 1478 | * security_netif_sid - Obtain the SID for a network interface. | 1489 | * security_netif_sid - Obtain the SID for a network interface. |
| 1479 | * @name: interface name | 1490 | * @name: interface name |
| 1480 | * @if_sid: interface SID | 1491 | * @if_sid: interface SID |
| 1481 | * @msg_sid: default SID for received packets | ||
| 1482 | */ | 1492 | */ |
| 1483 | int security_netif_sid(char *name, | 1493 | int security_netif_sid(char *name, u32 *if_sid) |
| 1484 | u32 *if_sid, | ||
| 1485 | u32 *msg_sid) | ||
| 1486 | { | 1494 | { |
| 1487 | int rc = 0; | 1495 | int rc = 0; |
| 1488 | struct ocontext *c; | 1496 | struct ocontext *c; |
| @@ -1510,11 +1518,8 @@ int security_netif_sid(char *name, | |||
| 1510 | goto out; | 1518 | goto out; |
| 1511 | } | 1519 | } |
| 1512 | *if_sid = c->sid[0]; | 1520 | *if_sid = c->sid[0]; |
| 1513 | *msg_sid = c->sid[1]; | 1521 | } else |
| 1514 | } else { | ||
| 1515 | *if_sid = SECINITSID_NETIF; | 1522 | *if_sid = SECINITSID_NETIF; |
| 1516 | *msg_sid = SECINITSID_NETMSG; | ||
| 1517 | } | ||
| 1518 | 1523 | ||
| 1519 | out: | 1524 | out: |
| 1520 | POLICY_RDUNLOCK; | 1525 | POLICY_RDUNLOCK; |
| @@ -2049,6 +2054,91 @@ out: | |||
| 2049 | return rc; | 2054 | return rc; |
| 2050 | } | 2055 | } |
| 2051 | 2056 | ||
| 2057 | /** | ||
| 2058 | * security_net_peersid_resolve - Compare and resolve two network peer SIDs | ||
| 2059 | * @nlbl_sid: NetLabel SID | ||
| 2060 | * @nlbl_type: NetLabel labeling protocol type | ||
| 2061 | * @xfrm_sid: XFRM SID | ||
| 2062 | * | ||
| 2063 | * Description: | ||
| 2064 | * Compare the @nlbl_sid and @xfrm_sid values and if the two SIDs can be | ||
| 2065 | * resolved into a single SID it is returned via @peer_sid and the function | ||
| 2066 | * returns zero. Otherwise @peer_sid is set to SECSID_NULL and the function | ||
| 2067 | * returns a negative value. A table summarizing the behavior is below: | ||
| 2068 | * | ||
| 2069 | * | function return | @sid | ||
| 2070 | * ------------------------------+-----------------+----------------- | ||
| 2071 | * no peer labels | 0 | SECSID_NULL | ||
| 2072 | * single peer label | 0 | <peer_label> | ||
| 2073 | * multiple, consistent labels | 0 | <peer_label> | ||
| 2074 | * multiple, inconsistent labels | -<errno> | SECSID_NULL | ||
| 2075 | * | ||
| 2076 | */ | ||
| 2077 | int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type, | ||
| 2078 | u32 xfrm_sid, | ||
| 2079 | u32 *peer_sid) | ||
| 2080 | { | ||
| 2081 | int rc; | ||
| 2082 | struct context *nlbl_ctx; | ||
| 2083 | struct context *xfrm_ctx; | ||
| 2084 | |||
| 2085 | /* handle the common (which also happens to be the set of easy) cases | ||
| 2086 | * right away, these two if statements catch everything involving a | ||
| 2087 | * single or absent peer SID/label */ | ||
| 2088 | if (xfrm_sid == SECSID_NULL) { | ||
| 2089 | *peer_sid = nlbl_sid; | ||
| 2090 | return 0; | ||
| 2091 | } | ||
| 2092 | /* NOTE: an nlbl_type == NETLBL_NLTYPE_UNLABELED is a "fallback" label | ||
| 2093 | * and is treated as if nlbl_sid == SECSID_NULL when a XFRM SID/label | ||
| 2094 | * is present */ | ||
| 2095 | if (nlbl_sid == SECSID_NULL || nlbl_type == NETLBL_NLTYPE_UNLABELED) { | ||
| 2096 | *peer_sid = xfrm_sid; | ||
| 2097 | return 0; | ||
| 2098 | } | ||
| 2099 | |||
| 2100 | /* we don't need to check ss_initialized here since the only way both | ||
| 2101 | * nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the | ||
| 2102 | * security server was initialized and ss_initialized was true */ | ||
| 2103 | if (!selinux_mls_enabled) { | ||
| 2104 | *peer_sid = SECSID_NULL; | ||
| 2105 | return 0; | ||
| 2106 | } | ||
| 2107 | |||
| 2108 | POLICY_RDLOCK; | ||
| 2109 | |||
| 2110 | nlbl_ctx = sidtab_search(&sidtab, nlbl_sid); | ||
| 2111 | if (!nlbl_ctx) { | ||
| 2112 | printk(KERN_ERR | ||
| 2113 | "security_sid_mls_cmp: unrecognized SID %d\n", | ||
| 2114 | nlbl_sid); | ||
| 2115 | rc = -EINVAL; | ||
| 2116 | goto out_slowpath; | ||
| 2117 | } | ||
| 2118 | xfrm_ctx = sidtab_search(&sidtab, xfrm_sid); | ||
| 2119 | if (!xfrm_ctx) { | ||
| 2120 | printk(KERN_ERR | ||
| 2121 | "security_sid_mls_cmp: unrecognized SID %d\n", | ||
| 2122 | xfrm_sid); | ||
| 2123 | rc = -EINVAL; | ||
| 2124 | goto out_slowpath; | ||
| 2125 | } | ||
| 2126 | rc = (mls_context_cmp(nlbl_ctx, xfrm_ctx) ? 0 : -EACCES); | ||
| 2127 | |||
| 2128 | out_slowpath: | ||
| 2129 | POLICY_RDUNLOCK; | ||
| 2130 | if (rc == 0) | ||
| 2131 | /* at present NetLabel SIDs/labels really only carry MLS | ||
| 2132 | * information so if the MLS portion of the NetLabel SID | ||
| 2133 | * matches the MLS portion of the labeled XFRM SID/label | ||
| 2134 | * then pass along the XFRM SID as it is the most | ||
| 2135 | * expressive */ | ||
| 2136 | *peer_sid = xfrm_sid; | ||
| 2137 | else | ||
| 2138 | *peer_sid = SECSID_NULL; | ||
| 2139 | return rc; | ||
| 2140 | } | ||
| 2141 | |||
| 2052 | static int get_classes_callback(void *k, void *d, void *args) | 2142 | static int get_classes_callback(void *k, void *d, void *args) |
| 2053 | { | 2143 | { |
| 2054 | struct class_datum *datum = d; | 2144 | struct class_datum *datum = d; |
| @@ -2154,6 +2244,60 @@ int security_get_allow_unknown(void) | |||
| 2154 | return policydb.allow_unknown; | 2244 | return policydb.allow_unknown; |
| 2155 | } | 2245 | } |
| 2156 | 2246 | ||
| 2247 | /** | ||
| 2248 | * security_get_policycaps - Query the loaded policy for its capabilities | ||
| 2249 | * @len: the number of capability bits | ||
| 2250 | * @values: the capability bit array | ||
| 2251 | * | ||
| 2252 | * Description: | ||
| 2253 | * Get an array of the policy capabilities in @values where each entry in | ||
| 2254 | * @values is either true (1) or false (0) depending the policy's support of | ||
| 2255 | * that feature. The policy capabilities are defined by the | ||
| 2256 | * POLICYDB_CAPABILITY_* enums. The size of the array is stored in @len and it | ||
| 2257 | * is up to the caller to free the array in @values. Returns zero on success, | ||
| 2258 | * negative values on failure. | ||
| 2259 | * | ||
| 2260 | */ | ||
| 2261 | int security_get_policycaps(int *len, int **values) | ||
| 2262 | { | ||
| 2263 | int rc = -ENOMEM; | ||
| 2264 | unsigned int iter; | ||
| 2265 | |||
| 2266 | POLICY_RDLOCK; | ||
| 2267 | |||
| 2268 | *values = kcalloc(POLICYDB_CAPABILITY_MAX, sizeof(int), GFP_ATOMIC); | ||
| 2269 | if (*values == NULL) | ||
| 2270 | goto out; | ||
| 2271 | for (iter = 0; iter < POLICYDB_CAPABILITY_MAX; iter++) | ||
| 2272 | (*values)[iter] = ebitmap_get_bit(&policydb.policycaps, iter); | ||
| 2273 | *len = POLICYDB_CAPABILITY_MAX; | ||
| 2274 | |||
| 2275 | out: | ||
| 2276 | POLICY_RDUNLOCK; | ||
| 2277 | return rc; | ||
| 2278 | } | ||
| 2279 | |||
| 2280 | /** | ||
| 2281 | * security_policycap_supported - Check for a specific policy capability | ||
| 2282 | * @req_cap: capability | ||
| 2283 | * | ||
| 2284 | * Description: | ||
| 2285 | * This function queries the currently loaded policy to see if it supports the | ||
| 2286 | * capability specified by @req_cap. Returns true (1) if the capability is | ||
| 2287 | * supported, false (0) if it isn't supported. | ||
| 2288 | * | ||
| 2289 | */ | ||
| 2290 | int security_policycap_supported(unsigned int req_cap) | ||
| 2291 | { | ||
| 2292 | int rc; | ||
| 2293 | |||
| 2294 | POLICY_RDLOCK; | ||
| 2295 | rc = ebitmap_get_bit(&policydb.policycaps, req_cap); | ||
| 2296 | POLICY_RDUNLOCK; | ||
| 2297 | |||
| 2298 | return rc; | ||
| 2299 | } | ||
| 2300 | |||
| 2157 | struct selinux_audit_rule { | 2301 | struct selinux_audit_rule { |
| 2158 | u32 au_seqno; | 2302 | u32 au_seqno; |
| 2159 | struct context au_ctxt; | 2303 | struct context au_ctxt; |
| @@ -2403,50 +2547,10 @@ void selinux_audit_set_callback(int (*callback)(void)) | |||
| 2403 | } | 2547 | } |
| 2404 | 2548 | ||
| 2405 | #ifdef CONFIG_NETLABEL | 2549 | #ifdef CONFIG_NETLABEL |
| 2406 | /* | ||
| 2407 | * NetLabel cache structure | ||
| 2408 | */ | ||
| 2409 | #define NETLBL_CACHE(x) ((struct selinux_netlbl_cache *)(x)) | ||
| 2410 | #define NETLBL_CACHE_T_NONE 0 | ||
| 2411 | #define NETLBL_CACHE_T_SID 1 | ||
| 2412 | #define NETLBL_CACHE_T_MLS 2 | ||
| 2413 | struct selinux_netlbl_cache { | ||
| 2414 | u32 type; | ||
| 2415 | union { | ||
| 2416 | u32 sid; | ||
| 2417 | struct mls_range mls_label; | ||
| 2418 | } data; | ||
| 2419 | }; | ||
| 2420 | |||
| 2421 | /** | ||
| 2422 | * security_netlbl_cache_free - Free the NetLabel cached data | ||
| 2423 | * @data: the data to free | ||
| 2424 | * | ||
| 2425 | * Description: | ||
| 2426 | * This function is intended to be used as the free() callback inside the | ||
| 2427 | * netlbl_lsm_cache structure. | ||
| 2428 | * | ||
| 2429 | */ | ||
| 2430 | static void security_netlbl_cache_free(const void *data) | ||
| 2431 | { | ||
| 2432 | struct selinux_netlbl_cache *cache; | ||
| 2433 | |||
| 2434 | if (data == NULL) | ||
| 2435 | return; | ||
| 2436 | |||
| 2437 | cache = NETLBL_CACHE(data); | ||
| 2438 | switch (cache->type) { | ||
| 2439 | case NETLBL_CACHE_T_MLS: | ||
| 2440 | ebitmap_destroy(&cache->data.mls_label.level[0].cat); | ||
| 2441 | break; | ||
| 2442 | } | ||
| 2443 | kfree(data); | ||
| 2444 | } | ||
| 2445 | |||
| 2446 | /** | 2550 | /** |
| 2447 | * security_netlbl_cache_add - Add an entry to the NetLabel cache | 2551 | * security_netlbl_cache_add - Add an entry to the NetLabel cache |
| 2448 | * @secattr: the NetLabel packet security attributes | 2552 | * @secattr: the NetLabel packet security attributes |
| 2449 | * @ctx: the SELinux context | 2553 | * @sid: the SELinux SID |
| 2450 | * | 2554 | * |
| 2451 | * Description: | 2555 | * Description: |
| 2452 | * Attempt to cache the context in @ctx, which was derived from the packet in | 2556 | * Attempt to cache the context in @ctx, which was derived from the packet in |
| @@ -2455,60 +2559,46 @@ static void security_netlbl_cache_free(const void *data) | |||
| 2455 | * | 2559 | * |
| 2456 | */ | 2560 | */ |
| 2457 | static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr, | 2561 | static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr, |
| 2458 | struct context *ctx) | 2562 | u32 sid) |
| 2459 | { | 2563 | { |
| 2460 | struct selinux_netlbl_cache *cache = NULL; | 2564 | u32 *sid_cache; |
| 2461 | 2565 | ||
| 2462 | secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); | 2566 | sid_cache = kmalloc(sizeof(*sid_cache), GFP_ATOMIC); |
| 2463 | if (secattr->cache == NULL) | 2567 | if (sid_cache == NULL) |
| 2464 | return; | ||
| 2465 | |||
| 2466 | cache = kzalloc(sizeof(*cache), GFP_ATOMIC); | ||
| 2467 | if (cache == NULL) | ||
| 2468 | return; | 2568 | return; |
| 2469 | 2569 | secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); | |
| 2470 | cache->type = NETLBL_CACHE_T_MLS; | 2570 | if (secattr->cache == NULL) { |
| 2471 | if (ebitmap_cpy(&cache->data.mls_label.level[0].cat, | 2571 | kfree(sid_cache); |
| 2472 | &ctx->range.level[0].cat) != 0) { | ||
| 2473 | kfree(cache); | ||
| 2474 | return; | 2572 | return; |
| 2475 | } | 2573 | } |
| 2476 | cache->data.mls_label.level[1].cat.highbit = | ||
| 2477 | cache->data.mls_label.level[0].cat.highbit; | ||
| 2478 | cache->data.mls_label.level[1].cat.node = | ||
| 2479 | cache->data.mls_label.level[0].cat.node; | ||
| 2480 | cache->data.mls_label.level[0].sens = ctx->range.level[0].sens; | ||
| 2481 | cache->data.mls_label.level[1].sens = ctx->range.level[0].sens; | ||
| 2482 | 2574 | ||
| 2483 | secattr->cache->free = security_netlbl_cache_free; | 2575 | *sid_cache = sid; |
| 2484 | secattr->cache->data = (void *)cache; | 2576 | secattr->cache->free = kfree; |
| 2577 | secattr->cache->data = sid_cache; | ||
| 2485 | secattr->flags |= NETLBL_SECATTR_CACHE; | 2578 | secattr->flags |= NETLBL_SECATTR_CACHE; |
| 2486 | } | 2579 | } |
| 2487 | 2580 | ||
| 2488 | /** | 2581 | /** |
| 2489 | * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID | 2582 | * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID |
| 2490 | * @secattr: the NetLabel packet security attributes | 2583 | * @secattr: the NetLabel packet security attributes |
| 2491 | * @base_sid: the SELinux SID to use as a context for MLS only attributes | ||
| 2492 | * @sid: the SELinux SID | 2584 | * @sid: the SELinux SID |
| 2493 | * | 2585 | * |
| 2494 | * Description: | 2586 | * Description: |
| 2495 | * Convert the given NetLabel security attributes in @secattr into a | 2587 | * Convert the given NetLabel security attributes in @secattr into a |
| 2496 | * SELinux SID. If the @secattr field does not contain a full SELinux | 2588 | * SELinux SID. If the @secattr field does not contain a full SELinux |
| 2497 | * SID/context then use the context in @base_sid as the foundation. If | 2589 | * SID/context then use SECINITSID_NETMSG as the foundation. If possibile the |
| 2498 | * possibile the 'cache' field of @secattr is set and the CACHE flag is set; | 2590 | * 'cache' field of @secattr is set and the CACHE flag is set; this is to |
| 2499 | * this is to allow the @secattr to be used by NetLabel to cache the secattr to | 2591 | * allow the @secattr to be used by NetLabel to cache the secattr to SID |
| 2500 | * SID conversion for future lookups. Returns zero on success, negative | 2592 | * conversion for future lookups. Returns zero on success, negative values on |
| 2501 | * values on failure. | 2593 | * failure. |
| 2502 | * | 2594 | * |
| 2503 | */ | 2595 | */ |
| 2504 | int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, | 2596 | int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, |
| 2505 | u32 base_sid, | ||
| 2506 | u32 *sid) | 2597 | u32 *sid) |
| 2507 | { | 2598 | { |
| 2508 | int rc = -EIDRM; | 2599 | int rc = -EIDRM; |
| 2509 | struct context *ctx; | 2600 | struct context *ctx; |
| 2510 | struct context ctx_new; | 2601 | struct context ctx_new; |
| 2511 | struct selinux_netlbl_cache *cache; | ||
| 2512 | 2602 | ||
| 2513 | if (!ss_initialized) { | 2603 | if (!ss_initialized) { |
| 2514 | *sid = SECSID_NULL; | 2604 | *sid = SECSID_NULL; |
| @@ -2518,40 +2608,13 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, | |||
| 2518 | POLICY_RDLOCK; | 2608 | POLICY_RDLOCK; |
| 2519 | 2609 | ||
| 2520 | if (secattr->flags & NETLBL_SECATTR_CACHE) { | 2610 | if (secattr->flags & NETLBL_SECATTR_CACHE) { |
| 2521 | cache = NETLBL_CACHE(secattr->cache->data); | 2611 | *sid = *(u32 *)secattr->cache->data; |
| 2522 | switch (cache->type) { | 2612 | rc = 0; |
| 2523 | case NETLBL_CACHE_T_SID: | 2613 | } else if (secattr->flags & NETLBL_SECATTR_SECID) { |
| 2524 | *sid = cache->data.sid; | 2614 | *sid = secattr->attr.secid; |
| 2525 | rc = 0; | 2615 | rc = 0; |
| 2526 | break; | ||
| 2527 | case NETLBL_CACHE_T_MLS: | ||
| 2528 | ctx = sidtab_search(&sidtab, base_sid); | ||
| 2529 | if (ctx == NULL) | ||
| 2530 | goto netlbl_secattr_to_sid_return; | ||
| 2531 | |||
| 2532 | ctx_new.user = ctx->user; | ||
| 2533 | ctx_new.role = ctx->role; | ||
| 2534 | ctx_new.type = ctx->type; | ||
| 2535 | ctx_new.range.level[0].sens = | ||
| 2536 | cache->data.mls_label.level[0].sens; | ||
| 2537 | ctx_new.range.level[0].cat.highbit = | ||
| 2538 | cache->data.mls_label.level[0].cat.highbit; | ||
| 2539 | ctx_new.range.level[0].cat.node = | ||
| 2540 | cache->data.mls_label.level[0].cat.node; | ||
| 2541 | ctx_new.range.level[1].sens = | ||
| 2542 | cache->data.mls_label.level[1].sens; | ||
| 2543 | ctx_new.range.level[1].cat.highbit = | ||
| 2544 | cache->data.mls_label.level[1].cat.highbit; | ||
| 2545 | ctx_new.range.level[1].cat.node = | ||
| 2546 | cache->data.mls_label.level[1].cat.node; | ||
| 2547 | |||
| 2548 | rc = sidtab_context_to_sid(&sidtab, &ctx_new, sid); | ||
| 2549 | break; | ||
| 2550 | default: | ||
| 2551 | goto netlbl_secattr_to_sid_return; | ||
| 2552 | } | ||
| 2553 | } else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) { | 2616 | } else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) { |
| 2554 | ctx = sidtab_search(&sidtab, base_sid); | 2617 | ctx = sidtab_search(&sidtab, SECINITSID_NETMSG); |
| 2555 | if (ctx == NULL) | 2618 | if (ctx == NULL) |
| 2556 | goto netlbl_secattr_to_sid_return; | 2619 | goto netlbl_secattr_to_sid_return; |
| 2557 | 2620 | ||
| @@ -2561,7 +2624,7 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, | |||
| 2561 | mls_import_netlbl_lvl(&ctx_new, secattr); | 2624 | mls_import_netlbl_lvl(&ctx_new, secattr); |
| 2562 | if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { | 2625 | if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { |
| 2563 | if (ebitmap_netlbl_import(&ctx_new.range.level[0].cat, | 2626 | if (ebitmap_netlbl_import(&ctx_new.range.level[0].cat, |
| 2564 | secattr->mls_cat) != 0) | 2627 | secattr->attr.mls.cat) != 0) |
| 2565 | goto netlbl_secattr_to_sid_return; | 2628 | goto netlbl_secattr_to_sid_return; |
| 2566 | ctx_new.range.level[1].cat.highbit = | 2629 | ctx_new.range.level[1].cat.highbit = |
| 2567 | ctx_new.range.level[0].cat.highbit; | 2630 | ctx_new.range.level[0].cat.highbit; |
| @@ -2578,7 +2641,7 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, | |||
| 2578 | if (rc != 0) | 2641 | if (rc != 0) |
| 2579 | goto netlbl_secattr_to_sid_return_cleanup; | 2642 | goto netlbl_secattr_to_sid_return_cleanup; |
| 2580 | 2643 | ||
| 2581 | security_netlbl_cache_add(secattr, &ctx_new); | 2644 | security_netlbl_cache_add(secattr, *sid); |
| 2582 | 2645 | ||
| 2583 | ebitmap_destroy(&ctx_new.range.level[0].cat); | 2646 | ebitmap_destroy(&ctx_new.range.level[0].cat); |
| 2584 | } else { | 2647 | } else { |
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index e07603969033..7e158205d081 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c | |||
| @@ -46,11 +46,14 @@ | |||
| 46 | #include <net/checksum.h> | 46 | #include <net/checksum.h> |
| 47 | #include <net/udp.h> | 47 | #include <net/udp.h> |
| 48 | #include <asm/semaphore.h> | 48 | #include <asm/semaphore.h> |
| 49 | #include <asm/atomic.h> | ||
| 49 | 50 | ||
| 50 | #include "avc.h" | 51 | #include "avc.h" |
| 51 | #include "objsec.h" | 52 | #include "objsec.h" |
| 52 | #include "xfrm.h" | 53 | #include "xfrm.h" |
| 53 | 54 | ||
| 55 | /* Labeled XFRM instance counter */ | ||
| 56 | atomic_t selinux_xfrm_refcount = ATOMIC_INIT(0); | ||
| 54 | 57 | ||
| 55 | /* | 58 | /* |
| 56 | * Returns true if an LSM/SELinux context | 59 | * Returns true if an LSM/SELinux context |
| @@ -293,6 +296,9 @@ int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, | |||
| 293 | BUG_ON(!uctx); | 296 | BUG_ON(!uctx); |
| 294 | 297 | ||
| 295 | err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx, 0); | 298 | err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx, 0); |
| 299 | if (err == 0) | ||
| 300 | atomic_inc(&selinux_xfrm_refcount); | ||
| 301 | |||
| 296 | return err; | 302 | return err; |
| 297 | } | 303 | } |
| 298 | 304 | ||
| @@ -340,10 +346,13 @@ int selinux_xfrm_policy_delete(struct xfrm_policy *xp) | |||
| 340 | struct xfrm_sec_ctx *ctx = xp->security; | 346 | struct xfrm_sec_ctx *ctx = xp->security; |
| 341 | int rc = 0; | 347 | int rc = 0; |
| 342 | 348 | ||
| 343 | if (ctx) | 349 | if (ctx) { |
| 344 | rc = avc_has_perm(tsec->sid, ctx->ctx_sid, | 350 | rc = avc_has_perm(tsec->sid, ctx->ctx_sid, |
| 345 | SECCLASS_ASSOCIATION, | 351 | SECCLASS_ASSOCIATION, |
| 346 | ASSOCIATION__SETCONTEXT, NULL); | 352 | ASSOCIATION__SETCONTEXT, NULL); |
| 353 | if (rc == 0) | ||
| 354 | atomic_dec(&selinux_xfrm_refcount); | ||
| 355 | } | ||
| 347 | 356 | ||
| 348 | return rc; | 357 | return rc; |
| 349 | } | 358 | } |
| @@ -360,6 +369,8 @@ int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uct | |||
| 360 | BUG_ON(!x); | 369 | BUG_ON(!x); |
| 361 | 370 | ||
| 362 | err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx, secid); | 371 | err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx, secid); |
| 372 | if (err == 0) | ||
| 373 | atomic_inc(&selinux_xfrm_refcount); | ||
| 363 | return err; | 374 | return err; |
| 364 | } | 375 | } |
| 365 | 376 | ||
| @@ -382,10 +393,13 @@ int selinux_xfrm_state_delete(struct xfrm_state *x) | |||
| 382 | struct xfrm_sec_ctx *ctx = x->security; | 393 | struct xfrm_sec_ctx *ctx = x->security; |
| 383 | int rc = 0; | 394 | int rc = 0; |
| 384 | 395 | ||
| 385 | if (ctx) | 396 | if (ctx) { |
| 386 | rc = avc_has_perm(tsec->sid, ctx->ctx_sid, | 397 | rc = avc_has_perm(tsec->sid, ctx->ctx_sid, |
| 387 | SECCLASS_ASSOCIATION, | 398 | SECCLASS_ASSOCIATION, |
| 388 | ASSOCIATION__SETCONTEXT, NULL); | 399 | ASSOCIATION__SETCONTEXT, NULL); |
| 400 | if (rc == 0) | ||
| 401 | atomic_dec(&selinux_xfrm_refcount); | ||
| 402 | } | ||
| 389 | 403 | ||
| 390 | return rc; | 404 | return rc; |
| 391 | } | 405 | } |
diff --git a/drivers/kvm/ioapic.c b/virt/kvm/ioapic.c index c7992e667fdb..317f8e211cd2 100644 --- a/drivers/kvm/ioapic.c +++ b/virt/kvm/ioapic.c | |||
| @@ -26,7 +26,7 @@ | |||
| 26 | * Based on Xen 3.1 code. | 26 | * Based on Xen 3.1 code. |
| 27 | */ | 27 | */ |
| 28 | 28 | ||
| 29 | #include "kvm.h" | 29 | #include <linux/kvm_host.h> |
| 30 | #include <linux/kvm.h> | 30 | #include <linux/kvm.h> |
| 31 | #include <linux/mm.h> | 31 | #include <linux/mm.h> |
| 32 | #include <linux/highmem.h> | 32 | #include <linux/highmem.h> |
| @@ -34,14 +34,17 @@ | |||
| 34 | #include <linux/hrtimer.h> | 34 | #include <linux/hrtimer.h> |
| 35 | #include <linux/io.h> | 35 | #include <linux/io.h> |
| 36 | #include <asm/processor.h> | 36 | #include <asm/processor.h> |
| 37 | #include <asm/msr.h> | ||
| 38 | #include <asm/page.h> | 37 | #include <asm/page.h> |
| 39 | #include <asm/current.h> | 38 | #include <asm/current.h> |
| 40 | #include <asm/apicdef.h> | 39 | |
| 41 | #include <asm/io_apic.h> | 40 | #include "ioapic.h" |
| 42 | #include "irq.h" | 41 | #include "lapic.h" |
| 43 | /* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ | 42 | |
| 43 | #if 0 | ||
| 44 | #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) | ||
| 45 | #else | ||
| 44 | #define ioapic_debug(fmt, arg...) | 46 | #define ioapic_debug(fmt, arg...) |
| 47 | #endif | ||
| 45 | static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq); | 48 | static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq); |
| 46 | 49 | ||
| 47 | static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, | 50 | static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, |
| @@ -113,7 +116,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) | |||
| 113 | default: | 116 | default: |
| 114 | index = (ioapic->ioregsel - 0x10) >> 1; | 117 | index = (ioapic->ioregsel - 0x10) >> 1; |
| 115 | 118 | ||
| 116 | ioapic_debug("change redir index %x val %x", index, val); | 119 | ioapic_debug("change redir index %x val %x\n", index, val); |
| 117 | if (index >= IOAPIC_NUM_PINS) | 120 | if (index >= IOAPIC_NUM_PINS) |
| 118 | return; | 121 | return; |
| 119 | if (ioapic->ioregsel & 1) { | 122 | if (ioapic->ioregsel & 1) { |
| @@ -131,16 +134,16 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) | |||
| 131 | } | 134 | } |
| 132 | 135 | ||
| 133 | static void ioapic_inj_irq(struct kvm_ioapic *ioapic, | 136 | static void ioapic_inj_irq(struct kvm_ioapic *ioapic, |
| 134 | struct kvm_lapic *target, | 137 | struct kvm_vcpu *vcpu, |
| 135 | u8 vector, u8 trig_mode, u8 delivery_mode) | 138 | u8 vector, u8 trig_mode, u8 delivery_mode) |
| 136 | { | 139 | { |
| 137 | ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode, | 140 | ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode, |
| 138 | delivery_mode); | 141 | delivery_mode); |
| 139 | 142 | ||
| 140 | ASSERT((delivery_mode == dest_Fixed) || | 143 | ASSERT((delivery_mode == IOAPIC_FIXED) || |
| 141 | (delivery_mode == dest_LowestPrio)); | 144 | (delivery_mode == IOAPIC_LOWEST_PRIORITY)); |
| 142 | 145 | ||
| 143 | kvm_apic_set_irq(target, vector, trig_mode); | 146 | kvm_apic_set_irq(vcpu, vector, trig_mode); |
| 144 | } | 147 | } |
| 145 | 148 | ||
| 146 | static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, | 149 | static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, |
| @@ -151,12 +154,12 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, | |||
| 151 | struct kvm *kvm = ioapic->kvm; | 154 | struct kvm *kvm = ioapic->kvm; |
| 152 | struct kvm_vcpu *vcpu; | 155 | struct kvm_vcpu *vcpu; |
| 153 | 156 | ||
| 154 | ioapic_debug("dest %d dest_mode %d", dest, dest_mode); | 157 | ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode); |
| 155 | 158 | ||
| 156 | if (dest_mode == 0) { /* Physical mode. */ | 159 | if (dest_mode == 0) { /* Physical mode. */ |
| 157 | if (dest == 0xFF) { /* Broadcast. */ | 160 | if (dest == 0xFF) { /* Broadcast. */ |
| 158 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | 161 | for (i = 0; i < KVM_MAX_VCPUS; ++i) |
| 159 | if (kvm->vcpus[i] && kvm->vcpus[i]->apic) | 162 | if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic) |
| 160 | mask |= 1 << i; | 163 | mask |= 1 << i; |
| 161 | return mask; | 164 | return mask; |
| 162 | } | 165 | } |
| @@ -164,8 +167,8 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, | |||
| 164 | vcpu = kvm->vcpus[i]; | 167 | vcpu = kvm->vcpus[i]; |
| 165 | if (!vcpu) | 168 | if (!vcpu) |
| 166 | continue; | 169 | continue; |
| 167 | if (kvm_apic_match_physical_addr(vcpu->apic, dest)) { | 170 | if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) { |
| 168 | if (vcpu->apic) | 171 | if (vcpu->arch.apic) |
| 169 | mask = 1 << i; | 172 | mask = 1 << i; |
| 170 | break; | 173 | break; |
| 171 | } | 174 | } |
| @@ -175,11 +178,11 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, | |||
| 175 | vcpu = kvm->vcpus[i]; | 178 | vcpu = kvm->vcpus[i]; |
| 176 | if (!vcpu) | 179 | if (!vcpu) |
| 177 | continue; | 180 | continue; |
| 178 | if (vcpu->apic && | 181 | if (vcpu->arch.apic && |
| 179 | kvm_apic_match_logical_addr(vcpu->apic, dest)) | 182 | kvm_apic_match_logical_addr(vcpu->arch.apic, dest)) |
| 180 | mask |= 1 << vcpu->vcpu_id; | 183 | mask |= 1 << vcpu->vcpu_id; |
| 181 | } | 184 | } |
| 182 | ioapic_debug("mask %x", mask); | 185 | ioapic_debug("mask %x\n", mask); |
| 183 | return mask; | 186 | return mask; |
| 184 | } | 187 | } |
| 185 | 188 | ||
| @@ -191,41 +194,39 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq) | |||
| 191 | u8 vector = ioapic->redirtbl[irq].fields.vector; | 194 | u8 vector = ioapic->redirtbl[irq].fields.vector; |
| 192 | u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; | 195 | u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; |
| 193 | u32 deliver_bitmask; | 196 | u32 deliver_bitmask; |
| 194 | struct kvm_lapic *target; | ||
| 195 | struct kvm_vcpu *vcpu; | 197 | struct kvm_vcpu *vcpu; |
| 196 | int vcpu_id; | 198 | int vcpu_id; |
| 197 | 199 | ||
| 198 | ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " | 200 | ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " |
| 199 | "vector=%x trig_mode=%x", | 201 | "vector=%x trig_mode=%x\n", |
| 200 | dest, dest_mode, delivery_mode, vector, trig_mode); | 202 | dest, dest_mode, delivery_mode, vector, trig_mode); |
| 201 | 203 | ||
| 202 | deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); | 204 | deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); |
| 203 | if (!deliver_bitmask) { | 205 | if (!deliver_bitmask) { |
| 204 | ioapic_debug("no target on destination"); | 206 | ioapic_debug("no target on destination\n"); |
| 205 | return; | 207 | return; |
| 206 | } | 208 | } |
| 207 | 209 | ||
| 208 | switch (delivery_mode) { | 210 | switch (delivery_mode) { |
| 209 | case dest_LowestPrio: | 211 | case IOAPIC_LOWEST_PRIORITY: |
| 210 | target = | 212 | vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector, |
| 211 | kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask); | 213 | deliver_bitmask); |
| 212 | if (target != NULL) | 214 | if (vcpu != NULL) |
| 213 | ioapic_inj_irq(ioapic, target, vector, | 215 | ioapic_inj_irq(ioapic, vcpu, vector, |
| 214 | trig_mode, delivery_mode); | 216 | trig_mode, delivery_mode); |
| 215 | else | 217 | else |
| 216 | ioapic_debug("null round robin: " | 218 | ioapic_debug("null lowest prio vcpu: " |
| 217 | "mask=%x vector=%x delivery_mode=%x", | 219 | "mask=%x vector=%x delivery_mode=%x\n", |
| 218 | deliver_bitmask, vector, dest_LowestPrio); | 220 | deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY); |
| 219 | break; | 221 | break; |
| 220 | case dest_Fixed: | 222 | case IOAPIC_FIXED: |
| 221 | for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { | 223 | for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { |
| 222 | if (!(deliver_bitmask & (1 << vcpu_id))) | 224 | if (!(deliver_bitmask & (1 << vcpu_id))) |
| 223 | continue; | 225 | continue; |
| 224 | deliver_bitmask &= ~(1 << vcpu_id); | 226 | deliver_bitmask &= ~(1 << vcpu_id); |
| 225 | vcpu = ioapic->kvm->vcpus[vcpu_id]; | 227 | vcpu = ioapic->kvm->vcpus[vcpu_id]; |
| 226 | if (vcpu) { | 228 | if (vcpu) { |
| 227 | target = vcpu->apic; | 229 | ioapic_inj_irq(ioapic, vcpu, vector, |
| 228 | ioapic_inj_irq(ioapic, target, vector, | ||
| 229 | trig_mode, delivery_mode); | 230 | trig_mode, delivery_mode); |
| 230 | } | 231 | } |
| 231 | } | 232 | } |
| @@ -271,7 +272,7 @@ static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) | |||
| 271 | 272 | ||
| 272 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) | 273 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) |
| 273 | { | 274 | { |
| 274 | struct kvm_ioapic *ioapic = kvm->vioapic; | 275 | struct kvm_ioapic *ioapic = kvm->arch.vioapic; |
| 275 | union ioapic_redir_entry *ent; | 276 | union ioapic_redir_entry *ent; |
| 276 | int gsi; | 277 | int gsi; |
| 277 | 278 | ||
| @@ -304,7 +305,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, | |||
| 304 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | 305 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; |
| 305 | u32 result; | 306 | u32 result; |
| 306 | 307 | ||
| 307 | ioapic_debug("addr %lx", (unsigned long)addr); | 308 | ioapic_debug("addr %lx\n", (unsigned long)addr); |
| 308 | ASSERT(!(addr & 0xf)); /* check alignment */ | 309 | ASSERT(!(addr & 0xf)); /* check alignment */ |
| 309 | 310 | ||
| 310 | addr &= 0xff; | 311 | addr &= 0xff; |
| @@ -341,8 +342,8 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, | |||
| 341 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | 342 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; |
| 342 | u32 data; | 343 | u32 data; |
| 343 | 344 | ||
| 344 | ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n", | 345 | ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", |
| 345 | addr, len, val); | 346 | (void*)addr, len, val); |
| 346 | ASSERT(!(addr & 0xf)); /* check alignment */ | 347 | ASSERT(!(addr & 0xf)); /* check alignment */ |
| 347 | if (len == 4 || len == 8) | 348 | if (len == 4 || len == 8) |
| 348 | data = *(u32 *) val; | 349 | data = *(u32 *) val; |
| @@ -360,24 +361,38 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, | |||
| 360 | case IOAPIC_REG_WINDOW: | 361 | case IOAPIC_REG_WINDOW: |
| 361 | ioapic_write_indirect(ioapic, data); | 362 | ioapic_write_indirect(ioapic, data); |
| 362 | break; | 363 | break; |
| 364 | #ifdef CONFIG_IA64 | ||
| 365 | case IOAPIC_REG_EOI: | ||
| 366 | kvm_ioapic_update_eoi(ioapic->kvm, data); | ||
| 367 | break; | ||
| 368 | #endif | ||
| 363 | 369 | ||
| 364 | default: | 370 | default: |
| 365 | break; | 371 | break; |
| 366 | } | 372 | } |
| 367 | } | 373 | } |
| 368 | 374 | ||
| 375 | void kvm_ioapic_reset(struct kvm_ioapic *ioapic) | ||
| 376 | { | ||
| 377 | int i; | ||
| 378 | |||
| 379 | for (i = 0; i < IOAPIC_NUM_PINS; i++) | ||
| 380 | ioapic->redirtbl[i].fields.mask = 1; | ||
| 381 | ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; | ||
| 382 | ioapic->ioregsel = 0; | ||
| 383 | ioapic->irr = 0; | ||
| 384 | ioapic->id = 0; | ||
| 385 | } | ||
| 386 | |||
| 369 | int kvm_ioapic_init(struct kvm *kvm) | 387 | int kvm_ioapic_init(struct kvm *kvm) |
| 370 | { | 388 | { |
| 371 | struct kvm_ioapic *ioapic; | 389 | struct kvm_ioapic *ioapic; |
| 372 | int i; | ||
| 373 | 390 | ||
| 374 | ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); | 391 | ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); |
| 375 | if (!ioapic) | 392 | if (!ioapic) |
| 376 | return -ENOMEM; | 393 | return -ENOMEM; |
| 377 | kvm->vioapic = ioapic; | 394 | kvm->arch.vioapic = ioapic; |
| 378 | for (i = 0; i < IOAPIC_NUM_PINS; i++) | 395 | kvm_ioapic_reset(ioapic); |
| 379 | ioapic->redirtbl[i].fields.mask = 1; | ||
| 380 | ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; | ||
| 381 | ioapic->dev.read = ioapic_mmio_read; | 396 | ioapic->dev.read = ioapic_mmio_read; |
| 382 | ioapic->dev.write = ioapic_mmio_write; | 397 | ioapic->dev.write = ioapic_mmio_write; |
| 383 | ioapic->dev.in_range = ioapic_in_range; | 398 | ioapic->dev.in_range = ioapic_in_range; |
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h new file mode 100644 index 000000000000..7f16675fe783 --- /dev/null +++ b/virt/kvm/ioapic.h | |||
| @@ -0,0 +1,95 @@ | |||
| 1 | #ifndef __KVM_IO_APIC_H | ||
| 2 | #define __KVM_IO_APIC_H | ||
| 3 | |||
| 4 | #include <linux/kvm_host.h> | ||
| 5 | |||
| 6 | #include "iodev.h" | ||
| 7 | |||
| 8 | struct kvm; | ||
| 9 | struct kvm_vcpu; | ||
| 10 | |||
| 11 | #define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS | ||
| 12 | #define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ | ||
| 13 | #define IOAPIC_EDGE_TRIG 0 | ||
| 14 | #define IOAPIC_LEVEL_TRIG 1 | ||
| 15 | |||
| 16 | #define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 | ||
| 17 | #define IOAPIC_MEM_LENGTH 0x100 | ||
| 18 | |||
| 19 | /* Direct registers. */ | ||
| 20 | #define IOAPIC_REG_SELECT 0x00 | ||
| 21 | #define IOAPIC_REG_WINDOW 0x10 | ||
| 22 | #define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ | ||
| 23 | |||
| 24 | /* Indirect registers. */ | ||
| 25 | #define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ | ||
| 26 | #define IOAPIC_REG_VERSION 0x01 | ||
| 27 | #define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ | ||
| 28 | |||
| 29 | /*ioapic delivery mode*/ | ||
| 30 | #define IOAPIC_FIXED 0x0 | ||
| 31 | #define IOAPIC_LOWEST_PRIORITY 0x1 | ||
| 32 | #define IOAPIC_PMI 0x2 | ||
| 33 | #define IOAPIC_NMI 0x4 | ||
| 34 | #define IOAPIC_INIT 0x5 | ||
| 35 | #define IOAPIC_EXTINT 0x7 | ||
| 36 | |||
| 37 | struct kvm_ioapic { | ||
| 38 | u64 base_address; | ||
| 39 | u32 ioregsel; | ||
| 40 | u32 id; | ||
| 41 | u32 irr; | ||
| 42 | u32 pad; | ||
| 43 | union ioapic_redir_entry { | ||
| 44 | u64 bits; | ||
| 45 | struct { | ||
| 46 | u8 vector; | ||
| 47 | u8 delivery_mode:3; | ||
| 48 | u8 dest_mode:1; | ||
| 49 | u8 delivery_status:1; | ||
| 50 | u8 polarity:1; | ||
| 51 | u8 remote_irr:1; | ||
| 52 | u8 trig_mode:1; | ||
| 53 | u8 mask:1; | ||
| 54 | u8 reserve:7; | ||
| 55 | u8 reserved[4]; | ||
| 56 | u8 dest_id; | ||
| 57 | } fields; | ||
| 58 | } redirtbl[IOAPIC_NUM_PINS]; | ||
| 59 | struct kvm_io_device dev; | ||
| 60 | struct kvm *kvm; | ||
| 61 | }; | ||
| 62 | |||
| 63 | #ifdef DEBUG | ||
| 64 | #define ASSERT(x) \ | ||
| 65 | do { \ | ||
| 66 | if (!(x)) { \ | ||
| 67 | printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ | ||
| 68 | __FILE__, __LINE__, #x); \ | ||
| 69 | BUG(); \ | ||
| 70 | } \ | ||
| 71 | } while (0) | ||
| 72 | #else | ||
| 73 | #define ASSERT(x) do { } while (0) | ||
| 74 | #endif | ||
| 75 | |||
| 76 | static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) | ||
| 77 | { | ||
| 78 | return kvm->arch.vioapic; | ||
| 79 | } | ||
| 80 | |||
| 81 | #ifdef CONFIG_IA64 | ||
| 82 | static inline int irqchip_in_kernel(struct kvm *kvm) | ||
| 83 | { | ||
| 84 | return 1; | ||
| 85 | } | ||
| 86 | #endif | ||
| 87 | |||
| 88 | struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, | ||
| 89 | unsigned long bitmap); | ||
| 90 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); | ||
| 91 | int kvm_ioapic_init(struct kvm *kvm); | ||
| 92 | void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); | ||
| 93 | void kvm_ioapic_reset(struct kvm_ioapic *ioapic); | ||
| 94 | |||
| 95 | #endif | ||
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h new file mode 100644 index 000000000000..c14e642027b2 --- /dev/null +++ b/virt/kvm/iodev.h | |||
| @@ -0,0 +1,63 @@ | |||
| 1 | /* | ||
| 2 | * This program is free software; you can redistribute it and/or modify | ||
| 3 | * it under the terms of the GNU General Public License as published by | ||
| 4 | * the Free Software Foundation; either version 2 of the License. | ||
| 5 | * | ||
| 6 | * This program is distributed in the hope that it will be useful, | ||
| 7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 9 | * GNU General Public License for more details. | ||
| 10 | * | ||
| 11 | * You should have received a copy of the GNU General Public License | ||
| 12 | * along with this program; if not, write to the Free Software | ||
| 13 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
| 14 | */ | ||
| 15 | |||
| 16 | #ifndef __KVM_IODEV_H__ | ||
| 17 | #define __KVM_IODEV_H__ | ||
| 18 | |||
| 19 | #include <linux/kvm_types.h> | ||
| 20 | |||
| 21 | struct kvm_io_device { | ||
| 22 | void (*read)(struct kvm_io_device *this, | ||
| 23 | gpa_t addr, | ||
| 24 | int len, | ||
| 25 | void *val); | ||
| 26 | void (*write)(struct kvm_io_device *this, | ||
| 27 | gpa_t addr, | ||
| 28 | int len, | ||
| 29 | const void *val); | ||
| 30 | int (*in_range)(struct kvm_io_device *this, gpa_t addr); | ||
| 31 | void (*destructor)(struct kvm_io_device *this); | ||
| 32 | |||
| 33 | void *private; | ||
| 34 | }; | ||
| 35 | |||
| 36 | static inline void kvm_iodevice_read(struct kvm_io_device *dev, | ||
| 37 | gpa_t addr, | ||
| 38 | int len, | ||
| 39 | void *val) | ||
| 40 | { | ||
| 41 | dev->read(dev, addr, len, val); | ||
| 42 | } | ||
| 43 | |||
| 44 | static inline void kvm_iodevice_write(struct kvm_io_device *dev, | ||
| 45 | gpa_t addr, | ||
| 46 | int len, | ||
| 47 | const void *val) | ||
| 48 | { | ||
| 49 | dev->write(dev, addr, len, val); | ||
| 50 | } | ||
| 51 | |||
| 52 | static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr) | ||
| 53 | { | ||
| 54 | return dev->in_range(dev, addr); | ||
| 55 | } | ||
| 56 | |||
| 57 | static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) | ||
| 58 | { | ||
| 59 | if (dev->destructor) | ||
| 60 | dev->destructor(dev); | ||
| 61 | } | ||
| 62 | |||
| 63 | #endif /* __KVM_IODEV_H__ */ | ||
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c new file mode 100644 index 000000000000..3c4fe26096fc --- /dev/null +++ b/virt/kvm/kvm_main.c | |||
| @@ -0,0 +1,1400 @@ | |||
| 1 | /* | ||
| 2 | * Kernel-based Virtual Machine driver for Linux | ||
| 3 | * | ||
| 4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
| 5 | * machines without emulation or binary translation. | ||
| 6 | * | ||
| 7 | * Copyright (C) 2006 Qumranet, Inc. | ||
| 8 | * | ||
| 9 | * Authors: | ||
| 10 | * Avi Kivity <avi@qumranet.com> | ||
| 11 | * Yaniv Kamay <yaniv@qumranet.com> | ||
| 12 | * | ||
| 13 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
| 14 | * the COPYING file in the top-level directory. | ||
| 15 | * | ||
| 16 | */ | ||
| 17 | |||
| 18 | #include "iodev.h" | ||
| 19 | |||
| 20 | #include <linux/kvm_host.h> | ||
| 21 | #include <linux/kvm.h> | ||
| 22 | #include <linux/module.h> | ||
| 23 | #include <linux/errno.h> | ||
| 24 | #include <linux/percpu.h> | ||
| 25 | #include <linux/gfp.h> | ||
| 26 | #include <linux/mm.h> | ||
| 27 | #include <linux/miscdevice.h> | ||
| 28 | #include <linux/vmalloc.h> | ||
| 29 | #include <linux/reboot.h> | ||
| 30 | #include <linux/debugfs.h> | ||
| 31 | #include <linux/highmem.h> | ||
| 32 | #include <linux/file.h> | ||
| 33 | #include <linux/sysdev.h> | ||
| 34 | #include <linux/cpu.h> | ||
| 35 | #include <linux/sched.h> | ||
| 36 | #include <linux/cpumask.h> | ||
| 37 | #include <linux/smp.h> | ||
| 38 | #include <linux/anon_inodes.h> | ||
| 39 | #include <linux/profile.h> | ||
| 40 | #include <linux/kvm_para.h> | ||
| 41 | #include <linux/pagemap.h> | ||
| 42 | #include <linux/mman.h> | ||
| 43 | |||
| 44 | #include <asm/processor.h> | ||
| 45 | #include <asm/io.h> | ||
| 46 | #include <asm/uaccess.h> | ||
| 47 | #include <asm/pgtable.h> | ||
| 48 | |||
| 49 | MODULE_AUTHOR("Qumranet"); | ||
| 50 | MODULE_LICENSE("GPL"); | ||
| 51 | |||
| 52 | DEFINE_SPINLOCK(kvm_lock); | ||
| 53 | LIST_HEAD(vm_list); | ||
| 54 | |||
| 55 | static cpumask_t cpus_hardware_enabled; | ||
| 56 | |||
| 57 | struct kmem_cache *kvm_vcpu_cache; | ||
| 58 | EXPORT_SYMBOL_GPL(kvm_vcpu_cache); | ||
| 59 | |||
| 60 | static __read_mostly struct preempt_ops kvm_preempt_ops; | ||
| 61 | |||
| 62 | static struct dentry *debugfs_dir; | ||
| 63 | |||
| 64 | static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, | ||
| 65 | unsigned long arg); | ||
| 66 | |||
| 67 | static inline int valid_vcpu(int n) | ||
| 68 | { | ||
| 69 | return likely(n >= 0 && n < KVM_MAX_VCPUS); | ||
| 70 | } | ||
| 71 | |||
| 72 | /* | ||
| 73 | * Switches to specified vcpu, until a matching vcpu_put() | ||
| 74 | */ | ||
| 75 | void vcpu_load(struct kvm_vcpu *vcpu) | ||
| 76 | { | ||
| 77 | int cpu; | ||
| 78 | |||
| 79 | mutex_lock(&vcpu->mutex); | ||
| 80 | cpu = get_cpu(); | ||
| 81 | preempt_notifier_register(&vcpu->preempt_notifier); | ||
| 82 | kvm_arch_vcpu_load(vcpu, cpu); | ||
| 83 | put_cpu(); | ||
| 84 | } | ||
| 85 | |||
| 86 | void vcpu_put(struct kvm_vcpu *vcpu) | ||
| 87 | { | ||
| 88 | preempt_disable(); | ||
| 89 | kvm_arch_vcpu_put(vcpu); | ||
| 90 | preempt_notifier_unregister(&vcpu->preempt_notifier); | ||
| 91 | preempt_enable(); | ||
| 92 | mutex_unlock(&vcpu->mutex); | ||
| 93 | } | ||
| 94 | |||
| 95 | static void ack_flush(void *_completed) | ||
| 96 | { | ||
| 97 | } | ||
| 98 | |||
| 99 | void kvm_flush_remote_tlbs(struct kvm *kvm) | ||
| 100 | { | ||
| 101 | int i, cpu; | ||
| 102 | cpumask_t cpus; | ||
| 103 | struct kvm_vcpu *vcpu; | ||
| 104 | |||
| 105 | cpus_clear(cpus); | ||
| 106 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
| 107 | vcpu = kvm->vcpus[i]; | ||
| 108 | if (!vcpu) | ||
| 109 | continue; | ||
| 110 | if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) | ||
| 111 | continue; | ||
| 112 | cpu = vcpu->cpu; | ||
| 113 | if (cpu != -1 && cpu != raw_smp_processor_id()) | ||
| 114 | cpu_set(cpu, cpus); | ||
| 115 | } | ||
| 116 | if (cpus_empty(cpus)) | ||
| 117 | return; | ||
| 118 | ++kvm->stat.remote_tlb_flush; | ||
| 119 | smp_call_function_mask(cpus, ack_flush, NULL, 1); | ||
| 120 | } | ||
| 121 | |||
| 122 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) | ||
| 123 | { | ||
| 124 | struct page *page; | ||
| 125 | int r; | ||
| 126 | |||
| 127 | mutex_init(&vcpu->mutex); | ||
| 128 | vcpu->cpu = -1; | ||
| 129 | vcpu->kvm = kvm; | ||
| 130 | vcpu->vcpu_id = id; | ||
| 131 | init_waitqueue_head(&vcpu->wq); | ||
| 132 | |||
| 133 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
| 134 | if (!page) { | ||
| 135 | r = -ENOMEM; | ||
| 136 | goto fail; | ||
| 137 | } | ||
| 138 | vcpu->run = page_address(page); | ||
| 139 | |||
| 140 | r = kvm_arch_vcpu_init(vcpu); | ||
| 141 | if (r < 0) | ||
| 142 | goto fail_free_run; | ||
| 143 | return 0; | ||
| 144 | |||
| 145 | fail_free_run: | ||
| 146 | free_page((unsigned long)vcpu->run); | ||
| 147 | fail: | ||
| 148 | return r; | ||
| 149 | } | ||
| 150 | EXPORT_SYMBOL_GPL(kvm_vcpu_init); | ||
| 151 | |||
| 152 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) | ||
| 153 | { | ||
| 154 | kvm_arch_vcpu_uninit(vcpu); | ||
| 155 | free_page((unsigned long)vcpu->run); | ||
| 156 | } | ||
| 157 | EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); | ||
| 158 | |||
| 159 | static struct kvm *kvm_create_vm(void) | ||
| 160 | { | ||
| 161 | struct kvm *kvm = kvm_arch_create_vm(); | ||
| 162 | |||
| 163 | if (IS_ERR(kvm)) | ||
| 164 | goto out; | ||
| 165 | |||
| 166 | kvm->mm = current->mm; | ||
| 167 | atomic_inc(&kvm->mm->mm_count); | ||
| 168 | spin_lock_init(&kvm->mmu_lock); | ||
| 169 | kvm_io_bus_init(&kvm->pio_bus); | ||
| 170 | mutex_init(&kvm->lock); | ||
| 171 | kvm_io_bus_init(&kvm->mmio_bus); | ||
| 172 | spin_lock(&kvm_lock); | ||
| 173 | list_add(&kvm->vm_list, &vm_list); | ||
| 174 | spin_unlock(&kvm_lock); | ||
| 175 | out: | ||
| 176 | return kvm; | ||
| 177 | } | ||
| 178 | |||
| 179 | /* | ||
| 180 | * Free any memory in @free but not in @dont. | ||
| 181 | */ | ||
| 182 | static void kvm_free_physmem_slot(struct kvm_memory_slot *free, | ||
| 183 | struct kvm_memory_slot *dont) | ||
| 184 | { | ||
| 185 | if (!dont || free->rmap != dont->rmap) | ||
| 186 | vfree(free->rmap); | ||
| 187 | |||
| 188 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) | ||
| 189 | vfree(free->dirty_bitmap); | ||
| 190 | |||
| 191 | free->npages = 0; | ||
| 192 | free->dirty_bitmap = NULL; | ||
| 193 | free->rmap = NULL; | ||
| 194 | } | ||
| 195 | |||
| 196 | void kvm_free_physmem(struct kvm *kvm) | ||
| 197 | { | ||
| 198 | int i; | ||
| 199 | |||
| 200 | for (i = 0; i < kvm->nmemslots; ++i) | ||
| 201 | kvm_free_physmem_slot(&kvm->memslots[i], NULL); | ||
| 202 | } | ||
| 203 | |||
| 204 | static void kvm_destroy_vm(struct kvm *kvm) | ||
| 205 | { | ||
| 206 | struct mm_struct *mm = kvm->mm; | ||
| 207 | |||
| 208 | spin_lock(&kvm_lock); | ||
| 209 | list_del(&kvm->vm_list); | ||
| 210 | spin_unlock(&kvm_lock); | ||
| 211 | kvm_io_bus_destroy(&kvm->pio_bus); | ||
| 212 | kvm_io_bus_destroy(&kvm->mmio_bus); | ||
| 213 | kvm_arch_destroy_vm(kvm); | ||
| 214 | mmdrop(mm); | ||
| 215 | } | ||
| 216 | |||
| 217 | static int kvm_vm_release(struct inode *inode, struct file *filp) | ||
| 218 | { | ||
| 219 | struct kvm *kvm = filp->private_data; | ||
| 220 | |||
| 221 | kvm_destroy_vm(kvm); | ||
| 222 | return 0; | ||
| 223 | } | ||
| 224 | |||
| 225 | /* | ||
| 226 | * Allocate some memory and give it an address in the guest physical address | ||
| 227 | * space. | ||
| 228 | * | ||
| 229 | * Discontiguous memory is allowed, mostly for framebuffers. | ||
| 230 | * | ||
| 231 | * Must be called holding mmap_sem for write. | ||
| 232 | */ | ||
| 233 | int __kvm_set_memory_region(struct kvm *kvm, | ||
| 234 | struct kvm_userspace_memory_region *mem, | ||
| 235 | int user_alloc) | ||
| 236 | { | ||
| 237 | int r; | ||
| 238 | gfn_t base_gfn; | ||
| 239 | unsigned long npages; | ||
| 240 | unsigned long i; | ||
| 241 | struct kvm_memory_slot *memslot; | ||
| 242 | struct kvm_memory_slot old, new; | ||
| 243 | |||
| 244 | r = -EINVAL; | ||
| 245 | /* General sanity checks */ | ||
| 246 | if (mem->memory_size & (PAGE_SIZE - 1)) | ||
| 247 | goto out; | ||
| 248 | if (mem->guest_phys_addr & (PAGE_SIZE - 1)) | ||
| 249 | goto out; | ||
| 250 | if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) | ||
| 251 | goto out; | ||
| 252 | if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) | ||
| 253 | goto out; | ||
| 254 | |||
| 255 | memslot = &kvm->memslots[mem->slot]; | ||
| 256 | base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; | ||
| 257 | npages = mem->memory_size >> PAGE_SHIFT; | ||
| 258 | |||
| 259 | if (!npages) | ||
| 260 | mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; | ||
| 261 | |||
| 262 | new = old = *memslot; | ||
| 263 | |||
| 264 | new.base_gfn = base_gfn; | ||
| 265 | new.npages = npages; | ||
| 266 | new.flags = mem->flags; | ||
| 267 | |||
| 268 | /* Disallow changing a memory slot's size. */ | ||
| 269 | r = -EINVAL; | ||
| 270 | if (npages && old.npages && npages != old.npages) | ||
| 271 | goto out_free; | ||
| 272 | |||
| 273 | /* Check for overlaps */ | ||
| 274 | r = -EEXIST; | ||
| 275 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
| 276 | struct kvm_memory_slot *s = &kvm->memslots[i]; | ||
| 277 | |||
| 278 | if (s == memslot) | ||
| 279 | continue; | ||
| 280 | if (!((base_gfn + npages <= s->base_gfn) || | ||
| 281 | (base_gfn >= s->base_gfn + s->npages))) | ||
| 282 | goto out_free; | ||
| 283 | } | ||
| 284 | |||
| 285 | /* Free page dirty bitmap if unneeded */ | ||
| 286 | if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) | ||
| 287 | new.dirty_bitmap = NULL; | ||
| 288 | |||
| 289 | r = -ENOMEM; | ||
| 290 | |||
| 291 | /* Allocate if a slot is being created */ | ||
| 292 | if (npages && !new.rmap) { | ||
| 293 | new.rmap = vmalloc(npages * sizeof(struct page *)); | ||
| 294 | |||
| 295 | if (!new.rmap) | ||
| 296 | goto out_free; | ||
| 297 | |||
| 298 | memset(new.rmap, 0, npages * sizeof(*new.rmap)); | ||
| 299 | |||
| 300 | new.user_alloc = user_alloc; | ||
| 301 | new.userspace_addr = mem->userspace_addr; | ||
| 302 | } | ||
| 303 | |||
| 304 | /* Allocate page dirty bitmap if needed */ | ||
| 305 | if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { | ||
| 306 | unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; | ||
| 307 | |||
| 308 | new.dirty_bitmap = vmalloc(dirty_bytes); | ||
| 309 | if (!new.dirty_bitmap) | ||
| 310 | goto out_free; | ||
| 311 | memset(new.dirty_bitmap, 0, dirty_bytes); | ||
| 312 | } | ||
| 313 | |||
| 314 | if (mem->slot >= kvm->nmemslots) | ||
| 315 | kvm->nmemslots = mem->slot + 1; | ||
| 316 | |||
| 317 | *memslot = new; | ||
| 318 | |||
| 319 | r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); | ||
| 320 | if (r) { | ||
| 321 | *memslot = old; | ||
| 322 | goto out_free; | ||
| 323 | } | ||
| 324 | |||
| 325 | kvm_free_physmem_slot(&old, &new); | ||
| 326 | return 0; | ||
| 327 | |||
| 328 | out_free: | ||
| 329 | kvm_free_physmem_slot(&new, &old); | ||
| 330 | out: | ||
| 331 | return r; | ||
| 332 | |||
| 333 | } | ||
| 334 | EXPORT_SYMBOL_GPL(__kvm_set_memory_region); | ||
| 335 | |||
| 336 | int kvm_set_memory_region(struct kvm *kvm, | ||
| 337 | struct kvm_userspace_memory_region *mem, | ||
| 338 | int user_alloc) | ||
| 339 | { | ||
| 340 | int r; | ||
| 341 | |||
| 342 | down_write(¤t->mm->mmap_sem); | ||
| 343 | r = __kvm_set_memory_region(kvm, mem, user_alloc); | ||
| 344 | up_write(¤t->mm->mmap_sem); | ||
| 345 | return r; | ||
| 346 | } | ||
| 347 | EXPORT_SYMBOL_GPL(kvm_set_memory_region); | ||
| 348 | |||
| 349 | int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, | ||
| 350 | struct | ||
| 351 | kvm_userspace_memory_region *mem, | ||
| 352 | int user_alloc) | ||
| 353 | { | ||
| 354 | if (mem->slot >= KVM_MEMORY_SLOTS) | ||
| 355 | return -EINVAL; | ||
| 356 | return kvm_set_memory_region(kvm, mem, user_alloc); | ||
| 357 | } | ||
| 358 | |||
| 359 | int kvm_get_dirty_log(struct kvm *kvm, | ||
| 360 | struct kvm_dirty_log *log, int *is_dirty) | ||
| 361 | { | ||
| 362 | struct kvm_memory_slot *memslot; | ||
| 363 | int r, i; | ||
| 364 | int n; | ||
| 365 | unsigned long any = 0; | ||
| 366 | |||
| 367 | r = -EINVAL; | ||
| 368 | if (log->slot >= KVM_MEMORY_SLOTS) | ||
| 369 | goto out; | ||
| 370 | |||
| 371 | memslot = &kvm->memslots[log->slot]; | ||
| 372 | r = -ENOENT; | ||
| 373 | if (!memslot->dirty_bitmap) | ||
| 374 | goto out; | ||
| 375 | |||
| 376 | n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; | ||
| 377 | |||
| 378 | for (i = 0; !any && i < n/sizeof(long); ++i) | ||
| 379 | any = memslot->dirty_bitmap[i]; | ||
| 380 | |||
| 381 | r = -EFAULT; | ||
| 382 | if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) | ||
| 383 | goto out; | ||
| 384 | |||
| 385 | if (any) | ||
| 386 | *is_dirty = 1; | ||
| 387 | |||
| 388 | r = 0; | ||
| 389 | out: | ||
| 390 | return r; | ||
| 391 | } | ||
| 392 | |||
| 393 | int is_error_page(struct page *page) | ||
| 394 | { | ||
| 395 | return page == bad_page; | ||
| 396 | } | ||
| 397 | EXPORT_SYMBOL_GPL(is_error_page); | ||
| 398 | |||
| 399 | static inline unsigned long bad_hva(void) | ||
| 400 | { | ||
| 401 | return PAGE_OFFSET; | ||
| 402 | } | ||
| 403 | |||
| 404 | int kvm_is_error_hva(unsigned long addr) | ||
| 405 | { | ||
| 406 | return addr == bad_hva(); | ||
| 407 | } | ||
| 408 | EXPORT_SYMBOL_GPL(kvm_is_error_hva); | ||
| 409 | |||
| 410 | static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | ||
| 411 | { | ||
| 412 | int i; | ||
| 413 | |||
| 414 | for (i = 0; i < kvm->nmemslots; ++i) { | ||
| 415 | struct kvm_memory_slot *memslot = &kvm->memslots[i]; | ||
| 416 | |||
| 417 | if (gfn >= memslot->base_gfn | ||
| 418 | && gfn < memslot->base_gfn + memslot->npages) | ||
| 419 | return memslot; | ||
| 420 | } | ||
| 421 | return NULL; | ||
| 422 | } | ||
| 423 | |||
| 424 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | ||
| 425 | { | ||
| 426 | gfn = unalias_gfn(kvm, gfn); | ||
| 427 | return __gfn_to_memslot(kvm, gfn); | ||
| 428 | } | ||
| 429 | |||
| 430 | int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) | ||
| 431 | { | ||
| 432 | int i; | ||
| 433 | |||
| 434 | gfn = unalias_gfn(kvm, gfn); | ||
| 435 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
| 436 | struct kvm_memory_slot *memslot = &kvm->memslots[i]; | ||
| 437 | |||
| 438 | if (gfn >= memslot->base_gfn | ||
| 439 | && gfn < memslot->base_gfn + memslot->npages) | ||
| 440 | return 1; | ||
| 441 | } | ||
| 442 | return 0; | ||
| 443 | } | ||
| 444 | EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); | ||
| 445 | |||
| 446 | static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) | ||
| 447 | { | ||
| 448 | struct kvm_memory_slot *slot; | ||
| 449 | |||
| 450 | gfn = unalias_gfn(kvm, gfn); | ||
| 451 | slot = __gfn_to_memslot(kvm, gfn); | ||
| 452 | if (!slot) | ||
| 453 | return bad_hva(); | ||
| 454 | return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); | ||
| 455 | } | ||
| 456 | |||
| 457 | /* | ||
| 458 | * Requires current->mm->mmap_sem to be held | ||
| 459 | */ | ||
| 460 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) | ||
| 461 | { | ||
| 462 | struct page *page[1]; | ||
| 463 | unsigned long addr; | ||
| 464 | int npages; | ||
| 465 | |||
| 466 | might_sleep(); | ||
| 467 | |||
| 468 | addr = gfn_to_hva(kvm, gfn); | ||
| 469 | if (kvm_is_error_hva(addr)) { | ||
| 470 | get_page(bad_page); | ||
| 471 | return bad_page; | ||
| 472 | } | ||
| 473 | |||
| 474 | npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page, | ||
| 475 | NULL); | ||
| 476 | |||
| 477 | if (npages != 1) { | ||
| 478 | get_page(bad_page); | ||
| 479 | return bad_page; | ||
| 480 | } | ||
| 481 | |||
| 482 | return page[0]; | ||
| 483 | } | ||
| 484 | |||
| 485 | EXPORT_SYMBOL_GPL(gfn_to_page); | ||
| 486 | |||
| 487 | void kvm_release_page_clean(struct page *page) | ||
| 488 | { | ||
| 489 | put_page(page); | ||
| 490 | } | ||
| 491 | EXPORT_SYMBOL_GPL(kvm_release_page_clean); | ||
| 492 | |||
| 493 | void kvm_release_page_dirty(struct page *page) | ||
| 494 | { | ||
| 495 | if (!PageReserved(page)) | ||
| 496 | SetPageDirty(page); | ||
| 497 | put_page(page); | ||
| 498 | } | ||
| 499 | EXPORT_SYMBOL_GPL(kvm_release_page_dirty); | ||
| 500 | |||
| 501 | static int next_segment(unsigned long len, int offset) | ||
| 502 | { | ||
| 503 | if (len > PAGE_SIZE - offset) | ||
| 504 | return PAGE_SIZE - offset; | ||
| 505 | else | ||
| 506 | return len; | ||
| 507 | } | ||
| 508 | |||
| 509 | int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, | ||
| 510 | int len) | ||
| 511 | { | ||
| 512 | int r; | ||
| 513 | unsigned long addr; | ||
| 514 | |||
| 515 | addr = gfn_to_hva(kvm, gfn); | ||
| 516 | if (kvm_is_error_hva(addr)) | ||
| 517 | return -EFAULT; | ||
| 518 | r = copy_from_user(data, (void __user *)addr + offset, len); | ||
| 519 | if (r) | ||
| 520 | return -EFAULT; | ||
| 521 | return 0; | ||
| 522 | } | ||
| 523 | EXPORT_SYMBOL_GPL(kvm_read_guest_page); | ||
| 524 | |||
| 525 | int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) | ||
| 526 | { | ||
| 527 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
| 528 | int seg; | ||
| 529 | int offset = offset_in_page(gpa); | ||
| 530 | int ret; | ||
| 531 | |||
| 532 | while ((seg = next_segment(len, offset)) != 0) { | ||
| 533 | ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); | ||
| 534 | if (ret < 0) | ||
| 535 | return ret; | ||
| 536 | offset = 0; | ||
| 537 | len -= seg; | ||
| 538 | data += seg; | ||
| 539 | ++gfn; | ||
| 540 | } | ||
| 541 | return 0; | ||
| 542 | } | ||
| 543 | EXPORT_SYMBOL_GPL(kvm_read_guest); | ||
| 544 | |||
| 545 | int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, | ||
| 546 | unsigned long len) | ||
| 547 | { | ||
| 548 | int r; | ||
| 549 | unsigned long addr; | ||
| 550 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
| 551 | int offset = offset_in_page(gpa); | ||
| 552 | |||
| 553 | addr = gfn_to_hva(kvm, gfn); | ||
| 554 | if (kvm_is_error_hva(addr)) | ||
| 555 | return -EFAULT; | ||
| 556 | r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); | ||
| 557 | if (r) | ||
| 558 | return -EFAULT; | ||
| 559 | return 0; | ||
| 560 | } | ||
| 561 | EXPORT_SYMBOL(kvm_read_guest_atomic); | ||
| 562 | |||
| 563 | int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, | ||
| 564 | int offset, int len) | ||
| 565 | { | ||
| 566 | int r; | ||
| 567 | unsigned long addr; | ||
| 568 | |||
| 569 | addr = gfn_to_hva(kvm, gfn); | ||
| 570 | if (kvm_is_error_hva(addr)) | ||
| 571 | return -EFAULT; | ||
| 572 | r = copy_to_user((void __user *)addr + offset, data, len); | ||
| 573 | if (r) | ||
| 574 | return -EFAULT; | ||
| 575 | mark_page_dirty(kvm, gfn); | ||
| 576 | return 0; | ||
| 577 | } | ||
| 578 | EXPORT_SYMBOL_GPL(kvm_write_guest_page); | ||
| 579 | |||
| 580 | int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, | ||
| 581 | unsigned long len) | ||
| 582 | { | ||
| 583 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
| 584 | int seg; | ||
| 585 | int offset = offset_in_page(gpa); | ||
| 586 | int ret; | ||
| 587 | |||
| 588 | while ((seg = next_segment(len, offset)) != 0) { | ||
| 589 | ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); | ||
| 590 | if (ret < 0) | ||
| 591 | return ret; | ||
| 592 | offset = 0; | ||
| 593 | len -= seg; | ||
| 594 | data += seg; | ||
| 595 | ++gfn; | ||
| 596 | } | ||
| 597 | return 0; | ||
| 598 | } | ||
| 599 | |||
| 600 | int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) | ||
| 601 | { | ||
| 602 | return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); | ||
| 603 | } | ||
| 604 | EXPORT_SYMBOL_GPL(kvm_clear_guest_page); | ||
| 605 | |||
| 606 | int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) | ||
| 607 | { | ||
| 608 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
| 609 | int seg; | ||
| 610 | int offset = offset_in_page(gpa); | ||
| 611 | int ret; | ||
| 612 | |||
| 613 | while ((seg = next_segment(len, offset)) != 0) { | ||
| 614 | ret = kvm_clear_guest_page(kvm, gfn, offset, seg); | ||
| 615 | if (ret < 0) | ||
| 616 | return ret; | ||
| 617 | offset = 0; | ||
| 618 | len -= seg; | ||
| 619 | ++gfn; | ||
| 620 | } | ||
| 621 | return 0; | ||
| 622 | } | ||
| 623 | EXPORT_SYMBOL_GPL(kvm_clear_guest); | ||
| 624 | |||
| 625 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn) | ||
| 626 | { | ||
| 627 | struct kvm_memory_slot *memslot; | ||
| 628 | |||
| 629 | gfn = unalias_gfn(kvm, gfn); | ||
| 630 | memslot = __gfn_to_memslot(kvm, gfn); | ||
| 631 | if (memslot && memslot->dirty_bitmap) { | ||
| 632 | unsigned long rel_gfn = gfn - memslot->base_gfn; | ||
| 633 | |||
| 634 | /* avoid RMW */ | ||
| 635 | if (!test_bit(rel_gfn, memslot->dirty_bitmap)) | ||
| 636 | set_bit(rel_gfn, memslot->dirty_bitmap); | ||
| 637 | } | ||
| 638 | } | ||
| 639 | |||
| 640 | /* | ||
| 641 | * The vCPU has executed a HLT instruction with in-kernel mode enabled. | ||
| 642 | */ | ||
| 643 | void kvm_vcpu_block(struct kvm_vcpu *vcpu) | ||
| 644 | { | ||
| 645 | DECLARE_WAITQUEUE(wait, current); | ||
| 646 | |||
| 647 | add_wait_queue(&vcpu->wq, &wait); | ||
| 648 | |||
| 649 | /* | ||
| 650 | * We will block until either an interrupt or a signal wakes us up | ||
| 651 | */ | ||
| 652 | while (!kvm_cpu_has_interrupt(vcpu) | ||
| 653 | && !signal_pending(current) | ||
| 654 | && !kvm_arch_vcpu_runnable(vcpu)) { | ||
| 655 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 656 | vcpu_put(vcpu); | ||
| 657 | schedule(); | ||
| 658 | vcpu_load(vcpu); | ||
| 659 | } | ||
| 660 | |||
| 661 | __set_current_state(TASK_RUNNING); | ||
| 662 | remove_wait_queue(&vcpu->wq, &wait); | ||
| 663 | } | ||
| 664 | |||
| 665 | void kvm_resched(struct kvm_vcpu *vcpu) | ||
| 666 | { | ||
| 667 | if (!need_resched()) | ||
| 668 | return; | ||
| 669 | cond_resched(); | ||
| 670 | } | ||
| 671 | EXPORT_SYMBOL_GPL(kvm_resched); | ||
| 672 | |||
| 673 | static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | ||
| 674 | { | ||
| 675 | struct kvm_vcpu *vcpu = vma->vm_file->private_data; | ||
| 676 | struct page *page; | ||
| 677 | |||
| 678 | if (vmf->pgoff == 0) | ||
| 679 | page = virt_to_page(vcpu->run); | ||
| 680 | else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) | ||
| 681 | page = virt_to_page(vcpu->arch.pio_data); | ||
| 682 | else | ||
| 683 | return VM_FAULT_SIGBUS; | ||
| 684 | get_page(page); | ||
| 685 | vmf->page = page; | ||
| 686 | return 0; | ||
| 687 | } | ||
| 688 | |||
| 689 | static struct vm_operations_struct kvm_vcpu_vm_ops = { | ||
| 690 | .fault = kvm_vcpu_fault, | ||
| 691 | }; | ||
| 692 | |||
| 693 | static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) | ||
| 694 | { | ||
| 695 | vma->vm_ops = &kvm_vcpu_vm_ops; | ||
| 696 | return 0; | ||
| 697 | } | ||
| 698 | |||
| 699 | static int kvm_vcpu_release(struct inode *inode, struct file *filp) | ||
| 700 | { | ||
| 701 | struct kvm_vcpu *vcpu = filp->private_data; | ||
| 702 | |||
| 703 | fput(vcpu->kvm->filp); | ||
| 704 | return 0; | ||
| 705 | } | ||
| 706 | |||
| 707 | static struct file_operations kvm_vcpu_fops = { | ||
| 708 | .release = kvm_vcpu_release, | ||
| 709 | .unlocked_ioctl = kvm_vcpu_ioctl, | ||
| 710 | .compat_ioctl = kvm_vcpu_ioctl, | ||
| 711 | .mmap = kvm_vcpu_mmap, | ||
| 712 | }; | ||
| 713 | |||
| 714 | /* | ||
| 715 | * Allocates an inode for the vcpu. | ||
| 716 | */ | ||
| 717 | static int create_vcpu_fd(struct kvm_vcpu *vcpu) | ||
| 718 | { | ||
| 719 | int fd, r; | ||
| 720 | struct inode *inode; | ||
| 721 | struct file *file; | ||
| 722 | |||
| 723 | r = anon_inode_getfd(&fd, &inode, &file, | ||
| 724 | "kvm-vcpu", &kvm_vcpu_fops, vcpu); | ||
| 725 | if (r) | ||
| 726 | return r; | ||
| 727 | atomic_inc(&vcpu->kvm->filp->f_count); | ||
| 728 | return fd; | ||
| 729 | } | ||
| 730 | |||
| 731 | /* | ||
| 732 | * Creates some virtual cpus. Good luck creating more than one. | ||
| 733 | */ | ||
| 734 | static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) | ||
| 735 | { | ||
| 736 | int r; | ||
| 737 | struct kvm_vcpu *vcpu; | ||
| 738 | |||
| 739 | if (!valid_vcpu(n)) | ||
| 740 | return -EINVAL; | ||
| 741 | |||
| 742 | vcpu = kvm_arch_vcpu_create(kvm, n); | ||
| 743 | if (IS_ERR(vcpu)) | ||
| 744 | return PTR_ERR(vcpu); | ||
| 745 | |||
| 746 | preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); | ||
| 747 | |||
| 748 | r = kvm_arch_vcpu_setup(vcpu); | ||
| 749 | if (r) | ||
| 750 | goto vcpu_destroy; | ||
| 751 | |||
| 752 | mutex_lock(&kvm->lock); | ||
| 753 | if (kvm->vcpus[n]) { | ||
| 754 | r = -EEXIST; | ||
| 755 | mutex_unlock(&kvm->lock); | ||
| 756 | goto vcpu_destroy; | ||
| 757 | } | ||
| 758 | kvm->vcpus[n] = vcpu; | ||
| 759 | mutex_unlock(&kvm->lock); | ||
| 760 | |||
| 761 | /* Now it's all set up, let userspace reach it */ | ||
| 762 | r = create_vcpu_fd(vcpu); | ||
| 763 | if (r < 0) | ||
| 764 | goto unlink; | ||
| 765 | return r; | ||
| 766 | |||
| 767 | unlink: | ||
| 768 | mutex_lock(&kvm->lock); | ||
| 769 | kvm->vcpus[n] = NULL; | ||
| 770 | mutex_unlock(&kvm->lock); | ||
| 771 | vcpu_destroy: | ||
| 772 | kvm_arch_vcpu_destroy(vcpu); | ||
| 773 | return r; | ||
| 774 | } | ||
| 775 | |||
| 776 | static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) | ||
| 777 | { | ||
| 778 | if (sigset) { | ||
| 779 | sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
| 780 | vcpu->sigset_active = 1; | ||
| 781 | vcpu->sigset = *sigset; | ||
| 782 | } else | ||
| 783 | vcpu->sigset_active = 0; | ||
| 784 | return 0; | ||
| 785 | } | ||
| 786 | |||
| 787 | static long kvm_vcpu_ioctl(struct file *filp, | ||
| 788 | unsigned int ioctl, unsigned long arg) | ||
| 789 | { | ||
| 790 | struct kvm_vcpu *vcpu = filp->private_data; | ||
| 791 | void __user *argp = (void __user *)arg; | ||
| 792 | int r; | ||
| 793 | |||
| 794 | if (vcpu->kvm->mm != current->mm) | ||
| 795 | return -EIO; | ||
| 796 | switch (ioctl) { | ||
| 797 | case KVM_RUN: | ||
| 798 | r = -EINVAL; | ||
| 799 | if (arg) | ||
| 800 | goto out; | ||
| 801 | r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); | ||
| 802 | break; | ||
| 803 | case KVM_GET_REGS: { | ||
| 804 | struct kvm_regs kvm_regs; | ||
| 805 | |||
| 806 | memset(&kvm_regs, 0, sizeof kvm_regs); | ||
| 807 | r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs); | ||
| 808 | if (r) | ||
| 809 | goto out; | ||
| 810 | r = -EFAULT; | ||
| 811 | if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs)) | ||
| 812 | goto out; | ||
| 813 | r = 0; | ||
| 814 | break; | ||
| 815 | } | ||
| 816 | case KVM_SET_REGS: { | ||
| 817 | struct kvm_regs kvm_regs; | ||
| 818 | |||
| 819 | r = -EFAULT; | ||
| 820 | if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs)) | ||
| 821 | goto out; | ||
| 822 | r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs); | ||
| 823 | if (r) | ||
| 824 | goto out; | ||
| 825 | r = 0; | ||
| 826 | break; | ||
| 827 | } | ||
| 828 | case KVM_GET_SREGS: { | ||
| 829 | struct kvm_sregs kvm_sregs; | ||
| 830 | |||
| 831 | memset(&kvm_sregs, 0, sizeof kvm_sregs); | ||
| 832 | r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); | ||
| 833 | if (r) | ||
| 834 | goto out; | ||
| 835 | r = -EFAULT; | ||
| 836 | if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) | ||
| 837 | goto out; | ||
| 838 | r = 0; | ||
| 839 | break; | ||
| 840 | } | ||
| 841 | case KVM_SET_SREGS: { | ||
| 842 | struct kvm_sregs kvm_sregs; | ||
| 843 | |||
| 844 | r = -EFAULT; | ||
| 845 | if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) | ||
| 846 | goto out; | ||
| 847 | r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); | ||
| 848 | if (r) | ||
| 849 | goto out; | ||
| 850 | r = 0; | ||
| 851 | break; | ||
| 852 | } | ||
| 853 | case KVM_TRANSLATE: { | ||
| 854 | struct kvm_translation tr; | ||
| 855 | |||
| 856 | r = -EFAULT; | ||
| 857 | if (copy_from_user(&tr, argp, sizeof tr)) | ||
| 858 | goto out; | ||
| 859 | r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); | ||
| 860 | if (r) | ||
| 861 | goto out; | ||
| 862 | r = -EFAULT; | ||
| 863 | if (copy_to_user(argp, &tr, sizeof tr)) | ||
| 864 | goto out; | ||
| 865 | r = 0; | ||
| 866 | break; | ||
| 867 | } | ||
| 868 | case KVM_DEBUG_GUEST: { | ||
| 869 | struct kvm_debug_guest dbg; | ||
| 870 | |||
| 871 | r = -EFAULT; | ||
| 872 | if (copy_from_user(&dbg, argp, sizeof dbg)) | ||
| 873 | goto out; | ||
| 874 | r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg); | ||
| 875 | if (r) | ||
| 876 | goto out; | ||
| 877 | r = 0; | ||
| 878 | break; | ||
| 879 | } | ||
| 880 | case KVM_SET_SIGNAL_MASK: { | ||
| 881 | struct kvm_signal_mask __user *sigmask_arg = argp; | ||
| 882 | struct kvm_signal_mask kvm_sigmask; | ||
| 883 | sigset_t sigset, *p; | ||
| 884 | |||
| 885 | p = NULL; | ||
| 886 | if (argp) { | ||
| 887 | r = -EFAULT; | ||
| 888 | if (copy_from_user(&kvm_sigmask, argp, | ||
| 889 | sizeof kvm_sigmask)) | ||
| 890 | goto out; | ||
| 891 | r = -EINVAL; | ||
| 892 | if (kvm_sigmask.len != sizeof sigset) | ||
| 893 | goto out; | ||
| 894 | r = -EFAULT; | ||
| 895 | if (copy_from_user(&sigset, sigmask_arg->sigset, | ||
| 896 | sizeof sigset)) | ||
| 897 | goto out; | ||
| 898 | p = &sigset; | ||
| 899 | } | ||
| 900 | r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); | ||
| 901 | break; | ||
| 902 | } | ||
| 903 | case KVM_GET_FPU: { | ||
| 904 | struct kvm_fpu fpu; | ||
| 905 | |||
| 906 | memset(&fpu, 0, sizeof fpu); | ||
| 907 | r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu); | ||
| 908 | if (r) | ||
| 909 | goto out; | ||
| 910 | r = -EFAULT; | ||
| 911 | if (copy_to_user(argp, &fpu, sizeof fpu)) | ||
| 912 | goto out; | ||
| 913 | r = 0; | ||
| 914 | break; | ||
| 915 | } | ||
| 916 | case KVM_SET_FPU: { | ||
| 917 | struct kvm_fpu fpu; | ||
| 918 | |||
| 919 | r = -EFAULT; | ||
| 920 | if (copy_from_user(&fpu, argp, sizeof fpu)) | ||
| 921 | goto out; | ||
| 922 | r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu); | ||
| 923 | if (r) | ||
| 924 | goto out; | ||
| 925 | r = 0; | ||
| 926 | break; | ||
| 927 | } | ||
| 928 | default: | ||
| 929 | r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); | ||
| 930 | } | ||
| 931 | out: | ||
| 932 | return r; | ||
| 933 | } | ||
| 934 | |||
| 935 | static long kvm_vm_ioctl(struct file *filp, | ||
| 936 | unsigned int ioctl, unsigned long arg) | ||
| 937 | { | ||
| 938 | struct kvm *kvm = filp->private_data; | ||
| 939 | void __user *argp = (void __user *)arg; | ||
| 940 | int r; | ||
| 941 | |||
| 942 | if (kvm->mm != current->mm) | ||
| 943 | return -EIO; | ||
| 944 | switch (ioctl) { | ||
| 945 | case KVM_CREATE_VCPU: | ||
| 946 | r = kvm_vm_ioctl_create_vcpu(kvm, arg); | ||
| 947 | if (r < 0) | ||
| 948 | goto out; | ||
| 949 | break; | ||
| 950 | case KVM_SET_USER_MEMORY_REGION: { | ||
| 951 | struct kvm_userspace_memory_region kvm_userspace_mem; | ||
| 952 | |||
| 953 | r = -EFAULT; | ||
| 954 | if (copy_from_user(&kvm_userspace_mem, argp, | ||
| 955 | sizeof kvm_userspace_mem)) | ||
| 956 | goto out; | ||
| 957 | |||
| 958 | r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); | ||
| 959 | if (r) | ||
| 960 | goto out; | ||
| 961 | break; | ||
| 962 | } | ||
| 963 | case KVM_GET_DIRTY_LOG: { | ||
| 964 | struct kvm_dirty_log log; | ||
| 965 | |||
| 966 | r = -EFAULT; | ||
| 967 | if (copy_from_user(&log, argp, sizeof log)) | ||
| 968 | goto out; | ||
| 969 | r = kvm_vm_ioctl_get_dirty_log(kvm, &log); | ||
| 970 | if (r) | ||
| 971 | goto out; | ||
| 972 | break; | ||
| 973 | } | ||
| 974 | default: | ||
| 975 | r = kvm_arch_vm_ioctl(filp, ioctl, arg); | ||
| 976 | } | ||
| 977 | out: | ||
| 978 | return r; | ||
| 979 | } | ||
| 980 | |||
| 981 | static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | ||
| 982 | { | ||
| 983 | struct kvm *kvm = vma->vm_file->private_data; | ||
| 984 | struct page *page; | ||
| 985 | |||
| 986 | if (!kvm_is_visible_gfn(kvm, vmf->pgoff)) | ||
| 987 | return VM_FAULT_SIGBUS; | ||
| 988 | page = gfn_to_page(kvm, vmf->pgoff); | ||
| 989 | if (is_error_page(page)) { | ||
| 990 | kvm_release_page_clean(page); | ||
| 991 | return VM_FAULT_SIGBUS; | ||
| 992 | } | ||
| 993 | vmf->page = page; | ||
| 994 | return 0; | ||
| 995 | } | ||
| 996 | |||
| 997 | static struct vm_operations_struct kvm_vm_vm_ops = { | ||
| 998 | .fault = kvm_vm_fault, | ||
| 999 | }; | ||
| 1000 | |||
| 1001 | static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) | ||
| 1002 | { | ||
| 1003 | vma->vm_ops = &kvm_vm_vm_ops; | ||
| 1004 | return 0; | ||
| 1005 | } | ||
| 1006 | |||
| 1007 | static struct file_operations kvm_vm_fops = { | ||
| 1008 | .release = kvm_vm_release, | ||
| 1009 | .unlocked_ioctl = kvm_vm_ioctl, | ||
| 1010 | .compat_ioctl = kvm_vm_ioctl, | ||
| 1011 | .mmap = kvm_vm_mmap, | ||
| 1012 | }; | ||
| 1013 | |||
| 1014 | static int kvm_dev_ioctl_create_vm(void) | ||
| 1015 | { | ||
| 1016 | int fd, r; | ||
| 1017 | struct inode *inode; | ||
| 1018 | struct file *file; | ||
| 1019 | struct kvm *kvm; | ||
| 1020 | |||
| 1021 | kvm = kvm_create_vm(); | ||
| 1022 | if (IS_ERR(kvm)) | ||
| 1023 | return PTR_ERR(kvm); | ||
| 1024 | r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm); | ||
| 1025 | if (r) { | ||
| 1026 | kvm_destroy_vm(kvm); | ||
| 1027 | return r; | ||
| 1028 | } | ||
| 1029 | |||
| 1030 | kvm->filp = file; | ||
| 1031 | |||
| 1032 | return fd; | ||
| 1033 | } | ||
| 1034 | |||
| 1035 | static long kvm_dev_ioctl(struct file *filp, | ||
| 1036 | unsigned int ioctl, unsigned long arg) | ||
| 1037 | { | ||
| 1038 | void __user *argp = (void __user *)arg; | ||
| 1039 | long r = -EINVAL; | ||
| 1040 | |||
| 1041 | switch (ioctl) { | ||
| 1042 | case KVM_GET_API_VERSION: | ||
| 1043 | r = -EINVAL; | ||
| 1044 | if (arg) | ||
| 1045 | goto out; | ||
| 1046 | r = KVM_API_VERSION; | ||
| 1047 | break; | ||
| 1048 | case KVM_CREATE_VM: | ||
| 1049 | r = -EINVAL; | ||
| 1050 | if (arg) | ||
| 1051 | goto out; | ||
| 1052 | r = kvm_dev_ioctl_create_vm(); | ||
| 1053 | break; | ||
| 1054 | case KVM_CHECK_EXTENSION: | ||
| 1055 | r = kvm_dev_ioctl_check_extension((long)argp); | ||
| 1056 | break; | ||
| 1057 | case KVM_GET_VCPU_MMAP_SIZE: | ||
| 1058 | r = -EINVAL; | ||
| 1059 | if (arg) | ||
| 1060 | goto out; | ||
| 1061 | r = 2 * PAGE_SIZE; | ||
| 1062 | break; | ||
| 1063 | default: | ||
| 1064 | return kvm_arch_dev_ioctl(filp, ioctl, arg); | ||
| 1065 | } | ||
| 1066 | out: | ||
| 1067 | return r; | ||
| 1068 | } | ||
| 1069 | |||
| 1070 | static struct file_operations kvm_chardev_ops = { | ||
| 1071 | .unlocked_ioctl = kvm_dev_ioctl, | ||
| 1072 | .compat_ioctl = kvm_dev_ioctl, | ||
| 1073 | }; | ||
| 1074 | |||
| 1075 | static struct miscdevice kvm_dev = { | ||
| 1076 | KVM_MINOR, | ||
| 1077 | "kvm", | ||
| 1078 | &kvm_chardev_ops, | ||
| 1079 | }; | ||
| 1080 | |||
| 1081 | static void hardware_enable(void *junk) | ||
| 1082 | { | ||
| 1083 | int cpu = raw_smp_processor_id(); | ||
| 1084 | |||
| 1085 | if (cpu_isset(cpu, cpus_hardware_enabled)) | ||
| 1086 | return; | ||
| 1087 | cpu_set(cpu, cpus_hardware_enabled); | ||
| 1088 | kvm_arch_hardware_enable(NULL); | ||
| 1089 | } | ||
| 1090 | |||
| 1091 | static void hardware_disable(void *junk) | ||
| 1092 | { | ||
| 1093 | int cpu = raw_smp_processor_id(); | ||
| 1094 | |||
| 1095 | if (!cpu_isset(cpu, cpus_hardware_enabled)) | ||
| 1096 | return; | ||
| 1097 | cpu_clear(cpu, cpus_hardware_enabled); | ||
| 1098 | decache_vcpus_on_cpu(cpu); | ||
| 1099 | kvm_arch_hardware_disable(NULL); | ||
| 1100 | } | ||
| 1101 | |||
| 1102 | static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, | ||
| 1103 | void *v) | ||
| 1104 | { | ||
| 1105 | int cpu = (long)v; | ||
| 1106 | |||
| 1107 | val &= ~CPU_TASKS_FROZEN; | ||
| 1108 | switch (val) { | ||
| 1109 | case CPU_DYING: | ||
| 1110 | printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", | ||
| 1111 | cpu); | ||
| 1112 | hardware_disable(NULL); | ||
| 1113 | break; | ||
| 1114 | case CPU_UP_CANCELED: | ||
| 1115 | printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", | ||
| 1116 | cpu); | ||
| 1117 | smp_call_function_single(cpu, hardware_disable, NULL, 0, 1); | ||
| 1118 | break; | ||
| 1119 | case CPU_ONLINE: | ||
| 1120 | printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", | ||
| 1121 | cpu); | ||
| 1122 | smp_call_function_single(cpu, hardware_enable, NULL, 0, 1); | ||
| 1123 | break; | ||
| 1124 | } | ||
| 1125 | return NOTIFY_OK; | ||
| 1126 | } | ||
| 1127 | |||
| 1128 | static int kvm_reboot(struct notifier_block *notifier, unsigned long val, | ||
| 1129 | void *v) | ||
| 1130 | { | ||
| 1131 | if (val == SYS_RESTART) { | ||
| 1132 | /* | ||
| 1133 | * Some (well, at least mine) BIOSes hang on reboot if | ||
| 1134 | * in vmx root mode. | ||
| 1135 | */ | ||
| 1136 | printk(KERN_INFO "kvm: exiting hardware virtualization\n"); | ||
| 1137 | on_each_cpu(hardware_disable, NULL, 0, 1); | ||
| 1138 | } | ||
| 1139 | return NOTIFY_OK; | ||
| 1140 | } | ||
| 1141 | |||
| 1142 | static struct notifier_block kvm_reboot_notifier = { | ||
| 1143 | .notifier_call = kvm_reboot, | ||
| 1144 | .priority = 0, | ||
| 1145 | }; | ||
| 1146 | |||
| 1147 | void kvm_io_bus_init(struct kvm_io_bus *bus) | ||
| 1148 | { | ||
| 1149 | memset(bus, 0, sizeof(*bus)); | ||
| 1150 | } | ||
| 1151 | |||
| 1152 | void kvm_io_bus_destroy(struct kvm_io_bus *bus) | ||
| 1153 | { | ||
| 1154 | int i; | ||
| 1155 | |||
| 1156 | for (i = 0; i < bus->dev_count; i++) { | ||
| 1157 | struct kvm_io_device *pos = bus->devs[i]; | ||
| 1158 | |||
| 1159 | kvm_iodevice_destructor(pos); | ||
| 1160 | } | ||
| 1161 | } | ||
| 1162 | |||
| 1163 | struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) | ||
| 1164 | { | ||
| 1165 | int i; | ||
| 1166 | |||
| 1167 | for (i = 0; i < bus->dev_count; i++) { | ||
| 1168 | struct kvm_io_device *pos = bus->devs[i]; | ||
| 1169 | |||
| 1170 | if (pos->in_range(pos, addr)) | ||
| 1171 | return pos; | ||
| 1172 | } | ||
| 1173 | |||
| 1174 | return NULL; | ||
| 1175 | } | ||
| 1176 | |||
| 1177 | void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) | ||
| 1178 | { | ||
| 1179 | BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); | ||
| 1180 | |||
| 1181 | bus->devs[bus->dev_count++] = dev; | ||
| 1182 | } | ||
| 1183 | |||
| 1184 | static struct notifier_block kvm_cpu_notifier = { | ||
| 1185 | .notifier_call = kvm_cpu_hotplug, | ||
| 1186 | .priority = 20, /* must be > scheduler priority */ | ||
| 1187 | }; | ||
| 1188 | |||
| 1189 | static u64 vm_stat_get(void *_offset) | ||
| 1190 | { | ||
| 1191 | unsigned offset = (long)_offset; | ||
| 1192 | u64 total = 0; | ||
| 1193 | struct kvm *kvm; | ||
| 1194 | |||
| 1195 | spin_lock(&kvm_lock); | ||
| 1196 | list_for_each_entry(kvm, &vm_list, vm_list) | ||
| 1197 | total += *(u32 *)((void *)kvm + offset); | ||
| 1198 | spin_unlock(&kvm_lock); | ||
| 1199 | return total; | ||
| 1200 | } | ||
| 1201 | |||
| 1202 | DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); | ||
| 1203 | |||
| 1204 | static u64 vcpu_stat_get(void *_offset) | ||
| 1205 | { | ||
| 1206 | unsigned offset = (long)_offset; | ||
| 1207 | u64 total = 0; | ||
| 1208 | struct kvm *kvm; | ||
| 1209 | struct kvm_vcpu *vcpu; | ||
| 1210 | int i; | ||
| 1211 | |||
| 1212 | spin_lock(&kvm_lock); | ||
| 1213 | list_for_each_entry(kvm, &vm_list, vm_list) | ||
| 1214 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
| 1215 | vcpu = kvm->vcpus[i]; | ||
| 1216 | if (vcpu) | ||
| 1217 | total += *(u32 *)((void *)vcpu + offset); | ||
| 1218 | } | ||
| 1219 | spin_unlock(&kvm_lock); | ||
| 1220 | return total; | ||
| 1221 | } | ||
| 1222 | |||
| 1223 | DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); | ||
| 1224 | |||
| 1225 | static struct file_operations *stat_fops[] = { | ||
| 1226 | [KVM_STAT_VCPU] = &vcpu_stat_fops, | ||
| 1227 | [KVM_STAT_VM] = &vm_stat_fops, | ||
| 1228 | }; | ||
| 1229 | |||
| 1230 | static void kvm_init_debug(void) | ||
| 1231 | { | ||
| 1232 | struct kvm_stats_debugfs_item *p; | ||
| 1233 | |||
| 1234 | debugfs_dir = debugfs_create_dir("kvm", NULL); | ||
| 1235 | for (p = debugfs_entries; p->name; ++p) | ||
| 1236 | p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir, | ||
| 1237 | (void *)(long)p->offset, | ||
| 1238 | stat_fops[p->kind]); | ||
| 1239 | } | ||
| 1240 | |||
| 1241 | static void kvm_exit_debug(void) | ||
| 1242 | { | ||
| 1243 | struct kvm_stats_debugfs_item *p; | ||
| 1244 | |||
| 1245 | for (p = debugfs_entries; p->name; ++p) | ||
| 1246 | debugfs_remove(p->dentry); | ||
| 1247 | debugfs_remove(debugfs_dir); | ||
| 1248 | } | ||
| 1249 | |||
| 1250 | static int kvm_suspend(struct sys_device *dev, pm_message_t state) | ||
| 1251 | { | ||
| 1252 | hardware_disable(NULL); | ||
| 1253 | return 0; | ||
| 1254 | } | ||
| 1255 | |||
| 1256 | static int kvm_resume(struct sys_device *dev) | ||
| 1257 | { | ||
| 1258 | hardware_enable(NULL); | ||
| 1259 | return 0; | ||
| 1260 | } | ||
| 1261 | |||
| 1262 | static struct sysdev_class kvm_sysdev_class = { | ||
| 1263 | .name = "kvm", | ||
| 1264 | .suspend = kvm_suspend, | ||
| 1265 | .resume = kvm_resume, | ||
| 1266 | }; | ||
| 1267 | |||
| 1268 | static struct sys_device kvm_sysdev = { | ||
| 1269 | .id = 0, | ||
| 1270 | .cls = &kvm_sysdev_class, | ||
| 1271 | }; | ||
| 1272 | |||
| 1273 | struct page *bad_page; | ||
| 1274 | |||
| 1275 | static inline | ||
| 1276 | struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) | ||
| 1277 | { | ||
| 1278 | return container_of(pn, struct kvm_vcpu, preempt_notifier); | ||
| 1279 | } | ||
| 1280 | |||
| 1281 | static void kvm_sched_in(struct preempt_notifier *pn, int cpu) | ||
| 1282 | { | ||
| 1283 | struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); | ||
| 1284 | |||
| 1285 | kvm_arch_vcpu_load(vcpu, cpu); | ||
| 1286 | } | ||
| 1287 | |||
| 1288 | static void kvm_sched_out(struct preempt_notifier *pn, | ||
| 1289 | struct task_struct *next) | ||
| 1290 | { | ||
| 1291 | struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); | ||
| 1292 | |||
| 1293 | kvm_arch_vcpu_put(vcpu); | ||
| 1294 | } | ||
| 1295 | |||
| 1296 | int kvm_init(void *opaque, unsigned int vcpu_size, | ||
| 1297 | struct module *module) | ||
| 1298 | { | ||
| 1299 | int r; | ||
| 1300 | int cpu; | ||
| 1301 | |||
| 1302 | kvm_init_debug(); | ||
| 1303 | |||
| 1304 | r = kvm_arch_init(opaque); | ||
| 1305 | if (r) | ||
| 1306 | goto out_fail; | ||
| 1307 | |||
| 1308 | bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
| 1309 | |||
| 1310 | if (bad_page == NULL) { | ||
| 1311 | r = -ENOMEM; | ||
| 1312 | goto out; | ||
| 1313 | } | ||
| 1314 | |||
| 1315 | r = kvm_arch_hardware_setup(); | ||
| 1316 | if (r < 0) | ||
| 1317 | goto out_free_0; | ||
| 1318 | |||
| 1319 | for_each_online_cpu(cpu) { | ||
| 1320 | smp_call_function_single(cpu, | ||
| 1321 | kvm_arch_check_processor_compat, | ||
| 1322 | &r, 0, 1); | ||
| 1323 | if (r < 0) | ||
| 1324 | goto out_free_1; | ||
| 1325 | } | ||
| 1326 | |||
| 1327 | on_each_cpu(hardware_enable, NULL, 0, 1); | ||
| 1328 | r = register_cpu_notifier(&kvm_cpu_notifier); | ||
| 1329 | if (r) | ||
| 1330 | goto out_free_2; | ||
| 1331 | register_reboot_notifier(&kvm_reboot_notifier); | ||
| 1332 | |||
| 1333 | r = sysdev_class_register(&kvm_sysdev_class); | ||
| 1334 | if (r) | ||
| 1335 | goto out_free_3; | ||
| 1336 | |||
| 1337 | r = sysdev_register(&kvm_sysdev); | ||
| 1338 | if (r) | ||
| 1339 | goto out_free_4; | ||
| 1340 | |||
| 1341 | /* A kmem cache lets us meet the alignment requirements of fx_save. */ | ||
| 1342 | kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, | ||
| 1343 | __alignof__(struct kvm_vcpu), | ||
| 1344 | 0, NULL); | ||
| 1345 | if (!kvm_vcpu_cache) { | ||
| 1346 | r = -ENOMEM; | ||
| 1347 | goto out_free_5; | ||
| 1348 | } | ||
| 1349 | |||
| 1350 | kvm_chardev_ops.owner = module; | ||
| 1351 | |||
| 1352 | r = misc_register(&kvm_dev); | ||
| 1353 | if (r) { | ||
| 1354 | printk(KERN_ERR "kvm: misc device register failed\n"); | ||
| 1355 | goto out_free; | ||
| 1356 | } | ||
| 1357 | |||
| 1358 | kvm_preempt_ops.sched_in = kvm_sched_in; | ||
| 1359 | kvm_preempt_ops.sched_out = kvm_sched_out; | ||
| 1360 | |||
| 1361 | return 0; | ||
| 1362 | |||
| 1363 | out_free: | ||
| 1364 | kmem_cache_destroy(kvm_vcpu_cache); | ||
| 1365 | out_free_5: | ||
| 1366 | sysdev_unregister(&kvm_sysdev); | ||
| 1367 | out_free_4: | ||
| 1368 | sysdev_class_unregister(&kvm_sysdev_class); | ||
| 1369 | out_free_3: | ||
| 1370 | unregister_reboot_notifier(&kvm_reboot_notifier); | ||
| 1371 | unregister_cpu_notifier(&kvm_cpu_notifier); | ||
| 1372 | out_free_2: | ||
| 1373 | on_each_cpu(hardware_disable, NULL, 0, 1); | ||
| 1374 | out_free_1: | ||
| 1375 | kvm_arch_hardware_unsetup(); | ||
| 1376 | out_free_0: | ||
| 1377 | __free_page(bad_page); | ||
| 1378 | out: | ||
| 1379 | kvm_arch_exit(); | ||
| 1380 | kvm_exit_debug(); | ||
| 1381 | out_fail: | ||
| 1382 | return r; | ||
| 1383 | } | ||
| 1384 | EXPORT_SYMBOL_GPL(kvm_init); | ||
| 1385 | |||
| 1386 | void kvm_exit(void) | ||
| 1387 | { | ||
| 1388 | misc_deregister(&kvm_dev); | ||
| 1389 | kmem_cache_destroy(kvm_vcpu_cache); | ||
| 1390 | sysdev_unregister(&kvm_sysdev); | ||
| 1391 | sysdev_class_unregister(&kvm_sysdev_class); | ||
| 1392 | unregister_reboot_notifier(&kvm_reboot_notifier); | ||
| 1393 | unregister_cpu_notifier(&kvm_cpu_notifier); | ||
| 1394 | on_each_cpu(hardware_disable, NULL, 0, 1); | ||
| 1395 | kvm_arch_hardware_unsetup(); | ||
| 1396 | kvm_arch_exit(); | ||
| 1397 | kvm_exit_debug(); | ||
| 1398 | __free_page(bad_page); | ||
| 1399 | } | ||
| 1400 | EXPORT_SYMBOL_GPL(kvm_exit); | ||
