Merge tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux

Pull virtio updates from Rusty Russell: "OK, this has the big virtio 1.0 implementation, as specified by OASIS. On top of tht is the major rework of lguest, to use PCI and virtio 1.0, to double-check the implementation. Then comes the inevitable fixes and cleanups from that work" * tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (80 commits) virtio: don't set VIRTIO_CONFIG_S_DRIVER_OK twice. virtio_net: unconditionally define struct virtio_net_hdr_v1. tools/lguest: don't use legacy definitions for net device in example launcher. virtio: Don't expose legacy net features when VIRTIO_NET_NO_LEGACY defined. tools/lguest: use common error macros in the example launcher. tools/lguest: give virtqueues names for better error messages tools/lguest: more documentation and checking of virtio 1.0 compliance. lguest: don't look in console features to find emerg_wr. tools/lguest: don't start devices until DRIVER_OK status set. tools/lguest: handle indirect partway through chain. tools/lguest: insert driver references from the 1.0 spec (4.1 Virtio Over PCI) tools/lguest: insert device references from the 1.0 spec (4.1 Virtio Over PCI) tools/lguest: rename virtio_pci_cfg_cap field to match spec. tools/lguest: fix features_accepted logic in example launcher. tools/lguest: handle device reset correctly in example launcher. virtual: Documentation: simplify and generalize paravirt_ops.txt lguest: remove NOTIFY call and eventfd facility. lguest: remove NOTIFY facility from demonstration launcher. lguest: use the PCI console device's emerg_wr for early boot messages. lguest: always put console in PCI slot #1. ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-02-18 12:24:01 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-02-18 12:24:01 -0500
commit: 53861af9a17022898619a2ae4ead0dfc601b7c13 (patch)
tree: dc11088d9e86fa1d8d8479974864153a8f976897 /drivers/lguest
parent: 5c2770079fb9b8c5bfb7113d9e76de66e77a0e24 (diff)
parent: 5b40a7daf51812b35cf05d1601a779a7043f8414 (diff)
8 files changed, 252 insertions, 847 deletions
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile
index c4197503900e..16f52ee73994 100644
--- a/drivers/lguest/Makefile
+++ b/drivers/lguest/Makefile
@@ -1,6 +1,3 @@
-# Guest requires the device configuration and probing code.
-obj-$(CONFIG_LGUEST_GUEST) += lguest_device.o
 # Host requires the other files, which can be a module.
 obj-$(CONFIG_LGUEST)    += lg.o
 lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 6590558d1d31..7dc93aa004c8 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -208,6 +208,14 @@ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
 */
 int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 {
+        /* If the launcher asked for a register with LHREQ_GETREG */
+        if (cpu->reg_read) {
+                if (put_user(*cpu->reg_read, user))
+                        return -EFAULT;
+                cpu->reg_read = NULL;
+                return sizeof(*cpu->reg_read);
+        }
        /* We stop running once the Guest is dead. */
        while (!cpu->lg->dead) {
                unsigned int irq;
@@ -217,21 +225,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
                if (cpu->hcall)
                        do_hypercalls(cpu);
-                /*
+                /* Do we have to tell the Launcher about a trap? */
-                 * It's possible the Guest did a NOTIFY hypercall to the
+                if (cpu->pending.trap) {
-                 * Launcher.
+                        if (copy_to_user(user, &cpu->pending,
-                 */
+                                         sizeof(cpu->pending)))
-                if (cpu->pending_notify) {
+                                return -EFAULT;
-                        /*
+                        return sizeof(cpu->pending);
-                         * Does it just needs to write to a registered
-                         * eventfd (ie. the appropriate virtqueue thread)?
-                         */
-                        if (!send_notify_to_eventfd(cpu)) {
-                                /* OK, we tell the main Launcher. */
-                                if (put_user(cpu->pending_notify, user))
-                                        return -EFAULT;
-                                return sizeof(cpu->pending_notify);
-                        }
                }
                /*
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 83511eb0923d..1219af493c0f 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -117,9 +117,6 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
                /* Similarly, this sets the halted flag for run_guest(). */
                cpu->halted = 1;
                break;
-        case LHCALL_NOTIFY:
-                cpu->pending_notify = args->arg1;
-                break;
        default:
                /* It should be an architecture-specific hypercall. */
                if (lguest_arch_do_hcall(cpu, args))
@@ -189,7 +186,7 @@ static void do_async_hcalls(struct lg_cpu *cpu)
                 * Stop doing hypercalls if they want to notify the Launcher:
                 * it needs to service this first.
                 */
-                if (cpu->pending_notify)
+                if (cpu->pending.trap)
                        break;
        }
 }
@@ -280,7 +277,7 @@ void do_hypercalls(struct lg_cpu *cpu)
         * NOTIFY to the Launcher, we want to return now.  Otherwise we do
         * the hypercall.
         */
-        if (!cpu->pending_notify) {
+        if (!cpu->pending.trap) {
                do_hcall(cpu, cpu->hcall);
                /*
                 * Tricky point: we reset the hcall pointer to mark the
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 2eef40be4c04..307e8b39e7d1 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -50,7 +50,10 @@ struct lg_cpu {
        /* Bitmap of what has changed: see CHANGED_* above. */
        int changed;
-        unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
+        /* Pending operation. */
+        struct lguest_pending pending;
+        unsigned long *reg_read; /* register from LHREQ_GETREG */
        /* At end of a page shared mapped over lguest_pages in guest. */
        unsigned long regs_page;
@@ -78,24 +81,18 @@ struct lg_cpu {
        struct lg_cpu_arch arch;
 };
-struct lg_eventfd {
-        unsigned long addr;
-        struct eventfd_ctx *event;
-};
-struct lg_eventfd_map {
-        unsigned int num;
-        struct lg_eventfd map[];
-};
 /* The private info the thread maintains about the guest. */
 struct lguest {
        struct lguest_data __user *lguest_data;
        struct lg_cpu cpus[NR_CPUS];
        unsigned int nr_cpus;
+        /* Valid guest memory pages must be < this. */
        u32 pfn_limit;
+        /* Device memory is >= pfn_limit and < device_limit. */
+        u32 device_limit;
        /*
         * This provides the offset to the base of guest-physical memory in the
         * Launcher.
@@ -110,8 +107,6 @@ struct lguest {
        unsigned int stack_pages;
        u32 tsc_khz;
-        struct lg_eventfd_map *eventfds;
        /* Dead? */
        const char *dead;
 };
@@ -197,8 +192,10 @@ void guest_pagetable_flush_user(struct lg_cpu *cpu);
 void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
                   unsigned long vaddr, pte_t val);
 void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
-bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode);
+bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode,
+                 unsigned long *iomem);
 void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
+bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr);
 unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
 void page_table_guest_data_init(struct lg_cpu *cpu);
@@ -210,6 +207,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu);
 int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
 int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
 void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
+unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any);
 /* <arch>/switcher.S: */
 extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
deleted file mode 100644
index 89088d6538fd..000000000000
--- a/drivers/lguest/lguest_device.c
+++ /dev/null
@@ -1,540 +0,0 @@
-/*P:050
- * Lguest guests use a very simple method to describe devices.  It's a
- * series of device descriptors contained just above the top of normal Guest
- * memory.
- *
- * We use the standard "virtio" device infrastructure, which provides us with a
- * console, a network and a block driver.  Each one expects some configuration
- * information and a "virtqueue" or two to send and receive data.
-:*/
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/lguest_launcher.h>
-#include <linux/virtio.h>
-#include <linux/virtio_config.h>
-#include <linux/interrupt.h>
-#include <linux/virtio_ring.h>
-#include <linux/err.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <asm/io.h>
-#include <asm/paravirt.h>
-#include <asm/lguest_hcall.h>
-/* The pointer to our (page) of device descriptions. */
-static void *lguest_devices;
-/*
- * For Guests, device memory can be used as normal memory, so we cast away the
- * __iomem to quieten sparse.
- */
-static inline void *lguest_map(unsigned long phys_addr, unsigned long pages)
-{
-        return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages);
-}
-static inline void lguest_unmap(void *addr)
-{
-        iounmap((__force void __iomem *)addr);
-}
-/*D:100
- * Each lguest device is just a virtio device plus a pointer to its entry
- * in the lguest_devices page.
- */
-struct lguest_device {
-        struct virtio_device vdev;
-        /* The entry in the lguest_devices page for this device. */
-        struct lguest_device_desc *desc;
-};
-/*
- * Since the virtio infrastructure hands us a pointer to the virtio_device all
- * the time, it helps to have a curt macro to get a pointer to the struct
- * lguest_device it's enclosed in.
- */
-#define to_lgdev(vd) container_of(vd, struct lguest_device, vdev)
-/*D:130
- * Device configurations
- *
- * The configuration information for a device consists of one or more
- * virtqueues, a feature bitmap, and some configuration bytes.  The
- * configuration bytes don't really matter to us: the Launcher sets them up, and
- * the driver will look at them during setup.
- *
- * A convenient routine to return the device's virtqueue config array:
- * immediately after the descriptor.
- */
-static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc)
-{
-        return (void *)(desc + 1);
-}
-/* The features come immediately after the virtqueues. */
-static u8 *lg_features(const struct lguest_device_desc *desc)
-{
-        return (void *)(lg_vq(desc) + desc->num_vq);
-}
-/* The config space comes after the two feature bitmasks. */
-static u8 *lg_config(const struct lguest_device_desc *desc)
-{
-        return lg_features(desc) + desc->feature_len * 2;
-}
-/* The total size of the config page used by this device (incl. desc) */
-static unsigned desc_size(const struct lguest_device_desc *desc)
-{
-        return sizeof(*desc)
-                + desc->num_vq * sizeof(struct lguest_vqconfig)
-                + desc->feature_len * 2
-                + desc->config_len;
-}
-/* This gets the device's feature bits. */
-static u64 lg_get_features(struct virtio_device *vdev)
-{
-        unsigned int i;
-        u32 features = 0;
-        struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
-        u8 *in_features = lg_features(desc);
-        /* We do this the slow but generic way. */
-        for (i = 0; i < min(desc->feature_len * 8, 32); i++)
-                if (in_features[i / 8] & (1 << (i % 8)))
-                        features |= (1 << i);
-        return features;
-}
-/*
- * To notify on reset or feature finalization, we (ab)use the NOTIFY
- * hypercall, with the descriptor address of the device.
- */
-static void status_notify(struct virtio_device *vdev)
-{
-        unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices;
-        hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0);
-}
-/*
- * The virtio core takes the features the Host offers, and copies the ones
- * supported by the driver into the vdev->features array.  Once that's all
- * sorted out, this routine is called so we can tell the Host which features we
- * understand and accept.
- */
-static int lg_finalize_features(struct virtio_device *vdev)
-{
-        unsigned int i, bits;
-        struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
-        /* Second half of bitmap is features we accept. */
-        u8 *out_features = lg_features(desc) + desc->feature_len;
-        /* Give virtio_ring a chance to accept features. */
-        vring_transport_features(vdev);
-        /* Make sure we don't have any features > 32 bits! */
-        BUG_ON((u32)vdev->features != vdev->features);
-        /*
-         * Since lguest is currently x86-only, we're little-endian.  That
-         * means we could just memcpy.  But it's not time critical, and in
-         * case someone copies this code, we do it the slow, obvious way.
-         */
-        memset(out_features, 0, desc->feature_len);
-        bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
-        for (i = 0; i < bits; i++) {
-                if (__virtio_test_bit(vdev, i))
-                        out_features[i / 8] |= (1 << (i % 8));
-        }
-        /* Tell Host we've finished with this device's feature negotiation */
-        status_notify(vdev);
-        return 0;
-}
-/* Once they've found a field, getting a copy of it is easy. */
-static void lg_get(struct virtio_device *vdev, unsigned int offset,
-                   void *buf, unsigned len)
-{
-        struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
-        /* Check they didn't ask for more than the length of the config! */
-        BUG_ON(offset + len > desc->config_len);
-        memcpy(buf, lg_config(desc) + offset, len);
-}
-/* Setting the contents is also trivial. */
-static void lg_set(struct virtio_device *vdev, unsigned int offset,
-                   const void *buf, unsigned len)
-{
-        struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
-        /* Check they didn't ask for more than the length of the config! */
-        BUG_ON(offset + len > desc->config_len);
-        memcpy(lg_config(desc) + offset, buf, len);
-}
-/*
- * The operations to get and set the status word just access the status field
- * of the device descriptor.
- */
-static u8 lg_get_status(struct virtio_device *vdev)
-{
-        return to_lgdev(vdev)->desc->status;
-}
-static void lg_set_status(struct virtio_device *vdev, u8 status)
-{
-        BUG_ON(!status);
-        to_lgdev(vdev)->desc->status = status;
-        /* Tell Host immediately if we failed. */
-        if (status & VIRTIO_CONFIG_S_FAILED)
-                status_notify(vdev);
-}
-static void lg_reset(struct virtio_device *vdev)
-{
-        /* 0 status means "reset" */
-        to_lgdev(vdev)->desc->status = 0;
-        status_notify(vdev);
-}
-/*
- * Virtqueues
- *
- * The other piece of infrastructure virtio needs is a "virtqueue": a way of
- * the Guest device registering buffers for the other side to read from or
- * write into (ie. send and receive buffers).  Each device can have multiple
- * virtqueues: for example the console driver uses one queue for sending and
- * another for receiving.
- *
- * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue
- * already exists in virtio_ring.c.  We just need to connect it up.
- *
- * We start with the information we need to keep about each virtqueue.
- */
-/*D:140 This is the information we remember about each virtqueue. */
-struct lguest_vq_info {
-        /* A copy of the information contained in the device config. */
-        struct lguest_vqconfig config;
-        /* The address where we mapped the virtio ring, so we can unmap it. */
-        void *pages;
-};
-/*
- * When the virtio_ring code wants to prod the Host, it calls us here and we
- * make a hypercall.  We hand the physical address of the virtqueue so the Host
- * knows which virtqueue we're talking about.
- */
-static bool lg_notify(struct virtqueue *vq)
-{
-        /*
-         * We store our virtqueue information in the "priv" pointer of the
-         * virtqueue structure.
-         */
-        struct lguest_vq_info *lvq = vq->priv;
-        hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0, 0);
-        return true;
-}
-/* An extern declaration inside a C file is bad form.  Don't do it. */
-extern int lguest_setup_irq(unsigned int irq);
-/*
- * This routine finds the Nth virtqueue described in the configuration of
- * this device and sets it up.
- *
- * This is kind of an ugly duckling.  It'd be nicer to have a standard
- * representation of a virtqueue in the configuration space, but it seems that
- * everyone wants to do it differently.  The KVM coders want the Guest to
- * allocate its own pages and tell the Host where they are, but for lguest it's
- * simpler for the Host to simply tell us where the pages are.
- */
-static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
-                                    unsigned index,
-                                    void (*callback)(struct virtqueue *vq),
-                                    const char *name)
-{
-        struct lguest_device *ldev = to_lgdev(vdev);
-        struct lguest_vq_info *lvq;
-        struct virtqueue *vq;
-        int err;
-        if (!name)
-                return NULL;
-        /* We must have this many virtqueues. */
-        if (index >= ldev->desc->num_vq)
-                return ERR_PTR(-ENOENT);
-        lvq = kmalloc(sizeof(*lvq), GFP_KERNEL);
-        if (!lvq)
-                return ERR_PTR(-ENOMEM);
-        /*
-         * Make a copy of the "struct lguest_vqconfig" entry, which sits after
-         * the descriptor.  We need a copy because the config space might not
-         * be aligned correctly.
-         */
-        memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config));
-        printk("Mapping virtqueue %i addr %lx\n", index,
-               (unsigned long)lvq->config.pfn << PAGE_SHIFT);
-        /* Figure out how many pages the ring will take, and map that memory */
-        lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT,
-                                DIV_ROUND_UP(vring_size(lvq->config.num,
-                                                        LGUEST_VRING_ALIGN),
-                                             PAGE_SIZE));
-        if (!lvq->pages) {
-                err = -ENOMEM;
-                goto free_lvq;
-        }
-        /*
-         * OK, tell virtio_ring.c to set up a virtqueue now we know its size
-         * and we've got a pointer to its pages.  Note that we set weak_barriers
-         * to 'true': the host just a(nother) SMP CPU, so we only need inter-cpu
-         * barriers.
-         */
-        vq = vring_new_virtqueue(index, lvq->config.num, LGUEST_VRING_ALIGN, vdev,
-                                 true, lvq->pages, lg_notify, callback, name);
-        if (!vq) {
-                err = -ENOMEM;
-                goto unmap;
-        }
-        /* Make sure the interrupt is allocated. */
-        err = lguest_setup_irq(lvq->config.irq);
-        if (err)
-                goto destroy_vring;
-        /*
-         * Tell the interrupt for this virtqueue to go to the virtio_ring
-         * interrupt handler.
-         *
-         * FIXME: We used to have a flag for the Host to tell us we could use
-         * the interrupt as a source of randomness: it'd be nice to have that
-         * back.
-         */
-        err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED,
-                          dev_name(&vdev->dev), vq);
-        if (err)
-                goto free_desc;
-        /*
-         * Last of all we hook up our 'struct lguest_vq_info" to the
-         * virtqueue's priv pointer.
-         */
-        vq->priv = lvq;
-        return vq;
-free_desc:
-        irq_free_desc(lvq->config.irq);
-destroy_vring:
-        vring_del_virtqueue(vq);
-unmap:
-        lguest_unmap(lvq->pages);
-free_lvq:
-        kfree(lvq);
-        return ERR_PTR(err);
-}
-/*:*/
-/* Cleaning up a virtqueue is easy */
-static void lg_del_vq(struct virtqueue *vq)
-{
-        struct lguest_vq_info *lvq = vq->priv;
-        /* Release the interrupt */
-        free_irq(lvq->config.irq, vq);
-        /* Tell virtio_ring.c to free the virtqueue. */
-        vring_del_virtqueue(vq);
-        /* Unmap the pages containing the ring. */
-        lguest_unmap(lvq->pages);
-        /* Free our own queue information. */
-        kfree(lvq);
-}
-static void lg_del_vqs(struct virtio_device *vdev)
-{
-        struct virtqueue *vq, *n;
-        list_for_each_entry_safe(vq, n, &vdev->vqs, list)
-                lg_del_vq(vq);
-}
-static int lg_find_vqs(struct virtio_device *vdev, unsigned nvqs,
-                       struct virtqueue *vqs[],
-                       vq_callback_t *callbacks[],
-                       const char *names[])
-{
-        struct lguest_device *ldev = to_lgdev(vdev);
-        int i;
-        /* We must have this many virtqueues. */
-        if (nvqs > ldev->desc->num_vq)
-                return -ENOENT;
-        for (i = 0; i < nvqs; ++i) {
-                vqs[i] = lg_find_vq(vdev, i, callbacks[i], names[i]);
-                if (IS_ERR(vqs[i]))
-                        goto error;
-        }
-        return 0;
-error:
-        lg_del_vqs(vdev);
-        return PTR_ERR(vqs[i]);
-}
-static const char *lg_bus_name(struct virtio_device *vdev)
-{
-        return "";
-}
-/* The ops structure which hooks everything together. */
-static const struct virtio_config_ops lguest_config_ops = {
-        .get_features = lg_get_features,
-        .finalize_features = lg_finalize_features,
-        .get = lg_get,
-        .set = lg_set,
-        .get_status = lg_get_status,
-        .set_status = lg_set_status,
-        .reset = lg_reset,
-        .find_vqs = lg_find_vqs,
-        .del_vqs = lg_del_vqs,
-        .bus_name = lg_bus_name,
-};
-/*
- * The root device for the lguest virtio devices.  This makes them appear as
- * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2.
- */
-static struct device *lguest_root;
-/*D:120
- * This is the core of the lguest bus: actually adding a new device.
- * It's a separate function because it's neater that way, and because an
- * earlier version of the code supported hotplug and unplug.  They were removed
- * early on because they were never used.
- *
- * As Andrew Tridgell says, "Untested code is buggy code".
- *
- * It's worth reading this carefully: we start with a pointer to the new device
- * descriptor in the "lguest_devices" page, and the offset into the device
- * descriptor page so we can uniquely identify it if things go badly wrong.
- */
-static void add_lguest_device(struct lguest_device_desc *d,
-                              unsigned int offset)
-{
-        struct lguest_device *ldev;
-        /* Start with zeroed memory; Linux's device layer counts on it. */
-        ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
-        if (!ldev) {
-                printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n",
-                       offset, d->type);
-                return;
-        }
-        /* This devices' parent is the lguest/ dir. */
-        ldev->vdev.dev.parent = lguest_root;
-        /*
-         * The device type comes straight from the descriptor.  There's also a
-         * device vendor field in the virtio_device struct, which we leave as
-         * 0.
-         */
-        ldev->vdev.id.device = d->type;
-        /*
-         * We have a simple set of routines for querying the device's
-         * configuration information and setting its status.
-         */
-        ldev->vdev.config = &lguest_config_ops;
-        /* And we remember the device's descriptor for lguest_config_ops. */
-        ldev->desc = d;
-        /*
-         * register_virtio_device() sets up the generic fields for the struct
-         * virtio_device and calls device_register().  This makes the bus
-         * infrastructure look for a matching driver.
-         */
-        if (register_virtio_device(&ldev->vdev) != 0) {
-                printk(KERN_ERR "Failed to register lguest dev %u type %u\n",
-                       offset, d->type);
-                kfree(ldev);
-        }
-}
-/*D:110
- * scan_devices() simply iterates through the device page.  The type 0 is
- * reserved to mean "end of devices".
- */
-static void scan_devices(void)
-{
-        unsigned int i;
-        struct lguest_device_desc *d;
-        /* We start at the page beginning, and skip over each entry. */
-        for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
-                d = lguest_devices + i;
-                /* Once we hit a zero, stop. */
-                if (d->type == 0)
-                        break;
-                printk("Device at %i has size %u\n", i, desc_size(d));
-                add_lguest_device(d, i);
-        }
-}
-/*D:105
- * Fairly early in boot, lguest_devices_init() is called to set up the
- * lguest device infrastructure.  We check that we are a Guest by checking
- * pv_info.name: there are other ways of checking, but this seems most
- * obvious to me.
- *
- * So we can access the "struct lguest_device_desc"s easily, we map that memory
- * and store the pointer in the global "lguest_devices".  Then we register a
- * root device from which all our devices will hang (this seems to be the
- * correct sysfs incantation).
- *
- * Finally we call scan_devices() which adds all the devices found in the
- * lguest_devices page.
- */
-static int __init lguest_devices_init(void)
-{
-        if (strcmp(pv_info.name, "lguest") != 0)
-                return 0;
-        lguest_root = root_device_register("lguest");
-        if (IS_ERR(lguest_root))
-                panic("Could not register lguest root");
-        /* Devices are in a single page above top of "normal" mem */
-        lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1);
-        scan_devices();
-        return 0;
-}
-/* We do this after core stuff, but before the drivers. */
-postcore_initcall(lguest_devices_init);
-/*D:150
- * At this point in the journey we used to now wade through the lguest
- * devices themselves: net, block and console.  Since they're all now virtio
- * devices rather than lguest-specific, I've decided to ignore them.  Mostly,
- * they're kind of boring.  But this does mean you'll never experience the
- * thrill of reading the forbidden love scene buried deep in the block driver.
- *
- * "make Launcher" beckons, where we answer questions like "Where do Guests
- * come from?", and "What do you do when someone asks for optimization?".
- */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 4263f4cc8c55..c4c6113eb9a6 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -2,175 +2,62 @@
 * launcher controls and communicates with the Guest.  For example,
 * the first write will tell us the Guest's memory layout and entry
 * point.  A read will run the Guest until something happens, such as
- * a signal or the Guest doing a NOTIFY out to the Launcher.  There is
+ * a signal or the Guest accessing a device.
- * also a way for the Launcher to attach eventfds to particular NOTIFY
- * values instead of returning from the read() call.
 :*/
 #include <linux/uaccess.h>
 #include <linux/miscdevice.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
-#include <linux/eventfd.h>
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/export.h>
 #include "lg.h"
-/*L:056
+/*L:052
- * Before we move on, let's jump ahead and look at what the kernel does when
+  The Launcher can get the registers, and also set some of them.
- * it needs to look up the eventfds.  That will complete our picture of how we
+*/
- * use RCU.
+static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
- *
- * The notification value is in cpu->pending_notify: we return true if it went
- * to an eventfd.
- */
-bool send_notify_to_eventfd(struct lg_cpu *cpu)
-{
-        unsigned int i;
-        struct lg_eventfd_map *map;
-        /*
-         * This "rcu_read_lock()" helps track when someone is still looking at
-         * the (RCU-using) eventfds array.  It's not actually a lock at all;
-         * indeed it's a noop in many configurations.  (You didn't expect me to
-         * explain all the RCU secrets here, did you?)
-         */
-        rcu_read_lock();
-        /*
-         * rcu_dereference is the counter-side of rcu_assign_pointer(); it
-         * makes sure we don't access the memory pointed to by
-         * cpu->lg->eventfds before cpu->lg->eventfds is set.  Sounds crazy,
-         * but Alpha allows this!  Paul McKenney points out that a really
-         * aggressive compiler could have the same effect:
-         *   http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html
-         *
-         * So play safe, use rcu_dereference to get the rcu-protected pointer:
-         */
-        map = rcu_dereference(cpu->lg->eventfds);
-        /*
-         * Simple array search: even if they add an eventfd while we do this,
-         * we'll continue to use the old array and just won't see the new one.
-         */
-        for (i = 0; i < map->num; i++) {
-                if (map->map[i].addr == cpu->pending_notify) {
-                        eventfd_signal(map->map[i].event, 1);
-                        cpu->pending_notify = 0;
-                        break;
-                }
-        }
-        /* We're done with the rcu-protected variable cpu->lg->eventfds. */
-        rcu_read_unlock();
-        /* If we cleared the notification, it's because we found a match. */
-        return cpu->pending_notify == 0;
-}
-/*L:055
- * One of the more tricksy tricks in the Linux Kernel is a technique called
- * Read Copy Update.  Since one point of lguest is to teach lguest journeyers
- * about kernel coding, I use it here.  (In case you're curious, other purposes
- * include learning about virtualization and instilling a deep appreciation for
- * simplicity and puppies).
- *
- * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we
- * add new eventfds without ever blocking readers from accessing the array.
- * The current Launcher only does this during boot, so that never happens.  But
- * Read Copy Update is cool, and adding a lock risks damaging even more puppies
- * than this code does.
- *
- * We allocate a brand new one-larger array, copy the old one and add our new
- * element.  Then we make the lg eventfd pointer point to the new array.
- * That's the easy part: now we need to free the old one, but we need to make
- * sure no slow CPU somewhere is still looking at it.  That's what
- * synchronize_rcu does for us: waits until every CPU has indicated that it has
- * moved on to know it's no longer using the old one.
- *
- * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update.
- */
-static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
 {
-        struct lg_eventfd_map *new, *old = lg->eventfds;
+        unsigned long which;
-        /*
-         * We don't allow notifications on value 0 anyway (pending_notify of
-         * 0 means "nothing pending").
-         */
-        if (!addr)
-                return -EINVAL;
-        /*
-         * Replace the old array with the new one, carefully: others can
-         * be accessing it at the same time.
-         */
-        new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
-                      GFP_KERNEL);
-        if (!new)
-                return -ENOMEM;
-        /* First make identical copy. */
+        /* We re-use the ptrace structure to specify which register to read. */
-        memcpy(new->map, old->map, sizeof(old->map[0]) * old->num);
+        if (get_user(which, input) != 0)
-        new->num = old->num;
+                return -EFAULT;
-        /* Now append new entry. */
-        new->map[new->num].addr = addr;
-        new->map[new->num].event = eventfd_ctx_fdget(fd);
-        if (IS_ERR(new->map[new->num].event)) {
-                int err =  PTR_ERR(new->map[new->num].event);
-                kfree(new);
-                return err;
-        }
-        new->num++;
        /*
-         * Now put new one in place: rcu_assign_pointer() is a fancy way of
+         * We set up the cpu register pointer, and their next read will
-         * doing "lg->eventfds = new", but it uses memory barriers to make
+         * actually get the value (instead of running the guest).
-         * absolutely sure that the contents of "new" written above is nailed
-         * down before we actually do the assignment.
         *
-         * We have to think about these kinds of things when we're operating on
+         * The last argument 'true' says we can access any register.
-         * live data without locks.
         */
-        rcu_assign_pointer(lg->eventfds, new);
+        cpu->reg_read = lguest_arch_regptr(cpu, which, true);
+        if (!cpu->reg_read)
+                return -ENOENT;
-        /*
+        /* And because this is a write() call, we return the length used. */
-         * We're not in a big hurry.  Wait until no one's looking at old
+        return sizeof(unsigned long) * 2;
-         * version, then free it.
-         */
-        synchronize_rcu();
-        kfree(old);
-        return 0;
 }
-/*L:052
+static int setreg(struct lg_cpu *cpu, const unsigned long __user *input)
- * Receiving notifications from the Guest is usually done by attaching a
- * particular LHCALL_NOTIFY value to an event filedescriptor.  The eventfd will
- * become readable when the Guest does an LHCALL_NOTIFY with that value.
- *
- * This is really convenient for processing each virtqueue in a separate
- * thread.
- */
-static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
 {
-        unsigned long addr, fd;
+        unsigned long which, value, *reg;
-        int err;
-        if (get_user(addr, input) != 0)
+        /* We re-use the ptrace structure to specify which register to read. */
+        if (get_user(which, input) != 0)
                return -EFAULT;
        input++;
-        if (get_user(fd, input) != 0)
+        if (get_user(value, input) != 0)
                return -EFAULT;
-        /*
+        /* The last argument 'false' means we can't access all registers. */
-         * Just make sure two callers don't add eventfds at once.  We really
+        reg = lguest_arch_regptr(cpu, which, false);
-         * only need to lock against callers adding to the same Guest, so using
+        if (!reg)
-         * the Big Lguest Lock is overkill.  But this is setup, not a fast path.
+                return -ENOENT;
-         */
-        mutex_lock(&lguest_lock);
-        err = add_eventfd(lg, addr, fd);
-        mutex_unlock(&lguest_lock);
-        return err;
+        *reg = value;
+        /* And because this is a write() call, we return the length used. */
+        return sizeof(unsigned long) * 3;
 }
 /*L:050
@@ -194,6 +81,23 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
        return 0;
 }
+/*L:053
+ * Deliver a trap: this is used by the Launcher if it can't emulate
+ * an instruction.
+ */
+static int trap(struct lg_cpu *cpu, const unsigned long __user *input)
+{
+        unsigned long trapnum;
+        if (get_user(trapnum, input) != 0)
+                return -EFAULT;
+        if (!deliver_trap(cpu, trapnum))
+                return -EINVAL;
+        return 0;
+}
 /*L:040
 * Once our Guest is initialized, the Launcher makes it run by reading
 * from /dev/lguest.
@@ -237,8 +141,8 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
         * If we returned from read() last time because the Guest sent I/O,
         * clear the flag.
         */
-        if (cpu->pending_notify)
+        if (cpu->pending.trap)
-                cpu->pending_notify = 0;
+                cpu->pending.trap = 0;
        /* Run the Guest until something interesting happens. */
        return run_guest(cpu, (unsigned long __user *)user);
@@ -319,7 +223,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
        /* "struct lguest" contains all we (the Host) know about a Guest. */
        struct lguest *lg;
        int err;
-        unsigned long args[3];
+        unsigned long args[4];
        /*
         * We grab the Big Lguest lock, which protects against multiple
@@ -343,21 +247,15 @@ static int initialize(struct file *file, const unsigned long __user *input)
                goto unlock;
        }
-        lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
-        if (!lg->eventfds) {
-                err = -ENOMEM;
-                goto free_lg;
-        }
-        lg->eventfds->num = 0;
        /* Populate the easy fields of our "struct lguest" */
        lg->mem_base = (void __user *)args[0];
        lg->pfn_limit = args[1];
+        lg->device_limit = args[3];
        /* This is the first cpu (cpu 0) and it will start booting at args[2] */
        err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
        if (err)
-                goto free_eventfds;
+                goto free_lg;
        /*
         * Initialize the Guest's shadow page tables.  This allocates
@@ -378,8 +276,6 @@ static int initialize(struct file *file, const unsigned long __user *input)
 free_regs:
        /* FIXME: This should be in free_vcpu */
        free_page(lg->cpus[0].regs_page);
-free_eventfds:
-        kfree(lg->eventfds);
 free_lg:
        kfree(lg);
 unlock:
@@ -432,8 +328,12 @@ static ssize_t write(struct file *file, const char __user *in,
                return initialize(file, input);
        case LHREQ_IRQ:
                return user_send_irq(cpu, input);
-        case LHREQ_EVENTFD:
+        case LHREQ_GETREG:
-                return attach_eventfd(lg, input);
+                return getreg_setup(cpu, input);
+        case LHREQ_SETREG:
+                return setreg(cpu, input);
+        case LHREQ_TRAP:
+                return trap(cpu, input);
        default:
                return -EINVAL;
        }
@@ -478,11 +378,6 @@ static int close(struct inode *inode, struct file *file)
                mmput(lg->cpus[i].mm);
        }
-        /* Release any eventfds they registered. */
-        for (i = 0; i < lg->eventfds->num; i++)
-                eventfd_ctx_put(lg->eventfds->map[i].event);
-        kfree(lg->eventfds);
        /*
         * If lg->dead doesn't contain an error code it will be NULL or a
         * kmalloc()ed string, either of which is ok to hand to kfree().
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index e8b55c3a6170..e3abebc912c0 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -250,6 +250,16 @@ static void release_pte(pte_t pte)
 }
 /*:*/
+static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte)
+{
+        /* We don't handle large pages. */
+        if (pte_flags(gpte) & _PAGE_PSE)
+                return false;
+        return (pte_pfn(gpte) >= cpu->lg->pfn_limit
+                && pte_pfn(gpte) < cpu->lg->device_limit);
+}
 static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
 {
        if ((pte_flags(gpte) & _PAGE_PSE) ||
@@ -374,8 +384,14 @@ static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
 *
 * If we fixed up the fault (ie. we mapped the address), this routine returns
 * true.  Otherwise, it was a real fault and we need to tell the Guest.
+ *
+ * There's a corner case: they're trying to access memory between
+ * pfn_limit and device_limit, which is I/O memory.  In this case, we
+ * return false and set @iomem to the physical address, so the the
+ * Launcher can handle the instruction manually.
 */
-bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode,
+                 unsigned long *iomem)
 {
        unsigned long gpte_ptr;
        pte_t gpte;
@@ -383,6 +399,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
        pmd_t gpmd;
        pgd_t gpgd;
+        *iomem = 0;
        /* We never demand page the Switcher, so trying is a mistake. */
        if (vaddr >= switcher_addr)
                return false;
@@ -459,6 +477,12 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
        if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
                return false;
+        /* If they're accessing io memory, we expect a fault. */
+        if (gpte_in_iomem(cpu, gpte)) {
+                *iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+                return false;
+        }
        /*
         * Check that the Guest PTE flags are OK, and the page number is below
         * the pfn_limit (ie. not mapping the Launcher binary).
@@ -553,7 +577,9 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
 */
 void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
 {
-        if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
+        unsigned long iomem;
+        if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem))
                kill_guest(cpu, "bad stack page %#lx", vaddr);
 }
 /*:*/
@@ -647,7 +673,7 @@ void guest_pagetable_flush_user(struct lg_cpu *cpu)
 /*:*/
 /* We walk down the guest page tables to get a guest-physical address */
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
+bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr)
 {
        pgd_t gpgd;
        pte_t gpte;
@@ -656,31 +682,47 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
 #endif
        /* Still not set up?  Just map 1:1. */
-        if (unlikely(cpu->linear_pages))
+        if (unlikely(cpu->linear_pages)) {
-                return vaddr;
+                *paddr = vaddr;
+                return true;
+        }
        /* First step: get the top-level Guest page table entry. */
        gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
        /* Toplevel not present?  We can't map it in. */
-        if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) {
+        if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-                kill_guest(cpu, "Bad address %#lx", vaddr);
+                goto fail;
-                return -1UL;
-        }
 #ifdef CONFIG_X86_PAE
        gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-        if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) {
+        if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
-                kill_guest(cpu, "Bad address %#lx", vaddr);
+                goto fail;
-                return -1UL;
-        }
        gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
 #else
        gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
 #endif
        if (!(pte_flags(gpte) & _PAGE_PRESENT))
-                kill_guest(cpu, "Bad address %#lx", vaddr);
+                goto fail;
+        *paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
+        return true;
+fail:
+        *paddr = -1UL;
+        return false;
+}
-        return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
+/*
+ * This is the version we normally use: kills the Guest if it uses a
+ * bad address
+ */
+unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
+{
+        unsigned long paddr;
+        if (!__guest_pa(cpu, vaddr, &paddr))
+                kill_guest(cpu, "Bad address %#lx", vaddr);
+        return paddr;
 }
 /*
@@ -912,7 +954,8 @@ static void __guest_set_pte(struct lg_cpu *cpu, int idx,
                         * now.  This shaves 10% off a copy-on-write
                         * micro-benchmark.
                         */
-                        if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
+                        if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED))
+                            && !gpte_in_iomem(cpu, gpte)) {
                                if (!check_gpte(cpu, gpte))
                                        return;
                                set_pte(spte,
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 6adfd7ba4c97..30f2aef69d78 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -182,6 +182,52 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
 }
 /*:*/
+unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any)
+{
+        switch (reg_off) {
+        case offsetof(struct pt_regs, bx):
+                return &cpu->regs->ebx;
+        case offsetof(struct pt_regs, cx):
+                return &cpu->regs->ecx;
+        case offsetof(struct pt_regs, dx):
+                return &cpu->regs->edx;
+        case offsetof(struct pt_regs, si):
+                return &cpu->regs->esi;
+        case offsetof(struct pt_regs, di):
+                return &cpu->regs->edi;
+        case offsetof(struct pt_regs, bp):
+                return &cpu->regs->ebp;
+        case offsetof(struct pt_regs, ax):
+                return &cpu->regs->eax;
+        case offsetof(struct pt_regs, ip):
+                return &cpu->regs->eip;
+        case offsetof(struct pt_regs, sp):
+                return &cpu->regs->esp;
+        }
+        /* Launcher can read these, but we don't allow any setting. */
+        if (any) {
+                switch (reg_off) {
+                case offsetof(struct pt_regs, ds):
+                        return &cpu->regs->ds;
+                case offsetof(struct pt_regs, es):
+                        return &cpu->regs->es;
+                case offsetof(struct pt_regs, fs):
+                        return &cpu->regs->fs;
+                case offsetof(struct pt_regs, gs):
+                        return &cpu->regs->gs;
+                case offsetof(struct pt_regs, cs):
+                        return &cpu->regs->cs;
+                case offsetof(struct pt_regs, flags):
+                        return &cpu->regs->eflags;
+                case offsetof(struct pt_regs, ss):
+                        return &cpu->regs->ss;
+                }
+        }
+        return NULL;
+}
 /*M:002
 * There are hooks in the scheduler which we can register to tell when we
 * get kicked off the CPU (preempt_notifier_register()).  This would allow us
@@ -269,110 +315,73 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
 * usually attached to a PC.
 *
 * When the Guest uses one of these instructions, we get a trap (General
- * Protection Fault) and come here.  We see if it's one of those troublesome
+ * Protection Fault) and come here.  We queue this to be sent out to the
- * instructions and skip over it.  We return true if we did.
+ * Launcher to handle.
 */
-static int emulate_insn(struct lg_cpu *cpu)
-{
-        u8 insn;
-        unsigned int insnlen = 0, in = 0, small_operand = 0;
-        /*
-         * The eip contains the *virtual* address of the Guest's instruction:
-         * walk the Guest's page tables to find the "physical" address.
-         */
-        unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
-        /*
-         * This must be the Guest kernel trying to do something, not userspace!
-         * The bottom two bits of the CS segment register are the privilege
-         * level.
-         */
-        if ((cpu->regs->cs & 3) != GUEST_PL)
-                return 0;
-        /* Decoding x86 instructions is icky. */
-        insn = lgread(cpu, physaddr, u8);
-        /*
+/*
-         * Around 2.6.33, the kernel started using an emulation for the
+ * The eip contains the *virtual* address of the Guest's instruction:
-         * cmpxchg8b instruction in early boot on many configurations.  This
+ * we copy the instruction here so the Launcher doesn't have to walk
-         * code isn't paravirtualized, and it tries to disable interrupts.
+ * the page tables to decode it.  We handle the case (eg. in a kernel
-         * Ignore it, which will Mostly Work.
+ * module) where the instruction is over two pages, and the pages are
-         */
+ * virtually but not physically contiguous.
-        if (insn == 0xfa) {
+ *
-                /* "cli", or Clear Interrupt Enable instruction.  Skip it. */
+ * The longest possible x86 instruction is 15 bytes, but we don't handle
-                cpu->regs->eip++;
+ * anything that strange.
-                return 1;
+ */
+static void copy_from_guest(struct lg_cpu *cpu,
+                            void *dst, unsigned long vaddr, size_t len)
+{
+        size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
+        unsigned long paddr;
+        BUG_ON(len > PAGE_SIZE);
+        /* If it goes over a page, copy in two parts. */
+        if (len > to_page_end) {
+                /* But make sure the next page is mapped! */
+                if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
+                        copy_from_guest(cpu, dst + to_page_end,
+                                        vaddr + to_page_end,
+                                        len - to_page_end);
+                else
+                        /* Otherwise fill with zeroes. */
+                        memset(dst + to_page_end, 0, len - to_page_end);
+                len = to_page_end;
        }
-        /*
+        /* This will kill the guest if it isn't mapped, but that
-         * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
+         * shouldn't happen. */
-         */
+        __lgread(cpu, dst, guest_pa(cpu, vaddr), len);
-        if (insn == 0x66) {
+}
-                small_operand = 1;
-                /* The instruction is 1 byte so far, read the next byte. */
-                insnlen = 1;
-                insn = lgread(cpu, physaddr + insnlen, u8);
-        }
-        /*
-         * We can ignore the lower bit for the moment and decode the 4 opcodes
-         * we need to emulate.
-         */
-        switch (insn & 0xFE) {
-        case 0xE4: /* in     <next byte>,%al */
-                insnlen += 2;
-                in = 1;
-                break;
-        case 0xEC: /* in     (%dx),%al */
-                insnlen += 1;
-                in = 1;
-                break;
-        case 0xE6: /* out    %al,<next byte> */
-                insnlen += 2;
-                break;
-        case 0xEE: /* out    %al,(%dx) */
-                insnlen += 1;
-                break;
-        default:
-                /* OK, we don't know what this is, can't emulate. */
-                return 0;
-        }
-        /*
+static void setup_emulate_insn(struct lg_cpu *cpu)
-         * If it was an "IN" instruction, they expect the result to be read
+{
-         * into %eax, so we change %eax.  We always return all-ones, which
+        cpu->pending.trap = 13;
-         * traditionally means "there's nothing there".
+        copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
-         */
+                        sizeof(cpu->pending.insn));
-        if (in) {
+}
-                /* Lower bit tells means it's a 32/16 bit access */
-                if (insn & 0x1) {
+static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr)
-                        if (small_operand)
+{
-                                cpu->regs->eax |= 0xFFFF;
+        cpu->pending.trap = 14;
-                        else
+        cpu->pending.addr = iomem_addr;
-                                cpu->regs->eax = 0xFFFFFFFF;
+        copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
-                } else
+                        sizeof(cpu->pending.insn));
-                        cpu->regs->eax |= 0xFF;
-        }
-        /* Finally, we've "done" the instruction, so move past it. */
-        cpu->regs->eip += insnlen;
-        /* Success! */
-        return 1;
 }
 /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
 void lguest_arch_handle_trap(struct lg_cpu *cpu)
 {
+        unsigned long iomem_addr;
        switch (cpu->regs->trapnum) {
        case 13: /* We've intercepted a General Protection Fault. */
-                /*
+                /* Hand to Launcher to emulate those pesky IN and OUT insns */
-                 * Check if this was one of those annoying IN or OUT
-                 * instructions which we need to emulate.  If so, we just go
-                 * back into the Guest after we've done it.
-                 */
                if (cpu->regs->errcode == 0) {
-                        if (emulate_insn(cpu))
+                        setup_emulate_insn(cpu);
-                                return;
+                        return;
                }
                break;
        case 14: /* We've intercepted a Page Fault. */
@@ -387,9 +396,16 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
                 * whether kernel or userspace code.
                 */
                if (demand_page(cpu, cpu->arch.last_pagefault,
-                                cpu->regs->errcode))
+                                cpu->regs->errcode, &iomem_addr))
                        return;
+                /* Was this an access to memory mapped IO? */
+                if (iomem_addr) {
+                        /* Tell Launcher, let it handle it. */
+                        setup_iomem_insn(cpu, iomem_addr);
+                        return;
+                }
                /*
                 * OK, it's really not there (or not OK): the Guest needs to
                 * know.  We write out the cr2 value so it knows where the
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-02-18 12:24:01 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-02-18 12:24:01 -0500
commit	53861af9a17022898619a2ae4ead0dfc601b7c13 (patch)
tree	dc11088d9e86fa1d8d8479974864153a8f976897 /drivers/lguest
parent	5c2770079fb9b8c5bfb7113d9e76de66e77a0e24 (diff)
parent	5b40a7daf51812b35cf05d1601a779a7043f8414 (diff)