60 files changed, 4481 insertions, 4074 deletions
diff --git a/Documentation/virtual/00-INDEX b/Documentation/virtual/00-INDEX
index 924bd462675e..e952d30bbf0f 100644
--- a/Documentation/virtual/00-INDEX
+++ b/Documentation/virtual/00-INDEX
@@ -6,6 +6,3 @@ kvm/
        - Kernel Virtual Machine.  See also http://linux-kvm.org
 uml/
        - User Mode Linux, builds/runs Linux kernel as a userspace program.
-virtio.txt
-        - Text version of draft virtio spec.
-          See http://ozlabs.org/~rusty/virtio-spec
diff --git a/Documentation/virtual/virtio-spec.txt b/Documentation/virtual/virtio-spec.txt
deleted file mode 100644
index eb094039b50d..000000000000
--- a/Documentation/virtual/virtio-spec.txt
+++ /dev/null
@@ -1,3210 +0,0 @@
-[Generated file: see http://ozlabs.org/~rusty/virtio-spec/]
-Virtio PCI Card Specification
-v0.9.5 DRAFT
-
-Rusty Russell <rusty@rustcorp.com.au> IBM Corporation (Editor)
-2012 May 7.
-Purpose and Description
-This document describes the specifications of the “virtio” family
-of PCI[LaTeX Command: nomenclature] devices. These are devices
-are found in virtual environments[LaTeX Command: nomenclature],
-yet by design they are not all that different from physical PCI
-devices, and this document treats them as such. This allows the
-guest to use standard PCI drivers and discovery mechanisms.
-The purpose of virtio and this specification is that virtual
-environments and guests should have a straightforward, efficient,
-standard and extensible mechanism for virtual devices, rather
-than boutique per-environment or per-OS mechanisms.
-  Straightforward: Virtio PCI devices use normal PCI mechanisms
-  of interrupts and DMA which should be familiar to any device
-  driver author. There is no exotic page-flipping or COW
-  mechanism: it's just a PCI device.[footnote:
-This lack of page-sharing implies that the implementation of the
-device (e.g. the hypervisor or host) needs full access to the
-guest memory. Communication with untrusted parties (i.e.
-inter-guest communication) requires copying.
-]
-  Efficient: Virtio PCI devices consist of rings of descriptors
-  for input and output, which are neatly separated to avoid cache
-  effects from both guest and device writing to the same cache
-  lines.
-  Standard: Virtio PCI makes no assumptions about the environment
-  in which it operates, beyond supporting PCI. In fact the virtio
-  devices specified in the appendices do not require PCI at all:
-  they have been implemented on non-PCI buses.[footnote:
-The Linux implementation further separates the PCI virtio code
-from the specific virtio drivers: these drivers are shared with
-the non-PCI implementations (currently lguest and S/390).
-]
-  Extensible: Virtio PCI devices contain feature bits which are
-  acknowledged by the guest operating system during device setup.
-  This allows forwards and backwards compatibility: the device
-  offers all the features it knows about, and the driver
-  acknowledges those it understands and wishes to use.
-  Virtqueues
-The mechanism for bulk data transport on virtio PCI devices is
-pretentiously called a virtqueue. Each device can have zero or
-more virtqueues: for example, the network device has one for
-transmit and one for receive.
-Each virtqueue occupies two or more physically-contiguous pages
-(defined, for the purposes of this specification, as 4096 bytes),
-and consists of three parts:
-+-------------------+-----------------------------------+-----------+
-| Descriptor Table  |   Available Ring     (padding)    | Used Ring |
-+-------------------+-----------------------------------+-----------+
-When the driver wants to send a buffer to the device, it fills in
-a slot in the descriptor table (or chains several together), and
-writes the descriptor index into the available ring. It then
-notifies the device. When the device has finished a buffer, it
-writes the descriptor into the used ring, and sends an interrupt.
-Specification
-  PCI Discovery
-Any PCI device with Vendor ID 0x1AF4, and Device ID 0x1000
-through 0x103F inclusive is a virtio device[footnote:
-The actual value within this range is ignored
-]. The device must also have a Revision ID of 0 to match this
-specification.
-The Subsystem Device ID indicates which virtio device is
-supported by the device. The Subsystem Vendor ID should reflect
-the PCI Vendor ID of the environment (it's currently only used
-for informational purposes by the guest).
-+----------------------+--------------------+---------------+
-| Subsystem Device ID  |   Virtio Device    | Specification |
-+----------------------+--------------------+---------------+
-+----------------------+--------------------+---------------+
-|          1           |   network card     |  Appendix C   |
-+----------------------+--------------------+---------------+
-|          2           |   block device     |  Appendix D   |
-+----------------------+--------------------+---------------+
-|          3           |      console       |  Appendix E   |
-+----------------------+--------------------+---------------+
-|          4           |  entropy source    |  Appendix F   |
-+----------------------+--------------------+---------------+
-|          5           | memory ballooning  |  Appendix G   |
-+----------------------+--------------------+---------------+
-|          6           |     ioMemory       |       -       |
-+----------------------+--------------------+---------------+
-|          7           |       rpmsg        |  Appendix H   |
-+----------------------+--------------------+---------------+
-|          8           |     SCSI host      |  Appendix I   |
-+----------------------+--------------------+---------------+
-|          9           |   9P transport     |       -       |
-+----------------------+--------------------+---------------+
-|         10           |   mac80211 wlan    |       -       |
-+----------------------+--------------------+---------------+
-  Device Configuration
-To configure the device, we use the first I/O region of the PCI
-device. This contains a virtio header followed by a
-device-specific region.
-There may be different widths of accesses to the I/O region; the “
-natural” access method for each field in the virtio header must
-be used (i.e. 32-bit accesses for 32-bit fields, etc), but the
-device-specific region can be accessed using any width accesses,
-and should obtain the same results.
-Note that this is possible because while the virtio header is PCI
-(i.e. little) endian, the device-specific region is encoded in
-the native endian of the guest (where such distinction is
-applicable).
-  Device Initialization Sequence<sub:Device-Initialization-Sequence>
-We start with an overview of device initialization, then expand
-on the details of the device and how each step is preformed.
-  Reset the device. This is not required on initial start up.
-  The ACKNOWLEDGE status bit is set: we have noticed the device.
-  The DRIVER status bit is set: we know how to drive the device.
-  Device-specific setup, including reading the Device Feature
-  Bits, discovery of virtqueues for the device, optional MSI-X
-  setup, and reading and possibly writing the virtio
-  configuration space.
-  The subset of Device Feature Bits understood by the driver is
-  written to the device.
-  The DRIVER_OK status bit is set.
-  The device can now be used (ie. buffers added to the
-  virtqueues)[footnote:
-Historically, drivers have used the device before steps 5 and 6.
-This is only allowed if the driver does not use any features
-which would alter this early use of the device.
-]
-If any of these steps go irrecoverably wrong, the guest should
-set the FAILED status bit to indicate that it has given up on the
-device (it can reset the device later to restart if desired).
-We now cover the fields required for general setup in detail.
-  Virtio Header
-The virtio header looks as follows:
-+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
-| Bits       || 32                  | 32                  | 32       | 16     | 16      | 16      | 8       | 8      |
-+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
-| Read/Write || R                   | R+W                 | R+W      | R      | R+W     | R+W     | R+W     | R      |
-+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
-| Purpose    || Device              | Guest               | Queue    | Queue  | Queue   | Queue   | Device  | ISR    |
-|            || Features bits 0:31  | Features bits 0:31  | Address  | Size   | Select  | Notify  | Status  | Status |
-+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
-If MSI-X is enabled for the device, two additional fields
-immediately follow this header:[footnote:
-ie. once you enable MSI-X on the device, the other fields move.
-If you turn it off again, they move back!
-]
-+------------++----------------+--------+
-| Bits       || 16             | 16     |
-              +----------------+--------+
-+------------++----------------+--------+
-| Read/Write || R+W            | R+W    |
-+------------++----------------+--------+
-| Purpose    || Configuration  | Queue  |
-| (MSI-X)    || Vector         | Vector |
-+------------++----------------+--------+
-Immediately following these general headers, there may be
-device-specific headers:
-+------------++--------------------+
-| Bits       || Device Specific    |
-              +--------------------+
-+------------++--------------------+
-| Read/Write || Device Specific    |
-+------------++--------------------+
-| Purpose    || Device Specific... |
-|            ||                    |
-+------------++--------------------+
-  Device Status
-The Device Status field is updated by the guest to indicate its
-progress. This provides a simple low-level diagnostic: it's most
-useful to imagine them hooked up to traffic lights on the console
-indicating the status of each device.
-The device can be reset by writing a 0 to this field, otherwise
-at least one bit should be set:
-  ACKNOWLEDGE (1) Indicates that the guest OS has found the
-  device and recognized it as a valid virtio device.
-  DRIVER (2) Indicates that the guest OS knows how to drive the
-  device. Under Linux, drivers can be loadable modules so there
-  may be a significant (or infinite) delay before setting this
-  bit.
-  DRIVER_OK (4) Indicates that the driver is set up and ready to
-  drive the device.
-  FAILED (128) Indicates that something went wrong in the guest,
-  and it has given up on the device. This could be an internal
-  error, or the driver didn't like the device for some reason, or
-  even a fatal error during device operation. The device must be
-  reset before attempting to re-initialize.
-  Feature Bits<sub:Feature-Bits>
-Thefirst configuration field indicates the features that the
-device supports. The bits are allocated as follows:
-  0 to 23 Feature bits for the specific device type
-  24 to 32 Feature bits reserved for extensions to the queue and
-  feature negotiation mechanisms
-For example, feature bit 0 for a network device (i.e. Subsystem
-Device ID 1) indicates that the device supports checksumming of
-packets.
-The feature bits are negotiated: the device lists all the
-features it understands in the Device Features field, and the
-guest writes the subset that it understands into the Guest
-Features field. The only way to renegotiate is to reset the
-device.
-In particular, new fields in the device configuration header are
-indicated by offering a feature bit, so the guest can check
-before accessing that part of the configuration space.
-This allows for forwards and backwards compatibility: if the
-device is enhanced with a new feature bit, older guests will not
-write that feature bit back to the Guest Features field and it
-can go into backwards compatibility mode. Similarly, if a guest
-is enhanced with a feature that the device doesn't support, it
-will not see that feature bit in the Device Features field and
-can go into backwards compatibility mode (or, for poor
-implementations, set the FAILED Device Status bit).
-  Configuration/Queue Vectors
-When MSI-X capability is present and enabled in the device
-(through standard PCI configuration space) 4 bytes at byte offset
-20 are used to map configuration change and queue interrupts to
-MSI-X vectors. In this case, the ISR Status field is unused, and
-device specific configuration starts at byte offset 24 in virtio
-header structure. When MSI-X capability is not enabled, device
-specific configuration starts at byte offset 20 in virtio header.
-Writing a valid MSI-X Table entry number, 0 to 0x7FF, to one of
-Configuration/Queue Vector registers, maps interrupts triggered
-by the configuration change/selected queue events respectively to
-the corresponding MSI-X vector. To disable interrupts for a
-specific event type, unmap it by writing a special NO_VECTOR
-value:
-/* Vector value used to disable MSI for queue */
-#define VIRTIO_MSI_NO_VECTOR            0xffff
-Reading these registers returns vector mapped to a given event,
-or NO_VECTOR if unmapped. All queue and configuration change
-events are unmapped by default.
-Note that mapping an event to vector might require allocating
-internal device resources, and might fail. Devices report such
-failures by returning the NO_VECTOR value when the relevant
-Vector field is read. After mapping an event to vector, the
-driver must verify success by reading the Vector field value: on
-success, the previously written value is returned, and on
-failure, NO_VECTOR is returned. If a mapping failure is detected,
-the driver can retry mapping with fewervectors, or disable MSI-X.
-  Virtqueue Configuration<sec:Virtqueue-Configuration>
-As a device can have zero or more virtqueues for bulk data
-transport (for example, the network driver has two), the driver
-needs to configure them as part of the device-specific
-configuration.
-This is done as follows, for each virtqueue a device has:
-  Write the virtqueue index (first queue is 0) to the Queue
-  Select field.
-  Read the virtqueue size from the Queue Size field, which is
-  always a power of 2. This controls how big the virtqueue is
-  (see below). If this field is 0, the virtqueue does not exist.
-  Allocate and zero virtqueue in contiguous physical memory, on a
-  4096 byte alignment. Write the physical address, divided by
-  4096 to the Queue Address field.[footnote:
-The 4096 is based on the x86 page size, but it's also large
-enough to ensure that the separate parts of the virtqueue are on
-separate cache lines.
-]
-  Optionally, if MSI-X capability is present and enabled on the
-  device, select a vector to use to request interrupts triggered
-  by virtqueue events. Write the MSI-X Table entry number
-  corresponding to this vector in Queue Vector field. Read the
-  Queue Vector field: on success, previously written value is
-  returned; on failure, NO_VECTOR value is returned.
-The Queue Size field controls the total number of bytes required
-for the virtqueue according to the following formula:
-#define ALIGN(x) (((x) + 4095) & ~4095)
-static inline unsigned vring_size(unsigned int qsz)
-{
-     return ALIGN(sizeof(struct vring_desc)*qsz + sizeof(u16)*(2
-+ qsz))
-          + ALIGN(sizeof(struct vring_used_elem)*qsz);
-}
-This currently wastes some space with padding, but also allows
-future extensions. The virtqueue layout structure looks like this
-(qsz is the Queue Size field, which is a variable, so this code
-won't compile):
-struct vring {
-    /* The actual descriptors (16 bytes each) */
-    struct vring_desc desc[qsz];
-    /* A ring of available descriptor heads with free-running
-index. */
-    struct vring_avail avail;
-    // Padding to the next 4096 boundary.
-    char pad[];
-    // A ring of used descriptor heads with free-running index.
-    struct vring_used used;
-};
-  A Note on Virtqueue Endianness
-Note that the endian of these fields and everything else in the
-virtqueue is the native endian of the guest, not little-endian as
-PCI normally is. This makes for simpler guest code, and it is
-assumed that the host already has to be deeply aware of the guest
-endian so such an “endian-aware” device is not a significant
-issue.
-  Descriptor Table
-The descriptor table refers to the buffers the guest is using for
-the device. The addresses are physical addresses, and the buffers
-can be chained via the next field. Each descriptor describes a
-buffer which is read-only or write-only, but a chain of
-descriptors can contain both read-only and write-only buffers.
-No descriptor chain may be more than 2^32 bytes long in total.struct vring_desc {
-    /* Address (guest-physical). */
-    u64 addr;
-    /* Length. */
-    u32 len;
-/* This marks a buffer as continuing via the next field. */
-#define VRING_DESC_F_NEXT   1
-/* This marks a buffer as write-only (otherwise read-only). */
-#define VRING_DESC_F_WRITE     2
-/* This means the buffer contains a list of buffer descriptors.
-*/
-#define VRING_DESC_F_INDIRECT   4
-    /* The flags as indicated above. */
-    u16 flags;
-    /* Next field if flags & NEXT */
-    u16 next;
-};
-The number of descriptors in the table is specified by the Queue
-Size field for this virtqueue.
-  <sub:Indirect-Descriptors>Indirect Descriptors
-Some devices benefit by concurrently dispatching a large number
-of large requests. The VIRTIO_RING_F_INDIRECT_DESC feature can be
-used to allow this (see [cha:Reserved-Feature-Bits]). To increase
-ring capacity it is possible to store a table of indirect
-descriptors anywhere in memory, and insert a descriptor in main
-virtqueue (with flags&INDIRECT on) that refers to memory buffer
-containing this indirect descriptor table; fields addr and len
-refer to the indirect table address and length in bytes,
-respectively. The indirect table layout structure looks like this
-(len is the length of the descriptor that refers to this table,
-which is a variable, so this code won't compile):
-struct indirect_descriptor_table {
-    /* The actual descriptors (16 bytes each) */
-    struct vring_desc desc[len / 16];
-};
-The first indirect descriptor is located at start of the indirect
-descriptor table (index 0), additional indirect descriptors are
-chained by next field. An indirect descriptor without next field
-(with flags&NEXT off) signals the end of the indirect descriptor
-table, and transfers control back to the main virtqueue. An
-indirect descriptor can not refer to another indirect descriptor
-table (flags&INDIRECT must be off). A single indirect descriptor
-table can include both read-only and write-only descriptors;
-write-only flag (flags&WRITE) in the descriptor that refers to it
-is ignored.
-  Available Ring
-The available ring refers to what descriptors we are offering the
-device: it refers to the head of a descriptor chain. The “flags”
-field is currently 0 or 1: 1 indicating that we do not need an
-interrupt when the device consumes a descriptor from the
-available ring. Alternatively, the guest can ask the device to
-delay interrupts until an entry with an index specified by the “
-used_event” field is written in the used ring (equivalently,
-until the idx field in the used ring will reach the value
-used_event + 1). The method employed by the device is controlled
-by the VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits]
-). This interrupt suppression is merely an optimization; it may
-not suppress interrupts entirely.
-The “idx” field indicates where we would put the next descriptor
-entry (modulo the ring size). This starts at 0, and increases.
-struct vring_avail {
-#define VRING_AVAIL_F_NO_INTERRUPT      1
-   u16 flags;
-   u16 idx;
-   u16 ring[qsz]; /* qsz is the Queue Size field read from device
-*/
-   u16 used_event;
-};
-  Used Ring
-The used ring is where the device returns buffers once it is done
-with them. The flags field can be used by the device to hint that
-no notification is necessary when the guest adds to the available
-ring. Alternatively, the “avail_event” field can be used by the
-device to hint that no notification is necessary until an entry
-with an index specified by the “avail_event” is written in the
-available ring (equivalently, until the idx field in the
-available ring will reach the value avail_event + 1). The method
-employed by the device is controlled by the guest through the
-VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits]
-). [footnote:
-These fields are kept here because this is the only part of the
-virtqueue written by the device
-].
-Each entry in the ring is a pair: the head entry of the
-descriptor chain describing the buffer (this matches an entry
-placed in the available ring by the guest earlier), and the total
-of bytes written into the buffer. The latter is extremely useful
-for guests using untrusted buffers: if you do not know exactly
-how much has been written by the device, you usually have to zero
-the buffer to ensure no data leakage occurs.
-/* u32 is used here for ids for padding reasons. */
-struct vring_used_elem {
-    /* Index of start of used descriptor chain. */
-    u32 id;
-    /* Total length of the descriptor chain which was used
-(written to) */
-    u32 len;
-};
-struct vring_used {
-#define VRING_USED_F_NO_NOTIFY  1
-    u16 flags;
-    u16 idx;
-    struct vring_used_elem ring[qsz];
-    u16 avail_event;
-};
-  Helpers for Managing Virtqueues
-The Linux Kernel Source code contains the definitions above and
-helper routines in a more usable form, in
-include/linux/virtio_ring.h. This was explicitly licensed by IBM
-and Red Hat under the (3-clause) BSD license so that it can be
-freely used by all other projects, and is reproduced (with slight
-variation to remove Linux assumptions) in Appendix A.
-  Device Operation<sec:Device-Operation>
-There are two parts to device operation: supplying new buffers to
-the device, and processing used buffers from the device. As an
-example, the virtio network device has two virtqueues: the
-transmit virtqueue and the receive virtqueue. The driver adds
-outgoing (read-only) packets to the transmit virtqueue, and then
-frees them after they are used. Similarly, incoming (write-only)
-buffers are added to the receive virtqueue, and processed after
-they are used.
-  Supplying Buffers to The Device
-Actual transfer of buffers from the guest OS to the device
-operates as follows:
-  Place the buffer(s) into free descriptor(s).
-  If there are no free descriptors, the guest may choose to
-    notify the device even if notifications are suppressed (to
-    reduce latency).[footnote:
-The Linux drivers do this only for read-only buffers: for
-write-only buffers, it is assumed that the driver is merely
-trying to keep the receive buffer ring full, and no notification
-of this expected condition is necessary.
-]
-  Place the id of the buffer in the next ring entry of the
-  available ring.
-  The steps (1) and (2) may be performed repeatedly if batching
-  is possible.
-  A memory barrier should be executed to ensure the device sees
-  the updated descriptor table and available ring before the next
-  step.
-  The available “idx” field should be increased by the number of
-  entries added to the available ring.
-  A memory barrier should be executed to ensure that we update
-  the idx field before checking for notification suppression.
-  If notifications are not suppressed, the device should be
-  notified of the new buffers.
-Note that the above code does not take precautions against the
-available ring buffer wrapping around: this is not possible since
-the ring buffer is the same size as the descriptor table, so step
-(1) will prevent such a condition.
-In addition, the maximum queue size is 32768 (it must be a power
-of 2 which fits in 16 bits), so the 16-bit “idx” value can always
-distinguish between a full and empty buffer.
-Here is a description of each stage in more detail.
-  Placing Buffers Into The Descriptor Table
-A buffer consists of zero or more read-only physically-contiguous
-elements followed by zero or more physically-contiguous
-write-only elements (it must have at least one element). This
-algorithm maps it into the descriptor table:
-  for each buffer element, b:
-  Get the next free descriptor table entry, d
-  Set d.addr to the physical address of the start of b
-  Set d.len to the length of b.
-  If b is write-only, set d.flags to VRING_DESC_F_WRITE,
-    otherwise 0.
-  If there is a buffer element after this:
-    Set d.next to the index of the next free descriptor element.
-    Set the VRING_DESC_F_NEXT bit in d.flags.
-In practice, the d.next fields are usually used to chain free
-descriptors, and a separate count kept to check there are enough
-free descriptors before beginning the mappings.
-  Updating The Available Ring
-The head of the buffer we mapped is the first d in the algorithm
-above. A naive implementation would do the following:
-avail->ring[avail->idx % qsz] = head;
-However, in general we can add many descriptors before we update
-the “idx” field (at which point they become visible to the
-device), so we keep a counter of how many we've added:
-avail->ring[(avail->idx + added++) % qsz] = head;
-  Updating The Index Field
-Once the idx field of the virtqueue is updated, the device will
-be able to access the descriptor entries we've created and the
-memory they refer to. This is why a memory barrier is generally
-used before the idx update, to ensure it sees the most up-to-date
-copy.
-The idx field always increments, and we let it wrap naturally at
-65536:
-avail->idx += added;
-  <sub:Notifying-The-Device>Notifying The Device
-Device notification occurs by writing the 16-bit virtqueue index
-of this virtqueue to the Queue Notify field of the virtio header
-in the first I/O region of the PCI device. This can be expensive,
-however, so the device can suppress such notifications if it
-doesn't need them. We have to be careful to expose the new idx
-value before checking the suppression flag: it's OK to notify
-gratuitously, but not to omit a required notification. So again,
-we use a memory barrier here before reading the flags or the
-avail_event field.
-If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated, and if
-the VRING_USED_F_NOTIFY flag is not set, we go ahead and write to
-the PCI configuration space.
-If the VIRTIO_F_RING_EVENT_IDX feature is negotiated, we read the
-avail_event field in the available ring structure. If the
-available index crossed_the avail_event field value since the
-last notification, we go ahead and write to the PCI configuration
-space. The avail_event field wraps naturally at 65536 as well:
-(u16)(new_idx - avail_event - 1) < (u16)(new_idx - old_idx)
-  <sub:Receiving-Used-Buffers>Receiving Used Buffers From The
-  Device
-Once the device has used a buffer (read from or written to it, or
-parts of both, depending on the nature of the virtqueue and the
-device), it sends an interrupt, following an algorithm very
-similar to the algorithm used for the driver to send the device a
-buffer:
-  Write the head descriptor number to the next field in the used
-  ring.
-  Update the used ring idx.
-  Determine whether an interrupt is necessary:
-  If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated: check
-    if f the VRING_AVAIL_F_NO_INTERRUPT flag is not set in avail-
-    >flags
-  If the VIRTIO_F_RING_EVENT_IDX feature is negotiated: check
-    whether the used index crossed the used_event field value
-    since the last update. The used_event field wraps naturally
-    at 65536 as well:(u16)(new_idx - used_event - 1) < (u16)(new_idx - old_idx)
-  If an interrupt is necessary:
-  If MSI-X capability is disabled:
-    Set the lower bit of the ISR Status field for the device.
-    Send the appropriate PCI interrupt for the device.
-  If MSI-X capability is enabled:
-    Request the appropriate MSI-X interrupt message for the
-      device, Queue Vector field sets the MSI-X Table entry
-      number.
-    If Queue Vector field value is NO_VECTOR, no interrupt
-      message is requested for this event.
-The guest interrupt handler should:
-  If MSI-X capability is disabled: read the ISR Status field,
-  which will reset it to zero. If the lower bit is zero, the
-  interrupt was not for this device. Otherwise, the guest driver
-  should look through the used rings of each virtqueue for the
-  device, to see if any progress has been made by the device
-  which requires servicing.
-  If MSI-X capability is enabled: look through the used rings of
-  each virtqueue mapped to the specific MSI-X vector for the
-  device, to see if any progress has been made by the device
-  which requires servicing.
-For each ring, guest should then disable interrupts by writing
-VRING_AVAIL_F_NO_INTERRUPT flag in avail structure, if required.
-It can then process used ring entries finally enabling interrupts
-by clearing the VRING_AVAIL_F_NO_INTERRUPT flag or updating the
-EVENT_IDX field in the available structure, Guest should then
-execute a memory barrier, and then recheck the ring empty
-condition. This is necessary to handle the case where, after the
-last check and before enabling interrupts, an interrupt has been
-suppressed by the device:
-vring_disable_interrupts(vq);
-for (;;) {
-    if (vq->last_seen_used != vring->used.idx) {
-                vring_enable_interrupts(vq);
-                mb();
-                if (vq->last_seen_used != vring->used.idx)
-                        break;
-    }
-    struct vring_used_elem *e =
-vring.used->ring[vq->last_seen_used%vsz];
-    process_buffer(e);
-    vq->last_seen_used++;
-}
-  Dealing With Configuration Changes<sub:Dealing-With-Configuration>
-Some virtio PCI devices can change the device configuration
-state, as reflected in the virtio header in the PCI configuration
-space. In this case:
-  If MSI-X capability is disabled: an interrupt is delivered and
-  the second highest bit is set in the ISR Status field to
-  indicate that the driver should re-examine the configuration
-  space.Note that a single interrupt can indicate both that one
-  or more virtqueue has been used and that the configuration
-  space has changed: even if the config bit is set, virtqueues
-  must be scanned.
-  If MSI-X capability is enabled: an interrupt message is
-  requested. The Configuration Vector field sets the MSI-X Table
-  entry number to use. If Configuration Vector field value is
-  NO_VECTOR, no interrupt message is requested for this event.
-Creating New Device Types
-Various considerations are necessary when creating a new device
-type:
-  How Many Virtqueues?
-It is possible that a very simple device will operate entirely
-through its configuration space, but most will need at least one
-virtqueue in which it will place requests. A device with both
-input and output (eg. console and network devices described here)
-need two queues: one which the driver fills with buffers to
-receive input, and one which the driver places buffers to
-transmit output.
-  What Configuration Space Layout?
-Configuration space is generally used for rarely-changing or
-initialization-time parameters. But it is a limited resource, so
-it might be better to use a virtqueue to update configuration
-information (the network device does this for filtering,
-otherwise the table in the config space could potentially be very
-large).
-Note that this space is generally the guest's native endian,
-rather than PCI's little-endian.
-  What Device Number?
-Currently device numbers are assigned quite freely: a simple
-request mail to the author of this document or the Linux
-virtualization mailing list[footnote:
-https://lists.linux-foundation.org/mailman/listinfo/virtualization
-] will be sufficient to secure a unique one.
-Meanwhile for experimental drivers, use 65535 and work backwards.
-  How many MSI-X vectors?
-Using the optional MSI-X capability devices can speed up
-interrupt processing by removing the need to read ISR Status
-register by guest driver (which might be an expensive operation),
-reducing interrupt sharing between devices and queues within the
-device, and handling interrupts from multiple CPUs. However, some
-systems impose a limit (which might be as low as 256) on the
-total number of MSI-X vectors that can be allocated to all
-devices. Devices and/or device drivers should take this into
-account, limiting the number of vectors used unless the device is
-expected to cause a high volume of interrupts. Devices can
-control the number of vectors used by limiting the MSI-X Table
-Size or not presenting MSI-X capability in PCI configuration
-space. Drivers can control this by mapping events to as small
-number of vectors as possible, or disabling MSI-X capability
-altogether.
-  Message Framing
-The descriptors used for a buffer should not effect the semantics
-of the message, except for the total length of the buffer. For
-example, a network buffer consists of a 10 byte header followed
-by the network packet. Whether this is presented in the ring
-descriptor chain as (say) a 10 byte buffer and a 1514 byte
-buffer, or a single 1524 byte buffer, or even three buffers,
-should have no effect.
-In particular, no implementation should use the descriptor
-boundaries to determine the size of any header in a request.[footnote:
-The current qemu device implementations mistakenly insist that
-the first descriptor cover the header in these cases exactly, so
-a cautious driver should arrange it so.
-]
-  Device Improvements
-Any change to configuration space, or new virtqueues, or
-behavioural changes, should be indicated by negotiation of a new
-feature bit. This establishes clarity[footnote:
-Even if it does mean documenting design or implementation
-mistakes!
-] and avoids future expansion problems.
-Clusters of functionality which are always implemented together
-can use a single bit, but if one feature makes sense without the
-others they should not be gratuitously grouped together to
-conserve feature bits. We can always extend the spec when the
-first person needs more than 24 feature bits for their device.
-[LaTeX Command: printnomenclature]
-Appendix A: virtio_ring.h
-#ifndef VIRTIO_RING_H
-#define VIRTIO_RING_H
-/* An interface for efficient virtio implementation.
- *
- * This header is BSD licensed so anyone can use the definitions
- * to implement compatible drivers/servers.
- *
- * Copyright 2007, 2009, IBM Corporation
- * Copyright 2011, Red Hat, Inc
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or
-without
- * modification, are permitted provided that the following
-conditions
- * are met:
- * 1. Redistributions of source code must retain the above
-copyright
- *    notice, this list of conditions and the following
-disclaimer.
- * 2. Redistributions in binary form must reproduce the above
-copyright
- *    notice, this list of conditions and the following
-disclaimer in the
- *    documentation and/or other materials provided with the
-distribution.
- * 3. Neither the name of IBM nor the names of its contributors
- *    may be used to endorse or promote products derived from
-this software
- *    without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE
-LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF
- * SUCH DAMAGE.
- */
-/* This marks a buffer as continuing via the next field. */
-#define VRING_DESC_F_NEXT       1
-/* This marks a buffer as write-only (otherwise read-only). */
-#define VRING_DESC_F_WRITE      2
-/* The Host uses this in used->flags to advise the Guest: don't
-kick me
- * when you add a buffer.  It's unreliable, so it's simply an
- * optimization.  Guest will still kick if it's out of buffers.
-*/
-#define VRING_USED_F_NO_NOTIFY  1
-/* The Guest uses this in avail->flags to advise the Host: don't
- * interrupt me when you consume a buffer.  It's unreliable, so
-it's
- * simply an optimization.  */
-#define VRING_AVAIL_F_NO_INTERRUPT      1
-/* Virtio ring descriptors: 16 bytes.
- * These can chain together via "next". */
-struct vring_desc {
-        /* Address (guest-physical). */
-        uint64_t addr;
-        /* Length. */
-        uint32_t len;
-        /* The flags as indicated above. */
-        uint16_t flags;
-        /* We chain unused descriptors via this, too */
-        uint16_t next;
-};
-struct vring_avail {
-        uint16_t flags;
-        uint16_t idx;
-        uint16_t ring[];
-        uint16_t used_event;
-};
-/* u32 is used here for ids for padding reasons. */
-struct vring_used_elem {
-        /* Index of start of used descriptor chain. */
-        uint32_t id;
-        /* Total length of the descriptor chain which was written
-to. */
-        uint32_t len;
-};
-struct vring_used {
-        uint16_t flags;
-        uint16_t idx;
-        struct vring_used_elem ring[];
-        uint16_t avail_event;
-};
-struct vring {
-        unsigned int num;
-        struct vring_desc *desc;
-        struct vring_avail *avail;
-        struct vring_used *used;
-};
-/* The standard layout for the ring is a continuous chunk of
-memory which
- * looks like this.  We assume num is a power of 2.
- *
- * struct vring {
- *      // The actual descriptors (16 bytes each)
- *      struct vring_desc desc[num];
- *
- *      // A ring of available descriptor heads with free-running
-index.
- *      __u16 avail_flags;
- *      __u16 avail_idx;
- *      __u16 available[num];
- *
- *      // Padding to the next align boundary.
- *      char pad[];
- *
- *      // A ring of used descriptor heads with free-running
-index.
- *      __u16 used_flags;
- *      __u16 EVENT_IDX;
- *      struct vring_used_elem used[num];
- * };
- * Note: for virtio PCI, align is 4096.
- */
-static inline void vring_init(struct vring *vr, unsigned int num,
-void *p,
-                              unsigned long align)
-{
-        vr->num = num;
-        vr->desc = p;
-        vr->avail = p + num*sizeof(struct vring_desc);
-        vr->used = (void *)(((unsigned long)&vr->avail->ring[num]
-                              + align-1)
-                            & ~(align - 1));
-}
-static inline unsigned vring_size(unsigned int num, unsigned long
-align)
-{
-        return ((sizeof(struct vring_desc)*num +
-sizeof(uint16_t)*(2+num)
-                 + align - 1) & ~(align - 1))
-                + sizeof(uint16_t)*3 + sizeof(struct
-vring_used_elem)*num;
-}
-static inline int vring_need_event(uint16_t event_idx, uint16_t
-new_idx, uint16_t old_idx)
-{
-         return (uint16_t)(new_idx - event_idx - 1) <
-(uint16_t)(new_idx - old_idx);
-}
-#endif /* VIRTIO_RING_H */
-<cha:Reserved-Feature-Bits>Appendix B: Reserved Feature Bits
-Currently there are five device-independent feature bits defined:
-  VIRTIO_F_NOTIFY_ON_EMPTY (24) Negotiating this feature
-  indicates that the driver wants an interrupt if the device runs
-  out of available descriptors on a virtqueue, even though
-  interrupts are suppressed using the VRING_AVAIL_F_NO_INTERRUPT
-  flag or the used_event field. An example of this is the
-  networking driver: it doesn't need to know every time a packet
-  is transmitted, but it does need to free the transmitted
-  packets a finite time after they are transmitted. It can avoid
-  using a timer if the device interrupts it when all the packets
-  are transmitted.
-  VIRTIO_F_RING_INDIRECT_DESC (28) Negotiating this feature
-  indicates that the driver can use descriptors with the
-  VRING_DESC_F_INDIRECT flag set, as described in [sub:Indirect-Descriptors]
-  .
-  VIRTIO_F_RING_EVENT_IDX(29) This feature enables the used_event
-  and the avail_event fields. If set, it indicates that the
-  device should ignore the flags field in the available ring
-  structure. Instead, the used_event field in this structure is
-  used by guest to suppress device interrupts. Further, the
-  driver should ignore the flags field in the used ring
-  structure. Instead, the avail_event field in this structure is
-  used by the device to suppress notifications. If unset, the
-  driver should ignore the used_event field; the device should
-  ignore the avail_event field; the flags field is used
-Appendix C: Network Device
-The virtio network device is a virtual ethernet card, and is the
-most complex of the devices supported so far by virtio. It has
-enhanced rapidly and demonstrates clearly how support for new
-features should be added to an existing device. Empty buffers are
-placed in one virtqueue for receiving packets, and outgoing
-packets are enqueued into another for transmission in that order.
-A third command queue is used to control advanced filtering
-features.
-  Configuration
-  Subsystem Device ID 1
-  Virtqueues 0:receiveq. 1:transmitq. 2:controlq[footnote:
-Only if VIRTIO_NET_F_CTRL_VQ set
-]
-  Feature bits
-  VIRTIO_NET_F_CSUM (0) Device handles packets with partial
-    checksum
-  VIRTIO_NET_F_GUEST_CSUM (1) Guest handles packets with partial
-    checksum
-  VIRTIO_NET_F_MAC (5) Device has given MAC address.
-  VIRTIO_NET_F_GSO (6) (Deprecated) device handles packets with
-    any GSO type.[footnote:
-It was supposed to indicate segmentation offload support, but
-upon further investigation it became clear that multiple bits
-were required.
-]
-  VIRTIO_NET_F_GUEST_TSO4 (7) Guest can receive TSOv4.
-  VIRTIO_NET_F_GUEST_TSO6 (8) Guest can receive TSOv6.
-  VIRTIO_NET_F_GUEST_ECN (9) Guest can receive TSO with ECN.
-  VIRTIO_NET_F_GUEST_UFO (10) Guest can receive UFO.
-  VIRTIO_NET_F_HOST_TSO4 (11) Device can receive TSOv4.
-  VIRTIO_NET_F_HOST_TSO6 (12) Device can receive TSOv6.
-  VIRTIO_NET_F_HOST_ECN (13) Device can receive TSO with ECN.
-  VIRTIO_NET_F_HOST_UFO (14) Device can receive UFO.
-  VIRTIO_NET_F_MRG_RXBUF (15) Guest can merge receive buffers.
-  VIRTIO_NET_F_STATUS (16) Configuration status field is
-    available.
-  VIRTIO_NET_F_CTRL_VQ (17) Control channel is available.
-  VIRTIO_NET_F_CTRL_RX (18) Control channel RX mode support.
-  VIRTIO_NET_F_CTRL_VLAN (19) Control channel VLAN filtering.
-  VIRTIO_NET_F_GUEST_ANNOUNCE(21) Guest can send gratuitous
-    packets.
-  Device configuration layout Two configuration fields are
-  currently defined. The mac address field always exists (though
-  is only valid if VIRTIO_NET_F_MAC is set), and the status field
-  only exists if VIRTIO_NET_F_STATUS is set. Two read-only bits
-  are currently defined for the status field:
-  VIRTIO_NET_S_LINK_UP and VIRTIO_NET_S_ANNOUNCE. #define VIRTIO_NET_S_LINK_UP  1
-#define VIRTIO_NET_S_ANNOUNCE   2
-struct virtio_net_config {
-    u8 mac[6];
-    u16 status;
-};
-  Device Initialization
-  The initialization routine should identify the receive and
-  transmission virtqueues.
-  If the VIRTIO_NET_F_MAC feature bit is set, the configuration
-  space “mac” entry indicates the “physical” address of the the
-  network card, otherwise a private MAC address should be
-  assigned. All guests are expected to negotiate this feature if
-  it is set.
-  If the VIRTIO_NET_F_CTRL_VQ feature bit is negotiated, identify
-  the control virtqueue.
-  If the VIRTIO_NET_F_STATUS feature bit is negotiated, the link
-  status can be read from the bottom bit of the “status” config
-  field. Otherwise, the link should be assumed active.
-  The receive virtqueue should be filled with receive buffers.
-  This is described in detail below in “Setting Up Receive
-  Buffers”.
-  A driver can indicate that it will generate checksumless
-  packets by negotating the VIRTIO_NET_F_CSUM feature. This “
-  checksum offload” is a common feature on modern network cards.
-  If that feature is negotiated[footnote:
-ie. VIRTIO_NET_F_HOST_TSO* and VIRTIO_NET_F_HOST_UFO are
-dependent on VIRTIO_NET_F_CSUM; a dvice which offers the offload
-features must offer the checksum feature, and a driver which
-accepts the offload features must accept the checksum feature.
-Similar logic applies to the VIRTIO_NET_F_GUEST_TSO4 features
-depending on VIRTIO_NET_F_GUEST_CSUM.
-], a driver can use TCP or UDP segmentation offload by
-  negotiating the VIRTIO_NET_F_HOST_TSO4 (IPv4 TCP),
-  VIRTIO_NET_F_HOST_TSO6 (IPv6 TCP) and VIRTIO_NET_F_HOST_UFO
-  (UDP fragmentation) features. It should not send TCP packets
-  requiring segmentation offload which have the Explicit
-  Congestion Notification bit set, unless the
-  VIRTIO_NET_F_HOST_ECN feature is negotiated.[footnote:
-This is a common restriction in real, older network cards.
-]
-  The converse features are also available: a driver can save the
-  virtual device some work by negotiating these features.[footnote:
-For example, a network packet transported between two guests on
-the same system may not require checksumming at all, nor
-segmentation, if both guests are amenable.
-] The VIRTIO_NET_F_GUEST_CSUM feature indicates that partially
-  checksummed packets can be received, and if it can do that then
-  the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
-  VIRTIO_NET_F_GUEST_UFO and VIRTIO_NET_F_GUEST_ECN are the input
-  equivalents of the features described above. See “Receiving
-  Packets” below.
-  Device Operation
-Packets are transmitted by placing them in the transmitq, and
-buffers for incoming packets are placed in the receiveq. In each
-case, the packet itself is preceded by a header:
-struct virtio_net_hdr {
-#define VIRTIO_NET_HDR_F_NEEDS_CSUM    1
-        u8 flags;
-#define VIRTIO_NET_HDR_GSO_NONE        0
-#define VIRTIO_NET_HDR_GSO_TCPV4       1
-#define VIRTIO_NET_HDR_GSO_UDP           3
-#define VIRTIO_NET_HDR_GSO_TCPV6       4
-#define VIRTIO_NET_HDR_GSO_ECN      0x80
-        u8 gso_type;
-        u16 hdr_len;
-        u16 gso_size;
-        u16 csum_start;
-        u16 csum_offset;
-/* Only if VIRTIO_NET_F_MRG_RXBUF: */
-        u16 num_buffers
-};
-The controlq is used to control device features such as
-filtering.
-  Packet Transmission
-Transmitting a single packet is simple, but varies depending on
-the different features the driver negotiated.
-  If the driver negotiated VIRTIO_NET_F_CSUM, and the packet has
-  not been fully checksummed, then the virtio_net_hdr's fields
-  are set as follows. Otherwise, the packet must be fully
-  checksummed, and flags is zero.
-  flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set,
-  <ite:csum_start-is-set>csum_start is set to the offset within
-    the packet to begin checksumming, and
-  csum_offset indicates how many bytes after the csum_start the
-    new (16 bit ones' complement) checksum should be placed.[footnote:
-For example, consider a partially checksummed TCP (IPv4) packet.
-It will have a 14 byte ethernet header and 20 byte IP header
-followed by the TCP header (with the TCP checksum field 16 bytes
-into that header). csum_start will be 14+20 = 34 (the TCP
-checksum includes the header), and csum_offset will be 16. The
-value in the TCP checksum field should be initialized to the sum
-of the TCP pseudo header, so that replacing it by the ones'
-complement checksum of the TCP header and body will give the
-correct result.
-]
-  <enu:If-the-driver>If the driver negotiated
-  VIRTIO_NET_F_HOST_TSO4, TSO6 or UFO, and the packet requires
-  TCP segmentation or UDP fragmentation, then the “gso_type”
-  field is set to VIRTIO_NET_HDR_GSO_TCPV4, TCPV6 or UDP.
-  (Otherwise, it is set to VIRTIO_NET_HDR_GSO_NONE). In this
-  case, packets larger than 1514 bytes can be transmitted: the
-  metadata indicates how to replicate the packet header to cut it
-  into smaller packets. The other gso fields are set:
-  hdr_len is a hint to the device as to how much of the header
-    needs to be kept to copy into each packet, usually set to the
-    length of the headers, including the transport header.[footnote:
-Due to various bugs in implementations, this field is not useful
-as a guarantee of the transport header size.
-]
-  gso_size is the maximum size of each packet beyond that header
-    (ie. MSS).
-  If the driver negotiated the VIRTIO_NET_F_HOST_ECN feature, the
-    VIRTIO_NET_HDR_GSO_ECN bit may be set in “gso_type” as well,
-    indicating that the TCP packet has the ECN bit set.[footnote:
-This case is not handled by some older hardware, so is called out
-specifically in the protocol.
-]
-  If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature,
-  the num_buffers field is set to zero.
-  The header and packet are added as one output buffer to the
-  transmitq, and the device is notified of the new entry (see [sub:Notifying-The-Device]
-  ).[footnote:
-Note that the header will be two bytes longer for the
-VIRTIO_NET_F_MRG_RXBUF case.
-]
-  Packet Transmission Interrupt
-Often a driver will suppress transmission interrupts using the
-VRING_AVAIL_F_NO_INTERRUPT flag (see [sub:Receiving-Used-Buffers]
-) and check for used packets in the transmit path of following
-packets. However, it will still receive interrupts if the
-VIRTIO_F_NOTIFY_ON_EMPTY feature is negotiated, indicating that
-the transmission queue is completely emptied.
-The normal behavior in this interrupt handler is to retrieve and
-new descriptors from the used ring and free the corresponding
-headers and packets.
-  Setting Up Receive Buffers
-It is generally a good idea to keep the receive virtqueue as
-fully populated as possible: if it runs out, network performance
-will suffer.
-If the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6 or
-VIRTIO_NET_F_GUEST_UFO features are used, the Guest will need to
-accept packets of up to 65550 bytes long (the maximum size of a
-TCP or UDP packet, plus the 14 byte ethernet header), otherwise
-1514 bytes. So unless VIRTIO_NET_F_MRG_RXBUF is negotiated, every
-buffer in the receive queue needs to be at least this length [footnote:
-Obviously each one can be split across multiple descriptor
-elements.
-].
-If VIRTIO_NET_F_MRG_RXBUF is negotiated, each buffer must be at
-least the size of the struct virtio_net_hdr.
-  Packet Receive Interrupt
-When a packet is copied into a buffer in the receiveq, the
-optimal path is to disable further interrupts for the receiveq
-(see [sub:Receiving-Used-Buffers]) and process packets until no
-more are found, then re-enable them.
-Processing packet involves:
-  If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature,
-  then the “num_buffers” field indicates how many descriptors
-  this packet is spread over (including this one). This allows
-  receipt of large packets without having to allocate large
-  buffers. In this case, there will be at least “num_buffers” in
-  the used ring, and they should be chained together to form a
-  single packet. The other buffers will not begin with a struct
-  virtio_net_hdr.
-  If the VIRTIO_NET_F_MRG_RXBUF feature was not negotiated, or
-  the “num_buffers” field is one, then the entire packet will be
-  contained within this buffer, immediately following the struct
-  virtio_net_hdr.
-  If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the
-  VIRTIO_NET_HDR_F_NEEDS_CSUM bit in the “flags” field may be
-  set: if so, the checksum on the packet is incomplete and the “
-  csum_start” and “csum_offset” fields indicate how to calculate
-  it (see [ite:csum_start-is-set]).
-  If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were
-  negotiated, then the “gso_type” may be something other than
-  VIRTIO_NET_HDR_GSO_NONE, and the “gso_size” field indicates the
-  desired MSS (see [enu:If-the-driver]).
-  Control Virtqueue
-The driver uses the control virtqueue (if VIRTIO_NET_F_VTRL_VQ is
-negotiated) to send commands to manipulate various features of
-the device which would not easily map into the configuration
-space.
-All commands are of the following form:
-struct virtio_net_ctrl {
-        u8 class;
-        u8 command;
-        u8 command-specific-data[];
-        u8 ack;
-};
-/* ack values */
-#define VIRTIO_NET_OK     0
-#define VIRTIO_NET_ERR    1
-The class, command and command-specific-data are set by the
-driver, and the device sets the ack byte. There is little it can
-do except issue a diagnostic if the ack byte is not
-VIRTIO_NET_OK.
-  Packet Receive Filtering
-If the VIRTIO_NET_F_CTRL_RX feature is negotiated, the driver can
-send control commands for promiscuous mode, multicast receiving,
-and filtering of MAC addresses.
-Note that in general, these commands are best-effort: unwanted
-packets may still arrive.
-  Setting Promiscuous Mode
-#define VIRTIO_NET_CTRL_RX    0
- #define VIRTIO_NET_CTRL_RX_PROMISC      0
- #define VIRTIO_NET_CTRL_RX_ALLMULTI     1
-The class VIRTIO_NET_CTRL_RX has two commands:
-VIRTIO_NET_CTRL_RX_PROMISC turns promiscuous mode on and off, and
-VIRTIO_NET_CTRL_RX_ALLMULTI turns all-multicast receive on and
-off. The command-specific-data is one byte containing 0 (off) or
-1 (on).
-  Setting MAC Address Filtering
-struct virtio_net_ctrl_mac {
-        u32 entries;
-        u8 macs[entries][ETH_ALEN];
-};
-#define VIRTIO_NET_CTRL_MAC    1
- #define VIRTIO_NET_CTRL_MAC_TABLE_SET        0
-The device can filter incoming packets by any number of
-destination MAC addresses.[footnote:
-Since there are no guarantees, it can use a hash filter
-orsilently switch to allmulti or promiscuous mode if it is given
-too many addresses.
-] This table is set using the class VIRTIO_NET_CTRL_MAC and the
-command VIRTIO_NET_CTRL_MAC_TABLE_SET. The command-specific-data
-is two variable length tables of 6-byte MAC addresses. The first
-table contains unicast addresses, and the second contains
-multicast addresses.
-  VLAN Filtering
-If the driver negotiates the VIRTION_NET_F_CTRL_VLAN feature, it
-can control a VLAN filter table in the device.
-#define VIRTIO_NET_CTRL_VLAN       2
- #define VIRTIO_NET_CTRL_VLAN_ADD             0
- #define VIRTIO_NET_CTRL_VLAN_DEL             1
-Both the VIRTIO_NET_CTRL_VLAN_ADD and VIRTIO_NET_CTRL_VLAN_DEL
-command take a 16-bit VLAN id as the command-specific-data.
-  Gratuitous Packet Sending
-If the driver negotiates the VIRTIO_NET_F_GUEST_ANNOUNCE (depends
-on VIRTIO_NET_F_CTRL_VQ), it can ask the guest to send gratuitous
-packets; this is usually done after the guest has been physically
-migrated, and needs to announce its presence on the new network
-links. (As hypervisor does not have the knowledge of guest
-network configuration (eg. tagged vlan) it is simplest to prod
-the guest in this way).
-#define VIRTIO_NET_CTRL_ANNOUNCE       3
- #define VIRTIO_NET_CTRL_ANNOUNCE_ACK             0
-The Guest needs to check VIRTIO_NET_S_ANNOUNCE bit in status
-field when it notices the changes of device configuration. The
-command VIRTIO_NET_CTRL_ANNOUNCE_ACK is used to indicate that
-driver has recevied the notification and device would clear the
-VIRTIO_NET_S_ANNOUNCE bit in the status filed after it received
-this command.
-Processing this notification involves:
-  Sending the gratuitous packets or marking there are pending
-  gratuitous packets to be sent and letting deferred routine to
-  send them.
-  Sending VIRTIO_NET_CTRL_ANNOUNCE_ACK command through control
-  vq.
-  .
-Appendix D: Block Device
-The virtio block device is a simple virtual block device (ie.
-disk). Read and write requests (and other exotic requests) are
-placed in the queue, and serviced (probably out of order) by the
-device except where noted.
-  Configuration
-  Subsystem Device ID 2
-  Virtqueues 0:requestq.
-  Feature bits
-  VIRTIO_BLK_F_BARRIER (0) Host supports request barriers.
-  VIRTIO_BLK_F_SIZE_MAX (1) Maximum size of any single segment is
-    in “size_max”.
-  VIRTIO_BLK_F_SEG_MAX (2) Maximum number of segments in a
-    request is in “seg_max”.
-  VIRTIO_BLK_F_GEOMETRY (4) Disk-style geometry specified in “
-    geometry”.
-  VIRTIO_BLK_F_RO (5) Device is read-only.
-  VIRTIO_BLK_F_BLK_SIZE (6) Block size of disk is in “blk_size”.
-  VIRTIO_BLK_F_SCSI (7) Device supports scsi packet commands.
-  VIRTIO_BLK_F_FLUSH (9) Cache flush command support.
-  Device configuration layout The capacity of the device
-  (expressed in 512-byte sectors) is always present. The
-  availability of the others all depend on various feature bits
-  as indicated above. struct virtio_blk_config {
-        u64 capacity;
-        u32 size_max;
-        u32 seg_max;
-        struct virtio_blk_geometry {
-                u16 cylinders;
-                u8 heads;
-                u8 sectors;
-        } geometry;
-        u32 blk_size;
-};
-  Device Initialization
-  The device size should be read from the “capacity”
-  configuration field. No requests should be submitted which goes
-  beyond this limit.
-  If the VIRTIO_BLK_F_BLK_SIZE feature is negotiated, the
-  blk_size field can be read to determine the optimal sector size
-  for the driver to use. This does not effect the units used in
-  the protocol (always 512 bytes), but awareness of the correct
-  value can effect performance.
-  If the VIRTIO_BLK_F_RO feature is set by the device, any write
-  requests will fail.
-  Device Operation
-The driver queues requests to the virtqueue, and they are used by
-the device (not necessarily in order). Each request is of form:
-struct virtio_blk_req {
-        u32 type;
-        u32 ioprio;
-        u64 sector;
-        char data[][512];
-        u8 status;
-};
-If the device has VIRTIO_BLK_F_SCSI feature, it can also support
-scsi packet command requests, each of these requests is of form:struct virtio_scsi_pc_req {
-        u32 type;
-        u32 ioprio;
-        u64 sector;
-    char cmd[];
-        char data[][512];
-#define SCSI_SENSE_BUFFERSIZE   96
-    u8 sense[SCSI_SENSE_BUFFERSIZE];
-    u32 errors;
-    u32 data_len;
-    u32 sense_len;
-    u32 residual;
-        u8 status;
-};
-The type of the request is either a read (VIRTIO_BLK_T_IN), a
-write (VIRTIO_BLK_T_OUT), a scsi packet command
-(VIRTIO_BLK_T_SCSI_CMD or VIRTIO_BLK_T_SCSI_CMD_OUT[footnote:
-the SCSI_CMD and SCSI_CMD_OUT types are equivalent, the device
-does not distinguish between them
-]) or a flush (VIRTIO_BLK_T_FLUSH or VIRTIO_BLK_T_FLUSH_OUT[footnote:
-the FLUSH and FLUSH_OUT types are equivalent, the device does not
-distinguish between them
-]). If the device has VIRTIO_BLK_F_BARRIER feature the high bit
-(VIRTIO_BLK_T_BARRIER) indicates that this request acts as a
-barrier and that all preceding requests must be complete before
-this one, and all following requests must not be started until
-this is complete. Note that a barrier does not flush caches in
-the underlying backend device in host, and thus does not serve as
-data consistency guarantee. Driver must use FLUSH request to
-flush the host cache.
-#define VIRTIO_BLK_T_IN           0
-#define VIRTIO_BLK_T_OUT          1
-#define VIRTIO_BLK_T_SCSI_CMD     2
-#define VIRTIO_BLK_T_SCSI_CMD_OUT 3
-#define VIRTIO_BLK_T_FLUSH        4
-#define VIRTIO_BLK_T_FLUSH_OUT    5
-#define VIRTIO_BLK_T_BARRIER     0x80000000
-The ioprio field is a hint about the relative priorities of
-requests to the device: higher numbers indicate more important
-requests.
-The sector number indicates the offset (multiplied by 512) where
-the read or write is to occur. This field is unused and set to 0
-for scsi packet commands and for flush commands.
-The cmd field is only present for scsi packet command requests,
-and indicates the command to perform. This field must reside in a
-single, separate read-only buffer; command length can be derived
-from the length of this buffer.
-Note that these first three (four for scsi packet commands)
-fields are always read-only: the data field is either read-only
-or write-only, depending on the request. The size of the read or
-write can be derived from the total size of the request buffers.
-The sense field is only present for scsi packet command requests,
-and indicates the buffer for scsi sense data.
-The data_len field is only present for scsi packet command
-requests, this field is deprecated, and should be ignored by the
-driver. Historically, devices copied data length there.
-The sense_len field is only present for scsi packet command
-requests and indicates the number of bytes actually written to
-the sense buffer.
-The residual field is only present for scsi packet command
-requests and indicates the residual size, calculated as data
-length - number of bytes actually transferred.
-The final status byte is written by the device: either
-VIRTIO_BLK_S_OK for success, VIRTIO_BLK_S_IOERR for host or guest
-error or VIRTIO_BLK_S_UNSUPP for a request unsupported by host:#define VIRTIO_BLK_S_OK        0
-#define VIRTIO_BLK_S_IOERR     1
-#define VIRTIO_BLK_S_UNSUPP    2
-Historically, devices assumed that the fields type, ioprio and
-sector reside in a single, separate read-only buffer; the fields
-errors, data_len, sense_len and residual reside in a single,
-separate write-only buffer; the sense field in a separate
-write-only buffer of size 96 bytes, by itself; the fields errors,
-data_len, sense_len and residual in a single write-only buffer;
-and the status field is a separate read-only buffer of size 1
-byte, by itself.
-Appendix E: Console Device
-The virtio console device is a simple device for data input and
-output. A device may have one or more ports. Each port has a pair
-of input and output virtqueues. Moreover, a device has a pair of
-control IO virtqueues. The control virtqueues are used to
-communicate information between the device and the driver about
-ports being opened and closed on either side of the connection,
-indication from the host about whether a particular port is a
-console port, adding new ports, port hot-plug/unplug, etc., and
-indication from the guest about whether a port or a device was
-successfully added, port open/close, etc.. For data IO, one or
-more empty buffers are placed in the receive queue for incoming
-data and outgoing characters are placed in the transmit queue.
-  Configuration
-  Subsystem Device ID 3
-  Virtqueues 0:receiveq(port0). 1:transmitq(port0), 2:control
-  receiveq[footnote:
-Ports 2 onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set
-], 3:control transmitq, 4:receiveq(port1), 5:transmitq(port1),
-  ...
-  Feature bits
-  VIRTIO_CONSOLE_F_SIZE (0) Configuration cols and rows fields
-    are valid.
-  VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple
-    ports; configuration fields nr_ports and max_nr_ports are
-    valid and control virtqueues will be used.
-  Device configuration layout The size of the console is supplied
-  in the configuration space if the VIRTIO_CONSOLE_F_SIZE feature
-  is set. Furthermore, if the VIRTIO_CONSOLE_F_MULTIPORT feature
-  is set, the maximum number of ports supported by the device can
-  be fetched.struct virtio_console_config {
-        u16 cols;
-        u16 rows;
-        u32 max_nr_ports;
-};
-  Device Initialization
-  If the VIRTIO_CONSOLE_F_SIZE feature is negotiated, the driver
-  can read the console dimensions from the configuration fields.
-  If the VIRTIO_CONSOLE_F_MULTIPORT feature is negotiated, the
-  driver can spawn multiple ports, not all of which may be
-  attached to a console. Some could be generic ports. In this
-  case, the control virtqueues are enabled and according to the
-  max_nr_ports configuration-space value, the appropriate number
-  of virtqueues are created. A control message indicating the
-  driver is ready is sent to the host. The host can then send
-  control messages for adding new ports to the device. After
-  creating and initializing each port, a
-  VIRTIO_CONSOLE_PORT_READY control message is sent to the host
-  for that port so the host can let us know of any additional
-  configuration options set for that port.
-  The receiveq for each port is populated with one or more
-  receive buffers.
-  Device Operation
-  For output, a buffer containing the characters is placed in the
-  port's transmitq.[footnote:
-Because this is high importance and low bandwidth, the current
-Linux implementation polls for the buffer to be used, rather than
-waiting for an interrupt, simplifying the implementation
-significantly. However, for generic serial ports with the
-O_NONBLOCK flag set, the polling limitation is relaxed and the
-consumed buffers are freed upon the next write or poll call or
-when a port is closed or hot-unplugged.
-]
-  When a buffer is used in the receiveq (signalled by an
-  interrupt), the contents is the input to the port associated
-  with the virtqueue for which the notification was received.
-  If the driver negotiated the VIRTIO_CONSOLE_F_SIZE feature, a
-  configuration change interrupt may occur. The updated size can
-  be read from the configuration fields.
-  If the driver negotiated the VIRTIO_CONSOLE_F_MULTIPORT
-  feature, active ports are announced by the host using the
-  VIRTIO_CONSOLE_PORT_ADD control message. The same message is
-  used for port hot-plug as well.
-  If the host specified a port `name', a sysfs attribute is
-  created with the name filled in, so that udev rules can be
-  written that can create a symlink from the port's name to the
-  char device for port discovery by applications in the guest.
-  Changes to ports' state are effected by control messages.
-  Appropriate action is taken on the port indicated in the
-  control message. The layout of the structure of the control
-  buffer and the events associated are:struct virtio_console_control {
-        uint32_t id;    /* Port number */
-        uint16_t event; /* The kind of control event */
-        uint16_t value; /* Extra information for the event */
-};
-/* Some events for the internal messages (control packets) */
-#define VIRTIO_CONSOLE_DEVICE_READY     0
-#define VIRTIO_CONSOLE_PORT_ADD         1
-#define VIRTIO_CONSOLE_PORT_REMOVE      2
-#define VIRTIO_CONSOLE_PORT_READY       3
-#define VIRTIO_CONSOLE_CONSOLE_PORT     4
-#define VIRTIO_CONSOLE_RESIZE           5
-#define VIRTIO_CONSOLE_PORT_OPEN        6
-#define VIRTIO_CONSOLE_PORT_NAME        7
-Appendix F: Entropy Device
-The virtio entropy device supplies high-quality randomness for
-guest use.
-  Configuration
-  Subsystem Device ID 4
-  Virtqueues 0:requestq.
-  Feature bits None currently defined
-  Device configuration layout None currently defined.
-  Device Initialization
-  The virtqueue is initialized
-  Device Operation
-When the driver requires random bytes, it places the descriptor
-of one or more buffers in the queue. It will be completely filled
-by random data by the device.
-Appendix G: Memory Balloon Device
-The virtio memory balloon device is a primitive device for
-managing guest memory: the device asks for a certain amount of
-memory, and the guest supplies it (or withdraws it, if the device
-has more than it asks for). This allows the guest to adapt to
-changes in allowance of underlying physical memory. If the
-feature is negotiated, the device can also be used to communicate
-guest memory statistics to the host.
-  Configuration
-  Subsystem Device ID 5
-  Virtqueues 0:inflateq. 1:deflateq. 2:statsq.[footnote:
-Only if VIRTIO_BALLON_F_STATS_VQ set
-]
-  Feature bits
-  VIRTIO_BALLOON_F_MUST_TELL_HOST (0) Host must be told before
-    pages from the balloon are used.
-  VIRTIO_BALLOON_F_STATS_VQ (1) A virtqueue for reporting guest
-    memory statistics is present.
-  Device configuration layout Both fields of this configuration
-  are always available. Note that they are little endian, despite
-  convention that device fields are guest endian:struct virtio_balloon_config {
-        u32 num_pages;
-        u32 actual;
-};
-  Device Initialization
-  The inflate and deflate virtqueues are identified.
-  If the VIRTIO_BALLOON_F_STATS_VQ feature bit is negotiated:
-  Identify the stats virtqueue.
-  Add one empty buffer to the stats virtqueue and notify the
-    host.
-Device operation begins immediately.
-  Device Operation
-  Memory Ballooning The device is driven by the receipt of a
-  configuration change interrupt.
-  The “num_pages” configuration field is examined. If this is
-  greater than the “actual” number of pages, memory must be given
-  to the balloon. If it is less than the “actual” number of
-  pages, memory may be taken back from the balloon for general
-  use.
-  To supply memory to the balloon (aka. inflate):
-  The driver constructs an array of addresses of unused memory
-    pages. These addresses are divided by 4096[footnote:
-This is historical, and independent of the guest page size
-] and the descriptor describing the resulting 32-bit array is
-    added to the inflateq.
-  To remove memory from the balloon (aka. deflate):
-  The driver constructs an array of addresses of memory pages it
-    has previously given to the balloon, as described above. This
-    descriptor is added to the deflateq.
-  If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is set, the
-    guest may not use these requested pages until that descriptor
-    in the deflateq has been used by the device.
-  Otherwise, the guest may begin to re-use pages previously given
-    to the balloon before the device has acknowledged their
-    withdrawl. [footnote:
-In this case, deflation advice is merely a courtesy
-]
-  In either case, once the device has completed the inflation or
-  deflation, the “actual” field of the configuration should be
-  updated to reflect the new number of pages in the balloon.[footnote:
-As updates to configuration space are not atomic, this field
-isn't particularly reliable, but can be used to diagnose buggy
-guests.
-]
-  Memory Statistics
-The stats virtqueue is atypical because communication is driven
-by the device (not the driver). The channel becomes active at
-driver initialization time when the driver adds an empty buffer
-and notifies the device. A request for memory statistics proceeds
-as follows:
-  The device pushes the buffer onto the used ring and sends an
-  interrupt.
-  The driver pops the used buffer and discards it.
-  The driver collects memory statistics and writes them into a
-  new buffer.
-  The driver adds the buffer to the virtqueue and notifies the
-  device.
-  The device pops the buffer (retaining it to initiate a
-  subsequent request) and consumes the statistics.
-  Memory Statistics Format Each statistic consists of a 16 bit
-  tag and a 64 bit value. Both quantities are represented in the
-  native endian of the guest. All statistics are optional and the
-  driver may choose which ones to supply. To guarantee backwards
-  compatibility, unsupported statistics should be omitted.
-  struct virtio_balloon_stat {
-#define VIRTIO_BALLOON_S_SWAP_IN  0
-#define VIRTIO_BALLOON_S_SWAP_OUT 1
-#define VIRTIO_BALLOON_S_MAJFLT   2
-#define VIRTIO_BALLOON_S_MINFLT   3
-#define VIRTIO_BALLOON_S_MEMFREE  4
-#define VIRTIO_BALLOON_S_MEMTOT   5
-        u16 tag;
-        u64 val;
-} __attribute__((packed));
-  Tags
-  VIRTIO_BALLOON_S_SWAP_IN The amount of memory that has been
-  swapped in (in bytes).
-  VIRTIO_BALLOON_S_SWAP_OUT The amount of memory that has been
-  swapped out to disk (in bytes).
-  VIRTIO_BALLOON_S_MAJFLT The number of major page faults that
-  have occurred.
-  VIRTIO_BALLOON_S_MINFLT The number of minor page faults that
-  have occurred.
-  VIRTIO_BALLOON_S_MEMFREE The amount of memory not being used
-  for any purpose (in bytes).
-  VIRTIO_BALLOON_S_MEMTOT The total amount of memory available
-  (in bytes).
-Appendix H: Rpmsg: Remote Processor Messaging
-Virtio rpmsg devices represent remote processors on the system
-which run in asymmetric multi-processing (AMP) configuration, and
-which are usually used to offload cpu-intensive tasks from the
-main application processor (a typical SoC methodology).
-Virtio is being used to communicate with those remote processors;
-empty buffers are placed in one virtqueue for receiving messages,
-and non-empty buffers, containing outbound messages, are enqueued
-in a second virtqueue for transmission.
-Numerous communication channels can be multiplexed over those two
-virtqueues, so different entities, running on the application and
-remote processor, can directly communicate in a point-to-point
-fashion.
-  Configuration
-  Subsystem Device ID 7
-  Virtqueues 0:receiveq. 1:transmitq.
-  Feature bits
-  VIRTIO_RPMSG_F_NS (0) Device sends (and capable of receiving)
-    name service messages announcing the creation (or
-    destruction) of a channel:/**
- * struct rpmsg_ns_msg - dynamic name service announcement
-message
- * @name: name of remote service that is published
- * @addr: address of remote service that is published
- * @flags: indicates whether service is created or destroyed
- *
- * This message is sent across to publish a new service (or
-announce
- * about its removal). When we receives these messages, an
-appropriate
- * rpmsg channel (i.e device) is created/destroyed.
- */
-struct rpmsg_ns_msgoon_config {
-        char name[RPMSG_NAME_SIZE];
-        u32 addr;
-        u32 flags;
-} __packed;
-/**
- * enum rpmsg_ns_flags - dynamic name service announcement flags
- *
- * @RPMSG_NS_CREATE: a new remote service was just created
- * @RPMSG_NS_DESTROY: a remote service was just destroyed
- */
-enum rpmsg_ns_flags {
-        RPMSG_NS_CREATE = 0,
-        RPMSG_NS_DESTROY = 1,
-};
-  Device configuration layout
-At his point none currently defined.
-  Device Initialization
-  The initialization routine should identify the receive and
-  transmission virtqueues.
-  The receive virtqueue should be filled with receive buffers.
-  Device Operation
-Messages are transmitted by placing them in the transmitq, and
-buffers for inbound messages are placed in the receiveq. In any
-case, messages are always preceded by the following header: /**
- * struct rpmsg_hdr - common header for all rpmsg messages
- * @src: source address
- * @dst: destination address
- * @reserved: reserved for future use
- * @len: length of payload (in bytes)
- * @flags: message flags
- * @data: @len bytes of message payload data
- *
- * Every message sent(/received) on the rpmsg bus begins with
-this header.
- */
-struct rpmsg_hdr {
-        u32 src;
-        u32 dst;
-        u32 reserved;
-        u16 len;
-        u16 flags;
-        u8 data[0];
-} __packed;
-Appendix I: SCSI Host Device
-The virtio SCSI host device groups together one or more virtual
-logical units (such as disks), and allows communicating to them
-using the SCSI protocol. An instance of the device represents a
-SCSI host to which many targets and LUNs are attached.
-The virtio SCSI device services two kinds of requests:
-  command requests for a logical unit;
-  task management functions related to a logical unit, target or
-  command.
-The device is also able to send out notifications about added and
-removed logical units. Together, these capabilities provide a
-SCSI transport protocol that uses virtqueues as the transfer
-medium. In the transport protocol, the virtio driver acts as the
-initiator, while the virtio SCSI host provides one or more
-targets that receive and process the requests.
-  Configuration
-  Subsystem Device ID 8
-  Virtqueues 0:controlq; 1:eventq; 2..n:request queues.
-  Feature bits
-  VIRTIO_SCSI_F_INOUT (0) A single request can include both
-    read-only and write-only data buffers.
-  VIRTIO_SCSI_F_HOTPLUG (1) The host should enable
-    hot-plug/hot-unplug of new LUNs and targets on the SCSI bus.
-  Device configuration layout All fields of this configuration
-  are always available. sense_size and cdb_size are writable by
-  the guest.struct virtio_scsi_config {
-    u32 num_queues;
-    u32 seg_max;
-    u32 max_sectors;
-    u32 cmd_per_lun;
-    u32 event_info_size;
-    u32 sense_size;
-    u32 cdb_size;
-    u16 max_channel;
-    u16 max_target;
-    u32 max_lun;
-};
-  num_queues is the total number of request virtqueues exposed by
-    the device. The driver is free to use only one request queue,
-    or it can use more to achieve better performance.
-  seg_max is the maximum number of segments that can be in a
-    command. A bidirectional command can include seg_max input
-    segments and seg_max output segments.
-  max_sectors is a hint to the guest about the maximum transfer
-    size it should use.
-  cmd_per_lun is a hint to the guest about the maximum number of
-    linked commands it should send to one LUN. The actual value
-    to be used is the minimum of cmd_per_lun and the virtqueue
-    size.
-  event_info_size is the maximum size that the device will fill
-    for buffers that the driver places in the eventq. The driver
-    should always put buffers at least of this size. It is
-    written by the device depending on the set of negotated
-    features.
-  sense_size is the maximum size of the sense data that the
-    device will write. The default value is written by the device
-    and will always be 96, but the driver can modify it. It is
-    restored to the default when the device is reset.
-  cdb_size is the maximum size of the CDB that the driver will
-    write. The default value is written by the device and will
-    always be 32, but the driver can likewise modify it. It is
-    restored to the default when the device is reset.
-  max_channel, max_target and max_lun can be used by the driver
-    as hints to constrain scanning the logical units on the
-    host.h
-  Device Initialization
-The initialization routine should first of all discover the
-device's virtqueues.
-If the driver uses the eventq, it should then place at least a
-buffer in the eventq.
-The driver can immediately issue requests (for example, INQUIRY
-or REPORT LUNS) or task management functions (for example, I_T
-RESET).
-  Device Operation: request queues
-The driver queues requests to an arbitrary request queue, and
-they are used by the device on that same queue. It is the
-responsibility of the driver to ensure strict request ordering
-for commands placed on different queues, because they will be
-consumed with no order constraints.
-Requests have the following format:
-struct virtio_scsi_req_cmd {
-    // Read-only
-    u8 lun[8];
-    u64 id;
-    u8 task_attr;
-    u8 prio;
-    u8 crn;
-    char cdb[cdb_size];
-    char dataout[];
-    // Write-only part
-    u32 sense_len;
-    u32 residual;
-    u16 status_qualifier;
-    u8 status;
-    u8 response;
-    u8 sense[sense_size];
-    char datain[];
-};
-/* command-specific response values */
-#define VIRTIO_SCSI_S_OK                0
-#define VIRTIO_SCSI_S_OVERRUN           1
-#define VIRTIO_SCSI_S_ABORTED           2
-#define VIRTIO_SCSI_S_BAD_TARGET        3
-#define VIRTIO_SCSI_S_RESET             4
-#define VIRTIO_SCSI_S_BUSY              5
-#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6
-#define VIRTIO_SCSI_S_TARGET_FAILURE    7
-#define VIRTIO_SCSI_S_NEXUS_FAILURE     8
-#define VIRTIO_SCSI_S_FAILURE           9
-/* task_attr */
-#define VIRTIO_SCSI_S_SIMPLE            0
-#define VIRTIO_SCSI_S_ORDERED           1
-#define VIRTIO_SCSI_S_HEAD              2
-#define VIRTIO_SCSI_S_ACA               3
-The lun field addresses a target and logical unit in the
-virtio-scsi device's SCSI domain. The only supported format for
-the LUN field is: first byte set to 1, second byte set to target,
-third and fourth byte representing a single level LUN structure,
-followed by four zero bytes. With this representation, a
-virtio-scsi device can serve up to 256 targets and 16384 LUNs per
-target.
-The id field is the command identifier (“tag”).
-task_attr, prio and crn should be left to zero. task_attr defines
-the task attribute as in the table above, but all task attributes
-may be mapped to SIMPLE by the device; crn may also be provided
-by clients, but is generally expected to be 0. The maximum CRN
-value defined by the protocol is 255, since CRN is stored in an
-8-bit integer.
-All of these fields are defined in SAM. They are always
-read-only, as are the cdb and dataout field. The cdb_size is
-taken from the configuration space.
-sense and subsequent fields are always write-only. The sense_len
-field indicates the number of bytes actually written to the sense
-buffer. The residual field indicates the residual size,
-calculated as “data_length - number_of_transferred_bytes”, for
-read or write operations. For bidirectional commands, the
-number_of_transferred_bytes includes both read and written bytes.
-A residual field that is less than the size of datain means that
-the dataout field was processed entirely. A residual field that
-exceeds the size of datain means that the dataout field was
-processed partially and the datain field was not processed at
-all.
-The status byte is written by the device to be the status code as
-defined in SAM.
-The response byte is written by the device to be one of the
-following:
-  VIRTIO_SCSI_S_OK when the request was completed and the status
-  byte is filled with a SCSI status code (not necessarily
-  "GOOD").
-  VIRTIO_SCSI_S_OVERRUN if the content of the CDB requires
-  transferring more data than is available in the data buffers.
-  VIRTIO_SCSI_S_ABORTED if the request was cancelled due to an
-  ABORT TASK or ABORT TASK SET task management function.
-  VIRTIO_SCSI_S_BAD_TARGET if the request was never processed
-  because the target indicated by the lun field does not exist.
-  VIRTIO_SCSI_S_RESET if the request was cancelled due to a bus
-  or device reset (including a task management function).
-  VIRTIO_SCSI_S_TRANSPORT_FAILURE if the request failed due to a
-  problem in the connection between the host and the target
-  (severed link).
-  VIRTIO_SCSI_S_TARGET_FAILURE if the target is suffering a
-  failure and the guest should not retry on other paths.
-  VIRTIO_SCSI_S_NEXUS_FAILURE if the nexus is suffering a failure
-  but retrying on other paths might yield a different result.
-  VIRTIO_SCSI_S_BUSY if the request failed but retrying on the
-  same path should work.
-  VIRTIO_SCSI_S_FAILURE for other host or guest error. In
-  particular, if neither dataout nor datain is empty, and the
-  VIRTIO_SCSI_F_INOUT feature has not been negotiated, the
-  request will be immediately returned with a response equal to
-  VIRTIO_SCSI_S_FAILURE.
-  Device Operation: controlq
-The controlq is used for other SCSI transport operations.
-Requests have the following format:
-struct virtio_scsi_ctrl {
-    u32 type;
-    ...
-    u8 response;
-};
-/* response values valid for all commands */
-#define VIRTIO_SCSI_S_OK                       0
-#define VIRTIO_SCSI_S_BAD_TARGET               3
-#define VIRTIO_SCSI_S_BUSY                     5
-#define VIRTIO_SCSI_S_TRANSPORT_FAILURE        6
-#define VIRTIO_SCSI_S_TARGET_FAILURE           7
-#define VIRTIO_SCSI_S_NEXUS_FAILURE            8
-#define VIRTIO_SCSI_S_FAILURE                  9
-#define VIRTIO_SCSI_S_INCORRECT_LUN            12
-The type identifies the remaining fields.
-The following commands are defined:
-  Task management function
-#define VIRTIO_SCSI_T_TMF                      0
-#define VIRTIO_SCSI_T_TMF_ABORT_TASK           0
-#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET       1
-#define VIRTIO_SCSI_T_TMF_CLEAR_ACA            2
-#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET       3
-#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET      4
-#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET   5
-#define VIRTIO_SCSI_T_TMF_QUERY_TASK           6
-#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET       7
-struct virtio_scsi_ctrl_tmf
-{
-    // Read-only part
-    u32 type;
-    u32 subtype;
-    u8 lun[8];
-    u64 id;
-    // Write-only part
-    u8 response;
-}
-/* command-specific response values */
-#define VIRTIO_SCSI_S_FUNCTION_COMPLETE        0
-#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED       10
-#define VIRTIO_SCSI_S_FUNCTION_REJECTED        11
-  The type is VIRTIO_SCSI_T_TMF; the subtype field defines. All
-  fields except response are filled by the driver. The subtype
-  field must always be specified and identifies the requested
-  task management function.
-  Other fields may be irrelevant for the requested TMF; if so,
-  they are ignored but they should still be present. The lun
-  field is in the same format specified for request queues; the
-  single level LUN is ignored when the task management function
-  addresses a whole I_T nexus. When relevant, the value of the id
-  field is matched against the id values passed on the requestq.
-  The outcome of the task management function is written by the
-  device in the response field. The command-specific response
-  values map 1-to-1 with those defined in SAM.
-  Asynchronous notification query
-#define VIRTIO_SCSI_T_AN_QUERY                    1
-struct virtio_scsi_ctrl_an {
-    // Read-only part
-    u32 type;
-    u8  lun[8];
-    u32 event_requested;
-    // Write-only part
-    u32 event_actual;
-    u8  response;
-}
-#define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE  2
-#define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT          4
-#define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST    8
-#define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE        16
-#define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST          32
-#define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY         64
-  By sending this command, the driver asks the device which
-  events the given LUN can report, as described in paragraphs 6.6
-  and A.6 of the SCSI MMC specification. The driver writes the
-  events it is interested in into the event_requested; the device
-  responds by writing the events that it supports into
-  event_actual.
-  The type is VIRTIO_SCSI_T_AN_QUERY. The lun and event_requested
-  fields are written by the driver. The event_actual and response
-  fields are written by the device.
-  No command-specific values are defined for the response byte.
-  Asynchronous notification subscription
-#define VIRTIO_SCSI_T_AN_SUBSCRIBE                2
-struct virtio_scsi_ctrl_an {
-    // Read-only part
-    u32 type;
-    u8  lun[8];
-    u32 event_requested;
-    // Write-only part
-    u32 event_actual;
-    u8  response;
-}
-  By sending this command, the driver asks the specified LUN to
-  report events for its physical interface, again as described in
-  the SCSI MMC specification. The driver writes the events it is
-  interested in into the event_requested; the device responds by
-  writing the events that it supports into event_actual.
-  Event types are the same as for the asynchronous notification
-  query message.
-  The type is VIRTIO_SCSI_T_AN_SUBSCRIBE. The lun and
-  event_requested fields are written by the driver. The
-  event_actual and response fields are written by the device.
-  No command-specific values are defined for the response byte.
-  Device Operation: eventq
-The eventq is used by the device to report information on logical
-units that are attached to it. The driver should always leave a
-few buffers ready in the eventq. In general, the device will not
-queue events to cope with an empty eventq, and will end up
-dropping events if it finds no buffer ready. However, when
-reporting events for many LUNs (e.g. when a whole target
-disappears), the device can throttle events to avoid dropping
-them. For this reason, placing 10-15 buffers on the event queue
-should be enough.
-Buffers are placed in the eventq and filled by the device when
-interesting events occur. The buffers should be strictly
-write-only (device-filled) and the size of the buffers should be
-at least the value given in the device's configuration
-information.
-Buffers returned by the device on the eventq will be referred to
-as "events" in the rest of this section. Events have the
-following format:
-#define VIRTIO_SCSI_T_EVENTS_MISSED   0x80000000
-struct virtio_scsi_event {
-    // Write-only part
-    u32 event;
-    ...
-}
-If bit 31 is set in the event field, the device failed to report
-an event due to missing buffers. In this case, the driver should
-poll the logical units for unit attention conditions, and/or do
-whatever form of bus scan is appropriate for the guest operating
-system.
-Other data that the device writes to the buffer depends on the
-contents of the event field. The following events are defined:
-  No event
-#define VIRTIO_SCSI_T_NO_EVENT         0
-  This event is fired in the following cases:
-  When the device detects in the eventq a buffer that is shorter
-    than what is indicated in the configuration field, it might
-    use it immediately and put this dummy value in the event
-    field. A well-written driver will never observe this
-    situation.
-  When events are dropped, the device may signal this event as
-    soon as the drivers makes a buffer available, in order to
-    request action from the driver. In this case, of course, this
-    event will be reported with the VIRTIO_SCSI_T_EVENTS_MISSED
-    flag.
-  Transport reset
-#define VIRTIO_SCSI_T_TRANSPORT_RESET  1
-struct virtio_scsi_event_reset {
-    // Write-only part
-    u32 event;
-    u8  lun[8];
-    u32 reason;
-}
-#define VIRTIO_SCSI_EVT_RESET_HARD         0
-#define VIRTIO_SCSI_EVT_RESET_RESCAN       1
-#define VIRTIO_SCSI_EVT_RESET_REMOVED      2
-  By sending this event, the device signals that a logical unit
-  on a target has been reset, including the case of a new device
-  appearing or disappearing on the bus.The device fills in all
-  fields. The event field is set to
-  VIRTIO_SCSI_T_TRANSPORT_RESET. The lun field addresses a
-  logical unit in the SCSI host.
-  The reason value is one of the three #define values appearing
-  above:
-  VIRTIO_SCSI_EVT_RESET_REMOVED (“LUN/target removed”) is used if
-    the target or logical unit is no longer able to receive
-    commands.
-  VIRTIO_SCSI_EVT_RESET_HARD (“LUN hard reset”) is used if the
-    logical unit has been reset, but is still present.
-  VIRTIO_SCSI_EVT_RESET_RESCAN (“rescan LUN/target”) is used if a
-    target or logical unit has just appeared on the device.
-  The “removed” and “rescan” events, when sent for LUN 0, may
-  apply to the entire target. After receiving them the driver
-  should ask the initiator to rescan the target, in order to
-  detect the case when an entire target has appeared or
-  disappeared. These two events will never be reported unless the
-  VIRTIO_SCSI_F_HOTPLUG feature was negotiated between the host
-  and the guest.
-  Events will also be reported via sense codes (this obviously
-  does not apply to newly appeared buses or targets, since the
-  application has never discovered them):
-  “LUN/target removed” maps to sense key ILLEGAL REQUEST, asc
-    0x25, ascq 0x00 (LOGICAL UNIT NOT SUPPORTED)
-  “LUN hard reset” maps to sense key UNIT ATTENTION, asc 0x29
-    (POWER ON, RESET OR BUS DEVICE RESET OCCURRED)
-  “rescan LUN/target” maps to sense key UNIT ATTENTION, asc 0x3f,
-    ascq 0x0e (REPORTED LUNS DATA HAS CHANGED)
-  The preferred way to detect transport reset is always to use
-  events, because sense codes are only seen by the driver when it
-  sends a SCSI command to the logical unit or target. However, in
-  case events are dropped, the initiator will still be able to
-  synchronize with the actual state of the controller if the
-  driver asks the initiator to rescan of the SCSI bus. During the
-  rescan, the initiator will be able to observe the above sense
-  codes, and it will process them as if it the driver had
-  received the equivalent event.
-  Asynchronous notification
-#define VIRTIO_SCSI_T_ASYNC_NOTIFY     2
-struct virtio_scsi_event_an {
-    // Write-only part
-    u32 event;
-    u8  lun[8];
-    u32 reason;
-}
-  By sending this event, the device signals that an asynchronous
-  event was fired from a physical interface.
-  All fields are written by the device. The event field is set to
-  VIRTIO_SCSI_T_ASYNC_NOTIFY. The lun field addresses a logical
-  unit in the SCSI host. The reason field is a subset of the
-  events that the driver has subscribed to via the "Asynchronous
-  notification subscription" command.
-  When dropped events are reported, the driver should poll for
-  asynchronous events manually using SCSI commands.
-Appendix X: virtio-mmio
-Virtual environments without PCI support (a common situation in
-embedded devices models) might use simple memory mapped device (“
-virtio-mmio”) instead of the PCI device.
-The memory mapped virtio device behaviour is based on the PCI
-device specification. Therefore most of operations like device
-initialization, queues configuration and buffer transfers are
-nearly identical. Existing differences are described in the
-following sections.
-  Device Initialization
-Instead of using the PCI IO space for virtio header, the “
-virtio-mmio” device provides a set of memory mapped control
-registers, all 32 bits wide, followed by device-specific
-configuration space. The following list presents their layout:
-  Offset from the device base address | Direction | Name
- Description
-  0x000 | R | MagicValue
- “virt” string.
-  0x004 | R | Version
- Device version number. Currently must be 1.
-  0x008 | R | DeviceID
- Virtio Subsystem Device ID (ie. 1 for network card).
-  0x00c | R | VendorID
- Virtio Subsystem Vendor ID.
-  0x010 | R | HostFeatures
- Flags representing features the device supports.
- Reading from this register returns 32 consecutive flag bits,
-  first bit depending on the last value written to
-  HostFeaturesSel register. Access to this register returns bits HostFeaturesSel*32
-   to (HostFeaturesSel*32)+31
-, eg. feature bits 0 to 31 if
-  HostFeaturesSel is set to 0 and features bits 32 to 63 if
-  HostFeaturesSel is set to 1. Also see [sub:Feature-Bits]
-  0x014 | W | HostFeaturesSel
- Device (Host) features word selection.
- Writing to this register selects a set of 32 device feature bits
-  accessible by reading from HostFeatures register. Device driver
-  must write a value to the HostFeaturesSel register before
-  reading from the HostFeatures register.
-  0x020 | W | GuestFeatures
- Flags representing device features understood and activated by
-  the driver.
- Writing to this register sets 32 consecutive flag bits, first
-  bit depending on the last value written to GuestFeaturesSel
-  register. Access to this register sets bits GuestFeaturesSel*32
-   to (GuestFeaturesSel*32)+31
-, eg. feature bits 0 to 31 if
-  GuestFeaturesSel is set to 0 and features bits 32 to 63 if
-  GuestFeaturesSel is set to 1. Also see [sub:Feature-Bits]
-  0x024 | W | GuestFeaturesSel
- Activated (Guest) features word selection.
- Writing to this register selects a set of 32 activated feature
-  bits accessible by writing to the GuestFeatures register.
-  Device driver must write a value to the GuestFeaturesSel
-  register before writing to the GuestFeatures register.
-  0x028 | W | GuestPageSize
- Guest page size.
- Device driver must write the guest page size in bytes to the
-  register during initialization, before any queues are used.
-  This value must be a power of 2 and is used by the Host to
-  calculate Guest address of the first queue page (see QueuePFN).
-  0x030 | W | QueueSel
- Virtual queue index (first queue is 0).
- Writing to this register selects the virtual queue that the
-  following operations on QueueNum, QueueAlign and QueuePFN apply
-  to.
-  0x034 | R | QueueNumMax
- Maximum virtual queue size.
- Reading from the register returns the maximum size of the queue
-  the Host is ready to process or zero (0x0) if the queue is not
-  available. This applies to the queue selected by writing to
-  QueueSel and is allowed only when QueuePFN is set to zero
-  (0x0), so when the queue is not actively used.
-  0x038 | W | QueueNum
- Virtual queue size.
- Queue size is a number of elements in the queue, therefore size
-  of the descriptor table and both available and used rings.
- Writing to this register notifies the Host what size of the
-  queue the Guest will use. This applies to the queue selected by
-  writing to QueueSel.
-  0x03c | W | QueueAlign
- Used Ring alignment in the virtual queue.
- Writing to this register notifies the Host about alignment
-  boundary of the Used Ring in bytes. This value must be a power
-  of 2 and applies to the queue selected by writing to QueueSel.
-  0x040 | RW | QueuePFN
- Guest physical page number of the virtual queue.
- Writing to this register notifies the host about location of the
-  virtual queue in the Guest's physical address space. This value
-  is the index number of a page starting with the queue
-  Descriptor Table. Value zero (0x0) means physical address zero
-  (0x00000000) and is illegal. When the Guest stops using the
-  queue it must write zero (0x0) to this register.
- Reading from this register returns the currently used page
-  number of the queue, therefore a value other than zero (0x0)
-  means that the queue is in use.
- Both read and write accesses apply to the queue selected by
-  writing to QueueSel.
-  0x050 | W | QueueNotify
- Queue notifier.
- Writing a queue index to this register notifies the Host that
-  there are new buffers to process in the queue.
-  0x60 | R | InterruptStatus
-Interrupt status.
-Reading from this register returns a bit mask of interrupts
-  asserted by the device. An interrupt is asserted if the
-  corresponding bit is set, ie. equals one (1).
-  Bit 0 | Used Ring Update
-This interrupt is asserted when the Host has updated the Used
-    Ring in at least one of the active virtual queues.
-  Bit 1 | Configuration change
-This interrupt is asserted when configuration of the device has
-    changed.
-  0x064 | W | InterruptACK
- Interrupt acknowledge.
- Writing to this register notifies the Host that the Guest
-  finished handling interrupts. Set bits in the value clear the
-  corresponding bits of the InterruptStatus register.
-  0x070 | RW | Status
- Device status.
- Reading from this register returns the current device status
-  flags.
- Writing non-zero values to this register sets the status flags,
-  indicating the Guest progress. Writing zero (0x0) to this
-  register triggers a device reset.
- Also see [sub:Device-Initialization-Sequence]
-  0x100+ | RW | Config
- Device-specific configuration space starts at an offset 0x100
-  and is accessed with byte alignment. Its meaning and size
-  depends on the device and the driver.
-Virtual queue size is a number of elements in the queue,
-therefore size of the descriptor table and both available and
-used rings.
-The endianness of the registers follows the native endianness of
-the Guest. Writing to registers described as “R” and reading from
-registers described as “W” is not permitted and can cause
-undefined behavior.
-The device initialization is performed as described in [sub:Device-Initialization-Sequence]
- with one exception: the Guest must notify the Host about its
-page size, writing the size in bytes to GuestPageSize register
-before the initialization is finished.
-The memory mapped virtio devices generate single interrupt only,
-therefore no special configuration is required.
-  Virtqueue Configuration
-The virtual queue configuration is performed in a similar way to
-the one described in [sec:Virtqueue-Configuration] with a few
-additional operations:
-  Select the queue writing its index (first queue is 0) to the
-  QueueSel register.
-  Check if the queue is not already in use: read QueuePFN
-  register, returned value should be zero (0x0).
-  Read maximum queue size (number of elements) from the
-  QueueNumMax register. If the returned value is zero (0x0) the
-  queue is not available.
-  Allocate and zero the queue pages in contiguous virtual memory,
-  aligning the Used Ring to an optimal boundary (usually page
-  size). Size of the allocated queue may be smaller than or equal
-  to the maximum size returned by the Host.
-  Notify the Host about the queue size by writing the size to
-  QueueNum register.
-  Notify the Host about the used alignment by writing its value
-  in bytes to QueueAlign register.
-  Write the physical number of the first page of the queue to the
-  QueuePFN register.
-The queue and the device are ready to begin normal operations
-now.
-  Device Operation
-The memory mapped virtio device behaves in the same way as
-described in [sec:Device-Operation], with the following
-exceptions:
-  The device is notified about new buffers available in a queue
-  by writing the queue index to register QueueNum instead of the
-  virtio header in PCI I/O space ([sub:Notifying-The-Device]).
-  The memory mapped virtio device is using single, dedicated
-  interrupt signal, which is raised when at least one of the
-  interrupts described in the InterruptStatus register
-  description is asserted. After receiving an interrupt, the
-  driver must read the InterruptStatus register to check what
-  caused the interrupt (see the register description). After the
-  interrupt is handled, the driver must acknowledge it by writing
-  a bit mask corresponding to the serviced interrupt to the
-  InterruptACK register.
diff --git a/MAINTAINERS b/MAINTAINERS
index ee468fac7dbf..b57e2765a342 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8743,6 +8743,7 @@ F:	drivers/virtio/
 F:      drivers/net/virtio_net.c
 F:      drivers/block/virtio_blk.c
 F:      include/linux/virtio_*.h
+F:      include/uapi/linux/virtio_*.h
 VIRTIO HOST (VHOST)
 M:      "Michael S. Tsirkin" <mst@redhat.com>
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 0d97deba1e35..e2d4a4afa8c3 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -11,18 +11,11 @@
 #define GUEST_PL 1
-/* Every guest maps the core switcher code. */
+/* Page for Switcher text itself, then two pages per cpu */
-#define SHARED_SWITCHER_PAGES \
+#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids)
-        DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
-/* Pages for switcher itself, then two pages per cpu */
+/* Where we map the Switcher, in both Host and Guest. */
-#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
+extern unsigned long switcher_addr;
-/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */
-#ifdef CONFIG_X86_PAE
-#define SWITCHER_ADDR 0xFFE00000
-#else
-#define SWITCHER_ADDR 0xFFC00000
-#endif
 /* Found in switcher.S */
 extern unsigned long default_idt_entries[];
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index dabd221857e1..03cf7179e8ef 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -110,7 +110,7 @@ new_segment:
                        if (!sg)
                                sg = sglist;
                        else {
-                                sg->page_link &= ~0x02;
+                                sg_unmark_end(sg);
                                sg = sg_next(sg);
                        }
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 936a110de0b9..5f2448253797 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -143,7 +143,7 @@ new_segment:
                         * termination bit to avoid doing a full
                         * sg_init_table() in drivers for each command.
                         */
-                        (*sg)->page_link &= ~0x02;
+                        sg_unmark_end(*sg);
                        *sg = sg_next(*sg);
                }
diff --git a/drivers/Makefile b/drivers/Makefile
index 33360de63650..8e57688ebd95 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -124,7 +124,7 @@ obj-$(CONFIG_PPC_PS3)		+= ps3/
 obj-$(CONFIG_OF)                += of/
 obj-$(CONFIG_SSB)               += ssb/
 obj-$(CONFIG_BCMA)              += bcma/
-obj-$(CONFIG_VHOST_NET)         += vhost/
+obj-$(CONFIG_VHOST_RING)        += vhost/
 obj-$(CONFIG_VLYNQ)             += vlynq/
 obj-$(CONFIG_STAGING)           += staging/
 obj-y                           += platform/
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 8ad21a25bc0d..64723953e1c9 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -100,96 +100,103 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
        return vbr;
 }
-static void virtblk_add_buf_wait(struct virtio_blk *vblk,
+static int __virtblk_add_req(struct virtqueue *vq,
-                                 struct virtblk_req *vbr,
+                             struct virtblk_req *vbr,
-                                 unsigned long out,
+                             struct scatterlist *data_sg,
-                                 unsigned long in)
+                             bool have_data)
 {
-        DEFINE_WAIT(wait);
+        struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6];
+        unsigned int num_out = 0, num_in = 0;
+        int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT;
-        for (;;) {
+        sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
-                prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
+        sgs[num_out++] = &hdr;
-                                          TASK_UNINTERRUPTIBLE);
-                spin_lock_irq(vblk->disk->queue->queue_lock);
+        /*
-                if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
+         * If this is a packet command we need a couple of additional headers.
-                                      GFP_ATOMIC) < 0) {
+         * Behind the normal outhdr we put a segment with the scsi command
-                        spin_unlock_irq(vblk->disk->queue->queue_lock);
+         * block, and before the normal inhdr we put the sense data and the
-                        io_schedule();
+         * inhdr with additional status information.
-                } else {
+         */
-                        virtqueue_kick(vblk->vq);
+        if (type == VIRTIO_BLK_T_SCSI_CMD) {
-                        spin_unlock_irq(vblk->disk->queue->queue_lock);
+                sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len);
-                        break;
+                sgs[num_out++] = &cmd;
-                }
+        }
+        if (have_data) {
+                if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT)
+                        sgs[num_out++] = data_sg;
+                else
+                        sgs[num_out + num_in++] = data_sg;
        }
-        finish_wait(&vblk->queue_wait, &wait);
+        if (type == VIRTIO_BLK_T_SCSI_CMD) {
+                sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
+                sgs[num_out + num_in++] = &sense;
+                sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr));
+                sgs[num_out + num_in++] = &inhdr;
+        }
+        sg_init_one(&status, &vbr->status, sizeof(vbr->status));
+        sgs[num_out + num_in++] = &status;
+        return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
 }
-static inline void virtblk_add_req(struct virtblk_req *vbr,
+static void virtblk_add_req(struct virtblk_req *vbr, bool have_data)
-                                   unsigned int out, unsigned int in)
 {
        struct virtio_blk *vblk = vbr->vblk;
+        DEFINE_WAIT(wait);
+        int ret;
        spin_lock_irq(vblk->disk->queue->queue_lock);
-        if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
+        while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg,
-                                        GFP_ATOMIC) < 0)) {
+                                                 have_data)) < 0)) {
+                prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
+                                          TASK_UNINTERRUPTIBLE);
                spin_unlock_irq(vblk->disk->queue->queue_lock);
-                virtblk_add_buf_wait(vblk, vbr, out, in);
+                io_schedule();
-                return;
+                spin_lock_irq(vblk->disk->queue->queue_lock);
+                finish_wait(&vblk->queue_wait, &wait);
        }
        virtqueue_kick(vblk->vq);
        spin_unlock_irq(vblk->disk->queue->queue_lock);
 }
-static int virtblk_bio_send_flush(struct virtblk_req *vbr)
+static void virtblk_bio_send_flush(struct virtblk_req *vbr)
 {
-        unsigned int out = 0, in = 0;
        vbr->flags |= VBLK_IS_FLUSH;
        vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
        vbr->out_hdr.sector = 0;
        vbr->out_hdr.ioprio = 0;
-        sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
-        sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status));
-        virtblk_add_req(vbr, out, in);
-        return 0;
+        virtblk_add_req(vbr, false);
 }
-static int virtblk_bio_send_data(struct virtblk_req *vbr)
+static void virtblk_bio_send_data(struct virtblk_req *vbr)
 {
        struct virtio_blk *vblk = vbr->vblk;
-        unsigned int num, out = 0, in = 0;
        struct bio *bio = vbr->bio;
+        bool have_data;
        vbr->flags &= ~VBLK_IS_FLUSH;
        vbr->out_hdr.type = 0;
        vbr->out_hdr.sector = bio->bi_sector;
        vbr->out_hdr.ioprio = bio_prio(bio);
-        sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
+        if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) {
+                have_data = true;
-        num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out);
+                if (bio->bi_rw & REQ_WRITE)
-        sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
-                   sizeof(vbr->status));
-        if (num) {
-                if (bio->bi_rw & REQ_WRITE) {
                        vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
-                        out += num;
+                else
-                } else {
                        vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
-                        in += num;
+        } else
-                }
+                have_data = false;
-        }
-        virtblk_add_req(vbr, out, in);
+        virtblk_add_req(vbr, have_data);
-        return 0;
 }
 static void virtblk_bio_send_data_work(struct work_struct *work)
@@ -298,7 +305,7 @@ static void virtblk_done(struct virtqueue *vq)
 static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
                   struct request *req)
 {
-        unsigned long num, out = 0, in = 0;
+        unsigned int num;
        struct virtblk_req *vbr;
        vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
@@ -335,40 +342,15 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
                }
        }
-        sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
+        num = blk_rq_map_sg(q, vbr->req, vblk->sg);
-        /*
-         * If this is a packet command we need a couple of additional headers.
-         * Behind the normal outhdr we put a segment with the scsi command
-         * block, and before the normal inhdr we put the sense data and the
-         * inhdr with additional status information before the normal inhdr.
-         */
-        if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC)
-                sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
-        num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
-        if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) {
-                sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
-                sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
-                           sizeof(vbr->in_hdr));
-        }
-        sg_set_buf(&vblk->sg[num + out + in++], &vbr->status,
-                   sizeof(vbr->status));
        if (num) {
-                if (rq_data_dir(vbr->req) == WRITE) {
+                if (rq_data_dir(vbr->req) == WRITE)
                        vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
-                        out += num;
+                else
-                } else {
                        vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
-                        in += num;
-                }
        }
-        if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr,
+        if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) {
-                              GFP_ATOMIC) < 0) {
                mempool_free(vbr, vblk->pool);
                return false;
        }
@@ -539,6 +521,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
        struct virtio_device *vdev = vblk->vdev;
        struct request_queue *q = vblk->disk->queue;
        char cap_str_2[10], cap_str_10[10];
+        char *envp[] = { "RESIZE=1", NULL };
        u64 capacity, size;
        mutex_lock(&vblk->config_lock);
@@ -568,6 +551,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
        set_capacity(vblk->disk, capacity);
        revalidate_disk(vblk->disk);
+        kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
 done:
        mutex_unlock(&vblk->config_lock);
 }
diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 6bf4d47324eb..ef46a9cfd832 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -47,7 +47,7 @@ static void register_buffer(u8 *buf, size_t size)
        sg_init_one(&sg, buf, size);
        /* There should always be room for one buffer. */
-        if (virtqueue_add_buf(vq, &sg, 0, 1, buf, GFP_KERNEL) < 0)
+        if (virtqueue_add_inbuf(vq, &sg, 1, buf, GFP_KERNEL) < 0)
                BUG();
        virtqueue_kick(vq);
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index ce5f3fc25d6d..1b456fe9b87a 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -78,8 +78,8 @@ struct ports_driver_data {
 };
 static struct ports_driver_data pdrvdata;
-DEFINE_SPINLOCK(pdrvdata_lock);
+static DEFINE_SPINLOCK(pdrvdata_lock);
-DECLARE_COMPLETION(early_console_added);
+static DECLARE_COMPLETION(early_console_added);
 /* This struct holds information that's relevant only for console ports */
 struct console {
@@ -503,7 +503,7 @@ static int add_inbuf(struct virtqueue *vq, struct port_buffer *buf)
        sg_init_one(sg, buf->buf, buf->size);
-        ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC);
+        ret = virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC);
        virtqueue_kick(vq);
        if (!ret)
                ret = vq->num_free;
@@ -572,7 +572,7 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
        sg_init_one(sg, &cpkt, sizeof(cpkt));
        spin_lock(&portdev->c_ovq_lock);
-        if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) == 0) {
+        if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) {
                virtqueue_kick(vq);
                while (!virtqueue_get_buf(vq, &len))
                        cpu_relax();
@@ -622,7 +622,7 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg,
        reclaim_consumed_buffers(port);
-        err = virtqueue_add_buf(out_vq, sg, nents, 0, data, GFP_ATOMIC);
+        err = virtqueue_add_outbuf(out_vq, sg, nents, data, GFP_ATOMIC);
        /* Tell Host to go! */
        virtqueue_kick(out_vq);
@@ -1040,7 +1040,7 @@ static int port_fops_open(struct inode *inode, struct file *filp)
        spin_lock_irq(&port->inbuf_lock);
        if (port->guest_connected) {
                spin_unlock_irq(&port->inbuf_lock);
-                ret = -EMFILE;
+                ret = -EBUSY;
                goto out;
        }
@@ -1202,7 +1202,7 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
        return hvc_instantiate(0, 0, &hv_ops);
 }
-int init_port_console(struct port *port)
+static int init_port_console(struct port *port)
 {
        int ret;
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 89875ea19ade..ee035ec4526b 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -5,10 +5,9 @@ config LGUEST
        ---help---
          This is a very simple module which allows you to run
          multiple instances of the same Linux kernel, using the
-          "lguest" command found in the Documentation/virtual/lguest
+          "lguest" command found in the tools/lguest directory.
-          directory.
          Note that "lguest" is pronounced to rhyme with "fell quest",
-          not "rustyvisor". See Documentation/virtual/lguest/lguest.txt.
+          not "rustyvisor". See tools/lguest/lguest.txt.
          If unsure, say N.  If curious, say M.  If masochistic, say Y.
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index a5ebc0083d87..0bf1e4edf04d 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -20,9 +20,9 @@
 #include <asm/asm-offsets.h>
 #include "lg.h"
+unsigned long switcher_addr;
+struct page **lg_switcher_pages;
 static struct vm_struct *switcher_vma;
-static struct page **switcher_page;
 /* This One Big lock protects all inter-guest data structures. */
 DEFINE_MUTEX(lguest_lock);
@@ -52,13 +52,21 @@ static __init int map_switcher(void)
         * easy.
         */
+        /* We assume Switcher text fits into a single page. */
+        if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
+                printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
+                       end_switcher_text - start_switcher_text);
+                return -EINVAL;
+        }
        /*
         * We allocate an array of struct page pointers.  map_vm_area() wants
         * this, rather than just an array of pages.
         */
-        switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
+        lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
-                                GFP_KERNEL);
+                                    * TOTAL_SWITCHER_PAGES,
-        if (!switcher_page) {
+                                    GFP_KERNEL);
+        if (!lg_switcher_pages) {
                err = -ENOMEM;
                goto out;
        }
@@ -68,32 +76,29 @@ static __init int map_switcher(void)
         * so we make sure they're zeroed.
         */
        for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
-                switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+                lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
-                if (!switcher_page[i]) {
+                if (!lg_switcher_pages[i]) {
                        err = -ENOMEM;
                        goto free_some_pages;
                }
        }
        /*
-         * First we check that the Switcher won't overlap the fixmap area at
+         * We place the Switcher underneath the fixmap area, which is the
-         * the top of memory.  It's currently nowhere near, but it could have
+         * highest virtual address we can get.  This is important, since we
-         * very strange effects if it ever happened.
+         * tell the Guest it can't access this memory, so we want its ceiling
+         * as high as possible.
         */
-        if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){
+        switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
-                err = -ENOMEM;
-                printk("lguest: mapping switcher would thwack fixmap\n");
-                goto free_pages;
-        }
        /*
-         * Now we reserve the "virtual memory area" we want: 0xFFC00000
+         * Now we reserve the "virtual memory area" we want.  We might
-         * (SWITCHER_ADDR).  We might not get it in theory, but in practice
+         * not get it in theory, but in practice it's worked so far.
-         * it's worked so far.  The end address needs +1 because __get_vm_area
+         * The end address needs +1 because __get_vm_area allocates an
-         * allocates an extra guard page, so we need space for that.
+         * extra guard page, so we need space for that.
         */
        switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
-                                     VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR
+                                     VM_ALLOC, switcher_addr, switcher_addr
                                     + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
        if (!switcher_vma) {
                err = -ENOMEM;
@@ -103,12 +108,12 @@ static __init int map_switcher(void)
        /*
         * This code actually sets up the pages we've allocated to appear at
-         * SWITCHER_ADDR.  map_vm_area() takes the vma we allocated above, the
+         * switcher_addr.  map_vm_area() takes the vma we allocated above, the
         * kind of pages we're mapping (kernel pages), and a pointer to our
         * array of struct pages.  It increments that pointer, but we don't
         * care.
         */
-        pagep = switcher_page;
+        pagep = lg_switcher_pages;
        err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
        if (err) {
                printk("lguest: map_vm_area failed: %i\n", err);
@@ -133,8 +138,8 @@ free_pages:
        i = TOTAL_SWITCHER_PAGES;
 free_some_pages:
        for (--i; i >= 0; i--)
-                __free_pages(switcher_page[i], 0);
+                __free_pages(lg_switcher_pages[i], 0);
-        kfree(switcher_page);
+        kfree(lg_switcher_pages);
 out:
        return err;
 }
@@ -149,8 +154,8 @@ static void unmap_switcher(void)
        vunmap(switcher_vma->addr);
        /* Now we just need to free the pages we copied the switcher into */
        for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
-                __free_pages(switcher_page[i], 0);
+                __free_pages(lg_switcher_pages[i], 0);
-        kfree(switcher_page);
+        kfree(lg_switcher_pages);
 }
 /*H:032
@@ -323,15 +328,10 @@ static int __init init(void)
        if (err)
                goto out;
-        /* Now we set up the pagetable implementation for the Guests. */
-        err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
-        if (err)
-                goto unmap;
        /* We might need to reserve an interrupt vector. */
        err = init_interrupts();
        if (err)
-                goto free_pgtables;
+                goto unmap;
        /* /dev/lguest needs to be registered. */
        err = lguest_device_init();
@@ -346,8 +346,6 @@ static int __init init(void)
 free_interrupts:
        free_interrupts();
-free_pgtables:
-        free_pagetables();
 unmap:
        unmap_switcher();
 out:
@@ -359,7 +357,6 @@ static void __exit fini(void)
 {
        lguest_device_remove();
        free_interrupts();
-        free_pagetables();
        unmap_switcher();
        lguest_arch_host_fini();
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 295df06e6590..2eef40be4c04 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -14,11 +14,10 @@
 #include <asm/lguest.h>
-void free_pagetables(void);
-int init_pagetables(struct page **switcher_page, unsigned int pages);
 struct pgdir {
        unsigned long gpgdir;
+        bool switcher_mapped;
+        int last_host_cpu;
        pgd_t *pgdir;
 };
@@ -124,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg,
                       unsigned long addr, unsigned long len);
 void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
 void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
+extern struct page **lg_switcher_pages;
 /*H:035
 * Using memory-copy operations like that is usually inconvient, so we
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index ff4a0bc9904d..4263f4cc8c55 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
 */
 static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
 {
-        /* We have a limited number the number of CPUs in the lguest struct. */
+        /* We have a limited number of CPUs in the lguest struct. */
        if (id >= ARRAY_SIZE(cpu->lg->cpus))
                return -EINVAL;
        /* Set up this CPU's id, and pointer back to the lguest struct. */
        cpu->id = id;
-        cpu->lg = container_of((cpu - id), struct lguest, cpus[0]);
+        cpu->lg = container_of(cpu, struct lguest, cpus[id]);
        cpu->lg->nr_cpus++;
        /* Each CPU has a timer it can set. */
@@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
        if (!cpu->regs_page)
                return -ENOMEM;
-        /* We actually put the registers at the bottom of the page. */
+        /* We actually put the registers at the end of the page. */
        cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
        /*
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 864baabaee25..699187ab3800 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -7,7 +7,7 @@
 * converted Guest pages when running the Guest.
 :*/
-/* Copyright (C) Rusty Russell IBM Corporation 2006.
+/* Copyright (C) Rusty Russell IBM Corporation 2013.
 * GPL v2 and any later version */
 #include <linux/mm.h>
 #include <linux/gfp.h>
@@ -62,22 +62,11 @@
 * will need the last pmd entry of the last pmd page.
 */
 #ifdef CONFIG_X86_PAE
-#define SWITCHER_PMD_INDEX      (PTRS_PER_PMD - 1)
-#define RESERVE_MEM             2U
 #define CHECK_GPGD_MASK         _PAGE_PRESENT
 #else
-#define RESERVE_MEM             4U
 #define CHECK_GPGD_MASK         _PAGE_TABLE
 #endif
-/*
- * We actually need a separate PTE page for each CPU.  Remember that after the
- * Switcher code itself comes two pages for each CPU, and we don't want this
- * CPU's guest to see the pages of any other CPU.
- */
-static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
-#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
 /*H:320
 * The page table code is curly enough to need helper functions to keep it
 * clear and clean.  The kernel itself provides many of them; one advantage
@@ -95,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
 {
        unsigned int index = pgd_index(vaddr);
-#ifndef CONFIG_X86_PAE
-        /* We kill any Guest trying to touch the Switcher addresses. */
-        if (index >= SWITCHER_PGD_INDEX) {
-                kill_guest(cpu, "attempt to access switcher pages");
-                index = 0;
-        }
-#endif
        /* Return a pointer index'th pgd entry for the i'th page table. */
        return &cpu->lg->pgdirs[i].pgdir[index];
 }
@@ -117,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
        unsigned int index = pmd_index(vaddr);
        pmd_t *page;
-        /* We kill any Guest trying to touch the Switcher addresses. */
-        if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
-                                        index >= SWITCHER_PMD_INDEX) {
-                kill_guest(cpu, "attempt to access switcher pages");
-                index = 0;
-        }
        /* You should never call this if the PGD entry wasn't valid */
        BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
        page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
@@ -275,122 +250,177 @@ static void release_pte(pte_t pte)
 }
 /*:*/
-static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
+static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
 {
        if ((pte_flags(gpte) & _PAGE_PSE) ||
-            pte_pfn(gpte) >= cpu->lg->pfn_limit)
+            pte_pfn(gpte) >= cpu->lg->pfn_limit) {
                kill_guest(cpu, "bad page table entry");
+                return false;
+        }
+        return true;
 }
-static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
+static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
 {
        if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
-           (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
+            (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
                kill_guest(cpu, "bad page directory entry");
+                return false;
+        }
+        return true;
 }
 #ifdef CONFIG_X86_PAE
-static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
+static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
 {
        if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
-           (pmd_pfn(gpmd) >= cpu->lg->pfn_limit))
+            (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
                kill_guest(cpu, "bad page middle directory entry");
+                return false;
+        }
+        return true;
 }
 #endif
-/*H:330
+/*H:331
- * (i) Looking up a page table entry when the Guest faults.
+ * This is the core routine to walk the shadow page tables and find the page
- *
+ * table entry for a specific address.
- * We saw this call in run_guest(): when we see a page fault in the Guest, we
- * come here.  That's because we only set up the shadow page tables lazily as
- * they're needed, so we get page faults all the time and quietly fix them up
- * and return to the Guest without it knowing.
 *
- * If we fixed up the fault (ie. we mapped the address), this routine returns
+ * If allocate is set, then we allocate any missing levels, setting the flags
- * true.  Otherwise, it was a real fault and we need to tell the Guest.
+ * on the new page directory and mid-level directories using the arguments
+ * (which are copied from the Guest's page table entries).
 */
-bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
+                        int pgd_flags, int pmd_flags)
 {
-        pgd_t gpgd;
        pgd_t *spgd;
-        unsigned long gpte_ptr;
-        pte_t gpte;
-        pte_t *spte;
        /* Mid level for PAE. */
 #ifdef CONFIG_X86_PAE
        pmd_t *spmd;
-        pmd_t gpmd;
 #endif
-        /* First step: get the top-level Guest page table entry. */
+        /* Get top level entry. */
-        if (unlikely(cpu->linear_pages)) {
-                /* Faking up a linear mapping. */
-                gpgd = __pgd(CHECK_GPGD_MASK);
-        } else {
-                gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
-                /* Toplevel not present?  We can't map it in. */
-                if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-                        return false;
-        }
-        /* Now look at the matching shadow entry. */
        spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
        if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
                /* No shadow entry: allocate a new shadow PTE page. */
-                unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+                unsigned long ptepage;
+                /* If they didn't want us to allocate anything, stop. */
+                if (!allocate)
+                        return NULL;
+                ptepage = get_zeroed_page(GFP_KERNEL);
                /*
                 * This is not really the Guest's fault, but killing it is
                 * simple for this corner case.
                 */
                if (!ptepage) {
                        kill_guest(cpu, "out of memory allocating pte page");
-                        return false;
+                        return NULL;
                }
-                /* We check that the Guest pgd is OK. */
-                check_gpgd(cpu, gpgd);
                /*
                 * And we copy the flags to the shadow PGD entry.  The page
                 * number in the shadow PGD is the page we just allocated.
                 */
-                set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
+                set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
        }
+        /*
+         * Intel's Physical Address Extension actually uses three levels of
+         * page tables, so we need to look in the mid-level.
+         */
 #ifdef CONFIG_X86_PAE
-        if (unlikely(cpu->linear_pages)) {
+        /* Now look at the mid-level shadow entry. */
-                /* Faking up a linear mapping. */
-                gpmd = __pmd(_PAGE_TABLE);
-        } else {
-                gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-                /* Middle level not present?  We can't map it in. */
-                if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
-                        return false;
-        }
-        /* Now look at the matching shadow entry. */
        spmd = spmd_addr(cpu, *spgd, vaddr);
        if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
                /* No shadow entry: allocate a new shadow PTE page. */
-                unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+                unsigned long ptepage;
+                /* If they didn't want us to allocate anything, stop. */
+                if (!allocate)
+                        return NULL;
+                ptepage = get_zeroed_page(GFP_KERNEL);
                /*
                 * This is not really the Guest's fault, but killing it is
                 * simple for this corner case.
                 */
                if (!ptepage) {
-                        kill_guest(cpu, "out of memory allocating pte page");
+                        kill_guest(cpu, "out of memory allocating pmd page");
-                        return false;
+                        return NULL;
                }
-                /* We check that the Guest pmd is OK. */
-                check_gpmd(cpu, gpmd);
                /*
                 * And we copy the flags to the shadow PMD entry.  The page
                 * number in the shadow PMD is the page we just allocated.
                 */
-                set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
+                set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
+        }
+#endif
+        /* Get the pointer to the shadow PTE entry we're going to set. */
+        return spte_addr(cpu, *spgd, vaddr);
+}
+/*H:330
+ * (i) Looking up a page table entry when the Guest faults.
+ *
+ * We saw this call in run_guest(): when we see a page fault in the Guest, we
+ * come here.  That's because we only set up the shadow page tables lazily as
+ * they're needed, so we get page faults all the time and quietly fix them up
+ * and return to the Guest without it knowing.
+ *
+ * If we fixed up the fault (ie. we mapped the address), this routine returns
+ * true.  Otherwise, it was a real fault and we need to tell the Guest.
+ */
+bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+{
+        unsigned long gpte_ptr;
+        pte_t gpte;
+        pte_t *spte;
+        pmd_t gpmd;
+        pgd_t gpgd;
+        /* We never demand page the Switcher, so trying is a mistake. */
+        if (vaddr >= switcher_addr)
+                return false;
+        /* First step: get the top-level Guest page table entry. */
+        if (unlikely(cpu->linear_pages)) {
+                /* Faking up a linear mapping. */
+                gpgd = __pgd(CHECK_GPGD_MASK);
+        } else {
+                gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
+                /* Toplevel not present?  We can't map it in. */
+                if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
+                        return false;
+                /* 
+                 * This kills the Guest if it has weird flags or tries to
+                 * refer to a "physical" address outside the bounds.
+                 */
+                if (!check_gpgd(cpu, gpgd))
+                        return false;
+        }
+        /* This "mid-level" entry is only used for non-linear, PAE mode. */
+        gpmd = __pmd(_PAGE_TABLE);
+#ifdef CONFIG_X86_PAE
+        if (likely(!cpu->linear_pages)) {
+                gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+                /* Middle level not present?  We can't map it in. */
+                if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+                        return false;
+                /* 
+                 * This kills the Guest if it has weird flags or tries to
+                 * refer to a "physical" address outside the bounds.
+                 */
+                if (!check_gpmd(cpu, gpmd))
+                        return false;
        }
        /*
@@ -433,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
         * Check that the Guest PTE flags are OK, and the page number is below
         * the pfn_limit (ie. not mapping the Launcher binary).
         */
-        check_gpte(cpu, gpte);
+        if (!check_gpte(cpu, gpte))
+                return false;
        /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
        gpte = pte_mkyoung(gpte);
@@ -441,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
                gpte = pte_mkdirty(gpte);
        /* Get the pointer to the shadow PTE entry we're going to set. */
-        spte = spte_addr(cpu, *spgd, vaddr);
+        spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
+        if (!spte)
+                return false;
        /*
         * If there was a valid shadow PTE entry here before, we release it.
@@ -493,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 */
 static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
 {
-        pgd_t *spgd;
+        pte_t *spte;
        unsigned long flags;
-#ifdef CONFIG_X86_PAE
+        /* You can't put your stack in the Switcher! */
-        pmd_t *spmd;
+        if (vaddr >= switcher_addr)
-#endif
-        /* Look at the current top level entry: is it present? */
-        spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
-        if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
                return false;
-#ifdef CONFIG_X86_PAE
+        /* If there's no shadow PTE, it's not writable. */
-        spmd = spmd_addr(cpu, *spgd, vaddr);
+        spte = find_spte(cpu, vaddr, false, 0, 0);
-        if (!(pmd_flags(*spmd) & _PAGE_PRESENT))
+        if (!spte)
                return false;
-#endif
        /*
         * Check the flags on the pte entry itself: it must be present and
         * writable.
         */
-        flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
+        flags = pte_flags(*spte);
        return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
 }
@@ -678,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
                              int *blank_pgdir)
 {
        unsigned int next;
-#ifdef CONFIG_X86_PAE
-        pmd_t *pmd_table;
-#endif
        /*
         * We pick one entry at random to throw out.  Choosing the Least
@@ -695,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
                if (!cpu->lg->pgdirs[next].pgdir)
                        next = cpu->cpu_pgd;
                else {
-#ifdef CONFIG_X86_PAE
                        /*
-                         * In PAE mode, allocate a pmd page and populate the
+                         * This is a blank page, so there are no kernel
-                         * last pgd entry.
+                         * mappings: caller must map the stack!
                         */
-                        pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
-                        if (!pmd_table) {
-                                free_page((long)cpu->lg->pgdirs[next].pgdir);
-                                set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
-                                next = cpu->cpu_pgd;
-                        } else {
-                                set_pgd(cpu->lg->pgdirs[next].pgdir +
-                                        SWITCHER_PGD_INDEX,
-                                        __pgd(__pa(pmd_table) | _PAGE_PRESENT));
-                                /*
-                                 * This is a blank page, so there are no kernel
-                                 * mappings: caller must map the stack!
-                                 */
-                                *blank_pgdir = 1;
-                        }
-#else
                        *blank_pgdir = 1;
-#endif
                }
        }
        /* Record which Guest toplevel this shadows. */
@@ -725,9 +731,50 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
        /* Release all the non-kernel mappings. */
        flush_user_mappings(cpu->lg, next);
+        /* This hasn't run on any CPU at all. */
+        cpu->lg->pgdirs[next].last_host_cpu = -1;
        return next;
 }
+/*H:501
+ * We do need the Switcher code mapped at all times, so we allocate that
+ * part of the Guest page table here.  We map the Switcher code immediately,
+ * but defer mapping of the guest register page and IDT/LDT etc page until
+ * just before we run the guest in map_switcher_in_guest().
+ *
+ * We *could* do this setup in map_switcher_in_guest(), but at that point
+ * we've interrupts disabled, and allocating pages like that is fraught: we
+ * can't sleep if we need to free up some memory.
+ */
+static bool allocate_switcher_mapping(struct lg_cpu *cpu)
+{
+        int i;
+        for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
+                pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
+                                       CHECK_GPGD_MASK, _PAGE_TABLE);
+                if (!pte)
+                        return false;
+                /*
+                 * Map the switcher page if not already there.  It might
+                 * already be there because we call allocate_switcher_mapping()
+                 * in guest_set_pgd() just in case it did discard our Switcher
+                 * mapping, but it probably didn't.
+                 */
+                if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
+                        /* Get a reference to the Switcher page. */
+                        get_page(lg_switcher_pages[0]);
+                        /* Create a read-only, exectuable, kernel-style PTE */
+                        set_pte(pte,
+                                mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
+                }
+        }
+        cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
+        return true;
+}
 /*H:470
 * Finally, a routine which throws away everything: all PGD entries in all
 * the shadow page tables, including the Guest's kernel mappings.  This is used
@@ -738,28 +785,16 @@ static void release_all_pagetables(struct lguest *lg)
        unsigned int i, j;
        /* Every shadow pagetable this Guest has */
-        for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+        for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
-                if (lg->pgdirs[i].pgdir) {
+                if (!lg->pgdirs[i].pgdir)
-#ifdef CONFIG_X86_PAE
+                        continue;
-                        pgd_t *spgd;
-                        pmd_t *pmdpage;
+                /* Every PGD entry. */
-                        unsigned int k;
+                for (j = 0; j < PTRS_PER_PGD; j++)
+                        release_pgd(lg->pgdirs[i].pgdir + j);
-                        /* Get the last pmd page. */
+                lg->pgdirs[i].switcher_mapped = false;
-                        spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
+                lg->pgdirs[i].last_host_cpu = -1;
-                        pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
+        }
-                        /*
-                         * And release the pmd entries of that pmd page,
-                         * except for the switcher pmd.
-                         */
-                        for (k = 0; k < SWITCHER_PMD_INDEX; k++)
-                                release_pmd(&pmdpage[k]);
-#endif
-                        /* Every PGD entry except the Switcher at the top */
-                        for (j = 0; j < SWITCHER_PGD_INDEX; j++)
-                                release_pgd(lg->pgdirs[i].pgdir + j);
-                }
 }
 /*
@@ -773,6 +808,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
        release_all_pagetables(cpu->lg);
        /* We need the Guest kernel stack mapped again. */
        pin_stack_pages(cpu);
+        /* And we need Switcher allocated. */
+        if (!allocate_switcher_mapping(cpu))
+                kill_guest(cpu, "Cannot populate switcher mapping");
 }
 /*H:430
@@ -808,9 +846,17 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
                newpgdir = new_pgdir(cpu, pgtable, &repin);
        /* Change the current pgd index to the new one. */
        cpu->cpu_pgd = newpgdir;
-        /* If it was completely blank, we map in the Guest kernel stack */
+        /*
+         * If it was completely blank, we map in the Guest kernel stack and
+         * the Switcher.
+         */
        if (repin)
                pin_stack_pages(cpu);
+        if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
+                if (!allocate_switcher_mapping(cpu))
+                        kill_guest(cpu, "Cannot populate switcher mapping");
+        }
 }
 /*:*/
@@ -865,7 +911,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
                         * micro-benchmark.
                         */
                        if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
-                                check_gpte(cpu, gpte);
+                                if (!check_gpte(cpu, gpte))
+                                        return;
                                set_pte(spte,
                                        gpte_to_spte(cpu, gpte,
                                                pte_flags(gpte) & _PAGE_DIRTY));
@@ -897,6 +944,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
 void guest_set_pte(struct lg_cpu *cpu,
                   unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
 {
+        /* We don't let you remap the Switcher; we need it to get back! */
+        if (vaddr >= switcher_addr) {
+                kill_guest(cpu, "attempt to set pte into Switcher pages");
+                return;
+        }
        /*
         * Kernel mappings must be changed on all top levels.  Slow, but doesn't
         * happen often.
@@ -933,14 +986,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
 {
        int pgdir;
-        if (idx >= SWITCHER_PGD_INDEX)
+        if (idx > PTRS_PER_PGD) {
+                kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
+                           idx, PTRS_PER_PGD);
                return;
+        }
        /* If they're talking about a page table we have a shadow for... */
        pgdir = find_pgdir(lg, gpgdir);
-        if (pgdir < ARRAY_SIZE(lg->pgdirs))
+        if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
                /* ... throw it away. */
                release_pgd(lg->pgdirs[pgdir].pgdir + idx);
+                /* That might have been the Switcher mapping, remap it. */
+                if (!allocate_switcher_mapping(&lg->cpus[0])) {
+                        kill_guest(&lg->cpus[0],
+                                   "Cannot populate switcher mapping");
+                }
+        }
 }
 #ifdef CONFIG_X86_PAE
@@ -958,6 +1020,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
 * we will populate on future faults.  The Guest doesn't have any actual
 * pagetables yet, so we set linear_pages to tell demand_page() to fake it
 * for the moment.
+ *
+ * We do need the Switcher to be mapped at all times, so we allocate that
+ * part of the Guest page table here.
 */
 int init_guest_pagetable(struct lguest *lg)
 {
@@ -971,21 +1036,34 @@ int init_guest_pagetable(struct lguest *lg)
        /* We start with a linear mapping until the initialize. */
        cpu->linear_pages = true;
+        /* Allocate the page tables for the Switcher. */
+        if (!allocate_switcher_mapping(cpu)) {
+                release_all_pagetables(lg);
+                return -ENOMEM;
+        }
        return 0;
 }
 /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
 void page_table_guest_data_init(struct lg_cpu *cpu)
 {
+        /*
+         * We tell the Guest that it can't use the virtual addresses
+         * used by the Switcher.  This trick is equivalent to 4GB -
+         * switcher_addr.
+         */
+        u32 top = ~switcher_addr + 1;
        /* We get the kernel address: above this is all kernel memory. */
        if (get_user(cpu->lg->kernel_address,
-                &cpu->lg->lguest_data->kernel_address)
+                     &cpu->lg->lguest_data->kernel_address)
                /*
-                 * We tell the Guest that it can't use the top 2 or 4 MB
+                 * We tell the Guest that it can't use the top virtual
-                 * of virtual addresses used by the Switcher.
+                 * addresses (used by the Switcher).
                 */
-                || put_user(RESERVE_MEM * 1024 * 1024,
+            || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
-                            &cpu->lg->lguest_data->reserve_mem)) {
                kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
                return;
        }
@@ -995,12 +1073,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
         * "pgd_index(lg->kernel_address)".  This assumes it won't hit the
         * Switcher mappings, so check that now.
         */
-#ifdef CONFIG_X86_PAE
+        if (cpu->lg->kernel_address >= switcher_addr)
-        if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
-                pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
-#else
-        if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
-#endif
                kill_guest(cpu, "bad kernel address %#lx",
                                 cpu->lg->kernel_address);
 }
@@ -1017,102 +1090,96 @@ void free_guest_pagetable(struct lguest *lg)
                free_page((long)lg->pgdirs[i].pgdir);
 }
-/*H:480
+/*H:481
- * (vi) Mapping the Switcher when the Guest is about to run.
+ * This clears the Switcher mappings for cpu #i.
- *
- * The Switcher and the two pages for this CPU need to be visible in the
- * Guest (and not the pages for other CPUs).  We have the appropriate PTE pages
- * for each CPU already set up, we just need to hook them in now we know which
- * Guest is about to run on this CPU.
 */
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
+static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
 {
-        pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages);
+        unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
-        pte_t regs_pte;
+        pte_t *pte;
-#ifdef CONFIG_X86_PAE
+        /* Clear the mappings for both pages. */
-        pmd_t switcher_pmd;
+        pte = find_spte(cpu, base, false, 0, 0);
-        pmd_t *pmd_table;
+        release_pte(*pte);
+        set_pte(pte, __pte(0));
-        switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT,
-                               PAGE_KERNEL_EXEC);
-        /* Figure out where the pmd page is, by reading the PGD, and converting
-         * it to a virtual address. */
-        pmd_table = __va(pgd_pfn(cpu->lg->
-                        pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
-                                                                << PAGE_SHIFT);
-        /* Now write it into the shadow page table. */
-        set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
-#else
-        pgd_t switcher_pgd;
-        /*
+        pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-         * Make the last PGD entry for this Guest point to the Switcher's PTE
+        release_pte(*pte);
-         * page for this CPU (with appropriate flags).
+        set_pte(pte, __pte(0));
-         */
-        switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
-        cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
-#endif
-        /*
-         * We also change the Switcher PTE page.  When we're running the Guest,
-         * we want the Guest's "regs" page to appear where the first Switcher
-         * page for this CPU is.  This is an optimization: when the Switcher
-         * saves the Guest registers, it saves them into the first page of this
-         * CPU's "struct lguest_pages": if we make sure the Guest's register
-         * page is already mapped there, we don't have to copy them out
-         * again.
-         */
-        regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL);
-        set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte);
 }
-/*:*/
-static void free_switcher_pte_pages(void)
+/*H:480
-{
+ * (vi) Mapping the Switcher when the Guest is about to run.
-        unsigned int i;
+ *
+ * The Switcher and the two pages for this CPU need to be visible in the Guest
-        for_each_possible_cpu(i)
+ * (and not the pages for other CPUs).
-                free_page((long)switcher_pte_page(i));
-}
-/*H:520
- * Setting up the Switcher PTE page for given CPU is fairly easy, given
- * the CPU number and the "struct page"s for the Switcher code itself.
 *
- * Currently the Switcher is less than a page long, so "pages" is always 1.
+ * The pages for the pagetables have all been allocated before: we just need
+ * to make sure the actual PTEs are up-to-date for the CPU we're about to run
+ * on.
 */
-static __init void populate_switcher_pte_page(unsigned int cpu,
+void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
-                                              struct page *switcher_page[],
-                                              unsigned int pages)
 {
-        unsigned int i;
+        unsigned long base;
-        pte_t *pte = switcher_pte_page(cpu);
+        struct page *percpu_switcher_page, *regs_page;
+        pte_t *pte;
+        struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
+        /* Switcher page should always be mapped by now! */
+        BUG_ON(!pgdir->switcher_mapped);
+        /* 
+         * Remember that we have two pages for each Host CPU, so we can run a
+         * Guest on each CPU without them interfering.  We need to make sure
+         * those pages are mapped correctly in the Guest, but since we usually
+         * run on the same CPU, we cache that, and only update the mappings
+         * when we move.
+         */
+        if (pgdir->last_host_cpu == raw_smp_processor_id())
+                return;
-        /* The first entries are easy: they map the Switcher code. */
+        /* -1 means unknown so we remove everything. */
-        for (i = 0; i < pages; i++) {
+        if (pgdir->last_host_cpu == -1) {
-                set_pte(&pte[i], mk_pte(switcher_page[i],
+                unsigned int i;
-                                __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
+                for_each_possible_cpu(i)
+                        remove_switcher_percpu_map(cpu, i);
+        } else {
+                /* We know exactly what CPU mapping to remove. */
+                remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
        }
-        /* The only other thing we map is this CPU's pair of pages. */
+        /*
-        i = pages + cpu*2;
+         * When we're running the Guest, we want the Guest's "regs" page to
+         * appear where the first Switcher page for this CPU is.  This is an
-        /* First page (Guest registers) is writable from the Guest */
+         * optimization: when the Switcher saves the Guest registers, it saves
-        set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]),
+         * them into the first page of this CPU's "struct lguest_pages": if we
-                         __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
+         * make sure the Guest's register page is already mapped there, we
+         * don't have to copy them out again.
+         */
+        /* Find the shadow PTE for this regs page. */
+        base = switcher_addr + PAGE_SIZE
+                + raw_smp_processor_id() * sizeof(struct lguest_pages);
+        pte = find_spte(cpu, base, false, 0, 0);
+        regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
+        get_page(regs_page);
+        set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
        /*
-         * The second page contains the "struct lguest_ro_state", and is
+         * We map the second page of the struct lguest_pages read-only in
-         * read-only.
+         * the Guest: the IDT, GDT and other things it's not supposed to
+         * change.
         */
-        set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]),
+        pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-                           __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
+        percpu_switcher_page
+                = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
+        get_page(percpu_switcher_page);
+        set_pte(pte, mk_pte(percpu_switcher_page,
+                            __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
+        pgdir->last_host_cpu = raw_smp_processor_id();
 }
-/*
+/*H:490
 * We've made it through the page table code.  Perhaps our tired brains are
 * still processing the details, or perhaps we're simply glad it's over.
 *
@@ -1124,29 +1191,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
 *
 * There is just one file remaining in the Host.
 */
-/*H:510
- * At boot or module load time, init_pagetables() allocates and populates
- * the Switcher PTE page for each CPU.
- */
-__init int init_pagetables(struct page **switcher_page, unsigned int pages)
-{
-        unsigned int i;
-        for_each_possible_cpu(i) {
-                switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
-                if (!switcher_pte_page(i)) {
-                        free_switcher_pte_pages();
-                        return -ENOMEM;
-                }
-                populate_switcher_pte_page(i, switcher_page, pages);
-        }
-        return 0;
-}
-/*:*/
-/* Cleaning up simply involves freeing the PTE page for each CPU. */
-void free_pagetables(void)
-{
-        free_switcher_pte_pages();
-}
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 4af12e1844d5..f0a3347b6441 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -59,14 +59,13 @@ static struct {
 /* Offset from where switcher.S was compiled to where we've copied it */
 static unsigned long switcher_offset(void)
 {
-        return SWITCHER_ADDR - (unsigned long)start_switcher_text;
+        return switcher_addr - (unsigned long)start_switcher_text;
 }
-/* This cpu's struct lguest_pages. */
+/* This cpu's struct lguest_pages (after the Switcher text page) */
 static struct lguest_pages *lguest_pages(unsigned int cpu)
 {
-        return &(((struct lguest_pages *)
+        return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
-                  (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
 }
 static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig
index a966128c2a7a..7ffc756131a2 100644
--- a/drivers/net/caif/Kconfig
+++ b/drivers/net/caif/Kconfig
@@ -40,3 +40,17 @@ config CAIF_HSI
       The caif low level driver for CAIF over HSI.
       Be aware that if you enable this then you also need to
       enable a low-level HSI driver.
+config CAIF_VIRTIO
+        tristate "CAIF virtio transport driver"
+        depends on CAIF
+        select VHOST_RING
+        select VIRTIO
+        select GENERIC_ALLOCATOR
+        default n
+        ---help---
+        The caif driver for CAIF over Virtio.
+if CAIF_VIRTIO
+source "drivers/vhost/Kconfig"
+endif
diff --git a/drivers/net/caif/Makefile b/drivers/net/caif/Makefile
index 15a9d2fc753d..9bbd45391f6c 100644
--- a/drivers/net/caif/Makefile
+++ b/drivers/net/caif/Makefile
@@ -9,3 +9,6 @@ obj-$(CONFIG_CAIF_SPI_SLAVE) += cfspi_slave.o
 # HSI interface
 obj-$(CONFIG_CAIF_HSI) += caif_hsi.o
+# Virtio interface
+obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o
diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
new file mode 100644
index 000000000000..b9ed1288ce2d
--- /dev/null
+++ b/drivers/net/caif/caif_virtio.c
@@ -0,0 +1,790 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2013
+ * Authors: Vicram Arv
+ *          Dmitry Tarnyagin <dmitry.tarnyagin@lockless.no>
+ *          Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/virtio.h>
+#include <linux/vringh.h>
+#include <linux/debugfs.h>
+#include <linux/spinlock.h>
+#include <linux/genalloc.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_caif.h>
+#include <linux/virtio_ring.h>
+#include <linux/dma-mapping.h>
+#include <net/caif/caif_dev.h>
+#include <linux/virtio_config.h>
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Vicram Arv");
+MODULE_AUTHOR("Sjur Brendeland");
+MODULE_DESCRIPTION("Virtio CAIF Driver");
+/* NAPI schedule quota */
+#define CFV_DEFAULT_QUOTA 32
+/* Defaults used if virtio config space is unavailable */
+#define CFV_DEF_MTU_SIZE 4096
+#define CFV_DEF_HEADROOM 32
+#define CFV_DEF_TAILROOM 32
+/* Required IP header alignment */
+#define IP_HDR_ALIGN 4
+/* struct cfv_napi_contxt - NAPI context info
+ * @riov: IOV holding data read from the ring. Note that riov may
+ *        still hold data when cfv_rx_poll() returns.
+ * @head: Last descriptor ID we received from vringh_getdesc_kern.
+ *        We use this to put descriptor back on the used ring. USHRT_MAX is
+ *        used to indicate invalid head-id.
+ */
+struct cfv_napi_context {
+        struct vringh_kiov riov;
+        unsigned short head;
+};
+/* struct cfv_stats - statistics for debugfs
+ * @rx_napi_complete:   Number of NAPI completions (RX)
+ * @rx_napi_resched:    Number of calls where the full quota was used (RX)
+ * @rx_nomem:           Number of SKB alloc failures (RX)
+ * @rx_kicks:           Number of RX kicks
+ * @tx_full_ring:       Number times TX ring was full
+ * @tx_no_mem:          Number of times TX went out of memory
+ * @tx_flow_on:         Number of flow on (TX)
+ * @tx_kicks:           Number of TX kicks
+ */
+struct cfv_stats {
+        u32 rx_napi_complete;
+        u32 rx_napi_resched;
+        u32 rx_nomem;
+        u32 rx_kicks;
+        u32 tx_full_ring;
+        u32 tx_no_mem;
+        u32 tx_flow_on;
+        u32 tx_kicks;
+};
+/* struct cfv_info - Caif Virtio control structure
+ * @cfdev:      caif common header
+ * @vdev:       Associated virtio device
+ * @vr_rx:      rx/downlink host vring
+ * @vq_tx:      tx/uplink virtqueue
+ * @ndev:       CAIF link layer device
+ * @watermark_tx: indicates number of free descriptors we need
+ *              to reopen the tx-queues after overload.
+ * @tx_lock:    protects vq_tx from concurrent use
+ * @tx_release_tasklet: Tasklet for freeing consumed TX buffers
+ * @napi:       Napi context used in cfv_rx_poll()
+ * @ctx:        Context data used in cfv_rx_poll()
+ * @tx_hr:      transmit headroom
+ * @rx_hr:      receive headroom
+ * @tx_tr:      transmit tail room
+ * @rx_tr:      receive tail room
+ * @mtu:        transmit max size
+ * @mru:        receive max size
+ * @allocsz:    size of dma memory reserved for TX buffers
+ * @alloc_addr: virtual address to dma memory for TX buffers
+ * @alloc_dma:  dma address to dma memory for TX buffers
+ * @genpool:    Gen Pool used for allocating TX buffers
+ * @reserved_mem: Pointer to memory reserve allocated from genpool
+ * @reserved_size: Size of memory reserve allocated from genpool
+ * @stats:       Statistics exposed in sysfs
+ * @debugfs:    Debugfs dentry for statistic counters
+ */
+struct cfv_info {
+        struct caif_dev_common cfdev;
+        struct virtio_device *vdev;
+        struct vringh *vr_rx;
+        struct virtqueue *vq_tx;
+        struct net_device *ndev;
+        unsigned int watermark_tx;
+        /* Protect access to vq_tx */
+        spinlock_t tx_lock;
+        struct tasklet_struct tx_release_tasklet;
+        struct napi_struct napi;
+        struct cfv_napi_context ctx;
+        u16 tx_hr;
+        u16 rx_hr;
+        u16 tx_tr;
+        u16 rx_tr;
+        u32 mtu;
+        u32 mru;
+        size_t allocsz;
+        void *alloc_addr;
+        dma_addr_t alloc_dma;
+        struct gen_pool *genpool;
+        unsigned long reserved_mem;
+        size_t reserved_size;
+        struct cfv_stats stats;
+        struct dentry *debugfs;
+};
+/* struct buf_info - maintains transmit buffer data handle
+ * @size:       size of transmit buffer
+ * @dma_handle: handle to allocated dma device memory area
+ * @vaddr:      virtual address mapping to allocated memory area
+ */
+struct buf_info {
+        size_t size;
+        u8 *vaddr;
+};
+/* Called from virtio device, in IRQ context */
+static void cfv_release_cb(struct virtqueue *vq_tx)
+{
+        struct cfv_info *cfv = vq_tx->vdev->priv;
+        ++cfv->stats.tx_kicks;
+        tasklet_schedule(&cfv->tx_release_tasklet);
+}
+static void free_buf_info(struct cfv_info *cfv, struct buf_info *buf_info)
+{
+        if (!buf_info)
+                return;
+        gen_pool_free(cfv->genpool, (unsigned long) buf_info->vaddr,
+                      buf_info->size);
+        kfree(buf_info);
+}
+/* This is invoked whenever the remote processor completed processing
+ * a TX msg we just sent, and the buffer is put back to the used ring.
+ */
+static void cfv_release_used_buf(struct virtqueue *vq_tx)
+{
+        struct cfv_info *cfv = vq_tx->vdev->priv;
+        unsigned long flags;
+        BUG_ON(vq_tx != cfv->vq_tx);
+        for (;;) {
+                unsigned int len;
+                struct buf_info *buf_info;
+                /* Get used buffer from used ring to recycle used descriptors */
+                spin_lock_irqsave(&cfv->tx_lock, flags);
+                buf_info = virtqueue_get_buf(vq_tx, &len);
+                spin_unlock_irqrestore(&cfv->tx_lock, flags);
+                /* Stop looping if there are no more buffers to free */
+                if (!buf_info)
+                        break;
+                free_buf_info(cfv, buf_info);
+                /* watermark_tx indicates if we previously stopped the tx
+                 * queues. If we have enough free stots in the virtio ring,
+                 * re-establish memory reserved and open up tx queues.
+                 */
+                if (cfv->vq_tx->num_free <= cfv->watermark_tx)
+                        continue;
+                /* Re-establish memory reserve */
+                if (cfv->reserved_mem == 0 && cfv->genpool)
+                        cfv->reserved_mem =
+                                gen_pool_alloc(cfv->genpool,
+                                               cfv->reserved_size);
+                /* Open up the tx queues */
+                if (cfv->reserved_mem) {
+                        cfv->watermark_tx =
+                                virtqueue_get_vring_size(cfv->vq_tx);
+                        netif_tx_wake_all_queues(cfv->ndev);
+                        /* Buffers are recycled in cfv_netdev_tx, so
+                         * disable notifications when queues are opened.
+                         */
+                        virtqueue_disable_cb(cfv->vq_tx);
+                        ++cfv->stats.tx_flow_on;
+                } else {
+                        /* if no memory reserve, wait for more free slots */
+                        WARN_ON(cfv->watermark_tx >
+                               virtqueue_get_vring_size(cfv->vq_tx));
+                        cfv->watermark_tx +=
+                                virtqueue_get_vring_size(cfv->vq_tx) / 4;
+                }
+        }
+}
+/* Allocate a SKB and copy packet data to it */
+static struct sk_buff *cfv_alloc_and_copy_skb(int *err,
+                                              struct cfv_info *cfv,
+                                              u8 *frm, u32 frm_len)
+{
+        struct sk_buff *skb;
+        u32 cfpkt_len, pad_len;
+        *err = 0;
+        /* Verify that packet size with down-link header and mtu size */
+        if (frm_len > cfv->mru || frm_len <= cfv->rx_hr + cfv->rx_tr) {
+                netdev_err(cfv->ndev,
+                           "Invalid frmlen:%u  mtu:%u hr:%d tr:%d\n",
+                           frm_len, cfv->mru,  cfv->rx_hr,
+                           cfv->rx_tr);
+                *err = -EPROTO;
+                return NULL;
+        }
+        cfpkt_len = frm_len - (cfv->rx_hr + cfv->rx_tr);
+        pad_len = (unsigned long)(frm + cfv->rx_hr) & (IP_HDR_ALIGN - 1);
+        skb = netdev_alloc_skb(cfv->ndev, frm_len + pad_len);
+        if (!skb) {
+                *err = -ENOMEM;
+                return NULL;
+        }
+        skb_reserve(skb, cfv->rx_hr + pad_len);
+        memcpy(skb_put(skb, cfpkt_len), frm + cfv->rx_hr, cfpkt_len);
+        return skb;
+}
+/* Get packets from the host vring */
+static int cfv_rx_poll(struct napi_struct *napi, int quota)
+{
+        struct cfv_info *cfv = container_of(napi, struct cfv_info, napi);
+        int rxcnt = 0;
+        int err = 0;
+        void *buf;
+        struct sk_buff *skb;
+        struct vringh_kiov *riov = &cfv->ctx.riov;
+        unsigned int skb_len;
+again:
+        do {
+                skb = NULL;
+                /* Put the previous iovec back on the used ring and
+                 * fetch a new iovec if we have processed all elements.
+                 */
+                if (riov->i == riov->used) {
+                        if (cfv->ctx.head != USHRT_MAX) {
+                                vringh_complete_kern(cfv->vr_rx,
+                                                     cfv->ctx.head,
+                                                     0);
+                                cfv->ctx.head = USHRT_MAX;
+                        }
+                        err = vringh_getdesc_kern(
+                                cfv->vr_rx,
+                                riov,
+                                NULL,
+                                &cfv->ctx.head,
+                                GFP_ATOMIC);
+                        if (err <= 0)
+                                goto exit;
+                }
+                buf = phys_to_virt((unsigned long) riov->iov[riov->i].iov_base);
+                /* TODO: Add check on valid buffer address */
+                skb = cfv_alloc_and_copy_skb(&err, cfv, buf,
+                                             riov->iov[riov->i].iov_len);
+                if (unlikely(err))
+                        goto exit;
+                /* Push received packet up the stack. */
+                skb_len = skb->len;
+                skb->protocol = htons(ETH_P_CAIF);
+                skb_reset_mac_header(skb);
+                skb->dev = cfv->ndev;
+                err = netif_receive_skb(skb);
+                if (unlikely(err)) {
+                        ++cfv->ndev->stats.rx_dropped;
+                } else {
+                        ++cfv->ndev->stats.rx_packets;
+                        cfv->ndev->stats.rx_bytes += skb_len;
+                }
+                ++riov->i;
+                ++rxcnt;
+        } while (rxcnt < quota);
+        ++cfv->stats.rx_napi_resched;
+        goto out;
+exit:
+        switch (err) {
+        case 0:
+                ++cfv->stats.rx_napi_complete;
+                /* Really out of patckets? (stolen from virtio_net)*/
+                napi_complete(napi);
+                if (unlikely(!vringh_notify_enable_kern(cfv->vr_rx)) &&
+                    napi_schedule_prep(napi)) {
+                        vringh_notify_disable_kern(cfv->vr_rx);
+                        __napi_schedule(napi);
+                        goto again;
+                }
+                break;
+        case -ENOMEM:
+                ++cfv->stats.rx_nomem;
+                dev_kfree_skb(skb);
+                /* Stop NAPI poll on OOM, we hope to be polled later */
+                napi_complete(napi);
+                vringh_notify_enable_kern(cfv->vr_rx);
+                break;
+        default:
+                /* We're doomed, any modem fault is fatal */
+                netdev_warn(cfv->ndev, "Bad ring, disable device\n");
+                cfv->ndev->stats.rx_dropped = riov->used - riov->i;
+                napi_complete(napi);
+                vringh_notify_disable_kern(cfv->vr_rx);
+                netif_carrier_off(cfv->ndev);
+                break;
+        }
+out:
+        if (rxcnt && vringh_need_notify_kern(cfv->vr_rx) > 0)
+                vringh_notify(cfv->vr_rx);
+        return rxcnt;
+}
+static void cfv_recv(struct virtio_device *vdev, struct vringh *vr_rx)
+{
+        struct cfv_info *cfv = vdev->priv;
+        ++cfv->stats.rx_kicks;
+        vringh_notify_disable_kern(cfv->vr_rx);
+        napi_schedule(&cfv->napi);
+}
+static void cfv_destroy_genpool(struct cfv_info *cfv)
+{
+        if (cfv->alloc_addr)
+                dma_free_coherent(cfv->vdev->dev.parent->parent,
+                                  cfv->allocsz, cfv->alloc_addr,
+                                  cfv->alloc_dma);
+        if (!cfv->genpool)
+                return;
+        gen_pool_free(cfv->genpool,  cfv->reserved_mem,
+                      cfv->reserved_size);
+        gen_pool_destroy(cfv->genpool);
+        cfv->genpool = NULL;
+}
+static int cfv_create_genpool(struct cfv_info *cfv)
+{
+        int err;
+        /* dma_alloc can only allocate whole pages, and we need a more
+         * fine graned allocation so we use genpool. We ask for space needed
+         * by IP and a full ring. If the dma allcoation fails we retry with a
+         * smaller allocation size.
+         */
+        err = -ENOMEM;
+        cfv->allocsz = (virtqueue_get_vring_size(cfv->vq_tx) *
+                        (ETH_DATA_LEN + cfv->tx_hr + cfv->tx_tr) * 11)/10;
+        if (cfv->allocsz <= (num_possible_cpus() + 1) * cfv->ndev->mtu)
+                return -EINVAL;
+        for (;;) {
+                if (cfv->allocsz <= num_possible_cpus() * cfv->ndev->mtu) {
+                        netdev_info(cfv->ndev, "Not enough device memory\n");
+                        return -ENOMEM;
+                }
+                cfv->alloc_addr = dma_alloc_coherent(
+                                                cfv->vdev->dev.parent->parent,
+                                                cfv->allocsz, &cfv->alloc_dma,
+                                                GFP_ATOMIC);
+                if (cfv->alloc_addr)
+                        break;
+                cfv->allocsz = (cfv->allocsz * 3) >> 2;
+        }
+        netdev_dbg(cfv->ndev, "Allocated %zd bytes from dma-memory\n",
+                   cfv->allocsz);
+        /* Allocate on 128 bytes boundaries (1 << 7)*/
+        cfv->genpool = gen_pool_create(7, -1);
+        if (!cfv->genpool)
+                goto err;
+        err = gen_pool_add_virt(cfv->genpool, (unsigned long)cfv->alloc_addr,
+                                (phys_addr_t)virt_to_phys(cfv->alloc_addr),
+                                cfv->allocsz, -1);
+        if (err)
+                goto err;
+        /* Reserve some memory for low memory situations. If we hit the roof
+         * in the memory pool, we stop TX flow and release the reserve.
+         */
+        cfv->reserved_size = num_possible_cpus() * cfv->ndev->mtu;
+        cfv->reserved_mem = gen_pool_alloc(cfv->genpool,
+                                           cfv->reserved_size);
+        if (!cfv->reserved_mem) {
+                err = -ENOMEM;
+                goto err;
+        }
+        cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx);
+        return 0;
+err:
+        cfv_destroy_genpool(cfv);
+        return err;
+}
+/* Enable the CAIF interface and allocate the memory-pool */
+static int cfv_netdev_open(struct net_device *netdev)
+{
+        struct cfv_info *cfv = netdev_priv(netdev);
+        if (cfv_create_genpool(cfv))
+                return -ENOMEM;
+        netif_carrier_on(netdev);
+        napi_enable(&cfv->napi);
+        /* Schedule NAPI to read any pending packets */
+        napi_schedule(&cfv->napi);
+        return 0;
+}
+/* Disable the CAIF interface and free the memory-pool */
+static int cfv_netdev_close(struct net_device *netdev)
+{
+        struct cfv_info *cfv = netdev_priv(netdev);
+        unsigned long flags;
+        struct buf_info *buf_info;
+        /* Disable interrupts, queues and NAPI polling */
+        netif_carrier_off(netdev);
+        virtqueue_disable_cb(cfv->vq_tx);
+        vringh_notify_disable_kern(cfv->vr_rx);
+        napi_disable(&cfv->napi);
+        /* Release any TX buffers on both used and avilable rings */
+        cfv_release_used_buf(cfv->vq_tx);
+        spin_lock_irqsave(&cfv->tx_lock, flags);
+        while ((buf_info = virtqueue_detach_unused_buf(cfv->vq_tx)))
+                free_buf_info(cfv, buf_info);
+        spin_unlock_irqrestore(&cfv->tx_lock, flags);
+        /* Release all dma allocated memory and destroy the pool */
+        cfv_destroy_genpool(cfv);
+        return 0;
+}
+/* Allocate a buffer in dma-memory and copy skb to it */
+static struct buf_info *cfv_alloc_and_copy_to_shm(struct cfv_info *cfv,
+                                                       struct sk_buff *skb,
+                                                       struct scatterlist *sg)
+{
+        struct caif_payload_info *info = (void *)&skb->cb;
+        struct buf_info *buf_info = NULL;
+        u8 pad_len, hdr_ofs;
+        if (!cfv->genpool)
+                goto err;
+        if (unlikely(cfv->tx_hr + skb->len + cfv->tx_tr > cfv->mtu)) {
+                netdev_warn(cfv->ndev, "Invalid packet len (%d > %d)\n",
+                            cfv->tx_hr + skb->len + cfv->tx_tr, cfv->mtu);
+                goto err;
+        }
+        buf_info = kmalloc(sizeof(struct buf_info), GFP_ATOMIC);
+        if (unlikely(!buf_info))
+                goto err;
+        /* Make the IP header aligned in tbe buffer */
+        hdr_ofs = cfv->tx_hr + info->hdr_len;
+        pad_len = hdr_ofs & (IP_HDR_ALIGN - 1);
+        buf_info->size = cfv->tx_hr + skb->len + cfv->tx_tr + pad_len;
+        /* allocate dma memory buffer */
+        buf_info->vaddr = (void *)gen_pool_alloc(cfv->genpool, buf_info->size);
+        if (unlikely(!buf_info->vaddr))
+                goto err;
+        /* copy skbuf contents to send buffer */
+        skb_copy_bits(skb, 0, buf_info->vaddr + cfv->tx_hr + pad_len, skb->len);
+        sg_init_one(sg, buf_info->vaddr + pad_len,
+                    skb->len + cfv->tx_hr + cfv->rx_hr);
+        return buf_info;
+err:
+        kfree(buf_info);
+        return NULL;
+}
+/* Put the CAIF packet on the virtio ring and kick the receiver */
+static int cfv_netdev_tx(struct sk_buff *skb, struct net_device *netdev)
+{
+        struct cfv_info *cfv = netdev_priv(netdev);
+        struct buf_info *buf_info;
+        struct scatterlist sg;
+        unsigned long flags;
+        bool flow_off = false;
+        int ret;
+        /* garbage collect released buffers */
+        cfv_release_used_buf(cfv->vq_tx);
+        spin_lock_irqsave(&cfv->tx_lock, flags);
+        /* Flow-off check takes into account number of cpus to make sure
+         * virtqueue will not be overfilled in any possible smp conditions.
+         *
+         * Flow-on is triggered when sufficient buffers are freed
+         */
+        if (unlikely(cfv->vq_tx->num_free <= num_present_cpus())) {
+                flow_off = true;
+                cfv->stats.tx_full_ring++;
+        }
+        /* If we run out of memory, we release the memory reserve and retry
+         * allocation.
+         */
+        buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
+        if (unlikely(!buf_info)) {
+                cfv->stats.tx_no_mem++;
+                flow_off = true;
+                if (cfv->reserved_mem && cfv->genpool) {
+                        gen_pool_free(cfv->genpool,  cfv->reserved_mem,
+                                      cfv->reserved_size);
+                        cfv->reserved_mem = 0;
+                        buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
+                }
+        }
+        if (unlikely(flow_off)) {
+                /* Turn flow on when a 1/4 of the descriptors are released */
+                cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx) / 4;
+                /* Enable notifications of recycled TX buffers */
+                virtqueue_enable_cb(cfv->vq_tx);
+                netif_tx_stop_all_queues(netdev);
+        }
+        if (unlikely(!buf_info)) {
+                /* If the memory reserve does it's job, this shouldn't happen */
+                netdev_warn(cfv->ndev, "Out of gen_pool memory\n");
+                goto err;
+        }
+        ret = virtqueue_add_outbuf(cfv->vq_tx, &sg, 1, buf_info, GFP_ATOMIC);
+        if (unlikely((ret < 0))) {
+                /* If flow control works, this shouldn't happen */
+                netdev_warn(cfv->ndev, "Failed adding buffer to TX vring:%d\n",
+                            ret);
+                goto err;
+        }
+        /* update netdev statistics */
+        cfv->ndev->stats.tx_packets++;
+        cfv->ndev->stats.tx_bytes += skb->len;
+        spin_unlock_irqrestore(&cfv->tx_lock, flags);
+        /* tell the remote processor it has a pending message to read */
+        virtqueue_kick(cfv->vq_tx);
+        dev_kfree_skb(skb);
+        return NETDEV_TX_OK;
+err:
+        spin_unlock_irqrestore(&cfv->tx_lock, flags);
+        cfv->ndev->stats.tx_dropped++;
+        free_buf_info(cfv, buf_info);
+        dev_kfree_skb(skb);
+        return NETDEV_TX_OK;
+}
+static void cfv_tx_release_tasklet(unsigned long drv)
+{
+        struct cfv_info *cfv = (struct cfv_info *)drv;
+        cfv_release_used_buf(cfv->vq_tx);
+}
+static const struct net_device_ops cfv_netdev_ops = {
+        .ndo_open = cfv_netdev_open,
+        .ndo_stop = cfv_netdev_close,
+        .ndo_start_xmit = cfv_netdev_tx,
+};
+static void cfv_netdev_setup(struct net_device *netdev)
+{
+        netdev->netdev_ops = &cfv_netdev_ops;
+        netdev->type = ARPHRD_CAIF;
+        netdev->tx_queue_len = 100;
+        netdev->flags = IFF_POINTOPOINT | IFF_NOARP;
+        netdev->mtu = CFV_DEF_MTU_SIZE;
+        netdev->destructor = free_netdev;
+}
+/* Create debugfs counters for the device */
+static inline void debugfs_init(struct cfv_info *cfv)
+{
+        cfv->debugfs =
+                debugfs_create_dir(netdev_name(cfv->ndev), NULL);
+        if (IS_ERR(cfv->debugfs))
+                return;
+        debugfs_create_u32("rx-napi-complete", S_IRUSR, cfv->debugfs,
+                           &cfv->stats.rx_napi_complete);
+        debugfs_create_u32("rx-napi-resched", S_IRUSR, cfv->debugfs,
+                           &cfv->stats.rx_napi_resched);
+        debugfs_create_u32("rx-nomem", S_IRUSR, cfv->debugfs,
+                           &cfv->stats.rx_nomem);
+        debugfs_create_u32("rx-kicks", S_IRUSR, cfv->debugfs,
+                           &cfv->stats.rx_kicks);
+        debugfs_create_u32("tx-full-ring", S_IRUSR, cfv->debugfs,
+                           &cfv->stats.tx_full_ring);
+        debugfs_create_u32("tx-no-mem", S_IRUSR, cfv->debugfs,
+                           &cfv->stats.tx_no_mem);
+        debugfs_create_u32("tx-kicks", S_IRUSR, cfv->debugfs,
+                           &cfv->stats.tx_kicks);
+        debugfs_create_u32("tx-flow-on", S_IRUSR, cfv->debugfs,
+                           &cfv->stats.tx_flow_on);
+}
+/* Setup CAIF for the a virtio device */
+static int cfv_probe(struct virtio_device *vdev)
+{
+        vq_callback_t *vq_cbs = cfv_release_cb;
+        vrh_callback_t *vrh_cbs = cfv_recv;
+        const char *names =  "output";
+        const char *cfv_netdev_name = "cfvrt";
+        struct net_device *netdev;
+        struct cfv_info *cfv;
+        int err = -EINVAL;
+        netdev = alloc_netdev(sizeof(struct cfv_info), cfv_netdev_name,
+                              cfv_netdev_setup);
+        if (!netdev)
+                return -ENOMEM;
+        cfv = netdev_priv(netdev);
+        cfv->vdev = vdev;
+        cfv->ndev = netdev;
+        spin_lock_init(&cfv->tx_lock);
+        /* Get the RX virtio ring. This is a "host side vring". */
+        err = -ENODEV;
+        if (!vdev->vringh_config || !vdev->vringh_config->find_vrhs)
+                goto err;
+        err = vdev->vringh_config->find_vrhs(vdev, 1, &cfv->vr_rx, &vrh_cbs);
+        if (err)
+                goto err;
+        /* Get the TX virtio ring. This is a "guest side vring". */
+        err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names);
+        if (err)
+                goto err;
+        /* Get the CAIF configuration from virtio config space, if available */
+#define GET_VIRTIO_CONFIG_OPS(_v, _var, _f) \
+        ((_v)->config->get(_v, offsetof(struct virtio_caif_transf_config, _f), \
+                           &_var, \
+                           FIELD_SIZEOF(struct virtio_caif_transf_config, _f)))
+        if (vdev->config->get) {
+                GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_hr, headroom);
+                GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_hr, headroom);
+                GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_tr, tailroom);
+                GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_tr, tailroom);
+                GET_VIRTIO_CONFIG_OPS(vdev, cfv->mtu, mtu);
+                GET_VIRTIO_CONFIG_OPS(vdev, cfv->mru, mtu);
+        } else {
+                cfv->tx_hr = CFV_DEF_HEADROOM;
+                cfv->rx_hr = CFV_DEF_HEADROOM;
+                cfv->tx_tr = CFV_DEF_TAILROOM;
+                cfv->rx_tr = CFV_DEF_TAILROOM;
+                cfv->mtu = CFV_DEF_MTU_SIZE;
+                cfv->mru = CFV_DEF_MTU_SIZE;
+        }
+        netdev->needed_headroom = cfv->tx_hr;
+        netdev->needed_tailroom = cfv->tx_tr;
+        /* Disable buffer release interrupts unless we have stopped TX queues */
+        virtqueue_disable_cb(cfv->vq_tx);
+        netdev->mtu = cfv->mtu - cfv->tx_tr;
+        vdev->priv = cfv;
+        /* Initialize NAPI poll context data */
+        vringh_kiov_init(&cfv->ctx.riov, NULL, 0);
+        cfv->ctx.head = USHRT_MAX;
+        netif_napi_add(netdev, &cfv->napi, cfv_rx_poll, CFV_DEFAULT_QUOTA);
+        tasklet_init(&cfv->tx_release_tasklet,
+                     cfv_tx_release_tasklet,
+                     (unsigned long)cfv);
+        /* Carrier is off until netdevice is opened */
+        netif_carrier_off(netdev);
+        /* register Netdev */
+        err = register_netdev(netdev);
+        if (err) {
+                dev_err(&vdev->dev, "Unable to register netdev (%d)\n", err);
+                goto err;
+        }
+        debugfs_init(cfv);
+        return 0;
+err:
+        netdev_warn(cfv->ndev, "CAIF Virtio probe failed:%d\n", err);
+        if (cfv->vr_rx)
+                vdev->vringh_config->del_vrhs(cfv->vdev);
+        if (cfv->vdev)
+                vdev->config->del_vqs(cfv->vdev);
+        free_netdev(netdev);
+        return err;
+}
+static void cfv_remove(struct virtio_device *vdev)
+{
+        struct cfv_info *cfv = vdev->priv;
+        rtnl_lock();
+        dev_close(cfv->ndev);
+        rtnl_unlock();
+        tasklet_kill(&cfv->tx_release_tasklet);
+        debugfs_remove_recursive(cfv->debugfs);
+        vringh_kiov_cleanup(&cfv->ctx.riov);
+        vdev->config->reset(vdev);
+        vdev->vringh_config->del_vrhs(cfv->vdev);
+        cfv->vr_rx = NULL;
+        vdev->config->del_vqs(cfv->vdev);
+        unregister_netdev(cfv->ndev);
+}
+static struct virtio_device_id id_table[] = {
+        { VIRTIO_ID_CAIF, VIRTIO_DEV_ANY_ID },
+        { 0 },
+};
+static unsigned int features[] = {
+};
+static struct virtio_driver caif_virtio_driver = {
+        .feature_table          = features,
+        .feature_table_size     = ARRAY_SIZE(features),
+        .driver.name            = KBUILD_MODNAME,
+        .driver.owner           = THIS_MODULE,
+        .id_table               = id_table,
+        .probe                  = cfv_probe,
+        .remove                 = cfv_remove,
+};
+module_virtio_driver(caif_virtio_driver);
+MODULE_DEVICE_TABLE(virtio, id_table);
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 50077753a0e5..3c23fdc27bf0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -39,7 +39,6 @@ module_param(gso, bool, 0444);
 #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 #define GOOD_COPY_LEN   128
-#define VIRTNET_SEND_COMMAND_SG_MAX    2
 #define VIRTNET_DRIVER_VERSION "1.0.0"
 struct virtnet_stats {
@@ -444,7 +443,7 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
        skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
-        err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp);
+        err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
        if (err < 0)
                dev_kfree_skb(skb);
@@ -489,8 +488,8 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
        /* chain first in list head */
        first->private = (unsigned long)list;
-        err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2,
+        err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
-                                first, gfp);
+                                  first, gfp);
        if (err < 0)
                give_pages(rq, first);
@@ -508,7 +507,7 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
        sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
-        err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp);
+        err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, page, gfp);
        if (err < 0)
                give_pages(rq, page);
@@ -582,7 +581,7 @@ static void refill_work(struct work_struct *work)
        bool still_empty;
        int i;
-        for (i = 0; i < vi->max_queue_pairs; i++) {
+        for (i = 0; i < vi->curr_queue_pairs; i++) {
                struct receive_queue *rq = &vi->rq[i];
                napi_disable(&rq->napi);
@@ -637,7 +636,7 @@ static int virtnet_open(struct net_device *dev)
        struct virtnet_info *vi = netdev_priv(dev);
        int i;
-        for (i = 0; i < vi->max_queue_pairs; i++) {
+        for (i = 0; i < vi->curr_queue_pairs; i++) {
                /* Make sure we have some buffers: if oom use wq. */
                if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
                        schedule_delayed_work(&vi->refill, 0);
@@ -711,8 +710,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
                sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr);
        num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
-        return virtqueue_add_buf(sq->vq, sq->sg, num_sg,
+        return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
-                                 0, skb, GFP_ATOMIC);
 }
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -767,32 +765,35 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 * never fail unless improperly formated.
 */
 static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
-                                 struct scatterlist *data, int out, int in)
+                                 struct scatterlist *out,
+                                 struct scatterlist *in)
 {
-        struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2];
+        struct scatterlist *sgs[4], hdr, stat;
        struct virtio_net_ctrl_hdr ctrl;
        virtio_net_ctrl_ack status = ~0;
-        unsigned int tmp;
+        unsigned out_num = 0, in_num = 0, tmp;
-        int i;
        /* Caller should know better */
-        BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ||
+        BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
-                (out + in > VIRTNET_SEND_COMMAND_SG_MAX));
-        out++; /* Add header */
-        in++; /* Add return status */
        ctrl.class = class;
        ctrl.cmd = cmd;
+        /* Add header */
+        sg_init_one(&hdr, &ctrl, sizeof(ctrl));
+        sgs[out_num++] = &hdr;
-        sg_init_table(sg, out + in);
+        if (out)
+                sgs[out_num++] = out;
+        if (in)
+                sgs[out_num + in_num++] = in;
-        sg_set_buf(&sg[0], &ctrl, sizeof(ctrl));
+        /* Add return status. */
-        for_each_sg(data, s, out + in - 2, i)
+        sg_init_one(&stat, &status, sizeof(status));
-                sg_set_buf(&sg[i + 1], sg_virt(s), s->length);
+        sgs[out_num + in_num++] = &stat;
-        sg_set_buf(&sg[out + in - 1], &status, sizeof(status));
-        BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi, GFP_ATOMIC) < 0);
+        BUG_ON(out_num + in_num > ARRAY_SIZE(sgs));
+        BUG_ON(virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC)
+               < 0);
        virtqueue_kick(vi->cvq);
@@ -821,7 +822,7 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
                sg_init_one(&sg, addr->sa_data, dev->addr_len);
                if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
                                          VIRTIO_NET_CTRL_MAC_ADDR_SET,
-                                          &sg, 1, 0)) {
+                                          &sg, NULL)) {
                        dev_warn(&vdev->dev,
                                 "Failed to set mac address by vq command.\n");
                        return -EINVAL;
@@ -889,8 +890,7 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi)
 {
        rtnl_lock();
        if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
-                                  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL,
+                                  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, NULL))
-                                  0, 0))
                dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
        rtnl_unlock();
 }
@@ -900,6 +900,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
        struct scatterlist sg;
        struct virtio_net_ctrl_mq s;
        struct net_device *dev = vi->dev;
+        int i;
        if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
                return 0;
@@ -908,12 +909,16 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
        sg_init_one(&sg, &s, sizeof(s));
        if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
-                                  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, 1, 0)){
+                                  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, NULL)) {
                dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
                         queue_pairs);
                return -EINVAL;
-        } else
+        } else {
+                for (i = vi->curr_queue_pairs; i < queue_pairs; i++)
+                        if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
+                                schedule_delayed_work(&vi->refill, 0);
                vi->curr_queue_pairs = queue_pairs;
+        }
        return 0;
 }
@@ -955,7 +960,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
        if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
                                  VIRTIO_NET_CTRL_RX_PROMISC,
-                                  sg, 1, 0))
+                                  sg, NULL))
                dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
                         promisc ? "en" : "dis");
@@ -963,7 +968,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
        if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
                                  VIRTIO_NET_CTRL_RX_ALLMULTI,
-                                  sg, 1, 0))
+                                  sg, NULL))
                dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
                         allmulti ? "en" : "dis");
@@ -1000,7 +1005,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
        if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
                                  VIRTIO_NET_CTRL_MAC_TABLE_SET,
-                                  sg, 2, 0))
+                                  sg, NULL))
                dev_warn(&dev->dev, "Failed to set MAC fitler table.\n");
        kfree(buf);
@@ -1015,7 +1020,7 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev,
        sg_init_one(&sg, &vid, sizeof(vid));
        if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
-                                  VIRTIO_NET_CTRL_VLAN_ADD, &sg, 1, 0))
+                                  VIRTIO_NET_CTRL_VLAN_ADD, &sg, NULL))
                dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
        return 0;
 }
@@ -1029,7 +1034,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
        sg_init_one(&sg, &vid, sizeof(vid));
        if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
-                                  VIRTIO_NET_CTRL_VLAN_DEL, &sg, 1, 0))
+                                  VIRTIO_NET_CTRL_VLAN_DEL, &sg, NULL))
                dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
        return 0;
 }
@@ -1570,7 +1575,7 @@ static int virtnet_probe(struct virtio_device *vdev)
        }
        /* Last of all, set up some receive buffers. */
-        for (i = 0; i < vi->max_queue_pairs; i++) {
+        for (i = 0; i < vi->curr_queue_pairs; i++) {
                try_fill_recv(&vi->rq[i], GFP_KERNEL);
                /* If we didn't even get one input buffer, we're useless. */
@@ -1694,7 +1699,7 @@ static int virtnet_restore(struct virtio_device *vdev)
        netif_device_attach(vi->dev);
-        for (i = 0; i < vi->max_queue_pairs; i++)
+        for (i = 0; i < vi->curr_queue_pairs; i++)
                if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
                        schedule_delayed_work(&vi->refill, 0);
diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c
index 7861f1119b7d..56fceafec9ec 100644
--- a/drivers/rpmsg/virtio_rpmsg_bus.c
+++ b/drivers/rpmsg/virtio_rpmsg_bus.c
@@ -757,14 +757,14 @@ int rpmsg_send_offchannel_raw(struct rpmsg_channel *rpdev, u32 src, u32 dst,
        mutex_lock(&vrp->tx_lock);
        /* add message to the remote processor's virtqueue */
-        err = virtqueue_add_buf(vrp->svq, &sg, 1, 0, msg, GFP_KERNEL);
+        err = virtqueue_add_outbuf(vrp->svq, &sg, 1, msg, GFP_KERNEL);
        if (err) {
                /*
                 * need to reclaim the buffer here, otherwise it's lost
                 * (memory won't leak, but rpmsg won't use it again for TX).
                 * this will wait for a buffer management overhaul.
                 */
-                dev_err(dev, "virtqueue_add_buf failed: %d\n", err);
+                dev_err(dev, "virtqueue_add_outbuf failed: %d\n", err);
                goto out;
        }
@@ -839,7 +839,7 @@ static void rpmsg_recv_done(struct virtqueue *rvq)
        sg_init_one(&sg, msg, RPMSG_BUF_SIZE);
        /* add the buffer back to the remote processor's virtqueue */
-        err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, msg, GFP_KERNEL);
+        err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, msg, GFP_KERNEL);
        if (err < 0) {
                dev_err(dev, "failed to add a virtqueue buffer: %d\n", err);
                return;
@@ -972,7 +972,7 @@ static int rpmsg_probe(struct virtio_device *vdev)
                sg_init_one(&sg, cpu_addr, RPMSG_BUF_SIZE);
-                err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, cpu_addr,
+                err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, cpu_addr,
                                                                GFP_KERNEL);
                WARN_ON(err); /* sanity check; this can't really happen */
        }
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 3449a1f8c656..2168258fb2c3 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -13,6 +13,8 @@
 *
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/mempool.h>
@@ -20,12 +22,14 @@
 #include <linux/virtio_ids.h>
 #include <linux/virtio_config.h>
 #include <linux/virtio_scsi.h>
+#include <linux/cpu.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_cmnd.h>
 #define VIRTIO_SCSI_MEMPOOL_SZ 64
 #define VIRTIO_SCSI_EVENT_LEN 8
+#define VIRTIO_SCSI_VQ_BASE 2
 /* Command queue element */
 struct virtio_scsi_cmd {
@@ -57,27 +61,61 @@ struct virtio_scsi_vq {
        struct virtqueue *vq;
 };
-/* Per-target queue state */
+/*
+ * Per-target queue state.
+ *
+ * This struct holds the data needed by the queue steering policy.  When a
+ * target is sent multiple requests, we need to drive them to the same queue so
+ * that FIFO processing order is kept.  However, if a target was idle, we can
+ * choose a queue arbitrarily.  In this case the queue is chosen according to
+ * the current VCPU, so the driver expects the number of request queues to be
+ * equal to the number of VCPUs.  This makes it easy and fast to select the
+ * queue, and also lets the driver optimize the IRQ affinity for the virtqueues
+ * (each virtqueue's affinity is set to the CPU that "owns" the queue).
+ *
+ * An interesting effect of this policy is that only writes to req_vq need to
+ * take the tgt_lock.  Read can be done outside the lock because:
+ *
+ * - writes of req_vq only occur when atomic_inc_return(&tgt->reqs) returns 1.
+ *   In that case, no other CPU is reading req_vq: even if they were in
+ *   virtscsi_queuecommand_multi, they would be spinning on tgt_lock.
+ *
+ * - reads of req_vq only occur when the target is not idle (reqs != 0).
+ *   A CPU that enters virtscsi_queuecommand_multi will not modify req_vq.
+ *
+ * Similarly, decrements of reqs are never concurrent with writes of req_vq.
+ * Thus they can happen outside the tgt_lock, provided of course we make reqs
+ * an atomic_t.
+ */
 struct virtio_scsi_target_state {
-        /* Protects sg.  Lock hierarchy is tgt_lock -> vq_lock.  */
+        /* This spinlock never held at the same time as vq_lock. */
        spinlock_t tgt_lock;
-        /* For sglist construction when adding commands to the virtqueue.  */
+        /* Count of outstanding requests. */
-        struct scatterlist sg[];
+        atomic_t reqs;
+        /* Currently active virtqueue for requests sent to this target. */
+        struct virtio_scsi_vq *req_vq;
 };
 /* Driver instance state */
 struct virtio_scsi {
        struct virtio_device *vdev;
-        struct virtio_scsi_vq ctrl_vq;
-        struct virtio_scsi_vq event_vq;
-        struct virtio_scsi_vq req_vq;
        /* Get some buffers ready for event vq */
        struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];
-        struct virtio_scsi_target_state *tgt[];
+        u32 num_queues;
+        /* If the affinity hint is set for virtqueues */
+        bool affinity_hint_set;
+        /* CPU hotplug notifier */
+        struct notifier_block nb;
+        struct virtio_scsi_vq ctrl_vq;
+        struct virtio_scsi_vq event_vq;
+        struct virtio_scsi_vq req_vqs[];
 };
 static struct kmem_cache *virtscsi_cmd_cache;
@@ -107,11 +145,13 @@ static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid)
 *
 * Called with vq_lock held.
 */
-static void virtscsi_complete_cmd(void *buf)
+static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
 {
        struct virtio_scsi_cmd *cmd = buf;
        struct scsi_cmnd *sc = cmd->sc;
        struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
+        struct virtio_scsi_target_state *tgt =
+                                scsi_target(sc->device)->hostdata;
        dev_dbg(&sc->device->sdev_gendev,
                "cmd %p response %u status %#02x sense_len %u\n",
@@ -166,32 +206,71 @@ static void virtscsi_complete_cmd(void *buf)
        mempool_free(cmd, virtscsi_cmd_pool);
        sc->scsi_done(sc);
+        atomic_dec(&tgt->reqs);
 }
-static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf))
+static void virtscsi_vq_done(struct virtio_scsi *vscsi,
+                             struct virtio_scsi_vq *virtscsi_vq,
+                             void (*fn)(struct virtio_scsi *vscsi, void *buf))
 {
        void *buf;
        unsigned int len;
+        unsigned long flags;
+        struct virtqueue *vq = virtscsi_vq->vq;
+        spin_lock_irqsave(&virtscsi_vq->vq_lock, flags);
        do {
                virtqueue_disable_cb(vq);
                while ((buf = virtqueue_get_buf(vq, &len)) != NULL)
-                        fn(buf);
+                        fn(vscsi, buf);
        } while (!virtqueue_enable_cb(vq));
+        spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags);
 }
 static void virtscsi_req_done(struct virtqueue *vq)
 {
        struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
        struct virtio_scsi *vscsi = shost_priv(sh);
-        unsigned long flags;
+        int index = vq->index - VIRTIO_SCSI_VQ_BASE;
+        struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index];
-        spin_lock_irqsave(&vscsi->req_vq.vq_lock, flags);
+        /*
-        virtscsi_vq_done(vq, virtscsi_complete_cmd);
+         * Read req_vq before decrementing the reqs field in
-        spin_unlock_irqrestore(&vscsi->req_vq.vq_lock, flags);
+         * virtscsi_complete_cmd.
+         *
+         * With barriers:
+         *
+         *      CPU #0                  virtscsi_queuecommand_multi (CPU #1)
+         *      ------------------------------------------------------------
+         *      lock vq_lock
+         *      read req_vq
+         *      read reqs (reqs = 1)
+         *      write reqs (reqs = 0)
+         *                              increment reqs (reqs = 1)
+         *                              write req_vq
+         *
+         * Possible reordering without barriers:
+         *
+         *      CPU #0                  virtscsi_queuecommand_multi (CPU #1)
+         *      ------------------------------------------------------------
+         *      lock vq_lock
+         *      read reqs (reqs = 1)
+         *      write reqs (reqs = 0)
+         *                              increment reqs (reqs = 1)
+         *                              write req_vq
+         *      read (wrong) req_vq
+         *
+         * We do not need a full smp_rmb, because req_vq is required to get
+         * to tgt->reqs: tgt is &vscsi->tgt[sc->device->id], where sc is stored
+         * in the virtqueue as the user token.
+         */
+        smp_read_barrier_depends();
+        virtscsi_vq_done(vscsi, req_vq, virtscsi_complete_cmd);
 };
-static void virtscsi_complete_free(void *buf)
+static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf)
 {
        struct virtio_scsi_cmd *cmd = buf;
@@ -205,11 +284,8 @@ static void virtscsi_ctrl_done(struct virtqueue *vq)
 {
        struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
        struct virtio_scsi *vscsi = shost_priv(sh);
-        unsigned long flags;
-        spin_lock_irqsave(&vscsi->ctrl_vq.vq_lock, flags);
+        virtscsi_vq_done(vscsi, &vscsi->ctrl_vq, virtscsi_complete_free);
-        virtscsi_vq_done(vq, virtscsi_complete_free);
-        spin_unlock_irqrestore(&vscsi->ctrl_vq.vq_lock, flags);
 };
 static int virtscsi_kick_event(struct virtio_scsi *vscsi,
@@ -223,8 +299,8 @@ static int virtscsi_kick_event(struct virtio_scsi *vscsi,
        spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags);
-        err = virtqueue_add_buf(vscsi->event_vq.vq, &sg, 0, 1, event_node,
+        err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node,
-                                GFP_ATOMIC);
+                                  GFP_ATOMIC);
        if (!err)
                virtqueue_kick(vscsi->event_vq.vq);
@@ -254,7 +330,7 @@ static void virtscsi_cancel_event_work(struct virtio_scsi *vscsi)
 }
 static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi,
-                                                struct virtio_scsi_event *event)
+                                            struct virtio_scsi_event *event)
 {
        struct scsi_device *sdev;
        struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
@@ -332,7 +408,7 @@ static void virtscsi_handle_event(struct work_struct *work)
        virtscsi_kick_event(vscsi, event_node);
 }
-static void virtscsi_complete_event(void *buf)
+static void virtscsi_complete_event(struct virtio_scsi *vscsi, void *buf)
 {
        struct virtio_scsi_event_node *event_node = buf;
@@ -344,82 +420,65 @@ static void virtscsi_event_done(struct virtqueue *vq)
 {
        struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
        struct virtio_scsi *vscsi = shost_priv(sh);
-        unsigned long flags;
-        spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags);
+        virtscsi_vq_done(vscsi, &vscsi->event_vq, virtscsi_complete_event);
-        virtscsi_vq_done(vq, virtscsi_complete_event);
-        spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags);
 };
-static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx,
-                             struct scsi_data_buffer *sdb)
-{
-        struct sg_table *table = &sdb->table;
-        struct scatterlist *sg_elem;
-        unsigned int idx = *p_idx;
-        int i;
-        for_each_sg(table->sgl, sg_elem, table->nents, i)
-                sg[idx++] = *sg_elem;
-        *p_idx = idx;
-}
 /**
- * virtscsi_map_cmd - map a scsi_cmd to a virtqueue scatterlist
+ * virtscsi_add_cmd - add a virtio_scsi_cmd to a virtqueue
- * @vscsi       : virtio_scsi state
+ * @vq          : the struct virtqueue we're talking about
 * @cmd         : command structure
- * @out_num     : number of read-only elements
- * @in_num      : number of write-only elements
 * @req_size    : size of the request buffer
 * @resp_size   : size of the response buffer
- *
+ * @gfp : flags to use for memory allocations
- * Called with tgt_lock held.
 */
-static void virtscsi_map_cmd(struct virtio_scsi_target_state *tgt,
+static int virtscsi_add_cmd(struct virtqueue *vq,
-                             struct virtio_scsi_cmd *cmd,
+                            struct virtio_scsi_cmd *cmd,
-                             unsigned *out_num, unsigned *in_num,
+                            size_t req_size, size_t resp_size, gfp_t gfp)
-                             size_t req_size, size_t resp_size)
 {
        struct scsi_cmnd *sc = cmd->sc;
-        struct scatterlist *sg = tgt->sg;
+        struct scatterlist *sgs[4], req, resp;
-        unsigned int idx = 0;
+        struct sg_table *out, *in;
+        unsigned out_num = 0, in_num = 0;
+        out = in = NULL;
+        if (sc && sc->sc_data_direction != DMA_NONE) {
+                if (sc->sc_data_direction != DMA_FROM_DEVICE)
+                        out = &scsi_out(sc)->table;
+                if (sc->sc_data_direction != DMA_TO_DEVICE)
+                        in = &scsi_in(sc)->table;
+        }
        /* Request header.  */
-        sg_set_buf(&sg[idx++], &cmd->req, req_size);
+        sg_init_one(&req, &cmd->req, req_size);
+        sgs[out_num++] = &req;
        /* Data-out buffer.  */
-        if (sc && sc->sc_data_direction != DMA_FROM_DEVICE)
+        if (out)
-                virtscsi_map_sgl(sg, &idx, scsi_out(sc));
+                sgs[out_num++] = out->sgl;
-        *out_num = idx;
        /* Response header.  */
-        sg_set_buf(&sg[idx++], &cmd->resp, resp_size);
+        sg_init_one(&resp, &cmd->resp, resp_size);
+        sgs[out_num + in_num++] = &resp;
        /* Data-in buffer */
-        if (sc && sc->sc_data_direction != DMA_TO_DEVICE)
+        if (in)
-                virtscsi_map_sgl(sg, &idx, scsi_in(sc));
+                sgs[out_num + in_num++] = in->sgl;
-        *in_num = idx - *out_num;
+        return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, gfp);
 }
-static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt,
+static int virtscsi_kick_cmd(struct virtio_scsi_vq *vq,
-                             struct virtio_scsi_vq *vq,
                             struct virtio_scsi_cmd *cmd,
                             size_t req_size, size_t resp_size, gfp_t gfp)
 {
-        unsigned int out_num, in_num;
        unsigned long flags;
        int err;
        bool needs_kick = false;
-        spin_lock_irqsave(&tgt->tgt_lock, flags);
+        spin_lock_irqsave(&vq->vq_lock, flags);
-        virtscsi_map_cmd(tgt, cmd, &out_num, &in_num, req_size, resp_size);
+        err = virtscsi_add_cmd(vq->vq, cmd, req_size, resp_size, gfp);
-        spin_lock(&vq->vq_lock);
-        err = virtqueue_add_buf(vq->vq, tgt->sg, out_num, in_num, cmd, gfp);
-        spin_unlock(&tgt->tgt_lock);
        if (!err)
                needs_kick = virtqueue_kick_prepare(vq->vq);
@@ -430,10 +489,10 @@ static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt,
        return err;
 }
-static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
+static int virtscsi_queuecommand(struct virtio_scsi *vscsi,
+                                 struct virtio_scsi_vq *req_vq,
+                                 struct scsi_cmnd *sc)
 {
-        struct virtio_scsi *vscsi = shost_priv(sh);
-        struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
        struct virtio_scsi_cmd *cmd;
        int ret;
@@ -467,7 +526,7 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
        BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE);
        memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len);
-        if (virtscsi_kick_cmd(tgt, &vscsi->req_vq, cmd,
+        if (virtscsi_kick_cmd(req_vq, cmd,
                              sizeof cmd->req.cmd, sizeof cmd->resp.cmd,
                              GFP_ATOMIC) == 0)
                ret = 0;
@@ -478,14 +537,62 @@ out:
        return ret;
 }
+static int virtscsi_queuecommand_single(struct Scsi_Host *sh,
+                                        struct scsi_cmnd *sc)
+{
+        struct virtio_scsi *vscsi = shost_priv(sh);
+        struct virtio_scsi_target_state *tgt =
+                                scsi_target(sc->device)->hostdata;
+        atomic_inc(&tgt->reqs);
+        return virtscsi_queuecommand(vscsi, &vscsi->req_vqs[0], sc);
+}
+static struct virtio_scsi_vq *virtscsi_pick_vq(struct virtio_scsi *vscsi,
+                                               struct virtio_scsi_target_state *tgt)
+{
+        struct virtio_scsi_vq *vq;
+        unsigned long flags;
+        u32 queue_num;
+        spin_lock_irqsave(&tgt->tgt_lock, flags);
+        /*
+         * The memory barrier after atomic_inc_return matches
+         * the smp_read_barrier_depends() in virtscsi_req_done.
+         */
+        if (atomic_inc_return(&tgt->reqs) > 1)
+                vq = ACCESS_ONCE(tgt->req_vq);
+        else {
+                queue_num = smp_processor_id();
+                while (unlikely(queue_num >= vscsi->num_queues))
+                        queue_num -= vscsi->num_queues;
+                tgt->req_vq = vq = &vscsi->req_vqs[queue_num];
+        }
+        spin_unlock_irqrestore(&tgt->tgt_lock, flags);
+        return vq;
+}
+static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
+                                       struct scsi_cmnd *sc)
+{
+        struct virtio_scsi *vscsi = shost_priv(sh);
+        struct virtio_scsi_target_state *tgt =
+                                scsi_target(sc->device)->hostdata;
+        struct virtio_scsi_vq *req_vq = virtscsi_pick_vq(vscsi, tgt);
+        return virtscsi_queuecommand(vscsi, req_vq, sc);
+}
 static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd)
 {
        DECLARE_COMPLETION_ONSTACK(comp);
-        struct virtio_scsi_target_state *tgt = vscsi->tgt[cmd->sc->device->id];
        int ret = FAILED;
        cmd->comp = &comp;
-        if (virtscsi_kick_cmd(tgt, &vscsi->ctrl_vq, cmd,
+        if (virtscsi_kick_cmd(&vscsi->ctrl_vq, cmd,
                              sizeof cmd->req.tmf, sizeof cmd->resp.tmf,
                              GFP_NOIO) < 0)
                goto out;
@@ -547,18 +654,57 @@ static int virtscsi_abort(struct scsi_cmnd *sc)
        return virtscsi_tmf(vscsi, cmd);
 }
-static struct scsi_host_template virtscsi_host_template = {
+static int virtscsi_target_alloc(struct scsi_target *starget)
+{
+        struct virtio_scsi_target_state *tgt =
+                                kmalloc(sizeof(*tgt), GFP_KERNEL);
+        if (!tgt)
+                return -ENOMEM;
+        spin_lock_init(&tgt->tgt_lock);
+        atomic_set(&tgt->reqs, 0);
+        tgt->req_vq = NULL;
+        starget->hostdata = tgt;
+        return 0;
+}
+static void virtscsi_target_destroy(struct scsi_target *starget)
+{
+        struct virtio_scsi_target_state *tgt = starget->hostdata;
+        kfree(tgt);
+}
+static struct scsi_host_template virtscsi_host_template_single = {
+        .module = THIS_MODULE,
+        .name = "Virtio SCSI HBA",
+        .proc_name = "virtio_scsi",
+        .this_id = -1,
+        .queuecommand = virtscsi_queuecommand_single,
+        .eh_abort_handler = virtscsi_abort,
+        .eh_device_reset_handler = virtscsi_device_reset,
+        .can_queue = 1024,
+        .dma_boundary = UINT_MAX,
+        .use_clustering = ENABLE_CLUSTERING,
+        .target_alloc = virtscsi_target_alloc,
+        .target_destroy = virtscsi_target_destroy,
+};
+static struct scsi_host_template virtscsi_host_template_multi = {
        .module = THIS_MODULE,
        .name = "Virtio SCSI HBA",
        .proc_name = "virtio_scsi",
-        .queuecommand = virtscsi_queuecommand,
        .this_id = -1,
+        .queuecommand = virtscsi_queuecommand_multi,
        .eh_abort_handler = virtscsi_abort,
        .eh_device_reset_handler = virtscsi_device_reset,
        .can_queue = 1024,
        .dma_boundary = UINT_MAX,
        .use_clustering = ENABLE_CLUSTERING,
+        .target_alloc = virtscsi_target_alloc,
+        .target_destroy = virtscsi_target_destroy,
 };
 #define virtscsi_config_get(vdev, fld) \
@@ -578,29 +724,69 @@ static struct scsi_host_template virtscsi_host_template = {
                                  &__val, sizeof(__val)); \
        })
-static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
+static void __virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
-                             struct virtqueue *vq)
 {
-        spin_lock_init(&virtscsi_vq->vq_lock);
+        int i;
-        virtscsi_vq->vq = vq;
+        int cpu;
+        /* In multiqueue mode, when the number of cpu is equal
+         * to the number of request queues, we let the qeueues
+         * to be private to one cpu by setting the affinity hint
+         * to eliminate the contention.
+         */
+        if ((vscsi->num_queues == 1 ||
+             vscsi->num_queues != num_online_cpus()) && affinity) {
+                if (vscsi->affinity_hint_set)
+                        affinity = false;
+                else
+                        return;
+        }
+        if (affinity) {
+                i = 0;
+                for_each_online_cpu(cpu) {
+                        virtqueue_set_affinity(vscsi->req_vqs[i].vq, cpu);
+                        i++;
+                }
+                vscsi->affinity_hint_set = true;
+        } else {
+                for (i = 0; i < vscsi->num_queues - VIRTIO_SCSI_VQ_BASE; i++)
+                        virtqueue_set_affinity(vscsi->req_vqs[i].vq, -1);
+                vscsi->affinity_hint_set = false;
+        }
 }
-static struct virtio_scsi_target_state *virtscsi_alloc_tgt(
+static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
-        struct virtio_device *vdev, int sg_elems)
 {
-        struct virtio_scsi_target_state *tgt;
+        get_online_cpus();
-        gfp_t gfp_mask = GFP_KERNEL;
+        __virtscsi_set_affinity(vscsi, affinity);
+        put_online_cpus();
-        /* We need extra sg elements at head and tail.  */
+}
-        tgt = kmalloc(sizeof(*tgt) + sizeof(tgt->sg[0]) * (sg_elems + 2),
-                      gfp_mask);
-        if (!tgt)
+static int virtscsi_cpu_callback(struct notifier_block *nfb,
-                return NULL;
+                                 unsigned long action, void *hcpu)
+{
+        struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb);
+        switch(action) {
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                __virtscsi_set_affinity(vscsi, true);
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
-        spin_lock_init(&tgt->tgt_lock);
+static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
-        sg_init_table(tgt->sg, sg_elems + 2);
+                             struct virtqueue *vq)
-        return tgt;
+{
+        spin_lock_init(&virtscsi_vq->vq_lock);
+        virtscsi_vq->vq = vq;
 }
 static void virtscsi_scan(struct virtio_device *vdev)
@@ -614,46 +800,56 @@ static void virtscsi_remove_vqs(struct virtio_device *vdev)
 {
        struct Scsi_Host *sh = virtio_scsi_host(vdev);
        struct virtio_scsi *vscsi = shost_priv(sh);
-        u32 i, num_targets;
+        virtscsi_set_affinity(vscsi, false);
        /* Stop all the virtqueues. */
        vdev->config->reset(vdev);
-        num_targets = sh->max_id;
-        for (i = 0; i < num_targets; i++) {
-                kfree(vscsi->tgt[i]);
-                vscsi->tgt[i] = NULL;
-        }
        vdev->config->del_vqs(vdev);
 }
 static int virtscsi_init(struct virtio_device *vdev,
-                         struct virtio_scsi *vscsi, int num_targets)
+                         struct virtio_scsi *vscsi)
 {
        int err;
-        struct virtqueue *vqs[3];
+        u32 i;
-        u32 i, sg_elems;
+        u32 num_vqs;
+        vq_callback_t **callbacks;
+        const char **names;
+        struct virtqueue **vqs;
+        num_vqs = vscsi->num_queues + VIRTIO_SCSI_VQ_BASE;
+        vqs = kmalloc(num_vqs * sizeof(struct virtqueue *), GFP_KERNEL);
+        callbacks = kmalloc(num_vqs * sizeof(vq_callback_t *), GFP_KERNEL);
+        names = kmalloc(num_vqs * sizeof(char *), GFP_KERNEL);
+        if (!callbacks || !vqs || !names) {
+                err = -ENOMEM;
+                goto out;
+        }
-        vq_callback_t *callbacks[] = {
+        callbacks[0] = virtscsi_ctrl_done;
-                virtscsi_ctrl_done,
+        callbacks[1] = virtscsi_event_done;
-                virtscsi_event_done,
+        names[0] = "control";
-                virtscsi_req_done
+        names[1] = "event";
-        };
+        for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) {
-        const char *names[] = {
+                callbacks[i] = virtscsi_req_done;
-                "control",
+                names[i] = "request";
-                "event",
+        }
-                "request"
-        };
        /* Discover virtqueues and write information to configuration.  */
-        err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names);
+        err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
        if (err)
-                return err;
+                goto out;
        virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]);
        virtscsi_init_vq(&vscsi->event_vq, vqs[1]);
-        virtscsi_init_vq(&vscsi->req_vq, vqs[2]);
+        for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++)
+                virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE],
+                                 vqs[i]);
+        virtscsi_set_affinity(vscsi, true);
        virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
        virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
@@ -661,19 +857,12 @@ static int virtscsi_init(struct virtio_device *vdev,
        if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG))
                virtscsi_kick_event_all(vscsi);
-        /* We need to know how many segments before we allocate.  */
-        sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1;
-        for (i = 0; i < num_targets; i++) {
-                vscsi->tgt[i] = virtscsi_alloc_tgt(vdev, sg_elems);
-                if (!vscsi->tgt[i]) {
-                        err = -ENOMEM;
-                        goto out;
-                }
-        }
        err = 0;
 out:
+        kfree(names);
+        kfree(callbacks);
+        kfree(vqs);
        if (err)
                virtscsi_remove_vqs(vdev);
        return err;
@@ -686,13 +875,21 @@ static int virtscsi_probe(struct virtio_device *vdev)
        int err;
        u32 sg_elems, num_targets;
        u32 cmd_per_lun;
+        u32 num_queues;
+        struct scsi_host_template *hostt;
+        /* We need to know how many queues before we allocate. */
+        num_queues = virtscsi_config_get(vdev, num_queues) ? : 1;
-        /* Allocate memory and link the structs together.  */
        num_targets = virtscsi_config_get(vdev, max_target) + 1;
-        shost = scsi_host_alloc(&virtscsi_host_template,
-                sizeof(*vscsi)
-                + num_targets * sizeof(struct virtio_scsi_target_state));
+        if (num_queues == 1)
+                hostt = &virtscsi_host_template_single;
+        else
+                hostt = &virtscsi_host_template_multi;
+        shost = scsi_host_alloc(hostt,
+                sizeof(*vscsi) + sizeof(vscsi->req_vqs[0]) * num_queues);
        if (!shost)
                return -ENOMEM;
@@ -700,12 +897,20 @@ static int virtscsi_probe(struct virtio_device *vdev)
        shost->sg_tablesize = sg_elems;
        vscsi = shost_priv(shost);
        vscsi->vdev = vdev;
+        vscsi->num_queues = num_queues;
        vdev->priv = shost;
-        err = virtscsi_init(vdev, vscsi, num_targets);
+        err = virtscsi_init(vdev, vscsi);
        if (err)
                goto virtscsi_init_failed;
+        vscsi->nb.notifier_call = &virtscsi_cpu_callback;
+        err = register_hotcpu_notifier(&vscsi->nb);
+        if (err) {
+                pr_err("registering cpu notifier failed\n");
+                goto scsi_add_host_failed;
+        }
        cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
        shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue);
        shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF;
@@ -743,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev)
        scsi_remove_host(shost);
+        unregister_hotcpu_notifier(&vscsi->nb);
        virtscsi_remove_vqs(vdev);
        scsi_host_put(shost);
 }
@@ -759,7 +966,7 @@ static int virtscsi_restore(struct virtio_device *vdev)
        struct Scsi_Host *sh = virtio_scsi_host(vdev);
        struct virtio_scsi *vscsi = shost_priv(sh);
-        return virtscsi_init(vdev, vscsi, sh->max_id);
+        return virtscsi_init(vdev, vscsi);
 }
 #endif
@@ -794,8 +1001,7 @@ static int __init init(void)
        virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0);
        if (!virtscsi_cmd_cache) {
-                printk(KERN_ERR "kmem_cache_create() for "
+                pr_err("kmem_cache_create() for virtscsi_cmd_cache failed\n");
-                                "virtscsi_cmd_cache failed\n");
                goto error;
        }
@@ -804,8 +1010,7 @@ static int __init init(void)
                mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ,
                                         virtscsi_cmd_cache);
        if (!virtscsi_cmd_pool) {
-                printk(KERN_ERR "mempool_create() for"
+                pr_err("mempool_create() for virtscsi_cmd_pool failed\n");
-                                "virtscsi_cmd_pool failed\n");
                goto error;
        }
        ret = register_virtio_driver(&virtio_scsi_driver);
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 26a64e5b8a58..8b9226da3f54 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -1,6 +1,7 @@
 config VHOST_NET
        tristate "Host kernel accelerator for virtio net"
        depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP)
+        select VHOST_RING
        ---help---
          This kernel module can be loaded in host kernel to accelerate
          guest networking with virtio_net. Not to be confused with virtio_net
@@ -12,7 +13,14 @@ config VHOST_NET
 config VHOST_SCSI
        tristate "VHOST_SCSI TCM fabric driver"
        depends on TARGET_CORE && EVENTFD && m
+        select VHOST_RING
        default n
        ---help---
        Say M here to enable the vhost_scsi TCM fabric module
        for use with virtio-scsi guests
+config VHOST_RING
+        tristate
+        ---help---
+          This option is selected by any driver which needs to access
+          the host side of a virtio ring.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index ef21d5fdfa7d..654e9afb11f5 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -3,3 +3,5 @@ vhost_net-y := vhost.o net.o
 obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
 vhost_scsi-y := scsi.o
+obj-$(CONFIG_VHOST_RING) += vringh.o
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index be65414d5bb1..1ee45bc85f67 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -282,7 +282,9 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl,
                return vhost_test_reset_owner(n);
        default:
                mutex_lock(&n->dev.mutex);
-                r = vhost_dev_ioctl(&n->dev, ioctl, arg);
+                r = vhost_dev_ioctl(&n->dev, ioctl, argp);
+                if (r == -ENOIOCTLCMD)
+                        r = vhost_vring_ioctl(&n->dev, ioctl, argp);
                vhost_test_flush(n);
                mutex_unlock(&n->dev.mutex);
                return r;
diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
new file mode 100644
index 000000000000..bff0775e258c
--- /dev/null
+++ b/drivers/vhost/vringh.c
@@ -0,0 +1,1007 @@
+/*
+ * Helpers for the host side of a virtio ring.
+ *
+ * Since these may be in userspace, we use (inline) accessors.
+ */
+#include <linux/vringh.h>
+#include <linux/virtio_ring.h>
+#include <linux/kernel.h>
+#include <linux/ratelimit.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+static __printf(1,2) __cold void vringh_bad(const char *fmt, ...)
+{
+        static DEFINE_RATELIMIT_STATE(vringh_rs,
+                                      DEFAULT_RATELIMIT_INTERVAL,
+                                      DEFAULT_RATELIMIT_BURST);
+        if (__ratelimit(&vringh_rs)) {
+                va_list ap;
+                va_start(ap, fmt);
+                printk(KERN_NOTICE "vringh:");
+                vprintk(fmt, ap);
+                va_end(ap);
+        }
+}
+/* Returns vring->num if empty, -ve on error. */
+static inline int __vringh_get_head(const struct vringh *vrh,
+                                    int (*getu16)(u16 *val, const u16 *p),
+                                    u16 *last_avail_idx)
+{
+        u16 avail_idx, i, head;
+        int err;
+        err = getu16(&avail_idx, &vrh->vring.avail->idx);
+        if (err) {
+                vringh_bad("Failed to access avail idx at %p",
+                           &vrh->vring.avail->idx);
+                return err;
+        }
+        if (*last_avail_idx == avail_idx)
+                return vrh->vring.num;
+        /* Only get avail ring entries after they have been exposed by guest. */
+        virtio_rmb(vrh->weak_barriers);
+        i = *last_avail_idx & (vrh->vring.num - 1);
+        err = getu16(&head, &vrh->vring.avail->ring[i]);
+        if (err) {
+                vringh_bad("Failed to read head: idx %d address %p",
+                           *last_avail_idx, &vrh->vring.avail->ring[i]);
+                return err;
+        }
+        if (head >= vrh->vring.num) {
+                vringh_bad("Guest says index %u > %u is available",
+                           head, vrh->vring.num);
+                return -EINVAL;
+        }
+        (*last_avail_idx)++;
+        return head;
+}
+/* Copy some bytes to/from the iovec.  Returns num copied. */
+static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
+                                      void *ptr, size_t len,
+                                      int (*xfer)(void *addr, void *ptr,
+                                                  size_t len))
+{
+        int err, done = 0;
+        while (len && iov->i < iov->used) {
+                size_t partlen;
+                partlen = min(iov->iov[iov->i].iov_len, len);
+                err = xfer(iov->iov[iov->i].iov_base, ptr, partlen);
+                if (err)
+                        return err;
+                done += partlen;
+                len -= partlen;
+                ptr += partlen;
+                iov->consumed += partlen;
+                iov->iov[iov->i].iov_len -= partlen;
+                iov->iov[iov->i].iov_base += partlen;
+                if (!iov->iov[iov->i].iov_len) {
+                        /* Fix up old iov element then increment. */
+                        iov->iov[iov->i].iov_len = iov->consumed;
+                        iov->iov[iov->i].iov_base -= iov->consumed;
+                        
+                        iov->consumed = 0;
+                        iov->i++;
+                }
+        }
+        return done;
+}
+/* May reduce *len if range is shorter. */
+static inline bool range_check(struct vringh *vrh, u64 addr, size_t *len,
+                               struct vringh_range *range,
+                               bool (*getrange)(struct vringh *,
+                                                u64, struct vringh_range *))
+{
+        if (addr < range->start || addr > range->end_incl) {
+                if (!getrange(vrh, addr, range))
+                        return false;
+        }
+        BUG_ON(addr < range->start || addr > range->end_incl);
+        /* To end of memory? */
+        if (unlikely(addr + *len == 0)) {
+                if (range->end_incl == -1ULL)
+                        return true;
+                goto truncate;
+        }
+        /* Otherwise, don't wrap. */
+        if (addr + *len < addr) {
+                vringh_bad("Wrapping descriptor %zu@0x%llx",
+                           *len, (unsigned long long)addr);
+                return false;
+        }
+        if (unlikely(addr + *len - 1 > range->end_incl))
+                goto truncate;
+        return true;
+truncate:
+        *len = range->end_incl + 1 - addr;
+        return true;
+}
+static inline bool no_range_check(struct vringh *vrh, u64 addr, size_t *len,
+                                  struct vringh_range *range,
+                                  bool (*getrange)(struct vringh *,
+                                                   u64, struct vringh_range *))
+{
+        return true;
+}
+/* No reason for this code to be inline. */
+static int move_to_indirect(int *up_next, u16 *i, void *addr,
+                            const struct vring_desc *desc,
+                            struct vring_desc **descs, int *desc_max)
+{
+        /* Indirect tables can't have indirect. */
+        if (*up_next != -1) {
+                vringh_bad("Multilevel indirect %u->%u", *up_next, *i);
+                return -EINVAL;
+        }
+        if (unlikely(desc->len % sizeof(struct vring_desc))) {
+                vringh_bad("Strange indirect len %u", desc->len);
+                return -EINVAL;
+        }
+        /* We will check this when we follow it! */
+        if (desc->flags & VRING_DESC_F_NEXT)
+                *up_next = desc->next;
+        else
+                *up_next = -2;
+        *descs = addr;
+        *desc_max = desc->len / sizeof(struct vring_desc);
+        /* Now, start at the first indirect. */
+        *i = 0;
+        return 0;
+}
+static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp)
+{
+        struct kvec *new;
+        unsigned int flag, new_num = (iov->max_num & ~VRINGH_IOV_ALLOCATED) * 2;
+        if (new_num < 8)
+                new_num = 8;
+        flag = (iov->max_num & VRINGH_IOV_ALLOCATED);
+        if (flag)
+                new = krealloc(iov->iov, new_num * sizeof(struct iovec), gfp);
+        else {
+                new = kmalloc(new_num * sizeof(struct iovec), gfp);
+                if (new) {
+                        memcpy(new, iov->iov,
+                               iov->max_num * sizeof(struct iovec));
+                        flag = VRINGH_IOV_ALLOCATED;
+                }
+        }
+        if (!new)
+                return -ENOMEM;
+        iov->iov = new;
+        iov->max_num = (new_num | flag);
+        return 0;
+}
+static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next,
+                                       struct vring_desc **descs, int *desc_max)
+{
+        u16 i = *up_next;
+        *up_next = -1;
+        *descs = vrh->vring.desc;
+        *desc_max = vrh->vring.num;
+        return i;
+}
+static int slow_copy(struct vringh *vrh, void *dst, const void *src,
+                     bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
+                                    struct vringh_range *range,
+                                    bool (*getrange)(struct vringh *vrh,
+                                                     u64,
+                                                     struct vringh_range *)),
+                     bool (*getrange)(struct vringh *vrh,
+                                      u64 addr,
+                                      struct vringh_range *r),
+                     struct vringh_range *range,
+                     int (*copy)(void *dst, const void *src, size_t len))
+{
+        size_t part, len = sizeof(struct vring_desc);
+        do {
+                u64 addr;
+                int err;
+                part = len;
+                addr = (u64)(unsigned long)src - range->offset;
+                if (!rcheck(vrh, addr, &part, range, getrange))
+                        return -EINVAL;
+                err = copy(dst, src, part);
+                if (err)
+                        return err;
+                dst += part;
+                src += part;
+                len -= part;
+        } while (len);
+        return 0;
+}
+static inline int
+__vringh_iov(struct vringh *vrh, u16 i,
+             struct vringh_kiov *riov,
+             struct vringh_kiov *wiov,
+             bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
+                            struct vringh_range *range,
+                            bool (*getrange)(struct vringh *, u64,
+                                             struct vringh_range *)),
+             bool (*getrange)(struct vringh *, u64, struct vringh_range *),
+             gfp_t gfp,
+             int (*copy)(void *dst, const void *src, size_t len))
+{
+        int err, count = 0, up_next, desc_max;
+        struct vring_desc desc, *descs;
+        struct vringh_range range = { -1ULL, 0 }, slowrange;
+        bool slow = false;
+        /* We start traversing vring's descriptor table. */
+        descs = vrh->vring.desc;
+        desc_max = vrh->vring.num;
+        up_next = -1;
+        if (riov)
+                riov->i = riov->used = 0;
+        else if (wiov)
+                wiov->i = wiov->used = 0;
+        else
+                /* You must want something! */
+                BUG();
+        for (;;) {
+                void *addr;
+                struct vringh_kiov *iov;
+                size_t len;
+                if (unlikely(slow))
+                        err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange,
+                                        &slowrange, copy);
+                else
+                        err = copy(&desc, &descs[i], sizeof(desc));
+                if (unlikely(err))
+                        goto fail;
+                if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) {
+                        /* Make sure it's OK, and get offset. */
+                        len = desc.len;
+                        if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
+                                err = -EINVAL;
+                                goto fail;
+                        }
+                        if (unlikely(len != desc.len)) {
+                                slow = true;
+                                /* We need to save this range to use offset */
+                                slowrange = range;
+                        }
+                        addr = (void *)(long)(desc.addr + range.offset);
+                        err = move_to_indirect(&up_next, &i, addr, &desc,
+                                               &descs, &desc_max);
+                        if (err)
+                                goto fail;
+                        continue;
+                }
+                if (count++ == vrh->vring.num) {
+                        vringh_bad("Descriptor loop in %p", descs);
+                        err = -ELOOP;
+                        goto fail;
+                }
+                if (desc.flags & VRING_DESC_F_WRITE)
+                        iov = wiov;
+                else {
+                        iov = riov;
+                        if (unlikely(wiov && wiov->i)) {
+                                vringh_bad("Readable desc %p after writable",
+                                           &descs[i]);
+                                err = -EINVAL;
+                                goto fail;
+                        }
+                }
+                if (!iov) {
+                        vringh_bad("Unexpected %s desc",
+                                   !wiov ? "writable" : "readable");
+                        err = -EPROTO;
+                        goto fail;
+                }
+        again:
+                /* Make sure it's OK, and get offset. */
+                len = desc.len;
+                if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
+                        err = -EINVAL;
+                        goto fail;
+                }
+                addr = (void *)(unsigned long)(desc.addr + range.offset);
+                if (unlikely(iov->used == (iov->max_num & ~VRINGH_IOV_ALLOCATED))) {
+                        err = resize_iovec(iov, gfp);
+                        if (err)
+                                goto fail;
+                }
+                iov->iov[iov->used].iov_base = addr;
+                iov->iov[iov->used].iov_len = len;
+                iov->used++;
+                if (unlikely(len != desc.len)) {
+                        desc.len -= len;
+                        desc.addr += len;
+                        goto again;
+                }
+                if (desc.flags & VRING_DESC_F_NEXT) {
+                        i = desc.next;
+                } else {
+                        /* Just in case we need to finish traversing above. */
+                        if (unlikely(up_next > 0)) {
+                                i = return_from_indirect(vrh, &up_next,
+                                                         &descs, &desc_max);
+                                slow = false;
+                        } else
+                                break;
+                }
+                if (i >= desc_max) {
+                        vringh_bad("Chained index %u > %u", i, desc_max);
+                        err = -EINVAL;
+                        goto fail;
+                }
+        }
+        return 0;
+fail:
+        return err;
+}
+static inline int __vringh_complete(struct vringh *vrh,
+                                    const struct vring_used_elem *used,
+                                    unsigned int num_used,
+                                    int (*putu16)(u16 *p, u16 val),
+                                    int (*putused)(struct vring_used_elem *dst,
+                                                   const struct vring_used_elem
+                                                   *src, unsigned num))
+{
+        struct vring_used *used_ring;
+        int err;
+        u16 used_idx, off;
+        used_ring = vrh->vring.used;
+        used_idx = vrh->last_used_idx + vrh->completed;
+        off = used_idx % vrh->vring.num;
+        /* Compiler knows num_used == 1 sometimes, hence extra check */
+        if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) {
+                u16 part = vrh->vring.num - off;
+                err = putused(&used_ring->ring[off], used, part);
+                if (!err)
+                        err = putused(&used_ring->ring[0], used + part,
+                                      num_used - part);
+        } else
+                err = putused(&used_ring->ring[off], used, num_used);
+        if (err) {
+                vringh_bad("Failed to write %u used entries %u at %p",
+                           num_used, off, &used_ring->ring[off]);
+                return err;
+        }
+        /* Make sure buffer is written before we update index. */
+        virtio_wmb(vrh->weak_barriers);
+        err = putu16(&vrh->vring.used->idx, used_idx + num_used);
+        if (err) {
+                vringh_bad("Failed to update used index at %p",
+                           &vrh->vring.used->idx);
+                return err;
+        }
+        vrh->completed += num_used;
+        return 0;
+}
+static inline int __vringh_need_notify(struct vringh *vrh,
+                                       int (*getu16)(u16 *val, const u16 *p))
+{
+        bool notify;
+        u16 used_event;
+        int err;
+        /* Flush out used index update. This is paired with the
+         * barrier that the Guest executes when enabling
+         * interrupts. */
+        virtio_mb(vrh->weak_barriers);
+        /* Old-style, without event indices. */
+        if (!vrh->event_indices) {
+                u16 flags;
+                err = getu16(&flags, &vrh->vring.avail->flags);
+                if (err) {
+                        vringh_bad("Failed to get flags at %p",
+                                   &vrh->vring.avail->flags);
+                        return err;
+                }
+                return (!(flags & VRING_AVAIL_F_NO_INTERRUPT));
+        }
+        /* Modern: we know when other side wants to know. */
+        err = getu16(&used_event, &vring_used_event(&vrh->vring));
+        if (err) {
+                vringh_bad("Failed to get used event idx at %p",
+                           &vring_used_event(&vrh->vring));
+                return err;
+        }
+        /* Just in case we added so many that we wrap. */
+        if (unlikely(vrh->completed > 0xffff))
+                notify = true;
+        else
+                notify = vring_need_event(used_event,
+                                          vrh->last_used_idx + vrh->completed,
+                                          vrh->last_used_idx);
+        vrh->last_used_idx += vrh->completed;
+        vrh->completed = 0;
+        return notify;
+}
+static inline bool __vringh_notify_enable(struct vringh *vrh,
+                                          int (*getu16)(u16 *val, const u16 *p),
+                                          int (*putu16)(u16 *p, u16 val))
+{
+        u16 avail;
+        if (!vrh->event_indices) {
+                /* Old-school; update flags. */
+                if (putu16(&vrh->vring.used->flags, 0) != 0) {
+                        vringh_bad("Clearing used flags %p",
+                                   &vrh->vring.used->flags);
+                        return true;
+                }
+        } else {
+                if (putu16(&vring_avail_event(&vrh->vring),
+                           vrh->last_avail_idx) != 0) {
+                        vringh_bad("Updating avail event index %p",
+                                   &vring_avail_event(&vrh->vring));
+                        return true;
+                }
+        }
+        /* They could have slipped one in as we were doing that: make
+         * sure it's written, then check again. */
+        virtio_mb(vrh->weak_barriers);
+        if (getu16(&avail, &vrh->vring.avail->idx) != 0) {
+                vringh_bad("Failed to check avail idx at %p",
+                           &vrh->vring.avail->idx);
+                return true;
+        }
+        /* This is unlikely, so we just leave notifications enabled
+         * (if we're using event_indices, we'll only get one
+         * notification anyway). */
+        return avail == vrh->last_avail_idx;
+}
+static inline void __vringh_notify_disable(struct vringh *vrh,
+                                           int (*putu16)(u16 *p, u16 val))
+{
+        if (!vrh->event_indices) {
+                /* Old-school; update flags. */
+                if (putu16(&vrh->vring.used->flags, VRING_USED_F_NO_NOTIFY)) {
+                        vringh_bad("Setting used flags %p",
+                                   &vrh->vring.used->flags);
+                }
+        }
+}
+/* Userspace access helpers: in this case, addresses are really userspace. */
+static inline int getu16_user(u16 *val, const u16 *p)
+{
+        return get_user(*val, (__force u16 __user *)p);
+}
+static inline int putu16_user(u16 *p, u16 val)
+{
+        return put_user(val, (__force u16 __user *)p);
+}
+static inline int copydesc_user(void *dst, const void *src, size_t len)
+{
+        return copy_from_user(dst, (__force void __user *)src, len) ?
+                -EFAULT : 0;
+}
+static inline int putused_user(struct vring_used_elem *dst,
+                               const struct vring_used_elem *src,
+                               unsigned int num)
+{
+        return copy_to_user((__force void __user *)dst, src,
+                            sizeof(*dst) * num) ? -EFAULT : 0;
+}
+static inline int xfer_from_user(void *src, void *dst, size_t len)
+{
+        return copy_from_user(dst, (__force void __user *)src, len) ?
+                -EFAULT : 0;
+}
+static inline int xfer_to_user(void *dst, void *src, size_t len)
+{
+        return copy_to_user((__force void __user *)dst, src, len) ?
+                -EFAULT : 0;
+}
+/**
+ * vringh_init_user - initialize a vringh for a userspace vring.
+ * @vrh: the vringh to initialize.
+ * @features: the feature bits for this ring.
+ * @num: the number of elements.
+ * @weak_barriers: true if we only need memory barriers, not I/O.
+ * @desc: the userpace descriptor pointer.
+ * @avail: the userpace avail pointer.
+ * @used: the userpace used pointer.
+ *
+ * Returns an error if num is invalid: you should check pointers
+ * yourself!
+ */
+int vringh_init_user(struct vringh *vrh, u32 features,
+                     unsigned int num, bool weak_barriers,
+                     struct vring_desc __user *desc,
+                     struct vring_avail __user *avail,
+                     struct vring_used __user *used)
+{
+        /* Sane power of 2 please! */
+        if (!num || num > 0xffff || (num & (num - 1))) {
+                vringh_bad("Bad ring size %u", num);
+                return -EINVAL;
+        }
+        vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
+        vrh->weak_barriers = weak_barriers;
+        vrh->completed = 0;
+        vrh->last_avail_idx = 0;
+        vrh->last_used_idx = 0;
+        vrh->vring.num = num;
+        /* vring expects kernel addresses, but only used via accessors. */
+        vrh->vring.desc = (__force struct vring_desc *)desc;
+        vrh->vring.avail = (__force struct vring_avail *)avail;
+        vrh->vring.used = (__force struct vring_used *)used;
+        return 0;
+}
+EXPORT_SYMBOL(vringh_init_user);
+/**
+ * vringh_getdesc_user - get next available descriptor from userspace ring.
+ * @vrh: the userspace vring.
+ * @riov: where to put the readable descriptors (or NULL)
+ * @wiov: where to put the writable descriptors (or NULL)
+ * @getrange: function to call to check ranges.
+ * @head: head index we received, for passing to vringh_complete_user().
+ *
+ * Returns 0 if there was no descriptor, 1 if there was, or -errno.
+ *
+ * Note that on error return, you can tell the difference between an
+ * invalid ring and a single invalid descriptor: in the former case,
+ * *head will be vrh->vring.num.  You may be able to ignore an invalid
+ * descriptor, but there's not much you can do with an invalid ring.
+ *
+ * Note that you may need to clean up riov and wiov, even on error!
+ */
+int vringh_getdesc_user(struct vringh *vrh,
+                        struct vringh_iov *riov,
+                        struct vringh_iov *wiov,
+                        bool (*getrange)(struct vringh *vrh,
+                                         u64 addr, struct vringh_range *r),
+                        u16 *head)
+{
+        int err;
+        *head = vrh->vring.num;
+        err = __vringh_get_head(vrh, getu16_user, &vrh->last_avail_idx);
+        if (err < 0)
+                return err;
+        /* Empty... */
+        if (err == vrh->vring.num)
+                return 0;
+        /* We need the layouts to be the identical for this to work */
+        BUILD_BUG_ON(sizeof(struct vringh_kiov) != sizeof(struct vringh_iov));
+        BUILD_BUG_ON(offsetof(struct vringh_kiov, iov) !=
+                     offsetof(struct vringh_iov, iov));
+        BUILD_BUG_ON(offsetof(struct vringh_kiov, i) !=
+                     offsetof(struct vringh_iov, i));
+        BUILD_BUG_ON(offsetof(struct vringh_kiov, used) !=
+                     offsetof(struct vringh_iov, used));
+        BUILD_BUG_ON(offsetof(struct vringh_kiov, max_num) !=
+                     offsetof(struct vringh_iov, max_num));
+        BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
+        BUILD_BUG_ON(offsetof(struct iovec, iov_base) !=
+                     offsetof(struct kvec, iov_base));
+        BUILD_BUG_ON(offsetof(struct iovec, iov_len) !=
+                     offsetof(struct kvec, iov_len));
+        BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_base)
+                     != sizeof(((struct kvec *)NULL)->iov_base));
+        BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_len)
+                     != sizeof(((struct kvec *)NULL)->iov_len));
+        *head = err;
+        err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov,
+                           (struct vringh_kiov *)wiov,
+                           range_check, getrange, GFP_KERNEL, copydesc_user);
+        if (err)
+                return err;
+        return 1;
+}
+EXPORT_SYMBOL(vringh_getdesc_user);
+/**
+ * vringh_iov_pull_user - copy bytes from vring_iov.
+ * @riov: the riov as passed to vringh_getdesc_user() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len)
+{
+        return vringh_iov_xfer((struct vringh_kiov *)riov,
+                               dst, len, xfer_from_user);
+}
+EXPORT_SYMBOL(vringh_iov_pull_user);
+/**
+ * vringh_iov_push_user - copy bytes into vring_iov.
+ * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
+                             const void *src, size_t len)
+{
+        return vringh_iov_xfer((struct vringh_kiov *)wiov,
+                               (void *)src, len, xfer_to_user);
+}
+EXPORT_SYMBOL(vringh_iov_push_user);
+/**
+ * vringh_abandon_user - we've decided not to handle the descriptor(s).
+ * @vrh: the vring.
+ * @num: the number of descriptors to put back (ie. num
+ *       vringh_get_user() to undo).
+ *
+ * The next vringh_get_user() will return the old descriptor(s) again.
+ */
+void vringh_abandon_user(struct vringh *vrh, unsigned int num)
+{
+        /* We only update vring_avail_event(vr) when we want to be notified,
+         * so we haven't changed that yet. */
+        vrh->last_avail_idx -= num;
+}
+EXPORT_SYMBOL(vringh_abandon_user);
+/**
+ * vringh_complete_user - we've finished with descriptor, publish it.
+ * @vrh: the vring.
+ * @head: the head as filled in by vringh_getdesc_user.
+ * @len: the length of data we have written.
+ *
+ * You should check vringh_need_notify_user() after one or more calls
+ * to this function.
+ */
+int vringh_complete_user(struct vringh *vrh, u16 head, u32 len)
+{
+        struct vring_used_elem used;
+        used.id = head;
+        used.len = len;
+        return __vringh_complete(vrh, &used, 1, putu16_user, putused_user);
+}
+EXPORT_SYMBOL(vringh_complete_user);
+/**
+ * vringh_complete_multi_user - we've finished with many descriptors.
+ * @vrh: the vring.
+ * @used: the head, length pairs.
+ * @num_used: the number of used elements.
+ *
+ * You should check vringh_need_notify_user() after one or more calls
+ * to this function.
+ */
+int vringh_complete_multi_user(struct vringh *vrh,
+                               const struct vring_used_elem used[],
+                               unsigned num_used)
+{
+        return __vringh_complete(vrh, used, num_used,
+                                 putu16_user, putused_user);
+}
+EXPORT_SYMBOL(vringh_complete_multi_user);
+/**
+ * vringh_notify_enable_user - we want to know if something changes.
+ * @vrh: the vring.
+ *
+ * This always enables notifications, but returns false if there are
+ * now more buffers available in the vring.
+ */
+bool vringh_notify_enable_user(struct vringh *vrh)
+{
+        return __vringh_notify_enable(vrh, getu16_user, putu16_user);
+}
+EXPORT_SYMBOL(vringh_notify_enable_user);
+/**
+ * vringh_notify_disable_user - don't tell us if something changes.
+ * @vrh: the vring.
+ *
+ * This is our normal running state: we disable and then only enable when
+ * we're going to sleep.
+ */
+void vringh_notify_disable_user(struct vringh *vrh)
+{
+        __vringh_notify_disable(vrh, putu16_user);
+}
+EXPORT_SYMBOL(vringh_notify_disable_user);
+/**
+ * vringh_need_notify_user - must we tell the other side about used buffers?
+ * @vrh: the vring we've called vringh_complete_user() on.
+ *
+ * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
+ */
+int vringh_need_notify_user(struct vringh *vrh)
+{
+        return __vringh_need_notify(vrh, getu16_user);
+}
+EXPORT_SYMBOL(vringh_need_notify_user);
+/* Kernelspace access helpers. */
+static inline int getu16_kern(u16 *val, const u16 *p)
+{
+        *val = ACCESS_ONCE(*p);
+        return 0;
+}
+static inline int putu16_kern(u16 *p, u16 val)
+{
+        ACCESS_ONCE(*p) = val;
+        return 0;
+}
+static inline int copydesc_kern(void *dst, const void *src, size_t len)
+{
+        memcpy(dst, src, len);
+        return 0;
+}
+static inline int putused_kern(struct vring_used_elem *dst,
+                               const struct vring_used_elem *src,
+                               unsigned int num)
+{
+        memcpy(dst, src, num * sizeof(*dst));
+        return 0;
+}
+static inline int xfer_kern(void *src, void *dst, size_t len)
+{
+        memcpy(dst, src, len);
+        return 0;
+}
+/**
+ * vringh_init_kern - initialize a vringh for a kernelspace vring.
+ * @vrh: the vringh to initialize.
+ * @features: the feature bits for this ring.
+ * @num: the number of elements.
+ * @weak_barriers: true if we only need memory barriers, not I/O.
+ * @desc: the userpace descriptor pointer.
+ * @avail: the userpace avail pointer.
+ * @used: the userpace used pointer.
+ *
+ * Returns an error if num is invalid.
+ */
+int vringh_init_kern(struct vringh *vrh, u32 features,
+                     unsigned int num, bool weak_barriers,
+                     struct vring_desc *desc,
+                     struct vring_avail *avail,
+                     struct vring_used *used)
+{
+        /* Sane power of 2 please! */
+        if (!num || num > 0xffff || (num & (num - 1))) {
+                vringh_bad("Bad ring size %u", num);
+                return -EINVAL;
+        }
+        vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
+        vrh->weak_barriers = weak_barriers;
+        vrh->completed = 0;
+        vrh->last_avail_idx = 0;
+        vrh->last_used_idx = 0;
+        vrh->vring.num = num;
+        vrh->vring.desc = desc;
+        vrh->vring.avail = avail;
+        vrh->vring.used = used;
+        return 0;
+}
+EXPORT_SYMBOL(vringh_init_kern);
+/**
+ * vringh_getdesc_kern - get next available descriptor from kernelspace ring.
+ * @vrh: the kernelspace vring.
+ * @riov: where to put the readable descriptors (or NULL)
+ * @wiov: where to put the writable descriptors (or NULL)
+ * @head: head index we received, for passing to vringh_complete_kern().
+ * @gfp: flags for allocating larger riov/wiov.
+ *
+ * Returns 0 if there was no descriptor, 1 if there was, or -errno.
+ *
+ * Note that on error return, you can tell the difference between an
+ * invalid ring and a single invalid descriptor: in the former case,
+ * *head will be vrh->vring.num.  You may be able to ignore an invalid
+ * descriptor, but there's not much you can do with an invalid ring.
+ *
+ * Note that you may need to clean up riov and wiov, even on error!
+ */
+int vringh_getdesc_kern(struct vringh *vrh,
+                        struct vringh_kiov *riov,
+                        struct vringh_kiov *wiov,
+                        u16 *head,
+                        gfp_t gfp)
+{
+        int err;
+        err = __vringh_get_head(vrh, getu16_kern, &vrh->last_avail_idx);
+        if (err < 0)
+                return err;
+        /* Empty... */
+        if (err == vrh->vring.num)
+                return 0;
+        *head = err;
+        err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL,
+                           gfp, copydesc_kern);
+        if (err)
+                return err;
+        return 1;
+}
+EXPORT_SYMBOL(vringh_getdesc_kern);
+/**
+ * vringh_iov_pull_kern - copy bytes from vring_iov.
+ * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len)
+{
+        return vringh_iov_xfer(riov, dst, len, xfer_kern);
+}
+EXPORT_SYMBOL(vringh_iov_pull_kern);
+/**
+ * vringh_iov_push_kern - copy bytes into vring_iov.
+ * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
+                             const void *src, size_t len)
+{
+        return vringh_iov_xfer(wiov, (void *)src, len, xfer_kern);
+}
+EXPORT_SYMBOL(vringh_iov_push_kern);
+/**
+ * vringh_abandon_kern - we've decided not to handle the descriptor(s).
+ * @vrh: the vring.
+ * @num: the number of descriptors to put back (ie. num
+ *       vringh_get_kern() to undo).
+ *
+ * The next vringh_get_kern() will return the old descriptor(s) again.
+ */
+void vringh_abandon_kern(struct vringh *vrh, unsigned int num)
+{
+        /* We only update vring_avail_event(vr) when we want to be notified,
+         * so we haven't changed that yet. */
+        vrh->last_avail_idx -= num;
+}
+EXPORT_SYMBOL(vringh_abandon_kern);
+/**
+ * vringh_complete_kern - we've finished with descriptor, publish it.
+ * @vrh: the vring.
+ * @head: the head as filled in by vringh_getdesc_kern.
+ * @len: the length of data we have written.
+ *
+ * You should check vringh_need_notify_kern() after one or more calls
+ * to this function.
+ */
+int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len)
+{
+        struct vring_used_elem used;
+        used.id = head;
+        used.len = len;
+        return __vringh_complete(vrh, &used, 1, putu16_kern, putused_kern);
+}
+EXPORT_SYMBOL(vringh_complete_kern);
+/**
+ * vringh_notify_enable_kern - we want to know if something changes.
+ * @vrh: the vring.
+ *
+ * This always enables notifications, but returns false if there are
+ * now more buffers available in the vring.
+ */
+bool vringh_notify_enable_kern(struct vringh *vrh)
+{
+        return __vringh_notify_enable(vrh, getu16_kern, putu16_kern);
+}
+EXPORT_SYMBOL(vringh_notify_enable_kern);
+/**
+ * vringh_notify_disable_kern - don't tell us if something changes.
+ * @vrh: the vring.
+ *
+ * This is our normal running state: we disable and then only enable when
+ * we're going to sleep.
+ */
+void vringh_notify_disable_kern(struct vringh *vrh)
+{
+        __vringh_notify_disable(vrh, putu16_kern);
+}
+EXPORT_SYMBOL(vringh_notify_disable_kern);
+/**
+ * vringh_need_notify_kern - must we tell the other side about used buffers?
+ * @vrh: the vring we've called vringh_complete_kern() on.
+ *
+ * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
+ */
+int vringh_need_notify_kern(struct vringh *vrh)
+{
+        return __vringh_need_notify(vrh, getu16_kern);
+}
+EXPORT_SYMBOL(vringh_need_notify_kern);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 8dab163c5ef0..bd3ae324a1a2 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -108,7 +108,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
        sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
        /* We should always be able to add one buffer to an empty queue. */
-        if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0)
+        if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0)
                BUG();
        virtqueue_kick(vq);
@@ -256,7 +256,7 @@ static void stats_handle_request(struct virtio_balloon *vb)
        if (!virtqueue_get_buf(vq, &len))
                return;
        sg_init_one(&sg, vb->stats, sizeof(vb->stats));
-        if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0)
+        if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0)
                BUG();
        virtqueue_kick(vq);
 }
@@ -341,7 +341,7 @@ static int init_vqs(struct virtio_balloon *vb)
                 * use it to signal us later.
                 */
                sg_init_one(&sg, vb->stats, sizeof vb->stats);
-                if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb, GFP_KERNEL)
+                if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
                    < 0)
                        BUG();
                virtqueue_kick(vb->stats_vq);
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index ffd7e7da5d3b..5217baf5528c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -24,27 +24,6 @@
 #include <linux/module.h>
 #include <linux/hrtimer.h>
-/* virtio guest is communicating with a virtual "device" that actually runs on
- * a host processor.  Memory barriers are used to control SMP effects. */
-#ifdef CONFIG_SMP
-/* Where possible, use SMP barriers which are more lightweight than mandatory
- * barriers, because mandatory barriers control MMIO effects on accesses
- * through relaxed memory I/O windows (which virtio-pci does not use). */
-#define virtio_mb(vq) \
-        do { if ((vq)->weak_barriers) smp_mb(); else mb(); } while(0)
-#define virtio_rmb(vq) \
-        do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0)
-#define virtio_wmb(vq) \
-        do { if ((vq)->weak_barriers) smp_wmb(); else wmb(); } while(0)
-#else
-/* We must force memory ordering even if guest is UP since host could be
- * running on another CPU, but SMP barriers are defined to barrier() in that
- * configuration. So fall back to mandatory barriers instead. */
-#define virtio_mb(vq) mb()
-#define virtio_rmb(vq) rmb()
-#define virtio_wmb(vq) wmb()
-#endif
 #ifdef DEBUG
 /* For development, we want to crash whenever the ring is screwed. */
 #define BAD_RING(_vq, fmt, args...)                             \
@@ -119,16 +98,36 @@ struct vring_virtqueue
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
+static inline struct scatterlist *sg_next_chained(struct scatterlist *sg,
+                                                  unsigned int *count)
+{
+        return sg_next(sg);
+}
+static inline struct scatterlist *sg_next_arr(struct scatterlist *sg,
+                                              unsigned int *count)
+{
+        if (--(*count) == 0)
+                return NULL;
+        return sg + 1;
+}
 /* Set up an indirect table of descriptors and add it to the queue. */
-static int vring_add_indirect(struct vring_virtqueue *vq,
+static inline int vring_add_indirect(struct vring_virtqueue *vq,
-                              struct scatterlist sg[],
+                                     struct scatterlist *sgs[],
-                              unsigned int out,
+                                     struct scatterlist *(*next)
-                              unsigned int in,
+                                       (struct scatterlist *, unsigned int *),
-                              gfp_t gfp)
+                                     unsigned int total_sg,
+                                     unsigned int total_out,
+                                     unsigned int total_in,
+                                     unsigned int out_sgs,
+                                     unsigned int in_sgs,
+                                     gfp_t gfp)
 {
        struct vring_desc *desc;
        unsigned head;
-        int i;
+        struct scatterlist *sg;
+        int i, n;
        /*
         * We require lowmem mappings for the descriptors because
@@ -137,25 +136,31 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
         */
        gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH);
-        desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
+        desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
        if (!desc)
                return -ENOMEM;
-        /* Transfer entries from the sg list into the indirect page */
+        /* Transfer entries from the sg lists into the indirect page */
-        for (i = 0; i < out; i++) {
+        i = 0;
-                desc[i].flags = VRING_DESC_F_NEXT;
+        for (n = 0; n < out_sgs; n++) {
-                desc[i].addr = sg_phys(sg);
+                for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
-                desc[i].len = sg->length;
+                        desc[i].flags = VRING_DESC_F_NEXT;
-                desc[i].next = i+1;
+                        desc[i].addr = sg_phys(sg);
-                sg++;
+                        desc[i].len = sg->length;
+                        desc[i].next = i+1;
+                        i++;
+                }
        }
-        for (; i < (out + in); i++) {
+        for (; n < (out_sgs + in_sgs); n++) {
-                desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+                for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
-                desc[i].addr = sg_phys(sg);
+                        desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
-                desc[i].len = sg->length;
+                        desc[i].addr = sg_phys(sg);
-                desc[i].next = i+1;
+                        desc[i].len = sg->length;
-                sg++;
+                        desc[i].next = i+1;
+                        i++;
+                }
        }
+        BUG_ON(i != total_sg);
        /* Last one doesn't continue. */
        desc[i-1].flags &= ~VRING_DESC_F_NEXT;
@@ -176,29 +181,20 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
        return head;
 }
-/**
+static inline int virtqueue_add(struct virtqueue *_vq,
- * virtqueue_add_buf - expose buffer to other end
+                                struct scatterlist *sgs[],
- * @vq: the struct virtqueue we're talking about.
+                                struct scatterlist *(*next)
- * @sg: the description of the buffer(s).
+                                  (struct scatterlist *, unsigned int *),
- * @out_num: the number of sg readable by other side
+                                unsigned int total_out,
- * @in_num: the number of sg which are writable (after readable ones)
+                                unsigned int total_in,
- * @data: the token identifying the buffer.
+                                unsigned int out_sgs,
- * @gfp: how to do memory allocations (if necessary).
+                                unsigned int in_sgs,
- *
+                                void *data,
- * Caller must ensure we don't call this with other virtqueue operations
+                                gfp_t gfp)
- * at the same time (except where noted).
- *
- * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
- */
-int virtqueue_add_buf(struct virtqueue *_vq,
-                      struct scatterlist sg[],
-                      unsigned int out,
-                      unsigned int in,
-                      void *data,
-                      gfp_t gfp)
 {
        struct vring_virtqueue *vq = to_vvq(_vq);
-        unsigned int i, avail, uninitialized_var(prev);
+        struct scatterlist *sg;
+        unsigned int i, n, avail, uninitialized_var(prev), total_sg;
        int head;
        START_USE(vq);
@@ -218,46 +214,54 @@ int virtqueue_add_buf(struct virtqueue *_vq,
        }
 #endif
+        total_sg = total_in + total_out;
        /* If the host supports indirect descriptor tables, and we have multiple
         * buffers, then go indirect. FIXME: tune this threshold */
-        if (vq->indirect && (out + in) > 1 && vq->vq.num_free) {
+        if (vq->indirect && total_sg > 1 && vq->vq.num_free) {
-                head = vring_add_indirect(vq, sg, out, in, gfp);
+                head = vring_add_indirect(vq, sgs, next, total_sg, total_out,
+                                          total_in,
+                                          out_sgs, in_sgs, gfp);
                if (likely(head >= 0))
                        goto add_head;
        }
-        BUG_ON(out + in > vq->vring.num);
+        BUG_ON(total_sg > vq->vring.num);
-        BUG_ON(out + in == 0);
+        BUG_ON(total_sg == 0);
-        if (vq->vq.num_free < out + in) {
+        if (vq->vq.num_free < total_sg) {
                pr_debug("Can't add buf len %i - avail = %i\n",
-                         out + in, vq->vq.num_free);
+                         total_sg, vq->vq.num_free);
                /* FIXME: for historical reasons, we force a notify here if
                 * there are outgoing parts to the buffer.  Presumably the
                 * host should service the ring ASAP. */
-                if (out)
+                if (out_sgs)
                        vq->notify(&vq->vq);
                END_USE(vq);
                return -ENOSPC;
        }
        /* We're about to use some buffers from the free list. */
-        vq->vq.num_free -= out + in;
+        vq->vq.num_free -= total_sg;
-        head = vq->free_head;
+        head = i = vq->free_head;
-        for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) {
+        for (n = 0; n < out_sgs; n++) {
-                vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
+                for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
-                vq->vring.desc[i].addr = sg_phys(sg);
+                        vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
-                vq->vring.desc[i].len = sg->length;
+                        vq->vring.desc[i].addr = sg_phys(sg);
-                prev = i;
+                        vq->vring.desc[i].len = sg->length;
-                sg++;
+                        prev = i;
+                        i = vq->vring.desc[i].next;
+                }
        }
-        for (; in; i = vq->vring.desc[i].next, in--) {
+        for (; n < (out_sgs + in_sgs); n++) {
-                vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+                for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
-                vq->vring.desc[i].addr = sg_phys(sg);
+                        vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
-                vq->vring.desc[i].len = sg->length;
+                        vq->vring.desc[i].addr = sg_phys(sg);
-                prev = i;
+                        vq->vring.desc[i].len = sg->length;
-                sg++;
+                        prev = i;
+                        i = vq->vring.desc[i].next;
+                }
        }
        /* Last one doesn't continue. */
        vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
@@ -276,7 +280,7 @@ add_head:
        /* Descriptors and available array need to be set before we expose the
         * new available array entries. */
-        virtio_wmb(vq);
+        virtio_wmb(vq->weak_barriers);
        vq->vring.avail->idx++;
        vq->num_added++;
@@ -290,9 +294,122 @@ add_head:
        return 0;
 }
+/**
+ * virtqueue_add_buf - expose buffer to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sg: the description of the buffer(s).
+ * @out_num: the number of sg readable by other side
+ * @in_num: the number of sg which are writable (after readable ones)
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_buf(struct virtqueue *_vq,
+                      struct scatterlist sg[],
+                      unsigned int out,
+                      unsigned int in,
+                      void *data,
+                      gfp_t gfp)
+{
+        struct scatterlist *sgs[2];
+        sgs[0] = sg;
+        sgs[1] = sg + out;
+        return virtqueue_add(_vq, sgs, sg_next_arr,
+                             out, in, out ? 1 : 0, in ? 1 : 0, data, gfp);
+}
 EXPORT_SYMBOL_GPL(virtqueue_add_buf);
 /**
+ * virtqueue_add_sgs - expose buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sgs: array of terminated scatterlists.
+ * @out_num: the number of scatterlists readable by other side
+ * @in_num: the number of scatterlists which are writable (after readable ones)
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_sgs(struct virtqueue *_vq,
+                      struct scatterlist *sgs[],
+                      unsigned int out_sgs,
+                      unsigned int in_sgs,
+                      void *data,
+                      gfp_t gfp)
+{
+        unsigned int i, total_out, total_in;
+        /* Count them first. */
+        for (i = total_out = total_in = 0; i < out_sgs; i++) {
+                struct scatterlist *sg;
+                for (sg = sgs[i]; sg; sg = sg_next(sg))
+                        total_out++;
+        }
+        for (; i < out_sgs + in_sgs; i++) {
+                struct scatterlist *sg;
+                for (sg = sgs[i]; sg; sg = sg_next(sg))
+                        total_in++;
+        }
+        return virtqueue_add(_vq, sgs, sg_next_chained,
+                             total_out, total_in, out_sgs, in_sgs, data, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
+/**
+ * virtqueue_add_outbuf - expose output buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sgs: array of scatterlists (need not be terminated!)
+ * @num: the number of scatterlists readable by other side
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_outbuf(struct virtqueue *vq,
+                         struct scatterlist sg[], unsigned int num,
+                         void *data,
+                         gfp_t gfp)
+{
+        return virtqueue_add(vq, &sg, sg_next_arr, num, 0, 1, 0, data, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
+/**
+ * virtqueue_add_inbuf - expose input buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sgs: array of scatterlists (need not be terminated!)
+ * @num: the number of scatterlists writable by other side
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_inbuf(struct virtqueue *vq,
+                        struct scatterlist sg[], unsigned int num,
+                        void *data,
+                        gfp_t gfp)
+{
+        return virtqueue_add(vq, &sg, sg_next_arr, 0, num, 0, 1, data, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
+/**
 * virtqueue_kick_prepare - first half of split virtqueue_kick call.
 * @vq: the struct virtqueue
 *
@@ -312,7 +429,7 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
        START_USE(vq);
        /* We need to expose available array entries before checking avail
         * event. */
-        virtio_mb(vq);
+        virtio_mb(vq->weak_barriers);
        old = vq->vring.avail->idx - vq->num_added;
        new = vq->vring.avail->idx;
@@ -436,7 +553,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
        }
        /* Only get used array entries after they have been exposed by host. */
-        virtio_rmb(vq);
+        virtio_rmb(vq->weak_barriers);
        last_used = (vq->last_used_idx & (vq->vring.num - 1));
        i = vq->vring.used->ring[last_used].id;
@@ -460,7 +577,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
         * the read in the next get_buf call. */
        if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
                vring_used_event(&vq->vring) = vq->last_used_idx;
-                virtio_mb(vq);
+                virtio_mb(vq->weak_barriers);
        }
 #ifdef DEBUG
@@ -513,7 +630,7 @@ bool virtqueue_enable_cb(struct virtqueue *_vq)
         * entry. Always do both to keep code simple. */
        vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
        vring_used_event(&vq->vring) = vq->last_used_idx;
-        virtio_mb(vq);
+        virtio_mb(vq->weak_barriers);
        if (unlikely(more_used(vq))) {
                END_USE(vq);
                return false;
@@ -553,7 +670,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
        /* TODO: tune this threshold */
        bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4;
        vring_used_event(&vq->vring) = vq->last_used_idx + bufs;
-        virtio_mb(vq);
+        virtio_mb(vq->weak_barriers);
        if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) {
                END_USE(vq);
                return false;
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 2d8bdaef9611..bfc47e0de81c 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -172,6 +172,22 @@ static inline void sg_mark_end(struct scatterlist *sg)
 }
 /**
+ * sg_unmark_end - Undo setting the end of the scatterlist
+ * @sg:          SG entryScatterlist
+ *
+ * Description:
+ *   Removes the termination marker from the given entry of the scatterlist.
+ *
+ **/
+static inline void sg_unmark_end(struct scatterlist *sg)
+{
+#ifdef CONFIG_DEBUG_SG
+        BUG_ON(sg->sg_magic != SG_MAGIC);
+#endif
+        sg->page_link &= ~0x02;
+}
+/**
 * sg_phys - Return physical address of an sg entry
 * @sg:      SG entry
 *
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 2d7a5e045908..9ff8645b7e0b 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -8,6 +8,7 @@
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
 #include <linux/gfp.h>
+#include <linux/vringh.h>
 /**
 * virtqueue - a queue to register buffers for sending or receiving.
@@ -40,6 +41,23 @@ int virtqueue_add_buf(struct virtqueue *vq,
                      void *data,
                      gfp_t gfp);
+int virtqueue_add_outbuf(struct virtqueue *vq,
+                         struct scatterlist sg[], unsigned int num,
+                         void *data,
+                         gfp_t gfp);
+int virtqueue_add_inbuf(struct virtqueue *vq,
+                        struct scatterlist sg[], unsigned int num,
+                        void *data,
+                        gfp_t gfp);
+int virtqueue_add_sgs(struct virtqueue *vq,
+                      struct scatterlist *sgs[],
+                      unsigned int out_sgs,
+                      unsigned int in_sgs,
+                      void *data,
+                      gfp_t gfp);
 void virtqueue_kick(struct virtqueue *vq);
 bool virtqueue_kick_prepare(struct virtqueue *vq);
@@ -64,6 +82,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *vq);
 * @dev: underlying device.
 * @id: the device type identification (used to match it with a driver).
 * @config: the configuration ops for this device.
+ * @vringh_config: configuration ops for host vrings.
 * @vqs: the list of virtqueues for this device.
 * @features: the features supported by both driver and device.
 * @priv: private pointer for the driver's use.
@@ -73,6 +92,7 @@ struct virtio_device {
        struct device dev;
        struct virtio_device_id id;
        const struct virtio_config_ops *config;
+        const struct vringh_config_ops *vringh_config;
        struct list_head vqs;
        /* Note that this is a Linux set_bit-style bitmap. */
        unsigned long features[1];
diff --git a/include/linux/virtio_caif.h b/include/linux/virtio_caif.h
new file mode 100644
index 000000000000..5d2d3124ca3d
--- /dev/null
+++ b/include/linux/virtio_caif.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2012
+ * Author: Sjur Brændeland <sjur.brandeland@stericsson.com>
+ *
+ * This header is BSD licensed so
+ * anyone can use the definitions to implement compatible remote processors
+ */
+#ifndef VIRTIO_CAIF_H
+#define VIRTIO_CAIF_H
+#include <linux/types.h>
+struct virtio_caif_transf_config {
+        u16 headroom;
+        u16 tailroom;
+        u32 mtu;
+        u8 reserved[4];
+};
+struct virtio_caif_config {
+        struct virtio_caif_transf_config uplink, downlink;
+        u8 reserved[8];
+};
+#endif
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index 63c6ea199519..ca3ad41c2c82 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -4,6 +4,63 @@
 #include <linux/irqreturn.h>
 #include <uapi/linux/virtio_ring.h>
+/*
+ * Barriers in virtio are tricky.  Non-SMP virtio guests can't assume
+ * they're not on an SMP host system, so they need to assume real
+ * barriers.  Non-SMP virtio hosts could skip the barriers, but does
+ * anyone care?
+ *
+ * For virtio_pci on SMP, we don't need to order with respect to MMIO
+ * accesses through relaxed memory I/O windows, so smp_mb() et al are
+ * sufficient.
+ *
+ * For using virtio to talk to real devices (eg. other heterogeneous
+ * CPUs) we do need real barriers.  In theory, we could be using both
+ * kinds of virtio, so it's a runtime decision, and the branch is
+ * actually quite cheap.
+ */
+#ifdef CONFIG_SMP
+static inline void virtio_mb(bool weak_barriers)
+{
+        if (weak_barriers)
+                smp_mb();
+        else
+                mb();
+}
+static inline void virtio_rmb(bool weak_barriers)
+{
+        if (weak_barriers)
+                smp_rmb();
+        else
+                rmb();
+}
+static inline void virtio_wmb(bool weak_barriers)
+{
+        if (weak_barriers)
+                smp_wmb();
+        else
+                wmb();
+}
+#else
+static inline void virtio_mb(bool weak_barriers)
+{
+        mb();
+}
+static inline void virtio_rmb(bool weak_barriers)
+{
+        rmb();
+}
+static inline void virtio_wmb(bool weak_barriers)
+{
+        wmb();
+}
+#endif
 struct virtio_device;
 struct virtqueue;
diff --git a/include/linux/vringh.h b/include/linux/vringh.h
new file mode 100644
index 000000000000..749cde28728b
--- /dev/null
+++ b/include/linux/vringh.h
@@ -0,0 +1,225 @@
+/*
+ * Linux host-side vring helpers; for when the kernel needs to access
+ * someone else's vring.
+ *
+ * Copyright IBM Corporation, 2013.
+ * Parts taken from drivers/vhost/vhost.c Copyright 2009 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Written by: Rusty Russell <rusty@rustcorp.com.au>
+ */
+#ifndef _LINUX_VRINGH_H
+#define _LINUX_VRINGH_H
+#include <uapi/linux/virtio_ring.h>
+#include <linux/uio.h>
+#include <linux/slab.h>
+#include <asm/barrier.h>
+/* virtio_ring with information needed for host access. */
+struct vringh {
+        /* Guest publishes used event idx (note: we always do). */
+        bool event_indices;
+        /* Can we get away with weak barriers? */
+        bool weak_barriers;
+        /* Last available index we saw (ie. where we're up to). */
+        u16 last_avail_idx;
+        /* Last index we used. */
+        u16 last_used_idx;
+        /* How many descriptors we've completed since last need_notify(). */
+        u32 completed;
+        /* The vring (note: it may contain user pointers!) */
+        struct vring vring;
+        /* The function to call to notify the guest about added buffers */
+        void (*notify)(struct vringh *);
+};
+/**
+ * struct vringh_config_ops - ops for creating a host vring from a virtio driver
+ * @find_vrhs: find the host vrings and instantiate them
+ *      vdev: the virtio_device
+ *      nhvrs: the number of host vrings to find
+ *      hvrs: on success, includes new host vrings
+ *      callbacks: array of driver callbacks, for each host vring
+ *              include a NULL entry for vqs that do not need a callback
+ *      Returns 0 on success or error status
+ * @del_vrhs: free the host vrings found by find_vrhs().
+ */
+struct virtio_device;
+typedef void vrh_callback_t(struct virtio_device *, struct vringh *);
+struct vringh_config_ops {
+        int (*find_vrhs)(struct virtio_device *vdev, unsigned nhvrs,
+                         struct vringh *vrhs[], vrh_callback_t *callbacks[]);
+        void (*del_vrhs)(struct virtio_device *vdev);
+};
+/* The memory the vring can access, and what offset to apply. */
+struct vringh_range {
+        u64 start, end_incl;
+        u64 offset;
+};
+/**
+ * struct vringh_iov - iovec mangler.
+ *
+ * Mangles iovec in place, and restores it.
+ * Remaining data is iov + i, of used - i elements.
+ */
+struct vringh_iov {
+        struct iovec *iov;
+        size_t consumed; /* Within iov[i] */
+        unsigned i, used, max_num;
+};
+/**
+ * struct vringh_iov - kvec mangler.
+ *
+ * Mangles kvec in place, and restores it.
+ * Remaining data is iov + i, of used - i elements.
+ */
+struct vringh_kiov {
+        struct kvec *iov;
+        size_t consumed; /* Within iov[i] */
+        unsigned i, used, max_num;
+};
+/* Flag on max_num to indicate we're kmalloced. */
+#define VRINGH_IOV_ALLOCATED 0x8000000
+/* Helpers for userspace vrings. */
+int vringh_init_user(struct vringh *vrh, u32 features,
+                     unsigned int num, bool weak_barriers,
+                     struct vring_desc __user *desc,
+                     struct vring_avail __user *avail,
+                     struct vring_used __user *used);
+static inline void vringh_iov_init(struct vringh_iov *iov,
+                                   struct iovec *iovec, unsigned num)
+{
+        iov->used = iov->i = 0;
+        iov->consumed = 0;
+        iov->max_num = num;
+        iov->iov = iovec;
+}
+static inline void vringh_iov_reset(struct vringh_iov *iov)
+{
+        iov->iov[iov->i].iov_len += iov->consumed;
+        iov->iov[iov->i].iov_base -= iov->consumed;
+        iov->consumed = 0;
+        iov->i = 0;
+}
+static inline void vringh_iov_cleanup(struct vringh_iov *iov)
+{
+        if (iov->max_num & VRINGH_IOV_ALLOCATED)
+                kfree(iov->iov);
+        iov->max_num = iov->used = iov->i = iov->consumed = 0;
+        iov->iov = NULL;
+}
+/* Convert a descriptor into iovecs. */
+int vringh_getdesc_user(struct vringh *vrh,
+                        struct vringh_iov *riov,
+                        struct vringh_iov *wiov,
+                        bool (*getrange)(struct vringh *vrh,
+                                         u64 addr, struct vringh_range *r),
+                        u16 *head);
+/* Copy bytes from readable vsg, consuming it (and incrementing wiov->i). */
+ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len);
+/* Copy bytes into writable vsg, consuming it (and incrementing wiov->i). */
+ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
+                             const void *src, size_t len);
+/* Mark a descriptor as used. */
+int vringh_complete_user(struct vringh *vrh, u16 head, u32 len);
+int vringh_complete_multi_user(struct vringh *vrh,
+                               const struct vring_used_elem used[],
+                               unsigned num_used);
+/* Pretend we've never seen descriptor (for easy error handling). */
+void vringh_abandon_user(struct vringh *vrh, unsigned int num);
+/* Do we need to fire the eventfd to notify the other side? */
+int vringh_need_notify_user(struct vringh *vrh);
+bool vringh_notify_enable_user(struct vringh *vrh);
+void vringh_notify_disable_user(struct vringh *vrh);
+/* Helpers for kernelspace vrings. */
+int vringh_init_kern(struct vringh *vrh, u32 features,
+                     unsigned int num, bool weak_barriers,
+                     struct vring_desc *desc,
+                     struct vring_avail *avail,
+                     struct vring_used *used);
+static inline void vringh_kiov_init(struct vringh_kiov *kiov,
+                                    struct kvec *kvec, unsigned num)
+{
+        kiov->used = kiov->i = 0;
+        kiov->consumed = 0;
+        kiov->max_num = num;
+        kiov->iov = kvec;
+}
+static inline void vringh_kiov_reset(struct vringh_kiov *kiov)
+{
+        kiov->iov[kiov->i].iov_len += kiov->consumed;
+        kiov->iov[kiov->i].iov_base -= kiov->consumed;
+        kiov->consumed = 0;
+        kiov->i = 0;
+}
+static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov)
+{
+        if (kiov->max_num & VRINGH_IOV_ALLOCATED)
+                kfree(kiov->iov);
+        kiov->max_num = kiov->used = kiov->i = kiov->consumed = 0;
+        kiov->iov = NULL;
+}
+int vringh_getdesc_kern(struct vringh *vrh,
+                        struct vringh_kiov *riov,
+                        struct vringh_kiov *wiov,
+                        u16 *head,
+                        gfp_t gfp);
+ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len);
+ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
+                             const void *src, size_t len);
+void vringh_abandon_kern(struct vringh *vrh, unsigned int num);
+int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len);
+bool vringh_notify_enable_kern(struct vringh *vrh);
+void vringh_notify_disable_kern(struct vringh *vrh);
+int vringh_need_notify_kern(struct vringh *vrh);
+/* Notify the guest about buffers added to the used ring */
+static inline void vringh_notify(struct vringh *vrh)
+{
+        if (vrh->notify)
+                vrh->notify(vrh);
+}
+#endif /* _LINUX_VRINGH_H */
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index 652dc8bea921..5e26f61b5df5 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -52,8 +52,8 @@ struct virtio_balloon_config
 #define VIRTIO_BALLOON_S_NR       6
 struct virtio_balloon_stat {
-        u16 tag;
+        __u16 tag;
-        u64 val;
+        __u64 val;
 } __attribute__((packed));
 #endif /* _LINUX_VIRTIO_BALLOON_H */
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index a7630d04029f..284fc3a05f7b 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -38,5 +38,6 @@
 #define VIRTIO_ID_SCSI          8 /* virtio scsi */
 #define VIRTIO_ID_9P            9 /* 9p virtio console */
 #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */
+#define VIRTIO_ID_CAIF         12 /* Virtio caif */
 #endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index de2e950a0a7a..e1c26b101830 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -194,11 +194,14 @@ static int pack_sg_list(struct scatterlist *sg, int start,
                if (s > count)
                        s = count;
                BUG_ON(index > limit);
+                /* Make sure we don't terminate early. */
+                sg_unmark_end(&sg[index]);
                sg_set_buf(&sg[index++], data, s);
                count -= s;
                data += s;
        }
+        if (index-start)
+                sg_mark_end(&sg[index - 1]);
        return index-start;
 }
@@ -236,12 +239,17 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit,
                s = rest_of_page(data);
                if (s > count)
                        s = count;
+                /* Make sure we don't terminate early. */
+                sg_unmark_end(&sg[index]);
                sg_set_page(&sg[index++], pdata[i++], s, data_off);
                data_off = 0;
                data += s;
                count -= s;
                nr_pages--;
        }
+        if (index-start)
+                sg_mark_end(&sg[index - 1]);
        return index - start;
 }
@@ -256,9 +264,10 @@ static int
 p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
 {
        int err;
-        int in, out;
+        int in, out, out_sgs, in_sgs;
        unsigned long flags;
        struct virtio_chan *chan = client->trans;
+        struct scatterlist *sgs[2];
        p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n");
@@ -266,14 +275,19 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
 req_retry:
        spin_lock_irqsave(&chan->lock, flags);
+        out_sgs = in_sgs = 0;
        /* Handle out VirtIO ring buffers */
        out = pack_sg_list(chan->sg, 0,
                           VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
+        if (out)
+                sgs[out_sgs++] = chan->sg;
        in = pack_sg_list(chan->sg, out,
                          VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity);
+        if (in)
+                sgs[out_sgs + in_sgs++] = chan->sg + out;
-        err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc,
+        err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc,
                                GFP_ATOMIC);
        if (err < 0) {
                if (err == -ENOSPC) {
@@ -289,7 +303,7 @@ req_retry:
                } else {
                        spin_unlock_irqrestore(&chan->lock, flags);
                        p9_debug(P9_DEBUG_TRANS,
-                                 "virtio rpc add_buf returned failure\n");
+                                 "virtio rpc add_sgs returned failure\n");
                        return -EIO;
                }
        }
@@ -351,11 +365,12 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
                     char *uidata, char *uodata, int inlen,
                     int outlen, int in_hdr_len, int kern_buf)
 {
-        int in, out, err;
+        int in, out, err, out_sgs, in_sgs;
        unsigned long flags;
        int in_nr_pages = 0, out_nr_pages = 0;
        struct page **in_pages = NULL, **out_pages = NULL;
        struct virtio_chan *chan = client->trans;
+        struct scatterlist *sgs[4];
        p9_debug(P9_DEBUG_TRANS, "virtio request\n");
@@ -396,13 +411,22 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
        req->status = REQ_STATUS_SENT;
 req_retry_pinned:
        spin_lock_irqsave(&chan->lock, flags);
+        out_sgs = in_sgs = 0;
        /* out data */
        out = pack_sg_list(chan->sg, 0,
                           VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
-        if (out_pages)
+        if (out)
+                sgs[out_sgs++] = chan->sg;
+        if (out_pages) {
+                sgs[out_sgs++] = chan->sg + out;
                out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM,
                                      out_pages, out_nr_pages, uodata, outlen);
+        }
+                
        /*
         * Take care of in data
         * For example TREAD have 11.
@@ -412,11 +436,17 @@ req_retry_pinned:
         */
        in = pack_sg_list(chan->sg, out,
                          VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len);
-        if (in_pages)
+        if (in)
+                sgs[out_sgs + in_sgs++] = chan->sg + out;
+        if (in_pages) {
+                sgs[out_sgs + in_sgs++] = chan->sg + out + in;
                in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM,
                                     in_pages, in_nr_pages, uidata, inlen);
+        }
-        err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc,
+        BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs));
+        err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc,
                                GFP_ATOMIC);
        if (err < 0) {
                if (err == -ENOSPC) {
@@ -432,7 +462,7 @@ req_retry_pinned:
                } else {
                        spin_unlock_irqrestore(&chan->lock, flags);
                        p9_debug(P9_DEBUG_TRANS,
-                                 "virtio rpc add_buf returned failure\n");
+                                 "virtio rpc add_sgs returned failure\n");
                        err = -EIO;
                        goto err_out;
                }
diff --git a/tools/lguest/lguest.txt b/tools/lguest/lguest.txt
index 7203ace65e83..06e1f4649511 100644
--- a/tools/lguest/lguest.txt
+++ b/tools/lguest/lguest.txt
@@ -70,7 +70,7 @@ Running Lguest:
 - Run an lguest as root:
-      Documentation/virtual/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
+      tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
        --block=rootfile root=/dev/vda
   Explanation:
diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d1d442ed106a..3187c62d9814 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,12 +1,14 @@
 all: test mod
-test: virtio_test
+test: virtio_test vringh_test
 virtio_test: virtio_ring.o virtio_test.o
-CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow  -MMD
+vringh_test: vringh_test.o vringh.o virtio_ring.o
-vpath %.c ../../drivers/virtio
+CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE
+vpath %.c ../../drivers/virtio ../../drivers/vhost
 mod:
        ${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test
 .PHONY: all test mod clean
 clean:
-        ${RM} *.o vhost_test/*.o vhost_test/.*.cmd \
+        ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
              vhost_test/Module.symvers vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/asm/barrier.h b/tools/virtio/asm/barrier.h
new file mode 100644
index 000000000000..aff61e13306c
--- /dev/null
+++ b/tools/virtio/asm/barrier.h
@@ -0,0 +1,14 @@
+#if defined(__i386__) || defined(__x86_64__)
+#define barrier() asm volatile("" ::: "memory")
+#define mb() __sync_synchronize()
+#define smp_mb()        mb()
+# define smp_rmb()      barrier()
+# define smp_wmb()      barrier()
+/* Weak barriers should be used. If not - it's a bug */
+# define rmb()  abort()
+# define wmb()  abort()
+#else
+#error Please fill in barrier macros
+#endif
diff --git a/tools/virtio/linux/bug.h b/tools/virtio/linux/bug.h
new file mode 100644
index 000000000000..fb94f0787c47
--- /dev/null
+++ b/tools/virtio/linux/bug.h
@@ -0,0 +1,10 @@
+#ifndef BUG_H
+#define BUG_H
+#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond))
+#define BUILD_BUG_ON(x)
+#define BUG() abort()
+#endif /* BUG_H */
diff --git a/tools/virtio/linux/err.h b/tools/virtio/linux/err.h
new file mode 100644
index 000000000000..e32eff8b2a14
--- /dev/null
+++ b/tools/virtio/linux/err.h
@@ -0,0 +1,26 @@
+#ifndef ERR_H
+#define ERR_H
+#define MAX_ERRNO       4095
+#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
+static inline void * __must_check ERR_PTR(long error)
+{
+        return (void *) error;
+}
+static inline long __must_check PTR_ERR(const void *ptr)
+{
+        return (long) ptr;
+}
+static inline long __must_check IS_ERR(const void *ptr)
+{
+        return IS_ERR_VALUE((unsigned long)ptr);
+}
+static inline long __must_check IS_ERR_OR_NULL(const void *ptr)
+{
+        return !ptr || IS_ERR_VALUE((unsigned long)ptr);
+}
+#endif /* ERR_H */
diff --git a/tools/virtio/linux/export.h b/tools/virtio/linux/export.h
new file mode 100644
index 000000000000..7311d326894a
--- /dev/null
+++ b/tools/virtio/linux/export.h
@@ -0,0 +1,5 @@
+#define EXPORT_SYMBOL(sym)
+#define EXPORT_SYMBOL_GPL(sym)
+#define EXPORT_SYMBOL_GPL_FUTURE(sym)
+#define EXPORT_UNUSED_SYMBOL(sym)
+#define EXPORT_UNUSED_SYMBOL_GPL(sym)
diff --git a/tools/virtio/linux/irqreturn.h b/tools/virtio/linux/irqreturn.h
new file mode 100644
index 000000000000..a3c4e7be7089
--- /dev/null
+++ b/tools/virtio/linux/irqreturn.h
@@ -0,0 +1 @@
+#include "../../../include/linux/irqreturn.h"
diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h
new file mode 100644
index 000000000000..fba705963968
--- /dev/null
+++ b/tools/virtio/linux/kernel.h
@@ -0,0 +1,112 @@
+#ifndef KERNEL_H
+#define KERNEL_H
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <stdarg.h>
+#include <linux/types.h>
+#include <linux/printk.h>
+#include <linux/bug.h>
+#include <errno.h>
+#include <unistd.h>
+#include <asm/barrier.h>
+#define CONFIG_SMP
+#define PAGE_SIZE getpagesize()
+#define PAGE_MASK (~(PAGE_SIZE-1))
+typedef unsigned long long dma_addr_t;
+typedef size_t __kernel_size_t;
+struct page {
+        unsigned long long dummy;
+};
+/* Physical == Virtual */
+#define virt_to_phys(p) ((unsigned long)p)
+#define phys_to_virt(a) ((void *)(unsigned long)(a))
+/* Page address: Virtual / 4K */
+#define page_to_phys(p) ((dma_addr_t)(unsigned long)(p))
+#define virt_to_page(p) ((struct page *)((unsigned long)p & PAGE_MASK))
+#define offset_in_page(p) (((unsigned long)p) % PAGE_SIZE)
+#define __printf(a,b) __attribute__((format(printf,a,b)))
+typedef enum {
+        GFP_KERNEL,
+        GFP_ATOMIC,
+        __GFP_HIGHMEM,
+        __GFP_HIGH
+} gfp_t;
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
+extern void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+static inline void *kmalloc(size_t s, gfp_t gfp)
+{
+        if (__kmalloc_fake)
+                return __kmalloc_fake;
+        return malloc(s);
+}
+static inline void kfree(void *p)
+{
+        if (p >= __kfree_ignore_start && p < __kfree_ignore_end)
+                return;
+        free(p);
+}
+static inline void *krealloc(void *p, size_t s, gfp_t gfp)
+{
+        return realloc(p, s);
+}
+static inline unsigned long __get_free_page(gfp_t gfp)
+{
+        void *p;
+        posix_memalign(&p, PAGE_SIZE, PAGE_SIZE);
+        return (unsigned long)p;
+}
+static inline void free_page(unsigned long addr)
+{
+        free((void *)addr);
+}
+#define container_of(ptr, type, member) ({                      \
+        const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
+        (type *)( (char *)__mptr - offsetof(type,member) );})
+#define uninitialized_var(x) x = x
+# ifndef likely
+#  define likely(x)     (__builtin_expect(!!(x), 1))
+# endif
+# ifndef unlikely
+#  define unlikely(x)   (__builtin_expect(!!(x), 0))
+# endif
+#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
+#ifdef DEBUG
+#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
+#else
+#define pr_debug(format, ...) do {} while (0)
+#endif
+#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
+#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
+#define min(x, y) ({                            \
+        typeof(x) _min1 = (x);                  \
+        typeof(y) _min2 = (y);                  \
+        (void) (&_min1 == &_min2);              \
+        _min1 < _min2 ? _min1 : _min2; })
+#endif /* KERNEL_H */
diff --git a/tools/virtio/linux/module.h b/tools/virtio/linux/module.h
index e69de29bb2d1..3039a7e972b6 100644
--- a/tools/virtio/linux/module.h
+++ b/tools/virtio/linux/module.h
@@ -0,0 +1 @@
+#include <linux/export.h>
diff --git a/tools/virtio/linux/printk.h b/tools/virtio/linux/printk.h
new file mode 100644
index 000000000000..9f2423bd89c2
--- /dev/null
+++ b/tools/virtio/linux/printk.h
@@ -0,0 +1,4 @@
+#include "../../../include/linux/kern_levels.h"
+#define printk printf
+#define vprintk vprintf
diff --git a/tools/virtio/linux/ratelimit.h b/tools/virtio/linux/ratelimit.h
new file mode 100644
index 000000000000..dcce1725f90d
--- /dev/null
+++ b/tools/virtio/linux/ratelimit.h
@@ -0,0 +1,4 @@
+#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) int name = 0
+#define __ratelimit(x) (*(x))
diff --git a/tools/virtio/linux/scatterlist.h b/tools/virtio/linux/scatterlist.h
new file mode 100644
index 000000000000..68c9e2adc996
--- /dev/null
+++ b/tools/virtio/linux/scatterlist.h
@@ -0,0 +1,189 @@
+#ifndef SCATTERLIST_H
+#define SCATTERLIST_H
+#include <linux/kernel.h>
+struct scatterlist {
+        unsigned long   page_link;
+        unsigned int    offset;
+        unsigned int    length;
+        dma_addr_t      dma_address;
+};
+/* Scatterlist helpers, stolen from linux/scatterlist.h */
+#define sg_is_chain(sg)         ((sg)->page_link & 0x01)
+#define sg_is_last(sg)          ((sg)->page_link & 0x02)
+#define sg_chain_ptr(sg)        \
+        ((struct scatterlist *) ((sg)->page_link & ~0x03))
+/**
+ * sg_assign_page - Assign a given page to an SG entry
+ * @sg:             SG entry
+ * @page:           The page
+ *
+ * Description:
+ *   Assign page to sg entry. Also see sg_set_page(), the most commonly used
+ *   variant.
+ *
+ **/
+static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
+{
+        unsigned long page_link = sg->page_link & 0x3;
+        /*
+         * In order for the low bit stealing approach to work, pages
+         * must be aligned at a 32-bit boundary as a minimum.
+         */
+        BUG_ON((unsigned long) page & 0x03);
+#ifdef CONFIG_DEBUG_SG
+        BUG_ON(sg->sg_magic != SG_MAGIC);
+        BUG_ON(sg_is_chain(sg));
+#endif
+        sg->page_link = page_link | (unsigned long) page;
+}
+/**
+ * sg_set_page - Set sg entry to point at given page
+ * @sg:          SG entry
+ * @page:        The page
+ * @len:         Length of data
+ * @offset:      Offset into page
+ *
+ * Description:
+ *   Use this function to set an sg entry pointing at a page, never assign
+ *   the page directly. We encode sg table information in the lower bits
+ *   of the page pointer. See sg_page() for looking up the page belonging
+ *   to an sg entry.
+ *
+ **/
+static inline void sg_set_page(struct scatterlist *sg, struct page *page,
+                               unsigned int len, unsigned int offset)
+{
+        sg_assign_page(sg, page);
+        sg->offset = offset;
+        sg->length = len;
+}
+static inline struct page *sg_page(struct scatterlist *sg)
+{
+#ifdef CONFIG_DEBUG_SG
+        BUG_ON(sg->sg_magic != SG_MAGIC);
+        BUG_ON(sg_is_chain(sg));
+#endif
+        return (struct page *)((sg)->page_link & ~0x3);
+}
+/*
+ * Loop over each sg element, following the pointer to a new list if necessary
+ */
+#define for_each_sg(sglist, sg, nr, __i)        \
+        for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))
+/**
+ * sg_chain - Chain two sglists together
+ * @prv:        First scatterlist
+ * @prv_nents:  Number of entries in prv
+ * @sgl:        Second scatterlist
+ *
+ * Description:
+ *   Links @prv@ and @sgl@ together, to form a longer scatterlist.
+ *
+ **/
+static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
+                            struct scatterlist *sgl)
+{
+        /*
+         * offset and length are unused for chain entry.  Clear them.
+         */
+        prv[prv_nents - 1].offset = 0;
+        prv[prv_nents - 1].length = 0;
+        /*
+         * Set lowest bit to indicate a link pointer, and make sure to clear
+         * the termination bit if it happens to be set.
+         */
+        prv[prv_nents - 1].page_link = ((unsigned long) sgl | 0x01) & ~0x02;
+}
+/**
+ * sg_mark_end - Mark the end of the scatterlist
+ * @sg:          SG entryScatterlist
+ *
+ * Description:
+ *   Marks the passed in sg entry as the termination point for the sg
+ *   table. A call to sg_next() on this entry will return NULL.
+ *
+ **/
+static inline void sg_mark_end(struct scatterlist *sg)
+{
+#ifdef CONFIG_DEBUG_SG
+        BUG_ON(sg->sg_magic != SG_MAGIC);
+#endif
+        /*
+         * Set termination bit, clear potential chain bit
+         */
+        sg->page_link |= 0x02;
+        sg->page_link &= ~0x01;
+}
+/**
+ * sg_unmark_end - Undo setting the end of the scatterlist
+ * @sg:          SG entryScatterlist
+ *
+ * Description:
+ *   Removes the termination marker from the given entry of the scatterlist.
+ *
+ **/
+static inline void sg_unmark_end(struct scatterlist *sg)
+{
+#ifdef CONFIG_DEBUG_SG
+        BUG_ON(sg->sg_magic != SG_MAGIC);
+#endif
+        sg->page_link &= ~0x02;
+}
+static inline struct scatterlist *sg_next(struct scatterlist *sg)
+{
+#ifdef CONFIG_DEBUG_SG
+        BUG_ON(sg->sg_magic != SG_MAGIC);
+#endif
+        if (sg_is_last(sg))
+                return NULL;
+        sg++;
+        if (unlikely(sg_is_chain(sg)))
+                sg = sg_chain_ptr(sg);
+        return sg;
+}
+static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents)
+{
+        memset(sgl, 0, sizeof(*sgl) * nents);
+#ifdef CONFIG_DEBUG_SG
+        {
+                unsigned int i;
+                for (i = 0; i < nents; i++)
+                        sgl[i].sg_magic = SG_MAGIC;
+        }
+#endif
+        sg_mark_end(&sgl[nents - 1]);
+}
+static inline dma_addr_t sg_phys(struct scatterlist *sg)
+{
+        return page_to_phys(sg_page(sg)) + sg->offset;
+}
+static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
+                              unsigned int buflen)
+{
+        sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
+}
+static inline void sg_init_one(struct scatterlist *sg,
+                               const void *buf, unsigned int buflen)
+{
+        sg_init_table(sg, 1);
+        sg_set_buf(sg, buf, buflen);
+}
+#endif /* SCATTERLIST_H */
diff --git a/tools/virtio/linux/types.h b/tools/virtio/linux/types.h
new file mode 100644
index 000000000000..f8ebb9a2b3d6
--- /dev/null
+++ b/tools/virtio/linux/types.h
@@ -0,0 +1,28 @@
+#ifndef TYPES_H
+#define TYPES_H
+#include <stdint.h>
+#define __force
+#define __user
+#define __must_check
+#define __cold
+typedef uint64_t u64;
+typedef int64_t s64;
+typedef uint32_t u32;
+typedef int32_t s32;
+typedef uint16_t u16;
+typedef int16_t s16;
+typedef uint8_t u8;
+typedef int8_t s8;
+typedef uint64_t __u64;
+typedef int64_t __s64;
+typedef uint32_t __u32;
+typedef int32_t __s32;
+typedef uint16_t __u16;
+typedef int16_t __s16;
+typedef uint8_t __u8;
+typedef int8_t __s8;
+#endif /* TYPES_H */
diff --git a/tools/virtio/linux/uaccess.h b/tools/virtio/linux/uaccess.h
new file mode 100644
index 000000000000..0a578fe18653
--- /dev/null
+++ b/tools/virtio/linux/uaccess.h
@@ -0,0 +1,50 @@
+#ifndef UACCESS_H
+#define UACCESS_H
+extern void *__user_addr_min, *__user_addr_max;
+#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+static inline void __chk_user_ptr(const volatile void *p, size_t size)
+{
+        assert(p >= __user_addr_min && p + size <= __user_addr_max);
+}
+#define put_user(x, ptr)                                        \
+({                                                              \
+        typeof(ptr) __pu_ptr = (ptr);                           \
+        __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr));            \
+        ACCESS_ONCE(*(__pu_ptr)) = x;                           \
+        0;                                                      \
+})
+#define get_user(x, ptr)                                        \
+({                                                              \
+        typeof(ptr) __pu_ptr = (ptr);                           \
+        __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr));            \
+        x = ACCESS_ONCE(*(__pu_ptr));                           \
+        0;                                                      \
+})
+static void volatile_memcpy(volatile char *to, const volatile char *from, 
+                            unsigned long n)
+{
+        while (n--)
+                *(to++) = *(from++);
+}
+static inline int copy_from_user(void *to, const void __user volatile *from,
+                                 unsigned long n)
+{
+        __chk_user_ptr(from, n);
+        volatile_memcpy(to, from, n);
+        return 0;
+}
+static inline int copy_to_user(void __user volatile *to, const void *from,
+                               unsigned long n)
+{
+        __chk_user_ptr(to, n);
+        volatile_memcpy(to, from, n);
+        return 0;
+}
+#endif /* UACCESS_H */
diff --git a/tools/virtio/linux/uio.h b/tools/virtio/linux/uio.h
new file mode 100644
index 000000000000..cd20f0ba3081
--- /dev/null
+++ b/tools/virtio/linux/uio.h
@@ -0,0 +1,3 @@
+#include <linux/kernel.h>
+#include "../../../include/linux/uio.h"
diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h
index 81847dd08bd0..cd801838156f 100644
--- a/tools/virtio/linux/virtio.h
+++ b/tools/virtio/linux/virtio.h
@@ -1,127 +1,7 @@
 #ifndef LINUX_VIRTIO_H
 #define LINUX_VIRTIO_H
+#include <linux/scatterlist.h>
-#include <stdbool.h>
+#include <linux/kernel.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <linux/types.h>
-#include <errno.h>
-typedef unsigned long long dma_addr_t;
-struct scatterlist {
-        unsigned long   page_link;
-        unsigned int    offset;
-        unsigned int    length;
-        dma_addr_t      dma_address;
-};
-struct page {
-        unsigned long long dummy;
-};
-#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond))
-/* Physical == Virtual */
-#define virt_to_phys(p) ((unsigned long)p)
-#define phys_to_virt(a) ((void *)(unsigned long)(a))
-/* Page address: Virtual / 4K */
-#define virt_to_page(p) ((struct page*)((virt_to_phys(p) / 4096) * \
-                                        sizeof(struct page)))
-#define offset_in_page(p) (((unsigned long)p) % 4096)
-#define sg_phys(sg) ((sg->page_link & ~0x3) / sizeof(struct page) * 4096 + \
-                     sg->offset)
-static inline void sg_mark_end(struct scatterlist *sg)
-{
-        /*
-         * Set termination bit, clear potential chain bit
-         */
-        sg->page_link |= 0x02;
-        sg->page_link &= ~0x01;
-}
-static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents)
-{
-        memset(sgl, 0, sizeof(*sgl) * nents);
-        sg_mark_end(&sgl[nents - 1]);
-}
-static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
-{
-        unsigned long page_link = sg->page_link & 0x3;
-        /*
-         * In order for the low bit stealing approach to work, pages
-         * must be aligned at a 32-bit boundary as a minimum.
-         */
-        BUG_ON((unsigned long) page & 0x03);
-        sg->page_link = page_link | (unsigned long) page;
-}
-static inline void sg_set_page(struct scatterlist *sg, struct page *page,
-                               unsigned int len, unsigned int offset)
-{
-        sg_assign_page(sg, page);
-        sg->offset = offset;
-        sg->length = len;
-}
-static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
-                              unsigned int buflen)
-{
-        sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
-}
-static inline void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen)
-{
-        sg_init_table(sg, 1);
-        sg_set_buf(sg, buf, buflen);
-}
-typedef __u16 u16;
-typedef enum {
-        GFP_KERNEL,
-        GFP_ATOMIC,
-} gfp_t;
-typedef enum {
-        IRQ_NONE,
-        IRQ_HANDLED
-} irqreturn_t;
-static inline void *kmalloc(size_t s, gfp_t gfp)
-{
-        return malloc(s);
-}
-static inline void kfree(void *p)
-{
-        free(p);
-}
-#define container_of(ptr, type, member) ({                      \
-        const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
-        (type *)( (char *)__mptr - offsetof(type,member) );})
-#define uninitialized_var(x) x = x
-# ifndef likely
-#  define likely(x)     (__builtin_expect(!!(x), 1))
-# endif
-# ifndef unlikely
-#  define unlikely(x)   (__builtin_expect(!!(x), 0))
-# endif
-#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
-#ifdef DEBUG
-#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
-#else
-#define pr_debug(format, ...) do {} while (0)
-#endif
-#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
-#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
 /* TODO: empty stubs for now. Broken but enough for virtio_ring.c */
 #define list_add_tail(a, b) do {} while (0)
@@ -131,6 +11,7 @@ static inline void kfree(void *p)
 #define BITS_PER_BYTE           8
 #define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE)
 #define BIT_MASK(nr)            (1UL << ((nr) % BITS_PER_LONG))
 /* TODO: Not atomic as it should be:
 * we don't use this for anything important. */
 static inline void clear_bit(int nr, volatile unsigned long *addr)
@@ -145,10 +26,6 @@ static inline int test_bit(int nr, const volatile unsigned long *addr)
 {
        return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
 }
-/* The only feature we care to support */
-#define virtio_has_feature(dev, feature) \
-        test_bit((feature), (dev)->features)
 /* end of stubs */
 struct virtio_device {
@@ -163,39 +40,32 @@ struct virtqueue {
        void (*callback)(struct virtqueue *vq);
        const char *name;
        struct virtio_device *vdev;
+        unsigned int index;
+        unsigned int num_free;
        void *priv;
 };
-#define EXPORT_SYMBOL_GPL(__EXPORT_SYMBOL_GPL_name) \
-        void __EXPORT_SYMBOL_GPL##__EXPORT_SYMBOL_GPL_name() { \
-}
 #define MODULE_LICENSE(__MODULE_LICENSE_value) \
        const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value
-#define CONFIG_SMP
-#if defined(__i386__) || defined(__x86_64__)
-#define barrier() asm volatile("" ::: "memory")
-#define mb() __sync_synchronize()
-#define smp_mb()        mb()
-# define smp_rmb()      barrier()
-# define smp_wmb()      barrier()
-/* Weak barriers should be used. If not - it's a bug */
-# define rmb()  abort()
-# define wmb()  abort()
-#else
-#error Please fill in barrier macros
-#endif
 /* Interfaces exported by virtio_ring. */
-int virtqueue_add_buf(struct virtqueue *vq,
+int virtqueue_add_sgs(struct virtqueue *vq,
-                      struct scatterlist sg[],
+                      struct scatterlist *sgs[],
-                      unsigned int out_num,
+                      unsigned int out_sgs,
-                      unsigned int in_num,
+                      unsigned int in_sgs,
                      void *data,
                      gfp_t gfp);
+int virtqueue_add_outbuf(struct virtqueue *vq,
+                         struct scatterlist sg[], unsigned int num,
+                         void *data,
+                         gfp_t gfp);
+int virtqueue_add_inbuf(struct virtqueue *vq,
+                        struct scatterlist sg[], unsigned int num,
+                        void *data,
+                        gfp_t gfp);
 void virtqueue_kick(struct virtqueue *vq);
 void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len);
@@ -206,7 +76,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq);
 bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
 void *virtqueue_detach_unused_buf(struct virtqueue *vq);
-struct virtqueue *vring_new_virtqueue(unsigned int num,
+struct virtqueue *vring_new_virtqueue(unsigned int index,
+                                      unsigned int num,
                                      unsigned int vring_align,
                                      struct virtio_device *vdev,
                                      bool weak_barriers,
diff --git a/tools/virtio/linux/virtio_config.h b/tools/virtio/linux/virtio_config.h
new file mode 100644
index 000000000000..5049967f99f7
--- /dev/null
+++ b/tools/virtio/linux/virtio_config.h
@@ -0,0 +1,6 @@
+#define VIRTIO_TRANSPORT_F_START        28
+#define VIRTIO_TRANSPORT_F_END          32
+#define virtio_has_feature(dev, feature) \
+        test_bit((feature), (dev)->features)
diff --git a/tools/virtio/linux/virtio_ring.h b/tools/virtio/linux/virtio_ring.h
new file mode 100644
index 000000000000..8949c4e2772c
--- /dev/null
+++ b/tools/virtio/linux/virtio_ring.h
@@ -0,0 +1 @@
+#include "../../../include/linux/virtio_ring.h"
diff --git a/tools/virtio/linux/vringh.h b/tools/virtio/linux/vringh.h
new file mode 100644
index 000000000000..9348957be56e
--- /dev/null
+++ b/tools/virtio/linux/vringh.h
@@ -0,0 +1 @@
+#include "../../../include/linux/vringh.h"
diff --git a/tools/virtio/uapi/linux/uio.h b/tools/virtio/uapi/linux/uio.h
new file mode 100644
index 000000000000..7230e9002207
--- /dev/null
+++ b/tools/virtio/uapi/linux/uio.h
@@ -0,0 +1 @@
+#include <sys/uio.h>
diff --git a/tools/virtio/uapi/linux/virtio_config.h b/tools/virtio/uapi/linux/virtio_config.h
new file mode 100644
index 000000000000..4c86675f0159
--- /dev/null
+++ b/tools/virtio/uapi/linux/virtio_config.h
@@ -0,0 +1 @@
+#include "../../../../include/uapi/linux/virtio_config.h"
diff --git a/tools/virtio/uapi/linux/virtio_ring.h b/tools/virtio/uapi/linux/virtio_ring.h
new file mode 100644
index 000000000000..4d99c78234d3
--- /dev/null
+++ b/tools/virtio/uapi/linux/virtio_ring.h
@@ -0,0 +1,4 @@
+#ifndef VIRTIO_RING_H
+#define VIRTIO_RING_H
+#include "../../../../include/uapi/linux/virtio_ring.h"
+#endif /* VIRTIO_RING_H */
diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c
index fcc9aa25fd08..da7a19558281 100644
--- a/tools/virtio/virtio_test.c
+++ b/tools/virtio/virtio_test.c
@@ -10,11 +10,15 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <fcntl.h>
+#include <stdbool.h>
 #include <linux/vhost.h>
 #include <linux/virtio.h>
 #include <linux/virtio_ring.h>
 #include "../../drivers/vhost/test.h"
+/* Unused */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
 struct vq_info {
        int kick;
        int call;
@@ -92,7 +96,8 @@ static void vq_info_add(struct vdev_info *dev, int num)
        assert(r >= 0);
        memset(info->ring, 0, vring_size(num, 4096));
        vring_init(&info->vring, num, info->ring, 4096);
-        info->vq = vring_new_virtqueue(info->vring.num, 4096, &dev->vdev,
+        info->vq = vring_new_virtqueue(info->idx,
+                                       info->vring.num, 4096, &dev->vdev,
                                       true, info->ring,
                                       vq_notify, vq_callback, "test");
        assert(info->vq);
@@ -161,9 +166,9 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq,
                do {
                        if (started < bufs) {
                                sg_init_one(&sl, dev->buf, dev->buf_size);
-                                r = virtqueue_add_buf(vq->vq, &sl, 1, 0,
+                                r = virtqueue_add_outbuf(vq->vq, &sl, 1,
-                                                      dev->buf + started,
+                                                         dev->buf + started,
-                                                      GFP_ATOMIC);
+                                                         GFP_ATOMIC);
                                if (likely(r == 0)) {
                                        ++started;
                                        virtqueue_kick(vq->vq);
diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c
new file mode 100644
index 000000000000..d053ea40c001
--- /dev/null
+++ b/tools/virtio/vringh_test.c
@@ -0,0 +1,741 @@
+/* Simple test of virtio code, entirely in userpsace. */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <err.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/virtio.h>
+#include <linux/vringh.h>
+#include <linux/virtio_ring.h>
+#include <linux/uaccess.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#define USER_MEM (1024*1024)
+void *__user_addr_min, *__user_addr_max;
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+static u64 user_addr_offset;
+#define RINGSIZE 256
+#define ALIGN 4096
+static void never_notify_host(struct virtqueue *vq)
+{
+        abort();
+}
+static void never_callback_guest(struct virtqueue *vq)
+{
+        abort();
+}
+static bool getrange_iov(struct vringh *vrh, u64 addr, struct vringh_range *r)
+{
+        if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset)
+                return false;
+        if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset)
+                return false;
+        r->start = (u64)(unsigned long)__user_addr_min - user_addr_offset;
+        r->end_incl = (u64)(unsigned long)__user_addr_max - 1 - user_addr_offset;
+        r->offset = user_addr_offset;
+        return true;
+}
+/* We return single byte ranges. */
+static bool getrange_slow(struct vringh *vrh, u64 addr, struct vringh_range *r)
+{
+        if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset)
+                return false;
+        if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset)
+                return false;
+        r->start = addr;
+        r->end_incl = r->start;
+        r->offset = user_addr_offset;
+        return true;
+}
+struct guest_virtio_device {
+        struct virtio_device vdev;
+        int to_host_fd;
+        unsigned long notifies;
+};
+static void parallel_notify_host(struct virtqueue *vq)
+{
+        struct guest_virtio_device *gvdev;
+        gvdev = container_of(vq->vdev, struct guest_virtio_device, vdev);
+        write(gvdev->to_host_fd, "", 1);
+        gvdev->notifies++;
+}
+static void no_notify_host(struct virtqueue *vq)
+{
+}
+#define NUM_XFERS (10000000)
+/* We aim for two "distant" cpus. */
+static void find_cpus(unsigned int *first, unsigned int *last)
+{
+        unsigned int i;
+        *first = -1U;
+        *last = 0;
+        for (i = 0; i < 4096; i++) {
+                cpu_set_t set;
+                CPU_ZERO(&set);
+                CPU_SET(i, &set);
+                if (sched_setaffinity(getpid(), sizeof(set), &set) == 0) {
+                        if (i < *first)
+                                *first = i;
+                        if (i > *last)
+                                *last = i;
+                }
+        }
+}
+/* Opencoded version for fast mode */
+static inline int vringh_get_head(struct vringh *vrh, u16 *head)
+{
+        u16 avail_idx, i;
+        int err;
+        err = get_user(avail_idx, &vrh->vring.avail->idx);
+        if (err)
+                return err;
+        if (vrh->last_avail_idx == avail_idx)
+                return 0;
+        /* Only get avail ring entries after they have been exposed by guest. */
+        virtio_rmb(vrh->weak_barriers);
+        i = vrh->last_avail_idx & (vrh->vring.num - 1);
+        err = get_user(*head, &vrh->vring.avail->ring[i]);
+        if (err)
+                return err;
+        vrh->last_avail_idx++;
+        return 1;
+}
+static int parallel_test(unsigned long features,
+                         bool (*getrange)(struct vringh *vrh,
+                                          u64 addr, struct vringh_range *r),
+                         bool fast_vringh)
+{
+        void *host_map, *guest_map;
+        int fd, mapsize, to_guest[2], to_host[2];
+        unsigned long xfers = 0, notifies = 0, receives = 0;
+        unsigned int first_cpu, last_cpu;
+        cpu_set_t cpu_set;
+        char buf[128];
+        /* Create real file to mmap. */
+        fd = open("/tmp/vringh_test-file", O_RDWR|O_CREAT|O_TRUNC, 0600);
+        if (fd < 0)
+                err(1, "Opening /tmp/vringh_test-file");
+        /* Extra room at the end for some data, and indirects */
+        mapsize = vring_size(RINGSIZE, ALIGN)
+                + RINGSIZE * 2 * sizeof(int)
+                + RINGSIZE * 6 * sizeof(struct vring_desc);
+        mapsize = (mapsize + getpagesize() - 1) & ~(getpagesize() - 1);
+        ftruncate(fd, mapsize);
+        /* Parent and child use separate addresses, to check our mapping logic! */
+        host_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+        guest_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+        pipe(to_guest);
+        pipe(to_host);
+        CPU_ZERO(&cpu_set);
+        find_cpus(&first_cpu, &last_cpu);
+        printf("Using CPUS %u and %u\n", first_cpu, last_cpu);
+        fflush(stdout);
+        if (fork() != 0) {
+                struct vringh vrh;
+                int status, err, rlen = 0;
+                char rbuf[5];
+                /* We are the host: never access guest addresses! */
+                munmap(guest_map, mapsize);
+                __user_addr_min = host_map;
+                __user_addr_max = __user_addr_min + mapsize;
+                user_addr_offset = host_map - guest_map;
+                assert(user_addr_offset);
+                close(to_guest[0]);
+                close(to_host[1]);
+                vring_init(&vrh.vring, RINGSIZE, host_map, ALIGN);
+                vringh_init_user(&vrh, features, RINGSIZE, true,
+                                 vrh.vring.desc, vrh.vring.avail, vrh.vring.used);
+                CPU_SET(first_cpu, &cpu_set);
+                if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set))
+                        errx(1, "Could not set affinity to cpu %u", first_cpu);
+                while (xfers < NUM_XFERS) {
+                        struct iovec host_riov[2], host_wiov[2];
+                        struct vringh_iov riov, wiov;
+                        u16 head, written;
+                        if (fast_vringh) {
+                                for (;;) {
+                                        err = vringh_get_head(&vrh, &head);
+                                        if (err != 0)
+                                                break;
+                                        err = vringh_need_notify_user(&vrh);
+                                        if (err < 0)
+                                                errx(1, "vringh_need_notify_user: %i",
+                                                     err);
+                                        if (err) {
+                                                write(to_guest[1], "", 1);
+                                                notifies++;
+                                        }
+                                }
+                                if (err != 1)
+                                        errx(1, "vringh_get_head");
+                                written = 0;
+                                goto complete;
+                        } else {
+                                vringh_iov_init(&riov,
+                                                host_riov,
+                                                ARRAY_SIZE(host_riov));
+                                vringh_iov_init(&wiov,
+                                                host_wiov,
+                                                ARRAY_SIZE(host_wiov));
+                                err = vringh_getdesc_user(&vrh, &riov, &wiov,
+                                                          getrange, &head);
+                        }
+                        if (err == 0) {
+                                err = vringh_need_notify_user(&vrh);
+                                if (err < 0)
+                                        errx(1, "vringh_need_notify_user: %i",
+                                             err);
+                                if (err) {
+                                        write(to_guest[1], "", 1);
+                                        notifies++;
+                                }
+                                if (!vringh_notify_enable_user(&vrh))
+                                        continue;
+                                /* Swallow all notifies at once. */
+                                if (read(to_host[0], buf, sizeof(buf)) < 1)
+                                        break;
+                                vringh_notify_disable_user(&vrh);
+                                receives++;
+                                continue;
+                        }
+                        if (err != 1)
+                                errx(1, "vringh_getdesc_user: %i", err);
+                        /* We simply copy bytes. */
+                        if (riov.used) {
+                                rlen = vringh_iov_pull_user(&riov, rbuf,
+                                                            sizeof(rbuf));
+                                if (rlen != 4)
+                                        errx(1, "vringh_iov_pull_user: %i",
+                                             rlen);
+                                assert(riov.i == riov.used);
+                                written = 0;
+                        } else {
+                                err = vringh_iov_push_user(&wiov, rbuf, rlen);
+                                if (err != rlen)
+                                        errx(1, "vringh_iov_push_user: %i",
+                                             err);
+                                assert(wiov.i == wiov.used);
+                                written = err;
+                        }
+                complete:
+                        xfers++;
+                        err = vringh_complete_user(&vrh, head, written);
+                        if (err != 0)
+                                errx(1, "vringh_complete_user: %i", err);
+                }
+                err = vringh_need_notify_user(&vrh);
+                if (err < 0)
+                        errx(1, "vringh_need_notify_user: %i", err);
+                if (err) {
+                        write(to_guest[1], "", 1);
+                        notifies++;
+                }
+                wait(&status);
+                if (!WIFEXITED(status))
+                        errx(1, "Child died with signal %i?", WTERMSIG(status));
+                if (WEXITSTATUS(status) != 0)
+                        errx(1, "Child exited %i?", WEXITSTATUS(status));
+                printf("Host: notified %lu, pinged %lu\n", notifies, receives);
+                return 0;
+        } else {
+                struct guest_virtio_device gvdev;
+                struct virtqueue *vq;
+                unsigned int *data;
+                struct vring_desc *indirects;
+                unsigned int finished = 0;
+                /* We pass sg[]s pointing into here, but we need RINGSIZE+1 */
+                data = guest_map + vring_size(RINGSIZE, ALIGN);
+                indirects = (void *)data + (RINGSIZE + 1) * 2 * sizeof(int);
+                /* We are the guest. */
+                munmap(host_map, mapsize);
+                close(to_guest[1]);
+                close(to_host[0]);
+                gvdev.vdev.features[0] = features;
+                gvdev.to_host_fd = to_host[1];
+                gvdev.notifies = 0;
+                CPU_SET(first_cpu, &cpu_set);
+                if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set))
+                        err(1, "Could not set affinity to cpu %u", first_cpu);
+                vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &gvdev.vdev, true,
+                                         guest_map, fast_vringh ? no_notify_host
+                                         : parallel_notify_host,
+                                         never_callback_guest, "guest vq");
+                /* Don't kfree indirects. */
+                __kfree_ignore_start = indirects;
+                __kfree_ignore_end = indirects + RINGSIZE * 6;
+                while (xfers < NUM_XFERS) {
+                        struct scatterlist sg[4];
+                        unsigned int num_sg, len;
+                        int *dbuf, err;
+                        bool output = !(xfers % 2);
+                        /* Consume bufs. */
+                        while ((dbuf = virtqueue_get_buf(vq, &len)) != NULL) {
+                                if (len == 4)
+                                        assert(*dbuf == finished - 1);
+                                else if (!fast_vringh)
+                                        assert(*dbuf == finished);
+                                finished++;
+                        }
+                        /* Produce a buffer. */
+                        dbuf = data + (xfers % (RINGSIZE + 1));
+                        if (output)
+                                *dbuf = xfers;
+                        else
+                                *dbuf = -1;
+                        switch ((xfers / sizeof(*dbuf)) % 4) {
+                        case 0:
+                                /* Nasty three-element sg list. */
+                                sg_init_table(sg, num_sg = 3);
+                                sg_set_buf(&sg[0], (void *)dbuf, 1);
+                                sg_set_buf(&sg[1], (void *)dbuf + 1, 2);
+                                sg_set_buf(&sg[2], (void *)dbuf + 3, 1);
+                                break;
+                        case 1:
+                                sg_init_table(sg, num_sg = 2);
+                                sg_set_buf(&sg[0], (void *)dbuf, 1);
+                                sg_set_buf(&sg[1], (void *)dbuf + 1, 3);
+                                break;
+                        case 2:
+                                sg_init_table(sg, num_sg = 1);
+                                sg_set_buf(&sg[0], (void *)dbuf, 4);
+                                break;
+                        case 3:
+                                sg_init_table(sg, num_sg = 4);
+                                sg_set_buf(&sg[0], (void *)dbuf, 1);
+                                sg_set_buf(&sg[1], (void *)dbuf + 1, 1);
+                                sg_set_buf(&sg[2], (void *)dbuf + 2, 1);
+                                sg_set_buf(&sg[3], (void *)dbuf + 3, 1);
+                                break;
+                        }
+                        /* May allocate an indirect, so force it to allocate
+                         * user addr */
+                        __kmalloc_fake = indirects + (xfers % RINGSIZE) * 4;
+                        if (output)
+                                err = virtqueue_add_outbuf(vq, sg, num_sg, dbuf,
+                                                           GFP_KERNEL);
+                        else
+                                err = virtqueue_add_inbuf(vq, sg, num_sg,
+                                                          dbuf, GFP_KERNEL);
+                        if (err == -ENOSPC) {
+                                if (!virtqueue_enable_cb_delayed(vq))
+                                        continue;
+                                /* Swallow all notifies at once. */
+                                if (read(to_guest[0], buf, sizeof(buf)) < 1)
+                                        break;
+                                
+                                receives++;
+                                virtqueue_disable_cb(vq);
+                                continue;
+                        }
+                        if (err)
+                                errx(1, "virtqueue_add_in/outbuf: %i", err);
+                        xfers++;
+                        virtqueue_kick(vq);
+                }
+                /* Any extra? */
+                while (finished != xfers) {
+                        int *dbuf;
+                        unsigned int len;
+                        /* Consume bufs. */
+                        dbuf = virtqueue_get_buf(vq, &len);
+                        if (dbuf) {
+                                if (len == 4)
+                                        assert(*dbuf == finished - 1);
+                                else
+                                        assert(len == 0);
+                                finished++;
+                                continue;
+                        }
+                        if (!virtqueue_enable_cb_delayed(vq))
+                                continue;
+                        if (read(to_guest[0], buf, sizeof(buf)) < 1)
+                                break;
+                                
+                        receives++;
+                        virtqueue_disable_cb(vq);
+                }
+                printf("Guest: notified %lu, pinged %lu\n",
+                       gvdev.notifies, receives);
+                vring_del_virtqueue(vq);
+                return 0;
+        }
+}
+int main(int argc, char *argv[])
+{
+        struct virtio_device vdev;
+        struct virtqueue *vq;
+        struct vringh vrh;
+        struct scatterlist guest_sg[RINGSIZE], *sgs[2];
+        struct iovec host_riov[2], host_wiov[2];
+        struct vringh_iov riov, wiov;
+        struct vring_used_elem used[RINGSIZE];
+        char buf[28];
+        u16 head;
+        int err;
+        unsigned i;
+        void *ret;
+        bool (*getrange)(struct vringh *vrh, u64 addr, struct vringh_range *r);
+        bool fast_vringh = false, parallel = false;
+        getrange = getrange_iov;
+        vdev.features[0] = 0;
+        while (argv[1]) {
+                if (strcmp(argv[1], "--indirect") == 0)
+                        vdev.features[0] |= (1 << VIRTIO_RING_F_INDIRECT_DESC);
+                else if (strcmp(argv[1], "--eventidx") == 0)
+                        vdev.features[0] |= (1 << VIRTIO_RING_F_EVENT_IDX);
+                else if (strcmp(argv[1], "--slow-range") == 0)
+                        getrange = getrange_slow;
+                else if (strcmp(argv[1], "--fast-vringh") == 0)
+                        fast_vringh = true;
+                else if (strcmp(argv[1], "--parallel") == 0)
+                        parallel = true;
+                else
+                        errx(1, "Unknown arg %s", argv[1]);
+                argv++;
+        }
+        if (parallel)
+                return parallel_test(vdev.features[0], getrange, fast_vringh);
+        if (posix_memalign(&__user_addr_min, PAGE_SIZE, USER_MEM) != 0)
+                abort();
+        __user_addr_max = __user_addr_min + USER_MEM;
+        memset(__user_addr_min, 0, vring_size(RINGSIZE, ALIGN));
+        /* Set up guest side. */
+        vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true,
+                                 __user_addr_min,
+                                 never_notify_host, never_callback_guest,
+                                 "guest vq");
+        /* Set up host side. */
+        vring_init(&vrh.vring, RINGSIZE, __user_addr_min, ALIGN);
+        vringh_init_user(&vrh, vdev.features[0], RINGSIZE, true,
+                         vrh.vring.desc, vrh.vring.avail, vrh.vring.used);
+        /* No descriptor to get yet... */
+        err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
+        if (err != 0)
+                errx(1, "vringh_getdesc_user: %i", err);
+        /* Guest puts in a descriptor. */
+        memcpy(__user_addr_max - 1, "a", 1);
+        sg_init_table(guest_sg, 1);
+        sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1);
+        sg_init_table(guest_sg+1, 1);
+        sg_set_buf(&guest_sg[1], __user_addr_max - 3, 2);
+        sgs[0] = &guest_sg[0];
+        sgs[1] = &guest_sg[1];
+        /* May allocate an indirect, so force it to allocate user addr */
+        __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN);
+        err = virtqueue_add_sgs(vq, sgs, 1, 1, &err, GFP_KERNEL);
+        if (err)
+                errx(1, "virtqueue_add_sgs: %i", err);
+        __kmalloc_fake = NULL;
+        /* Host retreives it. */
+        vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
+        vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
+        err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
+        if (err != 1)
+                errx(1, "vringh_getdesc_user: %i", err);
+        assert(riov.used == 1);
+        assert(riov.iov[0].iov_base == __user_addr_max - 1);
+        assert(riov.iov[0].iov_len == 1);
+        if (getrange != getrange_slow) {
+                assert(wiov.used == 1);
+                assert(wiov.iov[0].iov_base == __user_addr_max - 3);
+                assert(wiov.iov[0].iov_len == 2);
+        } else {
+                assert(wiov.used == 2);
+                assert(wiov.iov[0].iov_base == __user_addr_max - 3);
+                assert(wiov.iov[0].iov_len == 1);
+                assert(wiov.iov[1].iov_base == __user_addr_max - 2);
+                assert(wiov.iov[1].iov_len == 1);
+        }
+        err = vringh_iov_pull_user(&riov, buf, 5);
+        if (err != 1)
+                errx(1, "vringh_iov_pull_user: %i", err);
+        assert(buf[0] == 'a');
+        assert(riov.i == 1);
+        assert(vringh_iov_pull_user(&riov, buf, 5) == 0);
+        memcpy(buf, "bcdef", 5);
+        err = vringh_iov_push_user(&wiov, buf, 5);
+        if (err != 2)
+                errx(1, "vringh_iov_push_user: %i", err);
+        assert(memcmp(__user_addr_max - 3, "bc", 2) == 0);
+        assert(wiov.i == wiov.used);
+        assert(vringh_iov_push_user(&wiov, buf, 5) == 0);
+        /* Host is done. */
+        err = vringh_complete_user(&vrh, head, err);
+        if (err != 0)
+                errx(1, "vringh_complete_user: %i", err);
+        /* Guest should see used token now. */
+        __kfree_ignore_start = __user_addr_min + vring_size(RINGSIZE, ALIGN);
+        __kfree_ignore_end = __kfree_ignore_start + 1;
+        ret = virtqueue_get_buf(vq, &i);
+        if (ret != &err)
+                errx(1, "virtqueue_get_buf: %p", ret);
+        assert(i == 2);
+        /* Guest puts in a huge descriptor. */
+        sg_init_table(guest_sg, RINGSIZE);
+        for (i = 0; i < RINGSIZE; i++) {
+                sg_set_buf(&guest_sg[i],
+                           __user_addr_max - USER_MEM/4, USER_MEM/4);
+        }
+        /* Fill contents with recognisable garbage. */
+        for (i = 0; i < USER_MEM/4; i++)
+                ((char *)__user_addr_max - USER_MEM/4)[i] = i;
+        /* This will allocate an indirect, so force it to allocate user addr */
+        __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN);
+        err = virtqueue_add_outbuf(vq, guest_sg, RINGSIZE, &err, GFP_KERNEL);
+        if (err)
+                errx(1, "virtqueue_add_outbuf (large): %i", err);
+        __kmalloc_fake = NULL;
+        /* Host picks it up (allocates new iov). */
+        vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
+        vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
+        err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
+        if (err != 1)
+                errx(1, "vringh_getdesc_user: %i", err);
+        assert(riov.max_num & VRINGH_IOV_ALLOCATED);
+        assert(riov.iov != host_riov);
+        if (getrange != getrange_slow)
+                assert(riov.used == RINGSIZE);
+        else
+                assert(riov.used == RINGSIZE * USER_MEM/4);
+        assert(!(wiov.max_num & VRINGH_IOV_ALLOCATED));
+        assert(wiov.used == 0);
+        /* Pull data back out (in odd chunks), should be as expected. */
+        for (i = 0; i < RINGSIZE * USER_MEM/4; i += 3) {
+                err = vringh_iov_pull_user(&riov, buf, 3);
+                if (err != 3 && i + err != RINGSIZE * USER_MEM/4)
+                        errx(1, "vringh_iov_pull_user large: %i", err);
+                assert(buf[0] == (char)i);
+                assert(err < 2 || buf[1] == (char)(i + 1));
+                assert(err < 3 || buf[2] == (char)(i + 2));
+        }
+        assert(riov.i == riov.used);
+        vringh_iov_cleanup(&riov);
+        vringh_iov_cleanup(&wiov);
+        /* Complete using multi interface, just because we can. */
+        used[0].id = head;
+        used[0].len = 0;
+        err = vringh_complete_multi_user(&vrh, used, 1);
+        if (err)
+                errx(1, "vringh_complete_multi_user(1): %i", err);
+        /* Free up those descriptors. */
+        ret = virtqueue_get_buf(vq, &i);
+        if (ret != &err)
+                errx(1, "virtqueue_get_buf: %p", ret);
+        /* Add lots of descriptors. */
+        sg_init_table(guest_sg, 1);
+        sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1);
+        for (i = 0; i < RINGSIZE; i++) {
+                err = virtqueue_add_outbuf(vq, guest_sg, 1, &err, GFP_KERNEL);
+                if (err)
+                        errx(1, "virtqueue_add_outbuf (multiple): %i", err);
+        }
+        /* Now get many, and consume them all at once. */
+        vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
+        vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
+        for (i = 0; i < RINGSIZE; i++) {
+                err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
+                if (err != 1)
+                        errx(1, "vringh_getdesc_user: %i", err);
+                used[i].id = head;
+                used[i].len = 0;
+        }
+        /* Make sure it wraps around ring, to test! */
+        assert(vrh.vring.used->idx % RINGSIZE != 0);
+        err = vringh_complete_multi_user(&vrh, used, RINGSIZE);
+        if (err)
+                errx(1, "vringh_complete_multi_user: %i", err);
+        /* Free those buffers. */
+        for (i = 0; i < RINGSIZE; i++) {
+                unsigned len;
+                assert(virtqueue_get_buf(vq, &len) != NULL);
+        }
+        /* Test weird (but legal!) indirect. */
+        if (vdev.features[0] & (1 << VIRTIO_RING_F_INDIRECT_DESC)) {
+                char *data = __user_addr_max - USER_MEM/4;
+                struct vring_desc *d = __user_addr_max - USER_MEM/2;
+                struct vring vring;
+                /* Force creation of direct, which we modify. */
+                vdev.features[0] &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC);
+                vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true,
+                                         __user_addr_min,
+                                         never_notify_host,
+                                         never_callback_guest,
+                                         "guest vq");
+                sg_init_table(guest_sg, 4);
+                sg_set_buf(&guest_sg[0], d, sizeof(*d)*2);
+                sg_set_buf(&guest_sg[1], d + 2, sizeof(*d)*1);
+                sg_set_buf(&guest_sg[2], data + 6, 4);
+                sg_set_buf(&guest_sg[3], d + 3, sizeof(*d)*3);
+                err = virtqueue_add_outbuf(vq, guest_sg, 4, &err, GFP_KERNEL);
+                if (err)
+                        errx(1, "virtqueue_add_outbuf (indirect): %i", err);
+                vring_init(&vring, RINGSIZE, __user_addr_min, ALIGN);
+                /* They're used in order, but double-check... */
+                assert(vring.desc[0].addr == (unsigned long)d);
+                assert(vring.desc[1].addr == (unsigned long)(d+2));
+                assert(vring.desc[2].addr == (unsigned long)data + 6);
+                assert(vring.desc[3].addr == (unsigned long)(d+3));
+                vring.desc[0].flags |= VRING_DESC_F_INDIRECT;
+                vring.desc[1].flags |= VRING_DESC_F_INDIRECT;
+                vring.desc[3].flags |= VRING_DESC_F_INDIRECT;
+                /* First indirect */
+                d[0].addr = (unsigned long)data;
+                d[0].len = 1;
+                d[0].flags = VRING_DESC_F_NEXT;
+                d[0].next = 1;
+                d[1].addr = (unsigned long)data + 1;
+                d[1].len = 2;
+                d[1].flags = 0;
+                /* Second indirect */
+                d[2].addr = (unsigned long)data + 3;
+                d[2].len = 3;
+                d[2].flags = 0;
+                /* Third indirect */
+                d[3].addr = (unsigned long)data + 10;
+                d[3].len = 5;
+                d[3].flags = VRING_DESC_F_NEXT;
+                d[3].next = 1;
+                d[4].addr = (unsigned long)data + 15;
+                d[4].len = 6;
+                d[4].flags = VRING_DESC_F_NEXT;
+                d[4].next = 2;
+                d[5].addr = (unsigned long)data + 21;
+                d[5].len = 7;
+                d[5].flags = 0;
+                /* Host picks it up (allocates new iov). */
+                vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
+                vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
+                err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
+                if (err != 1)
+                        errx(1, "vringh_getdesc_user: %i", err);
+                if (head != 0)
+                        errx(1, "vringh_getdesc_user: head %i not 0", head);
+                assert(riov.max_num & VRINGH_IOV_ALLOCATED);
+                if (getrange != getrange_slow)
+                        assert(riov.used == 7);
+                else
+                        assert(riov.used == 28);
+                err = vringh_iov_pull_user(&riov, buf, 29);
+                assert(err == 28);
+                /* Data should be linear. */
+                for (i = 0; i < err; i++)
+                        assert(buf[i] == i);
+                vringh_iov_cleanup(&riov);
+        }
+        /* Don't leak memory... */
+        vring_del_virtqueue(vq);
+        free(__user_addr_min);
+        return 0;
+}