aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/00-INDEX3
-rw-r--r--Documentation/virtual/virtio-spec.txt3210
-rw-r--r--MAINTAINERS1
-rw-r--r--arch/x86/include/asm/lguest.h17
-rw-r--r--block/blk-integrity.c2
-rw-r--r--block/blk-merge.c2
-rw-r--r--drivers/Makefile2
-rw-r--r--drivers/block/virtio_blk.c148
-rw-r--r--drivers/char/hw_random/virtio-rng.c2
-rw-r--r--drivers/char/virtio_console.c14
-rw-r--r--drivers/lguest/Kconfig5
-rw-r--r--drivers/lguest/core.c67
-rw-r--r--drivers/lguest/lg.h6
-rw-r--r--drivers/lguest/lguest_user.c6
-rw-r--r--drivers/lguest/page_tables.c567
-rw-r--r--drivers/lguest/x86/core.c7
-rw-r--r--drivers/net/caif/Kconfig14
-rw-r--r--drivers/net/caif/Makefile3
-rw-r--r--drivers/net/caif/caif_virtio.c790
-rw-r--r--drivers/net/virtio_net.c77
-rw-r--r--drivers/rpmsg/virtio_rpmsg_bus.c8
-rw-r--r--drivers/scsi/virtio_scsi.c487
-rw-r--r--drivers/vhost/Kconfig8
-rw-r--r--drivers/vhost/Makefile2
-rw-r--r--drivers/vhost/test.c4
-rw-r--r--drivers/vhost/vringh.c1007
-rw-r--r--drivers/virtio/virtio_balloon.c6
-rw-r--r--drivers/virtio/virtio_ring.c297
-rw-r--r--include/linux/scatterlist.h16
-rw-r--r--include/linux/virtio.h20
-rw-r--r--include/linux/virtio_caif.h24
-rw-r--r--include/linux/virtio_ring.h57
-rw-r--r--include/linux/vringh.h225
-rw-r--r--include/uapi/linux/virtio_balloon.h4
-rw-r--r--include/uapi/linux/virtio_ids.h1
-rw-r--r--net/9p/trans_virtio.c48
-rw-r--r--tools/lguest/lguest.txt2
-rw-r--r--tools/virtio/Makefile10
-rw-r--r--tools/virtio/asm/barrier.h14
-rw-r--r--tools/virtio/linux/bug.h10
-rw-r--r--tools/virtio/linux/err.h26
-rw-r--r--tools/virtio/linux/export.h5
-rw-r--r--tools/virtio/linux/irqreturn.h1
-rw-r--r--tools/virtio/linux/kernel.h112
-rw-r--r--tools/virtio/linux/module.h1
-rw-r--r--tools/virtio/linux/printk.h4
-rw-r--r--tools/virtio/linux/ratelimit.h4
-rw-r--r--tools/virtio/linux/scatterlist.h189
-rw-r--r--tools/virtio/linux/types.h28
-rw-r--r--tools/virtio/linux/uaccess.h50
-rw-r--r--tools/virtio/linux/uio.h3
-rw-r--r--tools/virtio/linux/virtio.h171
-rw-r--r--tools/virtio/linux/virtio_config.h6
-rw-r--r--tools/virtio/linux/virtio_ring.h1
-rw-r--r--tools/virtio/linux/vringh.h1
-rw-r--r--tools/virtio/uapi/linux/uio.h1
-rw-r--r--tools/virtio/uapi/linux/virtio_config.h1
-rw-r--r--tools/virtio/uapi/linux/virtio_ring.h4
-rw-r--r--tools/virtio/virtio_test.c13
-rw-r--r--tools/virtio/vringh_test.c741
60 files changed, 4481 insertions, 4074 deletions
diff --git a/Documentation/virtual/00-INDEX b/Documentation/virtual/00-INDEX
index 924bd462675e..e952d30bbf0f 100644
--- a/Documentation/virtual/00-INDEX
+++ b/Documentation/virtual/00-INDEX
@@ -6,6 +6,3 @@ kvm/
6 - Kernel Virtual Machine. See also http://linux-kvm.org 6 - Kernel Virtual Machine. See also http://linux-kvm.org
7uml/ 7uml/
8 - User Mode Linux, builds/runs Linux kernel as a userspace program. 8 - User Mode Linux, builds/runs Linux kernel as a userspace program.
9virtio.txt
10 - Text version of draft virtio spec.
11 See http://ozlabs.org/~rusty/virtio-spec
diff --git a/Documentation/virtual/virtio-spec.txt b/Documentation/virtual/virtio-spec.txt
deleted file mode 100644
index eb094039b50d..000000000000
--- a/Documentation/virtual/virtio-spec.txt
+++ /dev/null
@@ -1,3210 +0,0 @@
1[Generated file: see http://ozlabs.org/~rusty/virtio-spec/]
2Virtio PCI Card Specification
3v0.9.5 DRAFT
4-
5
6Rusty Russell <rusty@rustcorp.com.au> IBM Corporation (Editor)
7
82012 May 7.
9
10Purpose and Description
11
12This document describes the specifications of the “virtio” family
13of PCI[LaTeX Command: nomenclature] devices. These are devices
14are found in virtual environments[LaTeX Command: nomenclature],
15yet by design they are not all that different from physical PCI
16devices, and this document treats them as such. This allows the
17guest to use standard PCI drivers and discovery mechanisms.
18
19The purpose of virtio and this specification is that virtual
20environments and guests should have a straightforward, efficient,
21standard and extensible mechanism for virtual devices, rather
22than boutique per-environment or per-OS mechanisms.
23
24 Straightforward: Virtio PCI devices use normal PCI mechanisms
25 of interrupts and DMA which should be familiar to any device
26 driver author. There is no exotic page-flipping or COW
27 mechanism: it's just a PCI device.[footnote:
28This lack of page-sharing implies that the implementation of the
29device (e.g. the hypervisor or host) needs full access to the
30guest memory. Communication with untrusted parties (i.e.
31inter-guest communication) requires copying.
32]
33
34 Efficient: Virtio PCI devices consist of rings of descriptors
35 for input and output, which are neatly separated to avoid cache
36 effects from both guest and device writing to the same cache
37 lines.
38
39 Standard: Virtio PCI makes no assumptions about the environment
40 in which it operates, beyond supporting PCI. In fact the virtio
41 devices specified in the appendices do not require PCI at all:
42 they have been implemented on non-PCI buses.[footnote:
43The Linux implementation further separates the PCI virtio code
44from the specific virtio drivers: these drivers are shared with
45the non-PCI implementations (currently lguest and S/390).
46]
47
48 Extensible: Virtio PCI devices contain feature bits which are
49 acknowledged by the guest operating system during device setup.
50 This allows forwards and backwards compatibility: the device
51 offers all the features it knows about, and the driver
52 acknowledges those it understands and wishes to use.
53
54 Virtqueues
55
56The mechanism for bulk data transport on virtio PCI devices is
57pretentiously called a virtqueue. Each device can have zero or
58more virtqueues: for example, the network device has one for
59transmit and one for receive.
60
61Each virtqueue occupies two or more physically-contiguous pages
62(defined, for the purposes of this specification, as 4096 bytes),
63and consists of three parts:
64
65
66+-------------------+-----------------------------------+-----------+
67| Descriptor Table | Available Ring (padding) | Used Ring |
68+-------------------+-----------------------------------+-----------+
69
70
71When the driver wants to send a buffer to the device, it fills in
72a slot in the descriptor table (or chains several together), and
73writes the descriptor index into the available ring. It then
74notifies the device. When the device has finished a buffer, it
75writes the descriptor into the used ring, and sends an interrupt.
76
77Specification
78
79 PCI Discovery
80
81Any PCI device with Vendor ID 0x1AF4, and Device ID 0x1000
82through 0x103F inclusive is a virtio device[footnote:
83The actual value within this range is ignored
84]. The device must also have a Revision ID of 0 to match this
85specification.
86
87The Subsystem Device ID indicates which virtio device is
88supported by the device. The Subsystem Vendor ID should reflect
89the PCI Vendor ID of the environment (it's currently only used
90for informational purposes by the guest).
91
92
93+----------------------+--------------------+---------------+
94| Subsystem Device ID | Virtio Device | Specification |
95+----------------------+--------------------+---------------+
96+----------------------+--------------------+---------------+
97| 1 | network card | Appendix C |
98+----------------------+--------------------+---------------+
99| 2 | block device | Appendix D |
100+----------------------+--------------------+---------------+
101| 3 | console | Appendix E |
102+----------------------+--------------------+---------------+
103| 4 | entropy source | Appendix F |
104+----------------------+--------------------+---------------+
105| 5 | memory ballooning | Appendix G |
106+----------------------+--------------------+---------------+
107| 6 | ioMemory | - |
108+----------------------+--------------------+---------------+
109| 7 | rpmsg | Appendix H |
110+----------------------+--------------------+---------------+
111| 8 | SCSI host | Appendix I |
112+----------------------+--------------------+---------------+
113| 9 | 9P transport | - |
114+----------------------+--------------------+---------------+
115| 10 | mac80211 wlan | - |
116+----------------------+--------------------+---------------+
117
118
119 Device Configuration
120
121To configure the device, we use the first I/O region of the PCI
122device. This contains a virtio header followed by a
123device-specific region.
124
125There may be different widths of accesses to the I/O region; the “
126natural” access method for each field in the virtio header must
127be used (i.e. 32-bit accesses for 32-bit fields, etc), but the
128device-specific region can be accessed using any width accesses,
129and should obtain the same results.
130
131Note that this is possible because while the virtio header is PCI
132(i.e. little) endian, the device-specific region is encoded in
133the native endian of the guest (where such distinction is
134applicable).
135
136 Device Initialization Sequence<sub:Device-Initialization-Sequence>
137
138We start with an overview of device initialization, then expand
139on the details of the device and how each step is preformed.
140
141 Reset the device. This is not required on initial start up.
142
143 The ACKNOWLEDGE status bit is set: we have noticed the device.
144
145 The DRIVER status bit is set: we know how to drive the device.
146
147 Device-specific setup, including reading the Device Feature
148 Bits, discovery of virtqueues for the device, optional MSI-X
149 setup, and reading and possibly writing the virtio
150 configuration space.
151
152 The subset of Device Feature Bits understood by the driver is
153 written to the device.
154
155 The DRIVER_OK status bit is set.
156
157 The device can now be used (ie. buffers added to the
158 virtqueues)[footnote:
159Historically, drivers have used the device before steps 5 and 6.
160This is only allowed if the driver does not use any features
161which would alter this early use of the device.
162]
163
164If any of these steps go irrecoverably wrong, the guest should
165set the FAILED status bit to indicate that it has given up on the
166device (it can reset the device later to restart if desired).
167
168We now cover the fields required for general setup in detail.
169
170 Virtio Header
171
172The virtio header looks as follows:
173
174
175+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
176| Bits || 32 | 32 | 32 | 16 | 16 | 16 | 8 | 8 |
177+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
178| Read/Write || R | R+W | R+W | R | R+W | R+W | R+W | R |
179+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
180| Purpose || Device | Guest | Queue | Queue | Queue | Queue | Device | ISR |
181| || Features bits 0:31 | Features bits 0:31 | Address | Size | Select | Notify | Status | Status |
182+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
183
184
185If MSI-X is enabled for the device, two additional fields
186immediately follow this header:[footnote:
187ie. once you enable MSI-X on the device, the other fields move.
188If you turn it off again, they move back!
189]
190
191
192+------------++----------------+--------+
193| Bits || 16 | 16 |
194 +----------------+--------+
195+------------++----------------+--------+
196| Read/Write || R+W | R+W |
197+------------++----------------+--------+
198| Purpose || Configuration | Queue |
199| (MSI-X) || Vector | Vector |
200+------------++----------------+--------+
201
202
203Immediately following these general headers, there may be
204device-specific headers:
205
206
207+------------++--------------------+
208| Bits || Device Specific |
209 +--------------------+
210+------------++--------------------+
211| Read/Write || Device Specific |
212+------------++--------------------+
213| Purpose || Device Specific... |
214| || |
215+------------++--------------------+
216
217
218 Device Status
219
220The Device Status field is updated by the guest to indicate its
221progress. This provides a simple low-level diagnostic: it's most
222useful to imagine them hooked up to traffic lights on the console
223indicating the status of each device.
224
225The device can be reset by writing a 0 to this field, otherwise
226at least one bit should be set:
227
228 ACKNOWLEDGE (1) Indicates that the guest OS has found the
229 device and recognized it as a valid virtio device.
230
231 DRIVER (2) Indicates that the guest OS knows how to drive the
232 device. Under Linux, drivers can be loadable modules so there
233 may be a significant (or infinite) delay before setting this
234 bit.
235
236 DRIVER_OK (4) Indicates that the driver is set up and ready to
237 drive the device.
238
239 FAILED (128) Indicates that something went wrong in the guest,
240 and it has given up on the device. This could be an internal
241 error, or the driver didn't like the device for some reason, or
242 even a fatal error during device operation. The device must be
243 reset before attempting to re-initialize.
244
245 Feature Bits<sub:Feature-Bits>
246
247Thefirst configuration field indicates the features that the
248device supports. The bits are allocated as follows:
249
250 0 to 23 Feature bits for the specific device type
251
252 24 to 32 Feature bits reserved for extensions to the queue and
253 feature negotiation mechanisms
254
255For example, feature bit 0 for a network device (i.e. Subsystem
256Device ID 1) indicates that the device supports checksumming of
257packets.
258
259The feature bits are negotiated: the device lists all the
260features it understands in the Device Features field, and the
261guest writes the subset that it understands into the Guest
262Features field. The only way to renegotiate is to reset the
263device.
264
265In particular, new fields in the device configuration header are
266indicated by offering a feature bit, so the guest can check
267before accessing that part of the configuration space.
268
269This allows for forwards and backwards compatibility: if the
270device is enhanced with a new feature bit, older guests will not
271write that feature bit back to the Guest Features field and it
272can go into backwards compatibility mode. Similarly, if a guest
273is enhanced with a feature that the device doesn't support, it
274will not see that feature bit in the Device Features field and
275can go into backwards compatibility mode (or, for poor
276implementations, set the FAILED Device Status bit).
277
278 Configuration/Queue Vectors
279
280When MSI-X capability is present and enabled in the device
281(through standard PCI configuration space) 4 bytes at byte offset
28220 are used to map configuration change and queue interrupts to
283MSI-X vectors. In this case, the ISR Status field is unused, and
284device specific configuration starts at byte offset 24 in virtio
285header structure. When MSI-X capability is not enabled, device
286specific configuration starts at byte offset 20 in virtio header.
287
288Writing a valid MSI-X Table entry number, 0 to 0x7FF, to one of
289Configuration/Queue Vector registers, maps interrupts triggered
290by the configuration change/selected queue events respectively to
291the corresponding MSI-X vector. To disable interrupts for a
292specific event type, unmap it by writing a special NO_VECTOR
293value:
294
295/* Vector value used to disable MSI for queue */
296
297#define VIRTIO_MSI_NO_VECTOR 0xffff
298
299Reading these registers returns vector mapped to a given event,
300or NO_VECTOR if unmapped. All queue and configuration change
301events are unmapped by default.
302
303Note that mapping an event to vector might require allocating
304internal device resources, and might fail. Devices report such
305failures by returning the NO_VECTOR value when the relevant
306Vector field is read. After mapping an event to vector, the
307driver must verify success by reading the Vector field value: on
308success, the previously written value is returned, and on
309failure, NO_VECTOR is returned. If a mapping failure is detected,
310the driver can retry mapping with fewervectors, or disable MSI-X.
311
312 Virtqueue Configuration<sec:Virtqueue-Configuration>
313
314As a device can have zero or more virtqueues for bulk data
315transport (for example, the network driver has two), the driver
316needs to configure them as part of the device-specific
317configuration.
318
319This is done as follows, for each virtqueue a device has:
320
321 Write the virtqueue index (first queue is 0) to the Queue
322 Select field.
323
324 Read the virtqueue size from the Queue Size field, which is
325 always a power of 2. This controls how big the virtqueue is
326 (see below). If this field is 0, the virtqueue does not exist.
327
328 Allocate and zero virtqueue in contiguous physical memory, on a
329 4096 byte alignment. Write the physical address, divided by
330 4096 to the Queue Address field.[footnote:
331The 4096 is based on the x86 page size, but it's also large
332enough to ensure that the separate parts of the virtqueue are on
333separate cache lines.
334]
335
336 Optionally, if MSI-X capability is present and enabled on the
337 device, select a vector to use to request interrupts triggered
338 by virtqueue events. Write the MSI-X Table entry number
339 corresponding to this vector in Queue Vector field. Read the
340 Queue Vector field: on success, previously written value is
341 returned; on failure, NO_VECTOR value is returned.
342
343The Queue Size field controls the total number of bytes required
344for the virtqueue according to the following formula:
345
346#define ALIGN(x) (((x) + 4095) & ~4095)
347
348static inline unsigned vring_size(unsigned int qsz)
349
350{
351
352 return ALIGN(sizeof(struct vring_desc)*qsz + sizeof(u16)*(2
353+ qsz))
354
355 + ALIGN(sizeof(struct vring_used_elem)*qsz);
356
357}
358
359This currently wastes some space with padding, but also allows
360future extensions. The virtqueue layout structure looks like this
361(qsz is the Queue Size field, which is a variable, so this code
362won't compile):
363
364struct vring {
365
366 /* The actual descriptors (16 bytes each) */
367
368 struct vring_desc desc[qsz];
369
370
371
372 /* A ring of available descriptor heads with free-running
373index. */
374
375 struct vring_avail avail;
376
377
378
379 // Padding to the next 4096 boundary.
380
381 char pad[];
382
383
384
385 // A ring of used descriptor heads with free-running index.
386
387 struct vring_used used;
388
389};
390
391 A Note on Virtqueue Endianness
392
393Note that the endian of these fields and everything else in the
394virtqueue is the native endian of the guest, not little-endian as
395PCI normally is. This makes for simpler guest code, and it is
396assumed that the host already has to be deeply aware of the guest
397endian so such an “endian-aware” device is not a significant
398issue.
399
400 Descriptor Table
401
402The descriptor table refers to the buffers the guest is using for
403the device. The addresses are physical addresses, and the buffers
404can be chained via the next field. Each descriptor describes a
405buffer which is read-only or write-only, but a chain of
406descriptors can contain both read-only and write-only buffers.
407
408No descriptor chain may be more than 2^32 bytes long in total.struct vring_desc {
409
410 /* Address (guest-physical). */
411
412 u64 addr;
413
414 /* Length. */
415
416 u32 len;
417
418/* This marks a buffer as continuing via the next field. */
419
420#define VRING_DESC_F_NEXT 1
421
422/* This marks a buffer as write-only (otherwise read-only). */
423
424#define VRING_DESC_F_WRITE 2
425
426/* This means the buffer contains a list of buffer descriptors.
427*/
428
429#define VRING_DESC_F_INDIRECT 4
430
431 /* The flags as indicated above. */
432
433 u16 flags;
434
435 /* Next field if flags & NEXT */
436
437 u16 next;
438
439};
440
441The number of descriptors in the table is specified by the Queue
442Size field for this virtqueue.
443
444 <sub:Indirect-Descriptors>Indirect Descriptors
445
446Some devices benefit by concurrently dispatching a large number
447of large requests. The VIRTIO_RING_F_INDIRECT_DESC feature can be
448used to allow this (see [cha:Reserved-Feature-Bits]). To increase
449ring capacity it is possible to store a table of indirect
450descriptors anywhere in memory, and insert a descriptor in main
451virtqueue (with flags&INDIRECT on) that refers to memory buffer
452containing this indirect descriptor table; fields addr and len
453refer to the indirect table address and length in bytes,
454respectively. The indirect table layout structure looks like this
455(len is the length of the descriptor that refers to this table,
456which is a variable, so this code won't compile):
457
458struct indirect_descriptor_table {
459
460 /* The actual descriptors (16 bytes each) */
461
462 struct vring_desc desc[len / 16];
463
464};
465
466The first indirect descriptor is located at start of the indirect
467descriptor table (index 0), additional indirect descriptors are
468chained by next field. An indirect descriptor without next field
469(with flags&NEXT off) signals the end of the indirect descriptor
470table, and transfers control back to the main virtqueue. An
471indirect descriptor can not refer to another indirect descriptor
472table (flags&INDIRECT must be off). A single indirect descriptor
473table can include both read-only and write-only descriptors;
474write-only flag (flags&WRITE) in the descriptor that refers to it
475is ignored.
476
477 Available Ring
478
479The available ring refers to what descriptors we are offering the
480device: it refers to the head of a descriptor chain. The “flags”
481field is currently 0 or 1: 1 indicating that we do not need an
482interrupt when the device consumes a descriptor from the
483available ring. Alternatively, the guest can ask the device to
484delay interrupts until an entry with an index specified by the “
485used_event” field is written in the used ring (equivalently,
486until the idx field in the used ring will reach the value
487used_event + 1). The method employed by the device is controlled
488by the VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits]
489). This interrupt suppression is merely an optimization; it may
490not suppress interrupts entirely.
491
492The “idx” field indicates where we would put the next descriptor
493entry (modulo the ring size). This starts at 0, and increases.
494
495struct vring_avail {
496
497#define VRING_AVAIL_F_NO_INTERRUPT 1
498
499 u16 flags;
500
501 u16 idx;
502
503 u16 ring[qsz]; /* qsz is the Queue Size field read from device
504*/
505
506 u16 used_event;
507
508};
509
510 Used Ring
511
512The used ring is where the device returns buffers once it is done
513with them. The flags field can be used by the device to hint that
514no notification is necessary when the guest adds to the available
515ring. Alternatively, the “avail_event” field can be used by the
516device to hint that no notification is necessary until an entry
517with an index specified by the “avail_event” is written in the
518available ring (equivalently, until the idx field in the
519available ring will reach the value avail_event + 1). The method
520employed by the device is controlled by the guest through the
521VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits]
522). [footnote:
523These fields are kept here because this is the only part of the
524virtqueue written by the device
525].
526
527Each entry in the ring is a pair: the head entry of the
528descriptor chain describing the buffer (this matches an entry
529placed in the available ring by the guest earlier), and the total
530of bytes written into the buffer. The latter is extremely useful
531for guests using untrusted buffers: if you do not know exactly
532how much has been written by the device, you usually have to zero
533the buffer to ensure no data leakage occurs.
534
535/* u32 is used here for ids for padding reasons. */
536
537struct vring_used_elem {
538
539 /* Index of start of used descriptor chain. */
540
541 u32 id;
542
543 /* Total length of the descriptor chain which was used
544(written to) */
545
546 u32 len;
547
548};
549
550
551
552struct vring_used {
553
554#define VRING_USED_F_NO_NOTIFY 1
555
556 u16 flags;
557
558 u16 idx;
559
560 struct vring_used_elem ring[qsz];
561
562 u16 avail_event;
563
564};
565
566 Helpers for Managing Virtqueues
567
568The Linux Kernel Source code contains the definitions above and
569helper routines in a more usable form, in
570include/linux/virtio_ring.h. This was explicitly licensed by IBM
571and Red Hat under the (3-clause) BSD license so that it can be
572freely used by all other projects, and is reproduced (with slight
573variation to remove Linux assumptions) in Appendix A.
574
575 Device Operation<sec:Device-Operation>
576
577There are two parts to device operation: supplying new buffers to
578the device, and processing used buffers from the device. As an
579example, the virtio network device has two virtqueues: the
580transmit virtqueue and the receive virtqueue. The driver adds
581outgoing (read-only) packets to the transmit virtqueue, and then
582frees them after they are used. Similarly, incoming (write-only)
583buffers are added to the receive virtqueue, and processed after
584they are used.
585
586 Supplying Buffers to The Device
587
588Actual transfer of buffers from the guest OS to the device
589operates as follows:
590
591 Place the buffer(s) into free descriptor(s).
592
593 If there are no free descriptors, the guest may choose to
594 notify the device even if notifications are suppressed (to
595 reduce latency).[footnote:
596The Linux drivers do this only for read-only buffers: for
597write-only buffers, it is assumed that the driver is merely
598trying to keep the receive buffer ring full, and no notification
599of this expected condition is necessary.
600]
601
602 Place the id of the buffer in the next ring entry of the
603 available ring.
604
605 The steps (1) and (2) may be performed repeatedly if batching
606 is possible.
607
608 A memory barrier should be executed to ensure the device sees
609 the updated descriptor table and available ring before the next
610 step.
611
612 The available “idx” field should be increased by the number of
613 entries added to the available ring.
614
615 A memory barrier should be executed to ensure that we update
616 the idx field before checking for notification suppression.
617
618 If notifications are not suppressed, the device should be
619 notified of the new buffers.
620
621Note that the above code does not take precautions against the
622available ring buffer wrapping around: this is not possible since
623the ring buffer is the same size as the descriptor table, so step
624(1) will prevent such a condition.
625
626In addition, the maximum queue size is 32768 (it must be a power
627of 2 which fits in 16 bits), so the 16-bit “idx” value can always
628distinguish between a full and empty buffer.
629
630Here is a description of each stage in more detail.
631
632 Placing Buffers Into The Descriptor Table
633
634A buffer consists of zero or more read-only physically-contiguous
635elements followed by zero or more physically-contiguous
636write-only elements (it must have at least one element). This
637algorithm maps it into the descriptor table:
638
639 for each buffer element, b:
640
641 Get the next free descriptor table entry, d
642
643 Set d.addr to the physical address of the start of b
644
645 Set d.len to the length of b.
646
647 If b is write-only, set d.flags to VRING_DESC_F_WRITE,
648 otherwise 0.
649
650 If there is a buffer element after this:
651
652 Set d.next to the index of the next free descriptor element.
653
654 Set the VRING_DESC_F_NEXT bit in d.flags.
655
656In practice, the d.next fields are usually used to chain free
657descriptors, and a separate count kept to check there are enough
658free descriptors before beginning the mappings.
659
660 Updating The Available Ring
661
662The head of the buffer we mapped is the first d in the algorithm
663above. A naive implementation would do the following:
664
665avail->ring[avail->idx % qsz] = head;
666
667However, in general we can add many descriptors before we update
668the “idx” field (at which point they become visible to the
669device), so we keep a counter of how many we've added:
670
671avail->ring[(avail->idx + added++) % qsz] = head;
672
673 Updating The Index Field
674
675Once the idx field of the virtqueue is updated, the device will
676be able to access the descriptor entries we've created and the
677memory they refer to. This is why a memory barrier is generally
678used before the idx update, to ensure it sees the most up-to-date
679copy.
680
681The idx field always increments, and we let it wrap naturally at
68265536:
683
684avail->idx += added;
685
686 <sub:Notifying-The-Device>Notifying The Device
687
688Device notification occurs by writing the 16-bit virtqueue index
689of this virtqueue to the Queue Notify field of the virtio header
690in the first I/O region of the PCI device. This can be expensive,
691however, so the device can suppress such notifications if it
692doesn't need them. We have to be careful to expose the new idx
693value before checking the suppression flag: it's OK to notify
694gratuitously, but not to omit a required notification. So again,
695we use a memory barrier here before reading the flags or the
696avail_event field.
697
698If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated, and if
699the VRING_USED_F_NOTIFY flag is not set, we go ahead and write to
700the PCI configuration space.
701
702If the VIRTIO_F_RING_EVENT_IDX feature is negotiated, we read the
703avail_event field in the available ring structure. If the
704available index crossed_the avail_event field value since the
705last notification, we go ahead and write to the PCI configuration
706space. The avail_event field wraps naturally at 65536 as well:
707
708(u16)(new_idx - avail_event - 1) < (u16)(new_idx - old_idx)
709
710 <sub:Receiving-Used-Buffers>Receiving Used Buffers From The
711 Device
712
713Once the device has used a buffer (read from or written to it, or
714parts of both, depending on the nature of the virtqueue and the
715device), it sends an interrupt, following an algorithm very
716similar to the algorithm used for the driver to send the device a
717buffer:
718
719 Write the head descriptor number to the next field in the used
720 ring.
721
722 Update the used ring idx.
723
724 Determine whether an interrupt is necessary:
725
726 If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated: check
727 if f the VRING_AVAIL_F_NO_INTERRUPT flag is not set in avail-
728 >flags
729
730 If the VIRTIO_F_RING_EVENT_IDX feature is negotiated: check
731 whether the used index crossed the used_event field value
732 since the last update. The used_event field wraps naturally
733 at 65536 as well:(u16)(new_idx - used_event - 1) < (u16)(new_idx - old_idx)
734
735 If an interrupt is necessary:
736
737 If MSI-X capability is disabled:
738
739 Set the lower bit of the ISR Status field for the device.
740
741 Send the appropriate PCI interrupt for the device.
742
743 If MSI-X capability is enabled:
744
745 Request the appropriate MSI-X interrupt message for the
746 device, Queue Vector field sets the MSI-X Table entry
747 number.
748
749 If Queue Vector field value is NO_VECTOR, no interrupt
750 message is requested for this event.
751
752The guest interrupt handler should:
753
754 If MSI-X capability is disabled: read the ISR Status field,
755 which will reset it to zero. If the lower bit is zero, the
756 interrupt was not for this device. Otherwise, the guest driver
757 should look through the used rings of each virtqueue for the
758 device, to see if any progress has been made by the device
759 which requires servicing.
760
761 If MSI-X capability is enabled: look through the used rings of
762 each virtqueue mapped to the specific MSI-X vector for the
763 device, to see if any progress has been made by the device
764 which requires servicing.
765
766For each ring, guest should then disable interrupts by writing
767VRING_AVAIL_F_NO_INTERRUPT flag in avail structure, if required.
768It can then process used ring entries finally enabling interrupts
769by clearing the VRING_AVAIL_F_NO_INTERRUPT flag or updating the
770EVENT_IDX field in the available structure, Guest should then
771execute a memory barrier, and then recheck the ring empty
772condition. This is necessary to handle the case where, after the
773last check and before enabling interrupts, an interrupt has been
774suppressed by the device:
775
776vring_disable_interrupts(vq);
777
778for (;;) {
779
780 if (vq->last_seen_used != vring->used.idx) {
781
782 vring_enable_interrupts(vq);
783
784 mb();
785
786 if (vq->last_seen_used != vring->used.idx)
787
788 break;
789
790 }
791
792 struct vring_used_elem *e =
793vring.used->ring[vq->last_seen_used%vsz];
794
795 process_buffer(e);
796
797 vq->last_seen_used++;
798
799}
800
801 Dealing With Configuration Changes<sub:Dealing-With-Configuration>
802
803Some virtio PCI devices can change the device configuration
804state, as reflected in the virtio header in the PCI configuration
805space. In this case:
806
807 If MSI-X capability is disabled: an interrupt is delivered and
808 the second highest bit is set in the ISR Status field to
809 indicate that the driver should re-examine the configuration
810 space.Note that a single interrupt can indicate both that one
811 or more virtqueue has been used and that the configuration
812 space has changed: even if the config bit is set, virtqueues
813 must be scanned.
814
815 If MSI-X capability is enabled: an interrupt message is
816 requested. The Configuration Vector field sets the MSI-X Table
817 entry number to use. If Configuration Vector field value is
818 NO_VECTOR, no interrupt message is requested for this event.
819
820Creating New Device Types
821
822Various considerations are necessary when creating a new device
823type:
824
825 How Many Virtqueues?
826
827It is possible that a very simple device will operate entirely
828through its configuration space, but most will need at least one
829virtqueue in which it will place requests. A device with both
830input and output (eg. console and network devices described here)
831need two queues: one which the driver fills with buffers to
832receive input, and one which the driver places buffers to
833transmit output.
834
835 What Configuration Space Layout?
836
837Configuration space is generally used for rarely-changing or
838initialization-time parameters. But it is a limited resource, so
839it might be better to use a virtqueue to update configuration
840information (the network device does this for filtering,
841otherwise the table in the config space could potentially be very
842large).
843
844Note that this space is generally the guest's native endian,
845rather than PCI's little-endian.
846
847 What Device Number?
848
849Currently device numbers are assigned quite freely: a simple
850request mail to the author of this document or the Linux
851virtualization mailing list[footnote:
852
853https://lists.linux-foundation.org/mailman/listinfo/virtualization
854] will be sufficient to secure a unique one.
855
856Meanwhile for experimental drivers, use 65535 and work backwards.
857
858 How many MSI-X vectors?
859
860Using the optional MSI-X capability devices can speed up
861interrupt processing by removing the need to read ISR Status
862register by guest driver (which might be an expensive operation),
863reducing interrupt sharing between devices and queues within the
864device, and handling interrupts from multiple CPUs. However, some
865systems impose a limit (which might be as low as 256) on the
866total number of MSI-X vectors that can be allocated to all
867devices. Devices and/or device drivers should take this into
868account, limiting the number of vectors used unless the device is
869expected to cause a high volume of interrupts. Devices can
870control the number of vectors used by limiting the MSI-X Table
871Size or not presenting MSI-X capability in PCI configuration
872space. Drivers can control this by mapping events to as small
873number of vectors as possible, or disabling MSI-X capability
874altogether.
875
876 Message Framing
877
878The descriptors used for a buffer should not effect the semantics
879of the message, except for the total length of the buffer. For
880example, a network buffer consists of a 10 byte header followed
881by the network packet. Whether this is presented in the ring
882descriptor chain as (say) a 10 byte buffer and a 1514 byte
883buffer, or a single 1524 byte buffer, or even three buffers,
884should have no effect.
885
886In particular, no implementation should use the descriptor
887boundaries to determine the size of any header in a request.[footnote:
888The current qemu device implementations mistakenly insist that
889the first descriptor cover the header in these cases exactly, so
890a cautious driver should arrange it so.
891]
892
893 Device Improvements
894
895Any change to configuration space, or new virtqueues, or
896behavioural changes, should be indicated by negotiation of a new
897feature bit. This establishes clarity[footnote:
898Even if it does mean documenting design or implementation
899mistakes!
900] and avoids future expansion problems.
901
902Clusters of functionality which are always implemented together
903can use a single bit, but if one feature makes sense without the
904others they should not be gratuitously grouped together to
905conserve feature bits. We can always extend the spec when the
906first person needs more than 24 feature bits for their device.
907
908[LaTeX Command: printnomenclature]
909
910Appendix A: virtio_ring.h
911
912#ifndef VIRTIO_RING_H
913
914#define VIRTIO_RING_H
915
916/* An interface for efficient virtio implementation.
917
918 *
919
920 * This header is BSD licensed so anyone can use the definitions
921
922 * to implement compatible drivers/servers.
923
924 *
925
926 * Copyright 2007, 2009, IBM Corporation
927
928 * Copyright 2011, Red Hat, Inc
929
930 * All rights reserved.
931
932 *
933
934 * Redistribution and use in source and binary forms, with or
935without
936
937 * modification, are permitted provided that the following
938conditions
939
940 * are met:
941
942 * 1. Redistributions of source code must retain the above
943copyright
944
945 * notice, this list of conditions and the following
946disclaimer.
947
948 * 2. Redistributions in binary form must reproduce the above
949copyright
950
951 * notice, this list of conditions and the following
952disclaimer in the
953
954 * documentation and/or other materials provided with the
955distribution.
956
957 * 3. Neither the name of IBM nor the names of its contributors
958
959 * may be used to endorse or promote products derived from
960this software
961
962 * without specific prior written permission.
963
964 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
965CONTRIBUTORS ``AS IS'' AND
966
967 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
968TO, THE
969
970 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
971PARTICULAR PURPOSE
972
973 * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE
974LIABLE
975
976 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
977CONSEQUENTIAL
978
979 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
980SUBSTITUTE GOODS
981
982 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
983INTERRUPTION)
984
985 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
986CONTRACT, STRICT
987
988 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
989IN ANY WAY
990
991 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
992POSSIBILITY OF
993
994 * SUCH DAMAGE.
995
996 */
997
998
999
1000/* This marks a buffer as continuing via the next field. */
1001
1002#define VRING_DESC_F_NEXT 1
1003
1004/* This marks a buffer as write-only (otherwise read-only). */
1005
1006#define VRING_DESC_F_WRITE 2
1007
1008
1009
1010/* The Host uses this in used->flags to advise the Guest: don't
1011kick me
1012
1013 * when you add a buffer. It's unreliable, so it's simply an
1014
1015 * optimization. Guest will still kick if it's out of buffers.
1016*/
1017
1018#define VRING_USED_F_NO_NOTIFY 1
1019
1020/* The Guest uses this in avail->flags to advise the Host: don't
1021
1022 * interrupt me when you consume a buffer. It's unreliable, so
1023it's
1024
1025 * simply an optimization. */
1026
1027#define VRING_AVAIL_F_NO_INTERRUPT 1
1028
1029
1030
1031/* Virtio ring descriptors: 16 bytes.
1032
1033 * These can chain together via "next". */
1034
1035struct vring_desc {
1036
1037 /* Address (guest-physical). */
1038
1039 uint64_t addr;
1040
1041 /* Length. */
1042
1043 uint32_t len;
1044
1045 /* The flags as indicated above. */
1046
1047 uint16_t flags;
1048
1049 /* We chain unused descriptors via this, too */
1050
1051 uint16_t next;
1052
1053};
1054
1055
1056
1057struct vring_avail {
1058
1059 uint16_t flags;
1060
1061 uint16_t idx;
1062
1063 uint16_t ring[];
1064
1065 uint16_t used_event;
1066
1067};
1068
1069
1070
1071/* u32 is used here for ids for padding reasons. */
1072
1073struct vring_used_elem {
1074
1075 /* Index of start of used descriptor chain. */
1076
1077 uint32_t id;
1078
1079 /* Total length of the descriptor chain which was written
1080to. */
1081
1082 uint32_t len;
1083
1084};
1085
1086
1087
1088struct vring_used {
1089
1090 uint16_t flags;
1091
1092 uint16_t idx;
1093
1094 struct vring_used_elem ring[];
1095
1096 uint16_t avail_event;
1097
1098};
1099
1100
1101
1102struct vring {
1103
1104 unsigned int num;
1105
1106
1107
1108 struct vring_desc *desc;
1109
1110 struct vring_avail *avail;
1111
1112 struct vring_used *used;
1113
1114};
1115
1116
1117
1118/* The standard layout for the ring is a continuous chunk of
1119memory which
1120
1121 * looks like this. We assume num is a power of 2.
1122
1123 *
1124
1125 * struct vring {
1126
1127 * // The actual descriptors (16 bytes each)
1128
1129 * struct vring_desc desc[num];
1130
1131 *
1132
1133 * // A ring of available descriptor heads with free-running
1134index.
1135
1136 * __u16 avail_flags;
1137
1138 * __u16 avail_idx;
1139
1140 * __u16 available[num];
1141
1142 *
1143
1144 * // Padding to the next align boundary.
1145
1146 * char pad[];
1147
1148 *
1149
1150 * // A ring of used descriptor heads with free-running
1151index.
1152
1153 * __u16 used_flags;
1154
1155 * __u16 EVENT_IDX;
1156
1157 * struct vring_used_elem used[num];
1158
1159 * };
1160
1161 * Note: for virtio PCI, align is 4096.
1162
1163 */
1164
1165static inline void vring_init(struct vring *vr, unsigned int num,
1166void *p,
1167
1168 unsigned long align)
1169
1170{
1171
1172 vr->num = num;
1173
1174 vr->desc = p;
1175
1176 vr->avail = p + num*sizeof(struct vring_desc);
1177
1178 vr->used = (void *)(((unsigned long)&vr->avail->ring[num]
1179
1180 + align-1)
1181
1182 & ~(align - 1));
1183
1184}
1185
1186
1187
1188static inline unsigned vring_size(unsigned int num, unsigned long
1189align)
1190
1191{
1192
1193 return ((sizeof(struct vring_desc)*num +
1194sizeof(uint16_t)*(2+num)
1195
1196 + align - 1) & ~(align - 1))
1197
1198 + sizeof(uint16_t)*3 + sizeof(struct
1199vring_used_elem)*num;
1200
1201}
1202
1203
1204
1205static inline int vring_need_event(uint16_t event_idx, uint16_t
1206new_idx, uint16_t old_idx)
1207
1208{
1209
1210 return (uint16_t)(new_idx - event_idx - 1) <
1211(uint16_t)(new_idx - old_idx);
1212
1213}
1214
1215#endif /* VIRTIO_RING_H */
1216
1217<cha:Reserved-Feature-Bits>Appendix B: Reserved Feature Bits
1218
1219Currently there are five device-independent feature bits defined:
1220
1221 VIRTIO_F_NOTIFY_ON_EMPTY (24) Negotiating this feature
1222 indicates that the driver wants an interrupt if the device runs
1223 out of available descriptors on a virtqueue, even though
1224 interrupts are suppressed using the VRING_AVAIL_F_NO_INTERRUPT
1225 flag or the used_event field. An example of this is the
1226 networking driver: it doesn't need to know every time a packet
1227 is transmitted, but it does need to free the transmitted
1228 packets a finite time after they are transmitted. It can avoid
1229 using a timer if the device interrupts it when all the packets
1230 are transmitted.
1231
1232 VIRTIO_F_RING_INDIRECT_DESC (28) Negotiating this feature
1233 indicates that the driver can use descriptors with the
1234 VRING_DESC_F_INDIRECT flag set, as described in [sub:Indirect-Descriptors]
1235 .
1236
1237 VIRTIO_F_RING_EVENT_IDX(29) This feature enables the used_event
1238 and the avail_event fields. If set, it indicates that the
1239 device should ignore the flags field in the available ring
1240 structure. Instead, the used_event field in this structure is
1241 used by guest to suppress device interrupts. Further, the
1242 driver should ignore the flags field in the used ring
1243 structure. Instead, the avail_event field in this structure is
1244 used by the device to suppress notifications. If unset, the
1245 driver should ignore the used_event field; the device should
1246 ignore the avail_event field; the flags field is used
1247
1248Appendix C: Network Device
1249
1250The virtio network device is a virtual ethernet card, and is the
1251most complex of the devices supported so far by virtio. It has
1252enhanced rapidly and demonstrates clearly how support for new
1253features should be added to an existing device. Empty buffers are
1254placed in one virtqueue for receiving packets, and outgoing
1255packets are enqueued into another for transmission in that order.
1256A third command queue is used to control advanced filtering
1257features.
1258
1259 Configuration
1260
1261 Subsystem Device ID 1
1262
1263 Virtqueues 0:receiveq. 1:transmitq. 2:controlq[footnote:
1264Only if VIRTIO_NET_F_CTRL_VQ set
1265]
1266
1267 Feature bits
1268
1269 VIRTIO_NET_F_CSUM (0) Device handles packets with partial
1270 checksum
1271
1272 VIRTIO_NET_F_GUEST_CSUM (1) Guest handles packets with partial
1273 checksum
1274
1275 VIRTIO_NET_F_MAC (5) Device has given MAC address.
1276
1277 VIRTIO_NET_F_GSO (6) (Deprecated) device handles packets with
1278 any GSO type.[footnote:
1279It was supposed to indicate segmentation offload support, but
1280upon further investigation it became clear that multiple bits
1281were required.
1282]
1283
1284 VIRTIO_NET_F_GUEST_TSO4 (7) Guest can receive TSOv4.
1285
1286 VIRTIO_NET_F_GUEST_TSO6 (8) Guest can receive TSOv6.
1287
1288 VIRTIO_NET_F_GUEST_ECN (9) Guest can receive TSO with ECN.
1289
1290 VIRTIO_NET_F_GUEST_UFO (10) Guest can receive UFO.
1291
1292 VIRTIO_NET_F_HOST_TSO4 (11) Device can receive TSOv4.
1293
1294 VIRTIO_NET_F_HOST_TSO6 (12) Device can receive TSOv6.
1295
1296 VIRTIO_NET_F_HOST_ECN (13) Device can receive TSO with ECN.
1297
1298 VIRTIO_NET_F_HOST_UFO (14) Device can receive UFO.
1299
1300 VIRTIO_NET_F_MRG_RXBUF (15) Guest can merge receive buffers.
1301
1302 VIRTIO_NET_F_STATUS (16) Configuration status field is
1303 available.
1304
1305 VIRTIO_NET_F_CTRL_VQ (17) Control channel is available.
1306
1307 VIRTIO_NET_F_CTRL_RX (18) Control channel RX mode support.
1308
1309 VIRTIO_NET_F_CTRL_VLAN (19) Control channel VLAN filtering.
1310
1311 VIRTIO_NET_F_GUEST_ANNOUNCE(21) Guest can send gratuitous
1312 packets.
1313
1314 Device configuration layout Two configuration fields are
1315 currently defined. The mac address field always exists (though
1316 is only valid if VIRTIO_NET_F_MAC is set), and the status field
1317 only exists if VIRTIO_NET_F_STATUS is set. Two read-only bits
1318 are currently defined for the status field:
1319 VIRTIO_NET_S_LINK_UP and VIRTIO_NET_S_ANNOUNCE. #define VIRTIO_NET_S_LINK_UP 1
1320
1321#define VIRTIO_NET_S_ANNOUNCE 2
1322
1323
1324
1325struct virtio_net_config {
1326
1327 u8 mac[6];
1328
1329 u16 status;
1330
1331};
1332
1333 Device Initialization
1334
1335 The initialization routine should identify the receive and
1336 transmission virtqueues.
1337
1338 If the VIRTIO_NET_F_MAC feature bit is set, the configuration
1339 space “mac” entry indicates the “physical” address of the the
1340 network card, otherwise a private MAC address should be
1341 assigned. All guests are expected to negotiate this feature if
1342 it is set.
1343
1344 If the VIRTIO_NET_F_CTRL_VQ feature bit is negotiated, identify
1345 the control virtqueue.
1346
1347 If the VIRTIO_NET_F_STATUS feature bit is negotiated, the link
1348 status can be read from the bottom bit of the “status” config
1349 field. Otherwise, the link should be assumed active.
1350
1351 The receive virtqueue should be filled with receive buffers.
1352 This is described in detail below in “Setting Up Receive
1353 Buffers”.
1354
1355 A driver can indicate that it will generate checksumless
1356 packets by negotating the VIRTIO_NET_F_CSUM feature. This “
1357 checksum offload” is a common feature on modern network cards.
1358
1359 If that feature is negotiated[footnote:
1360ie. VIRTIO_NET_F_HOST_TSO* and VIRTIO_NET_F_HOST_UFO are
1361dependent on VIRTIO_NET_F_CSUM; a dvice which offers the offload
1362features must offer the checksum feature, and a driver which
1363accepts the offload features must accept the checksum feature.
1364Similar logic applies to the VIRTIO_NET_F_GUEST_TSO4 features
1365depending on VIRTIO_NET_F_GUEST_CSUM.
1366], a driver can use TCP or UDP segmentation offload by
1367 negotiating the VIRTIO_NET_F_HOST_TSO4 (IPv4 TCP),
1368 VIRTIO_NET_F_HOST_TSO6 (IPv6 TCP) and VIRTIO_NET_F_HOST_UFO
1369 (UDP fragmentation) features. It should not send TCP packets
1370 requiring segmentation offload which have the Explicit
1371 Congestion Notification bit set, unless the
1372 VIRTIO_NET_F_HOST_ECN feature is negotiated.[footnote:
1373This is a common restriction in real, older network cards.
1374]
1375
1376 The converse features are also available: a driver can save the
1377 virtual device some work by negotiating these features.[footnote:
1378For example, a network packet transported between two guests on
1379the same system may not require checksumming at all, nor
1380segmentation, if both guests are amenable.
1381] The VIRTIO_NET_F_GUEST_CSUM feature indicates that partially
1382 checksummed packets can be received, and if it can do that then
1383 the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
1384 VIRTIO_NET_F_GUEST_UFO and VIRTIO_NET_F_GUEST_ECN are the input
1385 equivalents of the features described above. See “Receiving
1386 Packets” below.
1387
1388 Device Operation
1389
1390Packets are transmitted by placing them in the transmitq, and
1391buffers for incoming packets are placed in the receiveq. In each
1392case, the packet itself is preceded by a header:
1393
1394struct virtio_net_hdr {
1395
1396#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1
1397
1398 u8 flags;
1399
1400#define VIRTIO_NET_HDR_GSO_NONE 0
1401
1402#define VIRTIO_NET_HDR_GSO_TCPV4 1
1403
1404#define VIRTIO_NET_HDR_GSO_UDP 3
1405
1406#define VIRTIO_NET_HDR_GSO_TCPV6 4
1407
1408#define VIRTIO_NET_HDR_GSO_ECN 0x80
1409
1410 u8 gso_type;
1411
1412 u16 hdr_len;
1413
1414 u16 gso_size;
1415
1416 u16 csum_start;
1417
1418 u16 csum_offset;
1419
1420/* Only if VIRTIO_NET_F_MRG_RXBUF: */
1421
1422 u16 num_buffers
1423
1424};
1425
1426The controlq is used to control device features such as
1427filtering.
1428
1429 Packet Transmission
1430
1431Transmitting a single packet is simple, but varies depending on
1432the different features the driver negotiated.
1433
1434 If the driver negotiated VIRTIO_NET_F_CSUM, and the packet has
1435 not been fully checksummed, then the virtio_net_hdr's fields
1436 are set as follows. Otherwise, the packet must be fully
1437 checksummed, and flags is zero.
1438
1439 flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set,
1440
1441 <ite:csum_start-is-set>csum_start is set to the offset within
1442 the packet to begin checksumming, and
1443
1444 csum_offset indicates how many bytes after the csum_start the
1445 new (16 bit ones' complement) checksum should be placed.[footnote:
1446For example, consider a partially checksummed TCP (IPv4) packet.
1447It will have a 14 byte ethernet header and 20 byte IP header
1448followed by the TCP header (with the TCP checksum field 16 bytes
1449into that header). csum_start will be 14+20 = 34 (the TCP
1450checksum includes the header), and csum_offset will be 16. The
1451value in the TCP checksum field should be initialized to the sum
1452of the TCP pseudo header, so that replacing it by the ones'
1453complement checksum of the TCP header and body will give the
1454correct result.
1455]
1456
1457 <enu:If-the-driver>If the driver negotiated
1458 VIRTIO_NET_F_HOST_TSO4, TSO6 or UFO, and the packet requires
1459 TCP segmentation or UDP fragmentation, then the “gso_type”
1460 field is set to VIRTIO_NET_HDR_GSO_TCPV4, TCPV6 or UDP.
1461 (Otherwise, it is set to VIRTIO_NET_HDR_GSO_NONE). In this
1462 case, packets larger than 1514 bytes can be transmitted: the
1463 metadata indicates how to replicate the packet header to cut it
1464 into smaller packets. The other gso fields are set:
1465
1466 hdr_len is a hint to the device as to how much of the header
1467 needs to be kept to copy into each packet, usually set to the
1468 length of the headers, including the transport header.[footnote:
1469Due to various bugs in implementations, this field is not useful
1470as a guarantee of the transport header size.
1471]
1472
1473 gso_size is the maximum size of each packet beyond that header
1474 (ie. MSS).
1475
1476 If the driver negotiated the VIRTIO_NET_F_HOST_ECN feature, the
1477 VIRTIO_NET_HDR_GSO_ECN bit may be set in “gso_type” as well,
1478 indicating that the TCP packet has the ECN bit set.[footnote:
1479This case is not handled by some older hardware, so is called out
1480specifically in the protocol.
1481]
1482
1483 If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature,
1484 the num_buffers field is set to zero.
1485
1486 The header and packet are added as one output buffer to the
1487 transmitq, and the device is notified of the new entry (see [sub:Notifying-The-Device]
1488 ).[footnote:
1489Note that the header will be two bytes longer for the
1490VIRTIO_NET_F_MRG_RXBUF case.
1491]
1492
1493 Packet Transmission Interrupt
1494
1495Often a driver will suppress transmission interrupts using the
1496VRING_AVAIL_F_NO_INTERRUPT flag (see [sub:Receiving-Used-Buffers]
1497) and check for used packets in the transmit path of following
1498packets. However, it will still receive interrupts if the
1499VIRTIO_F_NOTIFY_ON_EMPTY feature is negotiated, indicating that
1500the transmission queue is completely emptied.
1501
1502The normal behavior in this interrupt handler is to retrieve and
1503new descriptors from the used ring and free the corresponding
1504headers and packets.
1505
1506 Setting Up Receive Buffers
1507
1508It is generally a good idea to keep the receive virtqueue as
1509fully populated as possible: if it runs out, network performance
1510will suffer.
1511
1512If the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6 or
1513VIRTIO_NET_F_GUEST_UFO features are used, the Guest will need to
1514accept packets of up to 65550 bytes long (the maximum size of a
1515TCP or UDP packet, plus the 14 byte ethernet header), otherwise
15161514 bytes. So unless VIRTIO_NET_F_MRG_RXBUF is negotiated, every
1517buffer in the receive queue needs to be at least this length [footnote:
1518Obviously each one can be split across multiple descriptor
1519elements.
1520].
1521
1522If VIRTIO_NET_F_MRG_RXBUF is negotiated, each buffer must be at
1523least the size of the struct virtio_net_hdr.
1524
1525 Packet Receive Interrupt
1526
1527When a packet is copied into a buffer in the receiveq, the
1528optimal path is to disable further interrupts for the receiveq
1529(see [sub:Receiving-Used-Buffers]) and process packets until no
1530more are found, then re-enable them.
1531
1532Processing packet involves:
1533
1534 If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature,
1535 then the “num_buffers” field indicates how many descriptors
1536 this packet is spread over (including this one). This allows
1537 receipt of large packets without having to allocate large
1538 buffers. In this case, there will be at least “num_buffers” in
1539 the used ring, and they should be chained together to form a
1540 single packet. The other buffers will not begin with a struct
1541 virtio_net_hdr.
1542
1543 If the VIRTIO_NET_F_MRG_RXBUF feature was not negotiated, or
1544 the “num_buffers” field is one, then the entire packet will be
1545 contained within this buffer, immediately following the struct
1546 virtio_net_hdr.
1547
1548 If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the
1549 VIRTIO_NET_HDR_F_NEEDS_CSUM bit in the “flags” field may be
1550 set: if so, the checksum on the packet is incomplete and the “
1551 csum_start” and “csum_offset” fields indicate how to calculate
1552 it (see [ite:csum_start-is-set]).
1553
1554 If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were
1555 negotiated, then the “gso_type” may be something other than
1556 VIRTIO_NET_HDR_GSO_NONE, and the “gso_size” field indicates the
1557 desired MSS (see [enu:If-the-driver]).
1558
1559 Control Virtqueue
1560
1561The driver uses the control virtqueue (if VIRTIO_NET_F_VTRL_VQ is
1562negotiated) to send commands to manipulate various features of
1563the device which would not easily map into the configuration
1564space.
1565
1566All commands are of the following form:
1567
1568struct virtio_net_ctrl {
1569
1570 u8 class;
1571
1572 u8 command;
1573
1574 u8 command-specific-data[];
1575
1576 u8 ack;
1577
1578};
1579
1580
1581
1582/* ack values */
1583
1584#define VIRTIO_NET_OK 0
1585
1586#define VIRTIO_NET_ERR 1
1587
1588The class, command and command-specific-data are set by the
1589driver, and the device sets the ack byte. There is little it can
1590do except issue a diagnostic if the ack byte is not
1591VIRTIO_NET_OK.
1592
1593 Packet Receive Filtering
1594
1595If the VIRTIO_NET_F_CTRL_RX feature is negotiated, the driver can
1596send control commands for promiscuous mode, multicast receiving,
1597and filtering of MAC addresses.
1598
1599Note that in general, these commands are best-effort: unwanted
1600packets may still arrive.
1601
1602 Setting Promiscuous Mode
1603
1604#define VIRTIO_NET_CTRL_RX 0
1605
1606 #define VIRTIO_NET_CTRL_RX_PROMISC 0
1607
1608 #define VIRTIO_NET_CTRL_RX_ALLMULTI 1
1609
1610The class VIRTIO_NET_CTRL_RX has two commands:
1611VIRTIO_NET_CTRL_RX_PROMISC turns promiscuous mode on and off, and
1612VIRTIO_NET_CTRL_RX_ALLMULTI turns all-multicast receive on and
1613off. The command-specific-data is one byte containing 0 (off) or
16141 (on).
1615
1616 Setting MAC Address Filtering
1617
1618struct virtio_net_ctrl_mac {
1619
1620 u32 entries;
1621
1622 u8 macs[entries][ETH_ALEN];
1623
1624};
1625
1626
1627
1628#define VIRTIO_NET_CTRL_MAC 1
1629
1630 #define VIRTIO_NET_CTRL_MAC_TABLE_SET 0
1631
1632The device can filter incoming packets by any number of
1633destination MAC addresses.[footnote:
1634Since there are no guarantees, it can use a hash filter
1635orsilently switch to allmulti or promiscuous mode if it is given
1636too many addresses.
1637] This table is set using the class VIRTIO_NET_CTRL_MAC and the
1638command VIRTIO_NET_CTRL_MAC_TABLE_SET. The command-specific-data
1639is two variable length tables of 6-byte MAC addresses. The first
1640table contains unicast addresses, and the second contains
1641multicast addresses.
1642
1643 VLAN Filtering
1644
1645If the driver negotiates the VIRTION_NET_F_CTRL_VLAN feature, it
1646can control a VLAN filter table in the device.
1647
1648#define VIRTIO_NET_CTRL_VLAN 2
1649
1650 #define VIRTIO_NET_CTRL_VLAN_ADD 0
1651
1652 #define VIRTIO_NET_CTRL_VLAN_DEL 1
1653
1654Both the VIRTIO_NET_CTRL_VLAN_ADD and VIRTIO_NET_CTRL_VLAN_DEL
1655command take a 16-bit VLAN id as the command-specific-data.
1656
1657 Gratuitous Packet Sending
1658
1659If the driver negotiates the VIRTIO_NET_F_GUEST_ANNOUNCE (depends
1660on VIRTIO_NET_F_CTRL_VQ), it can ask the guest to send gratuitous
1661packets; this is usually done after the guest has been physically
1662migrated, and needs to announce its presence on the new network
1663links. (As hypervisor does not have the knowledge of guest
1664network configuration (eg. tagged vlan) it is simplest to prod
1665the guest in this way).
1666
1667#define VIRTIO_NET_CTRL_ANNOUNCE 3
1668
1669 #define VIRTIO_NET_CTRL_ANNOUNCE_ACK 0
1670
1671The Guest needs to check VIRTIO_NET_S_ANNOUNCE bit in status
1672field when it notices the changes of device configuration. The
1673command VIRTIO_NET_CTRL_ANNOUNCE_ACK is used to indicate that
1674driver has recevied the notification and device would clear the
1675VIRTIO_NET_S_ANNOUNCE bit in the status filed after it received
1676this command.
1677
1678Processing this notification involves:
1679
1680 Sending the gratuitous packets or marking there are pending
1681 gratuitous packets to be sent and letting deferred routine to
1682 send them.
1683
1684 Sending VIRTIO_NET_CTRL_ANNOUNCE_ACK command through control
1685 vq.
1686
1687 .
1688
1689Appendix D: Block Device
1690
1691The virtio block device is a simple virtual block device (ie.
1692disk). Read and write requests (and other exotic requests) are
1693placed in the queue, and serviced (probably out of order) by the
1694device except where noted.
1695
1696 Configuration
1697
1698 Subsystem Device ID 2
1699
1700 Virtqueues 0:requestq.
1701
1702 Feature bits
1703
1704 VIRTIO_BLK_F_BARRIER (0) Host supports request barriers.
1705
1706 VIRTIO_BLK_F_SIZE_MAX (1) Maximum size of any single segment is
1707 in “size_max”.
1708
1709 VIRTIO_BLK_F_SEG_MAX (2) Maximum number of segments in a
1710 request is in “seg_max”.
1711
1712 VIRTIO_BLK_F_GEOMETRY (4) Disk-style geometry specified in “
1713 geometry”.
1714
1715 VIRTIO_BLK_F_RO (5) Device is read-only.
1716
1717 VIRTIO_BLK_F_BLK_SIZE (6) Block size of disk is in “blk_size”.
1718
1719 VIRTIO_BLK_F_SCSI (7) Device supports scsi packet commands.
1720
1721 VIRTIO_BLK_F_FLUSH (9) Cache flush command support.
1722
1723 Device configuration layout The capacity of the device
1724 (expressed in 512-byte sectors) is always present. The
1725 availability of the others all depend on various feature bits
1726 as indicated above. struct virtio_blk_config {
1727
1728 u64 capacity;
1729
1730 u32 size_max;
1731
1732 u32 seg_max;
1733
1734 struct virtio_blk_geometry {
1735
1736 u16 cylinders;
1737
1738 u8 heads;
1739
1740 u8 sectors;
1741
1742 } geometry;
1743
1744 u32 blk_size;
1745
1746
1747
1748};
1749
1750 Device Initialization
1751
1752 The device size should be read from the “capacity”
1753 configuration field. No requests should be submitted which goes
1754 beyond this limit.
1755
1756 If the VIRTIO_BLK_F_BLK_SIZE feature is negotiated, the
1757 blk_size field can be read to determine the optimal sector size
1758 for the driver to use. This does not effect the units used in
1759 the protocol (always 512 bytes), but awareness of the correct
1760 value can effect performance.
1761
1762 If the VIRTIO_BLK_F_RO feature is set by the device, any write
1763 requests will fail.
1764
1765 Device Operation
1766
1767The driver queues requests to the virtqueue, and they are used by
1768the device (not necessarily in order). Each request is of form:
1769
1770struct virtio_blk_req {
1771
1772
1773
1774 u32 type;
1775
1776 u32 ioprio;
1777
1778 u64 sector;
1779
1780 char data[][512];
1781
1782 u8 status;
1783
1784};
1785
1786If the device has VIRTIO_BLK_F_SCSI feature, it can also support
1787scsi packet command requests, each of these requests is of form:struct virtio_scsi_pc_req {
1788
1789 u32 type;
1790
1791 u32 ioprio;
1792
1793 u64 sector;
1794
1795 char cmd[];
1796
1797 char data[][512];
1798
1799#define SCSI_SENSE_BUFFERSIZE 96
1800
1801 u8 sense[SCSI_SENSE_BUFFERSIZE];
1802
1803 u32 errors;
1804
1805 u32 data_len;
1806
1807 u32 sense_len;
1808
1809 u32 residual;
1810
1811 u8 status;
1812
1813};
1814
1815The type of the request is either a read (VIRTIO_BLK_T_IN), a
1816write (VIRTIO_BLK_T_OUT), a scsi packet command
1817(VIRTIO_BLK_T_SCSI_CMD or VIRTIO_BLK_T_SCSI_CMD_OUT[footnote:
1818the SCSI_CMD and SCSI_CMD_OUT types are equivalent, the device
1819does not distinguish between them
1820]) or a flush (VIRTIO_BLK_T_FLUSH or VIRTIO_BLK_T_FLUSH_OUT[footnote:
1821the FLUSH and FLUSH_OUT types are equivalent, the device does not
1822distinguish between them
1823]). If the device has VIRTIO_BLK_F_BARRIER feature the high bit
1824(VIRTIO_BLK_T_BARRIER) indicates that this request acts as a
1825barrier and that all preceding requests must be complete before
1826this one, and all following requests must not be started until
1827this is complete. Note that a barrier does not flush caches in
1828the underlying backend device in host, and thus does not serve as
1829data consistency guarantee. Driver must use FLUSH request to
1830flush the host cache.
1831
1832#define VIRTIO_BLK_T_IN 0
1833
1834#define VIRTIO_BLK_T_OUT 1
1835
1836#define VIRTIO_BLK_T_SCSI_CMD 2
1837
1838#define VIRTIO_BLK_T_SCSI_CMD_OUT 3
1839
1840#define VIRTIO_BLK_T_FLUSH 4
1841
1842#define VIRTIO_BLK_T_FLUSH_OUT 5
1843
1844#define VIRTIO_BLK_T_BARRIER 0x80000000
1845
1846The ioprio field is a hint about the relative priorities of
1847requests to the device: higher numbers indicate more important
1848requests.
1849
1850The sector number indicates the offset (multiplied by 512) where
1851the read or write is to occur. This field is unused and set to 0
1852for scsi packet commands and for flush commands.
1853
1854The cmd field is only present for scsi packet command requests,
1855and indicates the command to perform. This field must reside in a
1856single, separate read-only buffer; command length can be derived
1857from the length of this buffer.
1858
1859Note that these first three (four for scsi packet commands)
1860fields are always read-only: the data field is either read-only
1861or write-only, depending on the request. The size of the read or
1862write can be derived from the total size of the request buffers.
1863
1864The sense field is only present for scsi packet command requests,
1865and indicates the buffer for scsi sense data.
1866
1867The data_len field is only present for scsi packet command
1868requests, this field is deprecated, and should be ignored by the
1869driver. Historically, devices copied data length there.
1870
1871The sense_len field is only present for scsi packet command
1872requests and indicates the number of bytes actually written to
1873the sense buffer.
1874
1875The residual field is only present for scsi packet command
1876requests and indicates the residual size, calculated as data
1877length - number of bytes actually transferred.
1878
1879The final status byte is written by the device: either
1880VIRTIO_BLK_S_OK for success, VIRTIO_BLK_S_IOERR for host or guest
1881error or VIRTIO_BLK_S_UNSUPP for a request unsupported by host:#define VIRTIO_BLK_S_OK 0
1882
1883#define VIRTIO_BLK_S_IOERR 1
1884
1885#define VIRTIO_BLK_S_UNSUPP 2
1886
1887Historically, devices assumed that the fields type, ioprio and
1888sector reside in a single, separate read-only buffer; the fields
1889errors, data_len, sense_len and residual reside in a single,
1890separate write-only buffer; the sense field in a separate
1891write-only buffer of size 96 bytes, by itself; the fields errors,
1892data_len, sense_len and residual in a single write-only buffer;
1893and the status field is a separate read-only buffer of size 1
1894byte, by itself.
1895
1896Appendix E: Console Device
1897
1898The virtio console device is a simple device for data input and
1899output. A device may have one or more ports. Each port has a pair
1900of input and output virtqueues. Moreover, a device has a pair of
1901control IO virtqueues. The control virtqueues are used to
1902communicate information between the device and the driver about
1903ports being opened and closed on either side of the connection,
1904indication from the host about whether a particular port is a
1905console port, adding new ports, port hot-plug/unplug, etc., and
1906indication from the guest about whether a port or a device was
1907successfully added, port open/close, etc.. For data IO, one or
1908more empty buffers are placed in the receive queue for incoming
1909data and outgoing characters are placed in the transmit queue.
1910
1911 Configuration
1912
1913 Subsystem Device ID 3
1914
1915 Virtqueues 0:receiveq(port0). 1:transmitq(port0), 2:control
1916 receiveq[footnote:
1917Ports 2 onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set
1918], 3:control transmitq, 4:receiveq(port1), 5:transmitq(port1),
1919 ...
1920
1921 Feature bits
1922
1923 VIRTIO_CONSOLE_F_SIZE (0) Configuration cols and rows fields
1924 are valid.
1925
1926 VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple
1927 ports; configuration fields nr_ports and max_nr_ports are
1928 valid and control virtqueues will be used.
1929
1930 Device configuration layout The size of the console is supplied
1931 in the configuration space if the VIRTIO_CONSOLE_F_SIZE feature
1932 is set. Furthermore, if the VIRTIO_CONSOLE_F_MULTIPORT feature
1933 is set, the maximum number of ports supported by the device can
1934 be fetched.struct virtio_console_config {
1935
1936 u16 cols;
1937
1938 u16 rows;
1939
1940
1941
1942 u32 max_nr_ports;
1943
1944};
1945
1946 Device Initialization
1947
1948 If the VIRTIO_CONSOLE_F_SIZE feature is negotiated, the driver
1949 can read the console dimensions from the configuration fields.
1950
1951 If the VIRTIO_CONSOLE_F_MULTIPORT feature is negotiated, the
1952 driver can spawn multiple ports, not all of which may be
1953 attached to a console. Some could be generic ports. In this
1954 case, the control virtqueues are enabled and according to the
1955 max_nr_ports configuration-space value, the appropriate number
1956 of virtqueues are created. A control message indicating the
1957 driver is ready is sent to the host. The host can then send
1958 control messages for adding new ports to the device. After
1959 creating and initializing each port, a
1960 VIRTIO_CONSOLE_PORT_READY control message is sent to the host
1961 for that port so the host can let us know of any additional
1962 configuration options set for that port.
1963
1964 The receiveq for each port is populated with one or more
1965 receive buffers.
1966
1967 Device Operation
1968
1969 For output, a buffer containing the characters is placed in the
1970 port's transmitq.[footnote:
1971Because this is high importance and low bandwidth, the current
1972Linux implementation polls for the buffer to be used, rather than
1973waiting for an interrupt, simplifying the implementation
1974significantly. However, for generic serial ports with the
1975O_NONBLOCK flag set, the polling limitation is relaxed and the
1976consumed buffers are freed upon the next write or poll call or
1977when a port is closed or hot-unplugged.
1978]
1979
1980 When a buffer is used in the receiveq (signalled by an
1981 interrupt), the contents is the input to the port associated
1982 with the virtqueue for which the notification was received.
1983
1984 If the driver negotiated the VIRTIO_CONSOLE_F_SIZE feature, a
1985 configuration change interrupt may occur. The updated size can
1986 be read from the configuration fields.
1987
1988 If the driver negotiated the VIRTIO_CONSOLE_F_MULTIPORT
1989 feature, active ports are announced by the host using the
1990 VIRTIO_CONSOLE_PORT_ADD control message. The same message is
1991 used for port hot-plug as well.
1992
1993 If the host specified a port `name', a sysfs attribute is
1994 created with the name filled in, so that udev rules can be
1995 written that can create a symlink from the port's name to the
1996 char device for port discovery by applications in the guest.
1997
1998 Changes to ports' state are effected by control messages.
1999 Appropriate action is taken on the port indicated in the
2000 control message. The layout of the structure of the control
2001 buffer and the events associated are:struct virtio_console_control {
2002
2003 uint32_t id; /* Port number */
2004
2005 uint16_t event; /* The kind of control event */
2006
2007 uint16_t value; /* Extra information for the event */
2008
2009};
2010
2011
2012
2013/* Some events for the internal messages (control packets) */
2014
2015
2016
2017#define VIRTIO_CONSOLE_DEVICE_READY 0
2018
2019#define VIRTIO_CONSOLE_PORT_ADD 1
2020
2021#define VIRTIO_CONSOLE_PORT_REMOVE 2
2022
2023#define VIRTIO_CONSOLE_PORT_READY 3
2024
2025#define VIRTIO_CONSOLE_CONSOLE_PORT 4
2026
2027#define VIRTIO_CONSOLE_RESIZE 5
2028
2029#define VIRTIO_CONSOLE_PORT_OPEN 6
2030
2031#define VIRTIO_CONSOLE_PORT_NAME 7
2032
2033Appendix F: Entropy Device
2034
2035The virtio entropy device supplies high-quality randomness for
2036guest use.
2037
2038 Configuration
2039
2040 Subsystem Device ID 4
2041
2042 Virtqueues 0:requestq.
2043
2044 Feature bits None currently defined
2045
2046 Device configuration layout None currently defined.
2047
2048 Device Initialization
2049
2050 The virtqueue is initialized
2051
2052 Device Operation
2053
2054When the driver requires random bytes, it places the descriptor
2055of one or more buffers in the queue. It will be completely filled
2056by random data by the device.
2057
2058Appendix G: Memory Balloon Device
2059
2060The virtio memory balloon device is a primitive device for
2061managing guest memory: the device asks for a certain amount of
2062memory, and the guest supplies it (or withdraws it, if the device
2063has more than it asks for). This allows the guest to adapt to
2064changes in allowance of underlying physical memory. If the
2065feature is negotiated, the device can also be used to communicate
2066guest memory statistics to the host.
2067
2068 Configuration
2069
2070 Subsystem Device ID 5
2071
2072 Virtqueues 0:inflateq. 1:deflateq. 2:statsq.[footnote:
2073Only if VIRTIO_BALLON_F_STATS_VQ set
2074]
2075
2076 Feature bits
2077
2078 VIRTIO_BALLOON_F_MUST_TELL_HOST (0) Host must be told before
2079 pages from the balloon are used.
2080
2081 VIRTIO_BALLOON_F_STATS_VQ (1) A virtqueue for reporting guest
2082 memory statistics is present.
2083
2084 Device configuration layout Both fields of this configuration
2085 are always available. Note that they are little endian, despite
2086 convention that device fields are guest endian:struct virtio_balloon_config {
2087
2088 u32 num_pages;
2089
2090 u32 actual;
2091
2092};
2093
2094 Device Initialization
2095
2096 The inflate and deflate virtqueues are identified.
2097
2098 If the VIRTIO_BALLOON_F_STATS_VQ feature bit is negotiated:
2099
2100 Identify the stats virtqueue.
2101
2102 Add one empty buffer to the stats virtqueue and notify the
2103 host.
2104
2105Device operation begins immediately.
2106
2107 Device Operation
2108
2109 Memory Ballooning The device is driven by the receipt of a
2110 configuration change interrupt.
2111
2112 The “num_pages” configuration field is examined. If this is
2113 greater than the “actual” number of pages, memory must be given
2114 to the balloon. If it is less than the “actual” number of
2115 pages, memory may be taken back from the balloon for general
2116 use.
2117
2118 To supply memory to the balloon (aka. inflate):
2119
2120 The driver constructs an array of addresses of unused memory
2121 pages. These addresses are divided by 4096[footnote:
2122This is historical, and independent of the guest page size
2123] and the descriptor describing the resulting 32-bit array is
2124 added to the inflateq.
2125
2126 To remove memory from the balloon (aka. deflate):
2127
2128 The driver constructs an array of addresses of memory pages it
2129 has previously given to the balloon, as described above. This
2130 descriptor is added to the deflateq.
2131
2132 If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is set, the
2133 guest may not use these requested pages until that descriptor
2134 in the deflateq has been used by the device.
2135
2136 Otherwise, the guest may begin to re-use pages previously given
2137 to the balloon before the device has acknowledged their
2138 withdrawl. [footnote:
2139In this case, deflation advice is merely a courtesy
2140]
2141
2142 In either case, once the device has completed the inflation or
2143 deflation, the “actual” field of the configuration should be
2144 updated to reflect the new number of pages in the balloon.[footnote:
2145As updates to configuration space are not atomic, this field
2146isn't particularly reliable, but can be used to diagnose buggy
2147guests.
2148]
2149
2150 Memory Statistics
2151
2152The stats virtqueue is atypical because communication is driven
2153by the device (not the driver). The channel becomes active at
2154driver initialization time when the driver adds an empty buffer
2155and notifies the device. A request for memory statistics proceeds
2156as follows:
2157
2158 The device pushes the buffer onto the used ring and sends an
2159 interrupt.
2160
2161 The driver pops the used buffer and discards it.
2162
2163 The driver collects memory statistics and writes them into a
2164 new buffer.
2165
2166 The driver adds the buffer to the virtqueue and notifies the
2167 device.
2168
2169 The device pops the buffer (retaining it to initiate a
2170 subsequent request) and consumes the statistics.
2171
2172 Memory Statistics Format Each statistic consists of a 16 bit
2173 tag and a 64 bit value. Both quantities are represented in the
2174 native endian of the guest. All statistics are optional and the
2175 driver may choose which ones to supply. To guarantee backwards
2176 compatibility, unsupported statistics should be omitted.
2177
2178 struct virtio_balloon_stat {
2179
2180#define VIRTIO_BALLOON_S_SWAP_IN 0
2181
2182#define VIRTIO_BALLOON_S_SWAP_OUT 1
2183
2184#define VIRTIO_BALLOON_S_MAJFLT 2
2185
2186#define VIRTIO_BALLOON_S_MINFLT 3
2187
2188#define VIRTIO_BALLOON_S_MEMFREE 4
2189
2190#define VIRTIO_BALLOON_S_MEMTOT 5
2191
2192 u16 tag;
2193
2194 u64 val;
2195
2196} __attribute__((packed));
2197
2198 Tags
2199
2200 VIRTIO_BALLOON_S_SWAP_IN The amount of memory that has been
2201 swapped in (in bytes).
2202
2203 VIRTIO_BALLOON_S_SWAP_OUT The amount of memory that has been
2204 swapped out to disk (in bytes).
2205
2206 VIRTIO_BALLOON_S_MAJFLT The number of major page faults that
2207 have occurred.
2208
2209 VIRTIO_BALLOON_S_MINFLT The number of minor page faults that
2210 have occurred.
2211
2212 VIRTIO_BALLOON_S_MEMFREE The amount of memory not being used
2213 for any purpose (in bytes).
2214
2215 VIRTIO_BALLOON_S_MEMTOT The total amount of memory available
2216 (in bytes).
2217
2218Appendix H: Rpmsg: Remote Processor Messaging
2219
2220Virtio rpmsg devices represent remote processors on the system
2221which run in asymmetric multi-processing (AMP) configuration, and
2222which are usually used to offload cpu-intensive tasks from the
2223main application processor (a typical SoC methodology).
2224
2225Virtio is being used to communicate with those remote processors;
2226empty buffers are placed in one virtqueue for receiving messages,
2227and non-empty buffers, containing outbound messages, are enqueued
2228in a second virtqueue for transmission.
2229
2230Numerous communication channels can be multiplexed over those two
2231virtqueues, so different entities, running on the application and
2232remote processor, can directly communicate in a point-to-point
2233fashion.
2234
2235 Configuration
2236
2237 Subsystem Device ID 7
2238
2239 Virtqueues 0:receiveq. 1:transmitq.
2240
2241 Feature bits
2242
2243 VIRTIO_RPMSG_F_NS (0) Device sends (and capable of receiving)
2244 name service messages announcing the creation (or
2245 destruction) of a channel:/**
2246
2247 * struct rpmsg_ns_msg - dynamic name service announcement
2248message
2249
2250 * @name: name of remote service that is published
2251
2252 * @addr: address of remote service that is published
2253
2254 * @flags: indicates whether service is created or destroyed
2255
2256 *
2257
2258 * This message is sent across to publish a new service (or
2259announce
2260
2261 * about its removal). When we receives these messages, an
2262appropriate
2263
2264 * rpmsg channel (i.e device) is created/destroyed.
2265
2266 */
2267
2268struct rpmsg_ns_msgoon_config {
2269
2270 char name[RPMSG_NAME_SIZE];
2271
2272 u32 addr;
2273
2274 u32 flags;
2275
2276} __packed;
2277
2278
2279
2280/**
2281
2282 * enum rpmsg_ns_flags - dynamic name service announcement flags
2283
2284 *
2285
2286 * @RPMSG_NS_CREATE: a new remote service was just created
2287
2288 * @RPMSG_NS_DESTROY: a remote service was just destroyed
2289
2290 */
2291
2292enum rpmsg_ns_flags {
2293
2294 RPMSG_NS_CREATE = 0,
2295
2296 RPMSG_NS_DESTROY = 1,
2297
2298};
2299
2300 Device configuration layout
2301
2302At his point none currently defined.
2303
2304 Device Initialization
2305
2306 The initialization routine should identify the receive and
2307 transmission virtqueues.
2308
2309 The receive virtqueue should be filled with receive buffers.
2310
2311 Device Operation
2312
2313Messages are transmitted by placing them in the transmitq, and
2314buffers for inbound messages are placed in the receiveq. In any
2315case, messages are always preceded by the following header: /**
2316
2317 * struct rpmsg_hdr - common header for all rpmsg messages
2318
2319 * @src: source address
2320
2321 * @dst: destination address
2322
2323 * @reserved: reserved for future use
2324
2325 * @len: length of payload (in bytes)
2326
2327 * @flags: message flags
2328
2329 * @data: @len bytes of message payload data
2330
2331 *
2332
2333 * Every message sent(/received) on the rpmsg bus begins with
2334this header.
2335
2336 */
2337
2338struct rpmsg_hdr {
2339
2340 u32 src;
2341
2342 u32 dst;
2343
2344 u32 reserved;
2345
2346 u16 len;
2347
2348 u16 flags;
2349
2350 u8 data[0];
2351
2352} __packed;
2353
2354Appendix I: SCSI Host Device
2355
2356The virtio SCSI host device groups together one or more virtual
2357logical units (such as disks), and allows communicating to them
2358using the SCSI protocol. An instance of the device represents a
2359SCSI host to which many targets and LUNs are attached.
2360
2361The virtio SCSI device services two kinds of requests:
2362
2363 command requests for a logical unit;
2364
2365 task management functions related to a logical unit, target or
2366 command.
2367
2368The device is also able to send out notifications about added and
2369removed logical units. Together, these capabilities provide a
2370SCSI transport protocol that uses virtqueues as the transfer
2371medium. In the transport protocol, the virtio driver acts as the
2372initiator, while the virtio SCSI host provides one or more
2373targets that receive and process the requests.
2374
2375 Configuration
2376
2377 Subsystem Device ID 8
2378
2379 Virtqueues 0:controlq; 1:eventq; 2..n:request queues.
2380
2381 Feature bits
2382
2383 VIRTIO_SCSI_F_INOUT (0) A single request can include both
2384 read-only and write-only data buffers.
2385
2386 VIRTIO_SCSI_F_HOTPLUG (1) The host should enable
2387 hot-plug/hot-unplug of new LUNs and targets on the SCSI bus.
2388
2389 Device configuration layout All fields of this configuration
2390 are always available. sense_size and cdb_size are writable by
2391 the guest.struct virtio_scsi_config {
2392
2393 u32 num_queues;
2394
2395 u32 seg_max;
2396
2397 u32 max_sectors;
2398
2399 u32 cmd_per_lun;
2400
2401 u32 event_info_size;
2402
2403 u32 sense_size;
2404
2405 u32 cdb_size;
2406
2407 u16 max_channel;
2408
2409 u16 max_target;
2410
2411 u32 max_lun;
2412
2413};
2414
2415 num_queues is the total number of request virtqueues exposed by
2416 the device. The driver is free to use only one request queue,
2417 or it can use more to achieve better performance.
2418
2419 seg_max is the maximum number of segments that can be in a
2420 command. A bidirectional command can include seg_max input
2421 segments and seg_max output segments.
2422
2423 max_sectors is a hint to the guest about the maximum transfer
2424 size it should use.
2425
2426 cmd_per_lun is a hint to the guest about the maximum number of
2427 linked commands it should send to one LUN. The actual value
2428 to be used is the minimum of cmd_per_lun and the virtqueue
2429 size.
2430
2431 event_info_size is the maximum size that the device will fill
2432 for buffers that the driver places in the eventq. The driver
2433 should always put buffers at least of this size. It is
2434 written by the device depending on the set of negotated
2435 features.
2436
2437 sense_size is the maximum size of the sense data that the
2438 device will write. The default value is written by the device
2439 and will always be 96, but the driver can modify it. It is
2440 restored to the default when the device is reset.
2441
2442 cdb_size is the maximum size of the CDB that the driver will
2443 write. The default value is written by the device and will
2444 always be 32, but the driver can likewise modify it. It is
2445 restored to the default when the device is reset.
2446
2447 max_channel, max_target and max_lun can be used by the driver
2448 as hints to constrain scanning the logical units on the
2449 host.h
2450
2451 Device Initialization
2452
2453The initialization routine should first of all discover the
2454device's virtqueues.
2455
2456If the driver uses the eventq, it should then place at least a
2457buffer in the eventq.
2458
2459The driver can immediately issue requests (for example, INQUIRY
2460or REPORT LUNS) or task management functions (for example, I_T
2461RESET).
2462
2463 Device Operation: request queues
2464
2465The driver queues requests to an arbitrary request queue, and
2466they are used by the device on that same queue. It is the
2467responsibility of the driver to ensure strict request ordering
2468for commands placed on different queues, because they will be
2469consumed with no order constraints.
2470
2471Requests have the following format:
2472
2473struct virtio_scsi_req_cmd {
2474
2475 // Read-only
2476
2477 u8 lun[8];
2478
2479 u64 id;
2480
2481 u8 task_attr;
2482
2483 u8 prio;
2484
2485 u8 crn;
2486
2487 char cdb[cdb_size];
2488
2489 char dataout[];
2490
2491 // Write-only part
2492
2493 u32 sense_len;
2494
2495 u32 residual;
2496
2497 u16 status_qualifier;
2498
2499 u8 status;
2500
2501 u8 response;
2502
2503 u8 sense[sense_size];
2504
2505 char datain[];
2506
2507};
2508
2509
2510
2511/* command-specific response values */
2512
2513#define VIRTIO_SCSI_S_OK 0
2514
2515#define VIRTIO_SCSI_S_OVERRUN 1
2516
2517#define VIRTIO_SCSI_S_ABORTED 2
2518
2519#define VIRTIO_SCSI_S_BAD_TARGET 3
2520
2521#define VIRTIO_SCSI_S_RESET 4
2522
2523#define VIRTIO_SCSI_S_BUSY 5
2524
2525#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6
2526
2527#define VIRTIO_SCSI_S_TARGET_FAILURE 7
2528
2529#define VIRTIO_SCSI_S_NEXUS_FAILURE 8
2530
2531#define VIRTIO_SCSI_S_FAILURE 9
2532
2533
2534
2535/* task_attr */
2536
2537#define VIRTIO_SCSI_S_SIMPLE 0
2538
2539#define VIRTIO_SCSI_S_ORDERED 1
2540
2541#define VIRTIO_SCSI_S_HEAD 2
2542
2543#define VIRTIO_SCSI_S_ACA 3
2544
2545The lun field addresses a target and logical unit in the
2546virtio-scsi device's SCSI domain. The only supported format for
2547the LUN field is: first byte set to 1, second byte set to target,
2548third and fourth byte representing a single level LUN structure,
2549followed by four zero bytes. With this representation, a
2550virtio-scsi device can serve up to 256 targets and 16384 LUNs per
2551target.
2552
2553The id field is the command identifier (“tag”).
2554
2555task_attr, prio and crn should be left to zero. task_attr defines
2556the task attribute as in the table above, but all task attributes
2557may be mapped to SIMPLE by the device; crn may also be provided
2558by clients, but is generally expected to be 0. The maximum CRN
2559value defined by the protocol is 255, since CRN is stored in an
25608-bit integer.
2561
2562All of these fields are defined in SAM. They are always
2563read-only, as are the cdb and dataout field. The cdb_size is
2564taken from the configuration space.
2565
2566sense and subsequent fields are always write-only. The sense_len
2567field indicates the number of bytes actually written to the sense
2568buffer. The residual field indicates the residual size,
2569calculated as “data_length - number_of_transferred_bytes”, for
2570read or write operations. For bidirectional commands, the
2571number_of_transferred_bytes includes both read and written bytes.
2572A residual field that is less than the size of datain means that
2573the dataout field was processed entirely. A residual field that
2574exceeds the size of datain means that the dataout field was
2575processed partially and the datain field was not processed at
2576all.
2577
2578The status byte is written by the device to be the status code as
2579defined in SAM.
2580
2581The response byte is written by the device to be one of the
2582following:
2583
2584 VIRTIO_SCSI_S_OK when the request was completed and the status
2585 byte is filled with a SCSI status code (not necessarily
2586 "GOOD").
2587
2588 VIRTIO_SCSI_S_OVERRUN if the content of the CDB requires
2589 transferring more data than is available in the data buffers.
2590
2591 VIRTIO_SCSI_S_ABORTED if the request was cancelled due to an
2592 ABORT TASK or ABORT TASK SET task management function.
2593
2594 VIRTIO_SCSI_S_BAD_TARGET if the request was never processed
2595 because the target indicated by the lun field does not exist.
2596
2597 VIRTIO_SCSI_S_RESET if the request was cancelled due to a bus
2598 or device reset (including a task management function).
2599
2600 VIRTIO_SCSI_S_TRANSPORT_FAILURE if the request failed due to a
2601 problem in the connection between the host and the target
2602 (severed link).
2603
2604 VIRTIO_SCSI_S_TARGET_FAILURE if the target is suffering a
2605 failure and the guest should not retry on other paths.
2606
2607 VIRTIO_SCSI_S_NEXUS_FAILURE if the nexus is suffering a failure
2608 but retrying on other paths might yield a different result.
2609
2610 VIRTIO_SCSI_S_BUSY if the request failed but retrying on the
2611 same path should work.
2612
2613 VIRTIO_SCSI_S_FAILURE for other host or guest error. In
2614 particular, if neither dataout nor datain is empty, and the
2615 VIRTIO_SCSI_F_INOUT feature has not been negotiated, the
2616 request will be immediately returned with a response equal to
2617 VIRTIO_SCSI_S_FAILURE.
2618
2619 Device Operation: controlq
2620
2621The controlq is used for other SCSI transport operations.
2622Requests have the following format:
2623
2624struct virtio_scsi_ctrl {
2625
2626 u32 type;
2627
2628 ...
2629
2630 u8 response;
2631
2632};
2633
2634
2635
2636/* response values valid for all commands */
2637
2638#define VIRTIO_SCSI_S_OK 0
2639
2640#define VIRTIO_SCSI_S_BAD_TARGET 3
2641
2642#define VIRTIO_SCSI_S_BUSY 5
2643
2644#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6
2645
2646#define VIRTIO_SCSI_S_TARGET_FAILURE 7
2647
2648#define VIRTIO_SCSI_S_NEXUS_FAILURE 8
2649
2650#define VIRTIO_SCSI_S_FAILURE 9
2651
2652#define VIRTIO_SCSI_S_INCORRECT_LUN 12
2653
2654The type identifies the remaining fields.
2655
2656The following commands are defined:
2657
2658 Task management function
2659#define VIRTIO_SCSI_T_TMF 0
2660
2661
2662
2663#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0
2664
2665#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1
2666
2667#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2
2668
2669#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3
2670
2671#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4
2672
2673#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5
2674
2675#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6
2676
2677#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7
2678
2679
2680
2681struct virtio_scsi_ctrl_tmf
2682
2683{
2684
2685 // Read-only part
2686
2687 u32 type;
2688
2689 u32 subtype;
2690
2691 u8 lun[8];
2692
2693 u64 id;
2694
2695 // Write-only part
2696
2697 u8 response;
2698
2699}
2700
2701
2702
2703/* command-specific response values */
2704
2705#define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0
2706
2707#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10
2708
2709#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11
2710
2711 The type is VIRTIO_SCSI_T_TMF; the subtype field defines. All
2712 fields except response are filled by the driver. The subtype
2713 field must always be specified and identifies the requested
2714 task management function.
2715
2716 Other fields may be irrelevant for the requested TMF; if so,
2717 they are ignored but they should still be present. The lun
2718 field is in the same format specified for request queues; the
2719 single level LUN is ignored when the task management function
2720 addresses a whole I_T nexus. When relevant, the value of the id
2721 field is matched against the id values passed on the requestq.
2722
2723 The outcome of the task management function is written by the
2724 device in the response field. The command-specific response
2725 values map 1-to-1 with those defined in SAM.
2726
2727 Asynchronous notification query
2728#define VIRTIO_SCSI_T_AN_QUERY 1
2729
2730
2731
2732struct virtio_scsi_ctrl_an {
2733
2734 // Read-only part
2735
2736 u32 type;
2737
2738 u8 lun[8];
2739
2740 u32 event_requested;
2741
2742 // Write-only part
2743
2744 u32 event_actual;
2745
2746 u8 response;
2747
2748}
2749
2750
2751
2752#define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2
2753
2754#define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4
2755
2756#define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8
2757
2758#define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16
2759
2760#define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32
2761
2762#define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64
2763
2764 By sending this command, the driver asks the device which
2765 events the given LUN can report, as described in paragraphs 6.6
2766 and A.6 of the SCSI MMC specification. The driver writes the
2767 events it is interested in into the event_requested; the device
2768 responds by writing the events that it supports into
2769 event_actual.
2770
2771 The type is VIRTIO_SCSI_T_AN_QUERY. The lun and event_requested
2772 fields are written by the driver. The event_actual and response
2773 fields are written by the device.
2774
2775 No command-specific values are defined for the response byte.
2776
2777 Asynchronous notification subscription
2778#define VIRTIO_SCSI_T_AN_SUBSCRIBE 2
2779
2780
2781
2782struct virtio_scsi_ctrl_an {
2783
2784 // Read-only part
2785
2786 u32 type;
2787
2788 u8 lun[8];
2789
2790 u32 event_requested;
2791
2792 // Write-only part
2793
2794 u32 event_actual;
2795
2796 u8 response;
2797
2798}
2799
2800 By sending this command, the driver asks the specified LUN to
2801 report events for its physical interface, again as described in
2802 the SCSI MMC specification. The driver writes the events it is
2803 interested in into the event_requested; the device responds by
2804 writing the events that it supports into event_actual.
2805
2806 Event types are the same as for the asynchronous notification
2807 query message.
2808
2809 The type is VIRTIO_SCSI_T_AN_SUBSCRIBE. The lun and
2810 event_requested fields are written by the driver. The
2811 event_actual and response fields are written by the device.
2812
2813 No command-specific values are defined for the response byte.
2814
2815 Device Operation: eventq
2816
2817The eventq is used by the device to report information on logical
2818units that are attached to it. The driver should always leave a
2819few buffers ready in the eventq. In general, the device will not
2820queue events to cope with an empty eventq, and will end up
2821dropping events if it finds no buffer ready. However, when
2822reporting events for many LUNs (e.g. when a whole target
2823disappears), the device can throttle events to avoid dropping
2824them. For this reason, placing 10-15 buffers on the event queue
2825should be enough.
2826
2827Buffers are placed in the eventq and filled by the device when
2828interesting events occur. The buffers should be strictly
2829write-only (device-filled) and the size of the buffers should be
2830at least the value given in the device's configuration
2831information.
2832
2833Buffers returned by the device on the eventq will be referred to
2834as "events" in the rest of this section. Events have the
2835following format:
2836
2837#define VIRTIO_SCSI_T_EVENTS_MISSED 0x80000000
2838
2839
2840
2841struct virtio_scsi_event {
2842
2843 // Write-only part
2844
2845 u32 event;
2846
2847 ...
2848
2849}
2850
2851If bit 31 is set in the event field, the device failed to report
2852an event due to missing buffers. In this case, the driver should
2853poll the logical units for unit attention conditions, and/or do
2854whatever form of bus scan is appropriate for the guest operating
2855system.
2856
2857Other data that the device writes to the buffer depends on the
2858contents of the event field. The following events are defined:
2859
2860 No event
2861#define VIRTIO_SCSI_T_NO_EVENT 0
2862
2863 This event is fired in the following cases:
2864
2865 When the device detects in the eventq a buffer that is shorter
2866 than what is indicated in the configuration field, it might
2867 use it immediately and put this dummy value in the event
2868 field. A well-written driver will never observe this
2869 situation.
2870
2871 When events are dropped, the device may signal this event as
2872 soon as the drivers makes a buffer available, in order to
2873 request action from the driver. In this case, of course, this
2874 event will be reported with the VIRTIO_SCSI_T_EVENTS_MISSED
2875 flag.
2876
2877 Transport reset
2878#define VIRTIO_SCSI_T_TRANSPORT_RESET 1
2879
2880
2881
2882struct virtio_scsi_event_reset {
2883
2884 // Write-only part
2885
2886 u32 event;
2887
2888 u8 lun[8];
2889
2890 u32 reason;
2891
2892}
2893
2894
2895
2896#define VIRTIO_SCSI_EVT_RESET_HARD 0
2897
2898#define VIRTIO_SCSI_EVT_RESET_RESCAN 1
2899
2900#define VIRTIO_SCSI_EVT_RESET_REMOVED 2
2901
2902 By sending this event, the device signals that a logical unit
2903 on a target has been reset, including the case of a new device
2904 appearing or disappearing on the bus.The device fills in all
2905 fields. The event field is set to
2906 VIRTIO_SCSI_T_TRANSPORT_RESET. The lun field addresses a
2907 logical unit in the SCSI host.
2908
2909 The reason value is one of the three #define values appearing
2910 above:
2911
2912 VIRTIO_SCSI_EVT_RESET_REMOVED (“LUN/target removed”) is used if
2913 the target or logical unit is no longer able to receive
2914 commands.
2915
2916 VIRTIO_SCSI_EVT_RESET_HARD (“LUN hard reset”) is used if the
2917 logical unit has been reset, but is still present.
2918
2919 VIRTIO_SCSI_EVT_RESET_RESCAN (“rescan LUN/target”) is used if a
2920 target or logical unit has just appeared on the device.
2921
2922 The “removed” and “rescan” events, when sent for LUN 0, may
2923 apply to the entire target. After receiving them the driver
2924 should ask the initiator to rescan the target, in order to
2925 detect the case when an entire target has appeared or
2926 disappeared. These two events will never be reported unless the
2927 VIRTIO_SCSI_F_HOTPLUG feature was negotiated between the host
2928 and the guest.
2929
2930 Events will also be reported via sense codes (this obviously
2931 does not apply to newly appeared buses or targets, since the
2932 application has never discovered them):
2933
2934 “LUN/target removed” maps to sense key ILLEGAL REQUEST, asc
2935 0x25, ascq 0x00 (LOGICAL UNIT NOT SUPPORTED)
2936
2937 “LUN hard reset” maps to sense key UNIT ATTENTION, asc 0x29
2938 (POWER ON, RESET OR BUS DEVICE RESET OCCURRED)
2939
2940 “rescan LUN/target” maps to sense key UNIT ATTENTION, asc 0x3f,
2941 ascq 0x0e (REPORTED LUNS DATA HAS CHANGED)
2942
2943 The preferred way to detect transport reset is always to use
2944 events, because sense codes are only seen by the driver when it
2945 sends a SCSI command to the logical unit or target. However, in
2946 case events are dropped, the initiator will still be able to
2947 synchronize with the actual state of the controller if the
2948 driver asks the initiator to rescan of the SCSI bus. During the
2949 rescan, the initiator will be able to observe the above sense
2950 codes, and it will process them as if it the driver had
2951 received the equivalent event.
2952
2953 Asynchronous notification
2954#define VIRTIO_SCSI_T_ASYNC_NOTIFY 2
2955
2956
2957
2958struct virtio_scsi_event_an {
2959
2960 // Write-only part
2961
2962 u32 event;
2963
2964 u8 lun[8];
2965
2966 u32 reason;
2967
2968}
2969
2970 By sending this event, the device signals that an asynchronous
2971 event was fired from a physical interface.
2972
2973 All fields are written by the device. The event field is set to
2974 VIRTIO_SCSI_T_ASYNC_NOTIFY. The lun field addresses a logical
2975 unit in the SCSI host. The reason field is a subset of the
2976 events that the driver has subscribed to via the "Asynchronous
2977 notification subscription" command.
2978
2979 When dropped events are reported, the driver should poll for
2980 asynchronous events manually using SCSI commands.
2981
2982Appendix X: virtio-mmio
2983
2984Virtual environments without PCI support (a common situation in
2985embedded devices models) might use simple memory mapped device (“
2986virtio-mmio”) instead of the PCI device.
2987
2988The memory mapped virtio device behaviour is based on the PCI
2989device specification. Therefore most of operations like device
2990initialization, queues configuration and buffer transfers are
2991nearly identical. Existing differences are described in the
2992following sections.
2993
2994 Device Initialization
2995
2996Instead of using the PCI IO space for virtio header, the “
2997virtio-mmio” device provides a set of memory mapped control
2998registers, all 32 bits wide, followed by device-specific
2999configuration space. The following list presents their layout:
3000
3001 Offset from the device base address | Direction | Name
3002 Description
3003
3004 0x000 | R | MagicValue
3005 “virt” string.
3006
3007 0x004 | R | Version
3008 Device version number. Currently must be 1.
3009
3010 0x008 | R | DeviceID
3011 Virtio Subsystem Device ID (ie. 1 for network card).
3012
3013 0x00c | R | VendorID
3014 Virtio Subsystem Vendor ID.
3015
3016 0x010 | R | HostFeatures
3017 Flags representing features the device supports.
3018 Reading from this register returns 32 consecutive flag bits,
3019 first bit depending on the last value written to
3020 HostFeaturesSel register. Access to this register returns bits HostFeaturesSel*32
3021
3022 to (HostFeaturesSel*32)+31
3023, eg. feature bits 0 to 31 if
3024 HostFeaturesSel is set to 0 and features bits 32 to 63 if
3025 HostFeaturesSel is set to 1. Also see [sub:Feature-Bits]
3026
3027 0x014 | W | HostFeaturesSel
3028 Device (Host) features word selection.
3029 Writing to this register selects a set of 32 device feature bits
3030 accessible by reading from HostFeatures register. Device driver
3031 must write a value to the HostFeaturesSel register before
3032 reading from the HostFeatures register.
3033
3034 0x020 | W | GuestFeatures
3035 Flags representing device features understood and activated by
3036 the driver.
3037 Writing to this register sets 32 consecutive flag bits, first
3038 bit depending on the last value written to GuestFeaturesSel
3039 register. Access to this register sets bits GuestFeaturesSel*32
3040
3041 to (GuestFeaturesSel*32)+31
3042, eg. feature bits 0 to 31 if
3043 GuestFeaturesSel is set to 0 and features bits 32 to 63 if
3044 GuestFeaturesSel is set to 1. Also see [sub:Feature-Bits]
3045
3046 0x024 | W | GuestFeaturesSel
3047 Activated (Guest) features word selection.
3048 Writing to this register selects a set of 32 activated feature
3049 bits accessible by writing to the GuestFeatures register.
3050 Device driver must write a value to the GuestFeaturesSel
3051 register before writing to the GuestFeatures register.
3052
3053 0x028 | W | GuestPageSize
3054 Guest page size.
3055 Device driver must write the guest page size in bytes to the
3056 register during initialization, before any queues are used.
3057 This value must be a power of 2 and is used by the Host to
3058 calculate Guest address of the first queue page (see QueuePFN).
3059
3060 0x030 | W | QueueSel
3061 Virtual queue index (first queue is 0).
3062 Writing to this register selects the virtual queue that the
3063 following operations on QueueNum, QueueAlign and QueuePFN apply
3064 to.
3065
3066 0x034 | R | QueueNumMax
3067 Maximum virtual queue size.
3068 Reading from the register returns the maximum size of the queue
3069 the Host is ready to process or zero (0x0) if the queue is not
3070 available. This applies to the queue selected by writing to
3071 QueueSel and is allowed only when QueuePFN is set to zero
3072 (0x0), so when the queue is not actively used.
3073
3074 0x038 | W | QueueNum
3075 Virtual queue size.
3076 Queue size is a number of elements in the queue, therefore size
3077 of the descriptor table and both available and used rings.
3078 Writing to this register notifies the Host what size of the
3079 queue the Guest will use. This applies to the queue selected by
3080 writing to QueueSel.
3081
3082 0x03c | W | QueueAlign
3083 Used Ring alignment in the virtual queue.
3084 Writing to this register notifies the Host about alignment
3085 boundary of the Used Ring in bytes. This value must be a power
3086 of 2 and applies to the queue selected by writing to QueueSel.
3087
3088 0x040 | RW | QueuePFN
3089 Guest physical page number of the virtual queue.
3090 Writing to this register notifies the host about location of the
3091 virtual queue in the Guest's physical address space. This value
3092 is the index number of a page starting with the queue
3093 Descriptor Table. Value zero (0x0) means physical address zero
3094 (0x00000000) and is illegal. When the Guest stops using the
3095 queue it must write zero (0x0) to this register.
3096 Reading from this register returns the currently used page
3097 number of the queue, therefore a value other than zero (0x0)
3098 means that the queue is in use.
3099 Both read and write accesses apply to the queue selected by
3100 writing to QueueSel.
3101
3102 0x050 | W | QueueNotify
3103 Queue notifier.
3104 Writing a queue index to this register notifies the Host that
3105 there are new buffers to process in the queue.
3106
3107 0x60 | R | InterruptStatus
3108Interrupt status.
3109Reading from this register returns a bit mask of interrupts
3110 asserted by the device. An interrupt is asserted if the
3111 corresponding bit is set, ie. equals one (1).
3112
3113 Bit 0 | Used Ring Update
3114This interrupt is asserted when the Host has updated the Used
3115 Ring in at least one of the active virtual queues.
3116
3117 Bit 1 | Configuration change
3118This interrupt is asserted when configuration of the device has
3119 changed.
3120
3121 0x064 | W | InterruptACK
3122 Interrupt acknowledge.
3123 Writing to this register notifies the Host that the Guest
3124 finished handling interrupts. Set bits in the value clear the
3125 corresponding bits of the InterruptStatus register.
3126
3127 0x070 | RW | Status
3128 Device status.
3129 Reading from this register returns the current device status
3130 flags.
3131 Writing non-zero values to this register sets the status flags,
3132 indicating the Guest progress. Writing zero (0x0) to this
3133 register triggers a device reset.
3134 Also see [sub:Device-Initialization-Sequence]
3135
3136 0x100+ | RW | Config
3137 Device-specific configuration space starts at an offset 0x100
3138 and is accessed with byte alignment. Its meaning and size
3139 depends on the device and the driver.
3140
3141Virtual queue size is a number of elements in the queue,
3142therefore size of the descriptor table and both available and
3143used rings.
3144
3145The endianness of the registers follows the native endianness of
3146the Guest. Writing to registers described as “R” and reading from
3147registers described as “W” is not permitted and can cause
3148undefined behavior.
3149
3150The device initialization is performed as described in [sub:Device-Initialization-Sequence]
3151 with one exception: the Guest must notify the Host about its
3152page size, writing the size in bytes to GuestPageSize register
3153before the initialization is finished.
3154
3155The memory mapped virtio devices generate single interrupt only,
3156therefore no special configuration is required.
3157
3158 Virtqueue Configuration
3159
3160The virtual queue configuration is performed in a similar way to
3161the one described in [sec:Virtqueue-Configuration] with a few
3162additional operations:
3163
3164 Select the queue writing its index (first queue is 0) to the
3165 QueueSel register.
3166
3167 Check if the queue is not already in use: read QueuePFN
3168 register, returned value should be zero (0x0).
3169
3170 Read maximum queue size (number of elements) from the
3171 QueueNumMax register. If the returned value is zero (0x0) the
3172 queue is not available.
3173
3174 Allocate and zero the queue pages in contiguous virtual memory,
3175 aligning the Used Ring to an optimal boundary (usually page
3176 size). Size of the allocated queue may be smaller than or equal
3177 to the maximum size returned by the Host.
3178
3179 Notify the Host about the queue size by writing the size to
3180 QueueNum register.
3181
3182 Notify the Host about the used alignment by writing its value
3183 in bytes to QueueAlign register.
3184
3185 Write the physical number of the first page of the queue to the
3186 QueuePFN register.
3187
3188The queue and the device are ready to begin normal operations
3189now.
3190
3191 Device Operation
3192
3193The memory mapped virtio device behaves in the same way as
3194described in [sec:Device-Operation], with the following
3195exceptions:
3196
3197 The device is notified about new buffers available in a queue
3198 by writing the queue index to register QueueNum instead of the
3199 virtio header in PCI I/O space ([sub:Notifying-The-Device]).
3200
3201 The memory mapped virtio device is using single, dedicated
3202 interrupt signal, which is raised when at least one of the
3203 interrupts described in the InterruptStatus register
3204 description is asserted. After receiving an interrupt, the
3205 driver must read the InterruptStatus register to check what
3206 caused the interrupt (see the register description). After the
3207 interrupt is handled, the driver must acknowledge it by writing
3208 a bit mask corresponding to the serviced interrupt to the
3209 InterruptACK register.
3210
diff --git a/MAINTAINERS b/MAINTAINERS
index ee468fac7dbf..b57e2765a342 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8743,6 +8743,7 @@ F: drivers/virtio/
8743F: drivers/net/virtio_net.c 8743F: drivers/net/virtio_net.c
8744F: drivers/block/virtio_blk.c 8744F: drivers/block/virtio_blk.c
8745F: include/linux/virtio_*.h 8745F: include/linux/virtio_*.h
8746F: include/uapi/linux/virtio_*.h
8746 8747
8747VIRTIO HOST (VHOST) 8748VIRTIO HOST (VHOST)
8748M: "Michael S. Tsirkin" <mst@redhat.com> 8749M: "Michael S. Tsirkin" <mst@redhat.com>
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 0d97deba1e35..e2d4a4afa8c3 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -11,18 +11,11 @@
11 11
12#define GUEST_PL 1 12#define GUEST_PL 1
13 13
14/* Every guest maps the core switcher code. */ 14/* Page for Switcher text itself, then two pages per cpu */
15#define SHARED_SWITCHER_PAGES \ 15#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids)
16 DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) 16
17/* Pages for switcher itself, then two pages per cpu */ 17/* Where we map the Switcher, in both Host and Guest. */
18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) 18extern unsigned long switcher_addr;
19
20/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */
21#ifdef CONFIG_X86_PAE
22#define SWITCHER_ADDR 0xFFE00000
23#else
24#define SWITCHER_ADDR 0xFFC00000
25#endif
26 19
27/* Found in switcher.S */ 20/* Found in switcher.S */
28extern unsigned long default_idt_entries[]; 21extern unsigned long default_idt_entries[];
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index dabd221857e1..03cf7179e8ef 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -110,7 +110,7 @@ new_segment:
110 if (!sg) 110 if (!sg)
111 sg = sglist; 111 sg = sglist;
112 else { 112 else {
113 sg->page_link &= ~0x02; 113 sg_unmark_end(sg);
114 sg = sg_next(sg); 114 sg = sg_next(sg);
115 } 115 }
116 116
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 936a110de0b9..5f2448253797 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -143,7 +143,7 @@ new_segment:
143 * termination bit to avoid doing a full 143 * termination bit to avoid doing a full
144 * sg_init_table() in drivers for each command. 144 * sg_init_table() in drivers for each command.
145 */ 145 */
146 (*sg)->page_link &= ~0x02; 146 sg_unmark_end(*sg);
147 *sg = sg_next(*sg); 147 *sg = sg_next(*sg);
148 } 148 }
149 149
diff --git a/drivers/Makefile b/drivers/Makefile
index 33360de63650..8e57688ebd95 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -124,7 +124,7 @@ obj-$(CONFIG_PPC_PS3) += ps3/
124obj-$(CONFIG_OF) += of/ 124obj-$(CONFIG_OF) += of/
125obj-$(CONFIG_SSB) += ssb/ 125obj-$(CONFIG_SSB) += ssb/
126obj-$(CONFIG_BCMA) += bcma/ 126obj-$(CONFIG_BCMA) += bcma/
127obj-$(CONFIG_VHOST_NET) += vhost/ 127obj-$(CONFIG_VHOST_RING) += vhost/
128obj-$(CONFIG_VLYNQ) += vlynq/ 128obj-$(CONFIG_VLYNQ) += vlynq/
129obj-$(CONFIG_STAGING) += staging/ 129obj-$(CONFIG_STAGING) += staging/
130obj-y += platform/ 130obj-y += platform/
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 8ad21a25bc0d..64723953e1c9 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -100,96 +100,103 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
100 return vbr; 100 return vbr;
101} 101}
102 102
103static void virtblk_add_buf_wait(struct virtio_blk *vblk, 103static int __virtblk_add_req(struct virtqueue *vq,
104 struct virtblk_req *vbr, 104 struct virtblk_req *vbr,
105 unsigned long out, 105 struct scatterlist *data_sg,
106 unsigned long in) 106 bool have_data)
107{ 107{
108 DEFINE_WAIT(wait); 108 struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6];
109 unsigned int num_out = 0, num_in = 0;
110 int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT;
109 111
110 for (;;) { 112 sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
111 prepare_to_wait_exclusive(&vblk->queue_wait, &wait, 113 sgs[num_out++] = &hdr;
112 TASK_UNINTERRUPTIBLE);
113 114
114 spin_lock_irq(vblk->disk->queue->queue_lock); 115 /*
115 if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, 116 * If this is a packet command we need a couple of additional headers.
116 GFP_ATOMIC) < 0) { 117 * Behind the normal outhdr we put a segment with the scsi command
117 spin_unlock_irq(vblk->disk->queue->queue_lock); 118 * block, and before the normal inhdr we put the sense data and the
118 io_schedule(); 119 * inhdr with additional status information.
119 } else { 120 */
120 virtqueue_kick(vblk->vq); 121 if (type == VIRTIO_BLK_T_SCSI_CMD) {
121 spin_unlock_irq(vblk->disk->queue->queue_lock); 122 sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len);
122 break; 123 sgs[num_out++] = &cmd;
123 } 124 }
124 125
126 if (have_data) {
127 if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT)
128 sgs[num_out++] = data_sg;
129 else
130 sgs[num_out + num_in++] = data_sg;
125 } 131 }
126 132
127 finish_wait(&vblk->queue_wait, &wait); 133 if (type == VIRTIO_BLK_T_SCSI_CMD) {
134 sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
135 sgs[num_out + num_in++] = &sense;
136 sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr));
137 sgs[num_out + num_in++] = &inhdr;
138 }
139
140 sg_init_one(&status, &vbr->status, sizeof(vbr->status));
141 sgs[num_out + num_in++] = &status;
142
143 return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
128} 144}
129 145
130static inline void virtblk_add_req(struct virtblk_req *vbr, 146static void virtblk_add_req(struct virtblk_req *vbr, bool have_data)
131 unsigned int out, unsigned int in)
132{ 147{
133 struct virtio_blk *vblk = vbr->vblk; 148 struct virtio_blk *vblk = vbr->vblk;
149 DEFINE_WAIT(wait);
150 int ret;
134 151
135 spin_lock_irq(vblk->disk->queue->queue_lock); 152 spin_lock_irq(vblk->disk->queue->queue_lock);
136 if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, 153 while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg,
137 GFP_ATOMIC) < 0)) { 154 have_data)) < 0)) {
155 prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
156 TASK_UNINTERRUPTIBLE);
157
138 spin_unlock_irq(vblk->disk->queue->queue_lock); 158 spin_unlock_irq(vblk->disk->queue->queue_lock);
139 virtblk_add_buf_wait(vblk, vbr, out, in); 159 io_schedule();
140 return; 160 spin_lock_irq(vblk->disk->queue->queue_lock);
161
162 finish_wait(&vblk->queue_wait, &wait);
141 } 163 }
164
142 virtqueue_kick(vblk->vq); 165 virtqueue_kick(vblk->vq);
143 spin_unlock_irq(vblk->disk->queue->queue_lock); 166 spin_unlock_irq(vblk->disk->queue->queue_lock);
144} 167}
145 168
146static int virtblk_bio_send_flush(struct virtblk_req *vbr) 169static void virtblk_bio_send_flush(struct virtblk_req *vbr)
147{ 170{
148 unsigned int out = 0, in = 0;
149
150 vbr->flags |= VBLK_IS_FLUSH; 171 vbr->flags |= VBLK_IS_FLUSH;
151 vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; 172 vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
152 vbr->out_hdr.sector = 0; 173 vbr->out_hdr.sector = 0;
153 vbr->out_hdr.ioprio = 0; 174 vbr->out_hdr.ioprio = 0;
154 sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
155 sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status));
156
157 virtblk_add_req(vbr, out, in);
158 175
159 return 0; 176 virtblk_add_req(vbr, false);
160} 177}
161 178
162static int virtblk_bio_send_data(struct virtblk_req *vbr) 179static void virtblk_bio_send_data(struct virtblk_req *vbr)
163{ 180{
164 struct virtio_blk *vblk = vbr->vblk; 181 struct virtio_blk *vblk = vbr->vblk;
165 unsigned int num, out = 0, in = 0;
166 struct bio *bio = vbr->bio; 182 struct bio *bio = vbr->bio;
183 bool have_data;
167 184
168 vbr->flags &= ~VBLK_IS_FLUSH; 185 vbr->flags &= ~VBLK_IS_FLUSH;
169 vbr->out_hdr.type = 0; 186 vbr->out_hdr.type = 0;
170 vbr->out_hdr.sector = bio->bi_sector; 187 vbr->out_hdr.sector = bio->bi_sector;
171 vbr->out_hdr.ioprio = bio_prio(bio); 188 vbr->out_hdr.ioprio = bio_prio(bio);
172 189
173 sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); 190 if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) {
174 191 have_data = true;
175 num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out); 192 if (bio->bi_rw & REQ_WRITE)
176
177 sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
178 sizeof(vbr->status));
179
180 if (num) {
181 if (bio->bi_rw & REQ_WRITE) {
182 vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; 193 vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
183 out += num; 194 else
184 } else {
185 vbr->out_hdr.type |= VIRTIO_BLK_T_IN; 195 vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
186 in += num; 196 } else
187 } 197 have_data = false;
188 }
189 198
190 virtblk_add_req(vbr, out, in); 199 virtblk_add_req(vbr, have_data);
191
192 return 0;
193} 200}
194 201
195static void virtblk_bio_send_data_work(struct work_struct *work) 202static void virtblk_bio_send_data_work(struct work_struct *work)
@@ -298,7 +305,7 @@ static void virtblk_done(struct virtqueue *vq)
298static bool do_req(struct request_queue *q, struct virtio_blk *vblk, 305static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
299 struct request *req) 306 struct request *req)
300{ 307{
301 unsigned long num, out = 0, in = 0; 308 unsigned int num;
302 struct virtblk_req *vbr; 309 struct virtblk_req *vbr;
303 310
304 vbr = virtblk_alloc_req(vblk, GFP_ATOMIC); 311 vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
@@ -335,40 +342,15 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
335 } 342 }
336 } 343 }
337 344
338 sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); 345 num = blk_rq_map_sg(q, vbr->req, vblk->sg);
339
340 /*
341 * If this is a packet command we need a couple of additional headers.
342 * Behind the normal outhdr we put a segment with the scsi command
343 * block, and before the normal inhdr we put the sense data and the
344 * inhdr with additional status information before the normal inhdr.
345 */
346 if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC)
347 sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
348
349 num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
350
351 if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) {
352 sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
353 sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
354 sizeof(vbr->in_hdr));
355 }
356
357 sg_set_buf(&vblk->sg[num + out + in++], &vbr->status,
358 sizeof(vbr->status));
359
360 if (num) { 346 if (num) {
361 if (rq_data_dir(vbr->req) == WRITE) { 347 if (rq_data_dir(vbr->req) == WRITE)
362 vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; 348 vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
363 out += num; 349 else
364 } else {
365 vbr->out_hdr.type |= VIRTIO_BLK_T_IN; 350 vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
366 in += num;
367 }
368 } 351 }
369 352
370 if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, 353 if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) {
371 GFP_ATOMIC) < 0) {
372 mempool_free(vbr, vblk->pool); 354 mempool_free(vbr, vblk->pool);
373 return false; 355 return false;
374 } 356 }
@@ -539,6 +521,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
539 struct virtio_device *vdev = vblk->vdev; 521 struct virtio_device *vdev = vblk->vdev;
540 struct request_queue *q = vblk->disk->queue; 522 struct request_queue *q = vblk->disk->queue;
541 char cap_str_2[10], cap_str_10[10]; 523 char cap_str_2[10], cap_str_10[10];
524 char *envp[] = { "RESIZE=1", NULL };
542 u64 capacity, size; 525 u64 capacity, size;
543 526
544 mutex_lock(&vblk->config_lock); 527 mutex_lock(&vblk->config_lock);
@@ -568,6 +551,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
568 551
569 set_capacity(vblk->disk, capacity); 552 set_capacity(vblk->disk, capacity);
570 revalidate_disk(vblk->disk); 553 revalidate_disk(vblk->disk);
554 kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
571done: 555done:
572 mutex_unlock(&vblk->config_lock); 556 mutex_unlock(&vblk->config_lock);
573} 557}
diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 6bf4d47324eb..ef46a9cfd832 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -47,7 +47,7 @@ static void register_buffer(u8 *buf, size_t size)
47 sg_init_one(&sg, buf, size); 47 sg_init_one(&sg, buf, size);
48 48
49 /* There should always be room for one buffer. */ 49 /* There should always be room for one buffer. */
50 if (virtqueue_add_buf(vq, &sg, 0, 1, buf, GFP_KERNEL) < 0) 50 if (virtqueue_add_inbuf(vq, &sg, 1, buf, GFP_KERNEL) < 0)
51 BUG(); 51 BUG();
52 52
53 virtqueue_kick(vq); 53 virtqueue_kick(vq);
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index ce5f3fc25d6d..1b456fe9b87a 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -78,8 +78,8 @@ struct ports_driver_data {
78}; 78};
79static struct ports_driver_data pdrvdata; 79static struct ports_driver_data pdrvdata;
80 80
81DEFINE_SPINLOCK(pdrvdata_lock); 81static DEFINE_SPINLOCK(pdrvdata_lock);
82DECLARE_COMPLETION(early_console_added); 82static DECLARE_COMPLETION(early_console_added);
83 83
84/* This struct holds information that's relevant only for console ports */ 84/* This struct holds information that's relevant only for console ports */
85struct console { 85struct console {
@@ -503,7 +503,7 @@ static int add_inbuf(struct virtqueue *vq, struct port_buffer *buf)
503 503
504 sg_init_one(sg, buf->buf, buf->size); 504 sg_init_one(sg, buf->buf, buf->size);
505 505
506 ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC); 506 ret = virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC);
507 virtqueue_kick(vq); 507 virtqueue_kick(vq);
508 if (!ret) 508 if (!ret)
509 ret = vq->num_free; 509 ret = vq->num_free;
@@ -572,7 +572,7 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
572 sg_init_one(sg, &cpkt, sizeof(cpkt)); 572 sg_init_one(sg, &cpkt, sizeof(cpkt));
573 573
574 spin_lock(&portdev->c_ovq_lock); 574 spin_lock(&portdev->c_ovq_lock);
575 if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) == 0) { 575 if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) {
576 virtqueue_kick(vq); 576 virtqueue_kick(vq);
577 while (!virtqueue_get_buf(vq, &len)) 577 while (!virtqueue_get_buf(vq, &len))
578 cpu_relax(); 578 cpu_relax();
@@ -622,7 +622,7 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg,
622 622
623 reclaim_consumed_buffers(port); 623 reclaim_consumed_buffers(port);
624 624
625 err = virtqueue_add_buf(out_vq, sg, nents, 0, data, GFP_ATOMIC); 625 err = virtqueue_add_outbuf(out_vq, sg, nents, data, GFP_ATOMIC);
626 626
627 /* Tell Host to go! */ 627 /* Tell Host to go! */
628 virtqueue_kick(out_vq); 628 virtqueue_kick(out_vq);
@@ -1040,7 +1040,7 @@ static int port_fops_open(struct inode *inode, struct file *filp)
1040 spin_lock_irq(&port->inbuf_lock); 1040 spin_lock_irq(&port->inbuf_lock);
1041 if (port->guest_connected) { 1041 if (port->guest_connected) {
1042 spin_unlock_irq(&port->inbuf_lock); 1042 spin_unlock_irq(&port->inbuf_lock);
1043 ret = -EMFILE; 1043 ret = -EBUSY;
1044 goto out; 1044 goto out;
1045 } 1045 }
1046 1046
@@ -1202,7 +1202,7 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
1202 return hvc_instantiate(0, 0, &hv_ops); 1202 return hvc_instantiate(0, 0, &hv_ops);
1203} 1203}
1204 1204
1205int init_port_console(struct port *port) 1205static int init_port_console(struct port *port)
1206{ 1206{
1207 int ret; 1207 int ret;
1208 1208
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 89875ea19ade..ee035ec4526b 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -5,10 +5,9 @@ config LGUEST
5 ---help--- 5 ---help---
6 This is a very simple module which allows you to run 6 This is a very simple module which allows you to run
7 multiple instances of the same Linux kernel, using the 7 multiple instances of the same Linux kernel, using the
8 "lguest" command found in the Documentation/virtual/lguest 8 "lguest" command found in the tools/lguest directory.
9 directory.
10 9
11 Note that "lguest" is pronounced to rhyme with "fell quest", 10 Note that "lguest" is pronounced to rhyme with "fell quest",
12 not "rustyvisor". See Documentation/virtual/lguest/lguest.txt. 11 not "rustyvisor". See tools/lguest/lguest.txt.
13 12
14 If unsure, say N. If curious, say M. If masochistic, say Y. 13 If unsure, say N. If curious, say M. If masochistic, say Y.
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index a5ebc0083d87..0bf1e4edf04d 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -20,9 +20,9 @@
20#include <asm/asm-offsets.h> 20#include <asm/asm-offsets.h>
21#include "lg.h" 21#include "lg.h"
22 22
23 23unsigned long switcher_addr;
24struct page **lg_switcher_pages;
24static struct vm_struct *switcher_vma; 25static struct vm_struct *switcher_vma;
25static struct page **switcher_page;
26 26
27/* This One Big lock protects all inter-guest data structures. */ 27/* This One Big lock protects all inter-guest data structures. */
28DEFINE_MUTEX(lguest_lock); 28DEFINE_MUTEX(lguest_lock);
@@ -52,13 +52,21 @@ static __init int map_switcher(void)
52 * easy. 52 * easy.
53 */ 53 */
54 54
55 /* We assume Switcher text fits into a single page. */
56 if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
57 printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
58 end_switcher_text - start_switcher_text);
59 return -EINVAL;
60 }
61
55 /* 62 /*
56 * We allocate an array of struct page pointers. map_vm_area() wants 63 * We allocate an array of struct page pointers. map_vm_area() wants
57 * this, rather than just an array of pages. 64 * this, rather than just an array of pages.
58 */ 65 */
59 switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, 66 lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
60 GFP_KERNEL); 67 * TOTAL_SWITCHER_PAGES,
61 if (!switcher_page) { 68 GFP_KERNEL);
69 if (!lg_switcher_pages) {
62 err = -ENOMEM; 70 err = -ENOMEM;
63 goto out; 71 goto out;
64 } 72 }
@@ -68,32 +76,29 @@ static __init int map_switcher(void)
68 * so we make sure they're zeroed. 76 * so we make sure they're zeroed.
69 */ 77 */
70 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { 78 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
71 switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); 79 lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
72 if (!switcher_page[i]) { 80 if (!lg_switcher_pages[i]) {
73 err = -ENOMEM; 81 err = -ENOMEM;
74 goto free_some_pages; 82 goto free_some_pages;
75 } 83 }
76 } 84 }
77 85
78 /* 86 /*
79 * First we check that the Switcher won't overlap the fixmap area at 87 * We place the Switcher underneath the fixmap area, which is the
80 * the top of memory. It's currently nowhere near, but it could have 88 * highest virtual address we can get. This is important, since we
81 * very strange effects if it ever happened. 89 * tell the Guest it can't access this memory, so we want its ceiling
90 * as high as possible.
82 */ 91 */
83 if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ 92 switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
84 err = -ENOMEM;
85 printk("lguest: mapping switcher would thwack fixmap\n");
86 goto free_pages;
87 }
88 93
89 /* 94 /*
90 * Now we reserve the "virtual memory area" we want: 0xFFC00000 95 * Now we reserve the "virtual memory area" we want. We might
91 * (SWITCHER_ADDR). We might not get it in theory, but in practice 96 * not get it in theory, but in practice it's worked so far.
92 * it's worked so far. The end address needs +1 because __get_vm_area 97 * The end address needs +1 because __get_vm_area allocates an
93 * allocates an extra guard page, so we need space for that. 98 * extra guard page, so we need space for that.
94 */ 99 */
95 switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, 100 switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
96 VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR 101 VM_ALLOC, switcher_addr, switcher_addr
97 + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); 102 + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
98 if (!switcher_vma) { 103 if (!switcher_vma) {
99 err = -ENOMEM; 104 err = -ENOMEM;
@@ -103,12 +108,12 @@ static __init int map_switcher(void)
103 108
104 /* 109 /*
105 * This code actually sets up the pages we've allocated to appear at 110 * This code actually sets up the pages we've allocated to appear at
106 * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the 111 * switcher_addr. map_vm_area() takes the vma we allocated above, the
107 * kind of pages we're mapping (kernel pages), and a pointer to our 112 * kind of pages we're mapping (kernel pages), and a pointer to our
108 * array of struct pages. It increments that pointer, but we don't 113 * array of struct pages. It increments that pointer, but we don't
109 * care. 114 * care.
110 */ 115 */
111 pagep = switcher_page; 116 pagep = lg_switcher_pages;
112 err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); 117 err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
113 if (err) { 118 if (err) {
114 printk("lguest: map_vm_area failed: %i\n", err); 119 printk("lguest: map_vm_area failed: %i\n", err);
@@ -133,8 +138,8 @@ free_pages:
133 i = TOTAL_SWITCHER_PAGES; 138 i = TOTAL_SWITCHER_PAGES;
134free_some_pages: 139free_some_pages:
135 for (--i; i >= 0; i--) 140 for (--i; i >= 0; i--)
136 __free_pages(switcher_page[i], 0); 141 __free_pages(lg_switcher_pages[i], 0);
137 kfree(switcher_page); 142 kfree(lg_switcher_pages);
138out: 143out:
139 return err; 144 return err;
140} 145}
@@ -149,8 +154,8 @@ static void unmap_switcher(void)
149 vunmap(switcher_vma->addr); 154 vunmap(switcher_vma->addr);
150 /* Now we just need to free the pages we copied the switcher into */ 155 /* Now we just need to free the pages we copied the switcher into */
151 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) 156 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
152 __free_pages(switcher_page[i], 0); 157 __free_pages(lg_switcher_pages[i], 0);
153 kfree(switcher_page); 158 kfree(lg_switcher_pages);
154} 159}
155 160
156/*H:032 161/*H:032
@@ -323,15 +328,10 @@ static int __init init(void)
323 if (err) 328 if (err)
324 goto out; 329 goto out;
325 330
326 /* Now we set up the pagetable implementation for the Guests. */
327 err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
328 if (err)
329 goto unmap;
330
331 /* We might need to reserve an interrupt vector. */ 331 /* We might need to reserve an interrupt vector. */
332 err = init_interrupts(); 332 err = init_interrupts();
333 if (err) 333 if (err)
334 goto free_pgtables; 334 goto unmap;
335 335
336 /* /dev/lguest needs to be registered. */ 336 /* /dev/lguest needs to be registered. */
337 err = lguest_device_init(); 337 err = lguest_device_init();
@@ -346,8 +346,6 @@ static int __init init(void)
346 346
347free_interrupts: 347free_interrupts:
348 free_interrupts(); 348 free_interrupts();
349free_pgtables:
350 free_pagetables();
351unmap: 349unmap:
352 unmap_switcher(); 350 unmap_switcher();
353out: 351out:
@@ -359,7 +357,6 @@ static void __exit fini(void)
359{ 357{
360 lguest_device_remove(); 358 lguest_device_remove();
361 free_interrupts(); 359 free_interrupts();
362 free_pagetables();
363 unmap_switcher(); 360 unmap_switcher();
364 361
365 lguest_arch_host_fini(); 362 lguest_arch_host_fini();
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 295df06e6590..2eef40be4c04 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -14,11 +14,10 @@
14 14
15#include <asm/lguest.h> 15#include <asm/lguest.h>
16 16
17void free_pagetables(void);
18int init_pagetables(struct page **switcher_page, unsigned int pages);
19
20struct pgdir { 17struct pgdir {
21 unsigned long gpgdir; 18 unsigned long gpgdir;
19 bool switcher_mapped;
20 int last_host_cpu;
22 pgd_t *pgdir; 21 pgd_t *pgdir;
23}; 22};
24 23
@@ -124,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg,
124 unsigned long addr, unsigned long len); 123 unsigned long addr, unsigned long len);
125void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); 124void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
126void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); 125void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
126extern struct page **lg_switcher_pages;
127 127
128/*H:035 128/*H:035
129 * Using memory-copy operations like that is usually inconvient, so we 129 * Using memory-copy operations like that is usually inconvient, so we
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index ff4a0bc9904d..4263f4cc8c55 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
250 */ 250 */
251static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) 251static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
252{ 252{
253 /* We have a limited number the number of CPUs in the lguest struct. */ 253 /* We have a limited number of CPUs in the lguest struct. */
254 if (id >= ARRAY_SIZE(cpu->lg->cpus)) 254 if (id >= ARRAY_SIZE(cpu->lg->cpus))
255 return -EINVAL; 255 return -EINVAL;
256 256
257 /* Set up this CPU's id, and pointer back to the lguest struct. */ 257 /* Set up this CPU's id, and pointer back to the lguest struct. */
258 cpu->id = id; 258 cpu->id = id;
259 cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); 259 cpu->lg = container_of(cpu, struct lguest, cpus[id]);
260 cpu->lg->nr_cpus++; 260 cpu->lg->nr_cpus++;
261 261
262 /* Each CPU has a timer it can set. */ 262 /* Each CPU has a timer it can set. */
@@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
270 if (!cpu->regs_page) 270 if (!cpu->regs_page)
271 return -ENOMEM; 271 return -ENOMEM;
272 272
273 /* We actually put the registers at the bottom of the page. */ 273 /* We actually put the registers at the end of the page. */
274 cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); 274 cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
275 275
276 /* 276 /*
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 864baabaee25..699187ab3800 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -7,7 +7,7 @@
7 * converted Guest pages when running the Guest. 7 * converted Guest pages when running the Guest.
8:*/ 8:*/
9 9
10/* Copyright (C) Rusty Russell IBM Corporation 2006. 10/* Copyright (C) Rusty Russell IBM Corporation 2013.
11 * GPL v2 and any later version */ 11 * GPL v2 and any later version */
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/gfp.h> 13#include <linux/gfp.h>
@@ -62,22 +62,11 @@
62 * will need the last pmd entry of the last pmd page. 62 * will need the last pmd entry of the last pmd page.
63 */ 63 */
64#ifdef CONFIG_X86_PAE 64#ifdef CONFIG_X86_PAE
65#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
66#define RESERVE_MEM 2U
67#define CHECK_GPGD_MASK _PAGE_PRESENT 65#define CHECK_GPGD_MASK _PAGE_PRESENT
68#else 66#else
69#define RESERVE_MEM 4U
70#define CHECK_GPGD_MASK _PAGE_TABLE 67#define CHECK_GPGD_MASK _PAGE_TABLE
71#endif 68#endif
72 69
73/*
74 * We actually need a separate PTE page for each CPU. Remember that after the
75 * Switcher code itself comes two pages for each CPU, and we don't want this
76 * CPU's guest to see the pages of any other CPU.
77 */
78static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
79#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
80
81/*H:320 70/*H:320
82 * The page table code is curly enough to need helper functions to keep it 71 * The page table code is curly enough to need helper functions to keep it
83 * clear and clean. The kernel itself provides many of them; one advantage 72 * clear and clean. The kernel itself provides many of them; one advantage
@@ -95,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
95{ 84{
96 unsigned int index = pgd_index(vaddr); 85 unsigned int index = pgd_index(vaddr);
97 86
98#ifndef CONFIG_X86_PAE
99 /* We kill any Guest trying to touch the Switcher addresses. */
100 if (index >= SWITCHER_PGD_INDEX) {
101 kill_guest(cpu, "attempt to access switcher pages");
102 index = 0;
103 }
104#endif
105 /* Return a pointer index'th pgd entry for the i'th page table. */ 87 /* Return a pointer index'th pgd entry for the i'th page table. */
106 return &cpu->lg->pgdirs[i].pgdir[index]; 88 return &cpu->lg->pgdirs[i].pgdir[index];
107} 89}
@@ -117,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
117 unsigned int index = pmd_index(vaddr); 99 unsigned int index = pmd_index(vaddr);
118 pmd_t *page; 100 pmd_t *page;
119 101
120 /* We kill any Guest trying to touch the Switcher addresses. */
121 if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
122 index >= SWITCHER_PMD_INDEX) {
123 kill_guest(cpu, "attempt to access switcher pages");
124 index = 0;
125 }
126
127 /* You should never call this if the PGD entry wasn't valid */ 102 /* You should never call this if the PGD entry wasn't valid */
128 BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); 103 BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
129 page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 104 page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
@@ -275,122 +250,177 @@ static void release_pte(pte_t pte)
275} 250}
276/*:*/ 251/*:*/
277 252
278static void check_gpte(struct lg_cpu *cpu, pte_t gpte) 253static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
279{ 254{
280 if ((pte_flags(gpte) & _PAGE_PSE) || 255 if ((pte_flags(gpte) & _PAGE_PSE) ||
281 pte_pfn(gpte) >= cpu->lg->pfn_limit) 256 pte_pfn(gpte) >= cpu->lg->pfn_limit) {
282 kill_guest(cpu, "bad page table entry"); 257 kill_guest(cpu, "bad page table entry");
258 return false;
259 }
260 return true;
283} 261}
284 262
285static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) 263static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
286{ 264{
287 if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || 265 if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
288 (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) 266 (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
289 kill_guest(cpu, "bad page directory entry"); 267 kill_guest(cpu, "bad page directory entry");
268 return false;
269 }
270 return true;
290} 271}
291 272
292#ifdef CONFIG_X86_PAE 273#ifdef CONFIG_X86_PAE
293static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) 274static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
294{ 275{
295 if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || 276 if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
296 (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) 277 (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
297 kill_guest(cpu, "bad page middle directory entry"); 278 kill_guest(cpu, "bad page middle directory entry");
279 return false;
280 }
281 return true;
298} 282}
299#endif 283#endif
300 284
301/*H:330 285/*H:331
302 * (i) Looking up a page table entry when the Guest faults. 286 * This is the core routine to walk the shadow page tables and find the page
303 * 287 * table entry for a specific address.
304 * We saw this call in run_guest(): when we see a page fault in the Guest, we
305 * come here. That's because we only set up the shadow page tables lazily as
306 * they're needed, so we get page faults all the time and quietly fix them up
307 * and return to the Guest without it knowing.
308 * 288 *
309 * If we fixed up the fault (ie. we mapped the address), this routine returns 289 * If allocate is set, then we allocate any missing levels, setting the flags
310 * true. Otherwise, it was a real fault and we need to tell the Guest. 290 * on the new page directory and mid-level directories using the arguments
291 * (which are copied from the Guest's page table entries).
311 */ 292 */
312bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) 293static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
294 int pgd_flags, int pmd_flags)
313{ 295{
314 pgd_t gpgd;
315 pgd_t *spgd; 296 pgd_t *spgd;
316 unsigned long gpte_ptr;
317 pte_t gpte;
318 pte_t *spte;
319
320 /* Mid level for PAE. */ 297 /* Mid level for PAE. */
321#ifdef CONFIG_X86_PAE 298#ifdef CONFIG_X86_PAE
322 pmd_t *spmd; 299 pmd_t *spmd;
323 pmd_t gpmd;
324#endif 300#endif
325 301
326 /* First step: get the top-level Guest page table entry. */ 302 /* Get top level entry. */
327 if (unlikely(cpu->linear_pages)) {
328 /* Faking up a linear mapping. */
329 gpgd = __pgd(CHECK_GPGD_MASK);
330 } else {
331 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
332 /* Toplevel not present? We can't map it in. */
333 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
334 return false;
335 }
336
337 /* Now look at the matching shadow entry. */
338 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); 303 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
339 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { 304 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
340 /* No shadow entry: allocate a new shadow PTE page. */ 305 /* No shadow entry: allocate a new shadow PTE page. */
341 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 306 unsigned long ptepage;
307
308 /* If they didn't want us to allocate anything, stop. */
309 if (!allocate)
310 return NULL;
311
312 ptepage = get_zeroed_page(GFP_KERNEL);
342 /* 313 /*
343 * This is not really the Guest's fault, but killing it is 314 * This is not really the Guest's fault, but killing it is
344 * simple for this corner case. 315 * simple for this corner case.
345 */ 316 */
346 if (!ptepage) { 317 if (!ptepage) {
347 kill_guest(cpu, "out of memory allocating pte page"); 318 kill_guest(cpu, "out of memory allocating pte page");
348 return false; 319 return NULL;
349 } 320 }
350 /* We check that the Guest pgd is OK. */
351 check_gpgd(cpu, gpgd);
352 /* 321 /*
353 * And we copy the flags to the shadow PGD entry. The page 322 * And we copy the flags to the shadow PGD entry. The page
354 * number in the shadow PGD is the page we just allocated. 323 * number in the shadow PGD is the page we just allocated.
355 */ 324 */
356 set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); 325 set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
357 } 326 }
358 327
328 /*
329 * Intel's Physical Address Extension actually uses three levels of
330 * page tables, so we need to look in the mid-level.
331 */
359#ifdef CONFIG_X86_PAE 332#ifdef CONFIG_X86_PAE
360 if (unlikely(cpu->linear_pages)) { 333 /* Now look at the mid-level shadow entry. */
361 /* Faking up a linear mapping. */
362 gpmd = __pmd(_PAGE_TABLE);
363 } else {
364 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
365 /* Middle level not present? We can't map it in. */
366 if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
367 return false;
368 }
369
370 /* Now look at the matching shadow entry. */
371 spmd = spmd_addr(cpu, *spgd, vaddr); 334 spmd = spmd_addr(cpu, *spgd, vaddr);
372 335
373 if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { 336 if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
374 /* No shadow entry: allocate a new shadow PTE page. */ 337 /* No shadow entry: allocate a new shadow PTE page. */
375 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 338 unsigned long ptepage;
339
340 /* If they didn't want us to allocate anything, stop. */
341 if (!allocate)
342 return NULL;
343
344 ptepage = get_zeroed_page(GFP_KERNEL);
376 345
377 /* 346 /*
378 * This is not really the Guest's fault, but killing it is 347 * This is not really the Guest's fault, but killing it is
379 * simple for this corner case. 348 * simple for this corner case.
380 */ 349 */
381 if (!ptepage) { 350 if (!ptepage) {
382 kill_guest(cpu, "out of memory allocating pte page"); 351 kill_guest(cpu, "out of memory allocating pmd page");
383 return false; 352 return NULL;
384 } 353 }
385 354
386 /* We check that the Guest pmd is OK. */
387 check_gpmd(cpu, gpmd);
388
389 /* 355 /*
390 * And we copy the flags to the shadow PMD entry. The page 356 * And we copy the flags to the shadow PMD entry. The page
391 * number in the shadow PMD is the page we just allocated. 357 * number in the shadow PMD is the page we just allocated.
392 */ 358 */
393 set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); 359 set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
360 }
361#endif
362
363 /* Get the pointer to the shadow PTE entry we're going to set. */
364 return spte_addr(cpu, *spgd, vaddr);
365}
366
367/*H:330
368 * (i) Looking up a page table entry when the Guest faults.
369 *
370 * We saw this call in run_guest(): when we see a page fault in the Guest, we
371 * come here. That's because we only set up the shadow page tables lazily as
372 * they're needed, so we get page faults all the time and quietly fix them up
373 * and return to the Guest without it knowing.
374 *
375 * If we fixed up the fault (ie. we mapped the address), this routine returns
376 * true. Otherwise, it was a real fault and we need to tell the Guest.
377 */
378bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
379{
380 unsigned long gpte_ptr;
381 pte_t gpte;
382 pte_t *spte;
383 pmd_t gpmd;
384 pgd_t gpgd;
385
386 /* We never demand page the Switcher, so trying is a mistake. */
387 if (vaddr >= switcher_addr)
388 return false;
389
390 /* First step: get the top-level Guest page table entry. */
391 if (unlikely(cpu->linear_pages)) {
392 /* Faking up a linear mapping. */
393 gpgd = __pgd(CHECK_GPGD_MASK);
394 } else {
395 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
396 /* Toplevel not present? We can't map it in. */
397 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
398 return false;
399
400 /*
401 * This kills the Guest if it has weird flags or tries to
402 * refer to a "physical" address outside the bounds.
403 */
404 if (!check_gpgd(cpu, gpgd))
405 return false;
406 }
407
408 /* This "mid-level" entry is only used for non-linear, PAE mode. */
409 gpmd = __pmd(_PAGE_TABLE);
410
411#ifdef CONFIG_X86_PAE
412 if (likely(!cpu->linear_pages)) {
413 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
414 /* Middle level not present? We can't map it in. */
415 if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
416 return false;
417
418 /*
419 * This kills the Guest if it has weird flags or tries to
420 * refer to a "physical" address outside the bounds.
421 */
422 if (!check_gpmd(cpu, gpmd))
423 return false;
394 } 424 }
395 425
396 /* 426 /*
@@ -433,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
433 * Check that the Guest PTE flags are OK, and the page number is below 463 * Check that the Guest PTE flags are OK, and the page number is below
434 * the pfn_limit (ie. not mapping the Launcher binary). 464 * the pfn_limit (ie. not mapping the Launcher binary).
435 */ 465 */
436 check_gpte(cpu, gpte); 466 if (!check_gpte(cpu, gpte))
467 return false;
437 468
438 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 469 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
439 gpte = pte_mkyoung(gpte); 470 gpte = pte_mkyoung(gpte);
@@ -441,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
441 gpte = pte_mkdirty(gpte); 472 gpte = pte_mkdirty(gpte);
442 473
443 /* Get the pointer to the shadow PTE entry we're going to set. */ 474 /* Get the pointer to the shadow PTE entry we're going to set. */
444 spte = spte_addr(cpu, *spgd, vaddr); 475 spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
476 if (!spte)
477 return false;
445 478
446 /* 479 /*
447 * If there was a valid shadow PTE entry here before, we release it. 480 * If there was a valid shadow PTE entry here before, we release it.
@@ -493,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
493 */ 526 */
494static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) 527static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
495{ 528{
496 pgd_t *spgd; 529 pte_t *spte;
497 unsigned long flags; 530 unsigned long flags;
498 531
499#ifdef CONFIG_X86_PAE 532 /* You can't put your stack in the Switcher! */
500 pmd_t *spmd; 533 if (vaddr >= switcher_addr)
501#endif
502 /* Look at the current top level entry: is it present? */
503 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
504 if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
505 return false; 534 return false;
506 535
507#ifdef CONFIG_X86_PAE 536 /* If there's no shadow PTE, it's not writable. */
508 spmd = spmd_addr(cpu, *spgd, vaddr); 537 spte = find_spte(cpu, vaddr, false, 0, 0);
509 if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) 538 if (!spte)
510 return false; 539 return false;
511#endif
512 540
513 /* 541 /*
514 * Check the flags on the pte entry itself: it must be present and 542 * Check the flags on the pte entry itself: it must be present and
515 * writable. 543 * writable.
516 */ 544 */
517 flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); 545 flags = pte_flags(*spte);
518
519 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 546 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
520} 547}
521 548
@@ -678,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
678 int *blank_pgdir) 705 int *blank_pgdir)
679{ 706{
680 unsigned int next; 707 unsigned int next;
681#ifdef CONFIG_X86_PAE
682 pmd_t *pmd_table;
683#endif
684 708
685 /* 709 /*
686 * We pick one entry at random to throw out. Choosing the Least 710 * We pick one entry at random to throw out. Choosing the Least
@@ -695,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
695 if (!cpu->lg->pgdirs[next].pgdir) 719 if (!cpu->lg->pgdirs[next].pgdir)
696 next = cpu->cpu_pgd; 720 next = cpu->cpu_pgd;
697 else { 721 else {
698#ifdef CONFIG_X86_PAE
699 /* 722 /*
700 * In PAE mode, allocate a pmd page and populate the 723 * This is a blank page, so there are no kernel
701 * last pgd entry. 724 * mappings: caller must map the stack!
702 */ 725 */
703 pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
704 if (!pmd_table) {
705 free_page((long)cpu->lg->pgdirs[next].pgdir);
706 set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
707 next = cpu->cpu_pgd;
708 } else {
709 set_pgd(cpu->lg->pgdirs[next].pgdir +
710 SWITCHER_PGD_INDEX,
711 __pgd(__pa(pmd_table) | _PAGE_PRESENT));
712 /*
713 * This is a blank page, so there are no kernel
714 * mappings: caller must map the stack!
715 */
716 *blank_pgdir = 1;
717 }
718#else
719 *blank_pgdir = 1; 726 *blank_pgdir = 1;
720#endif
721 } 727 }
722 } 728 }
723 /* Record which Guest toplevel this shadows. */ 729 /* Record which Guest toplevel this shadows. */
@@ -725,9 +731,50 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
725 /* Release all the non-kernel mappings. */ 731 /* Release all the non-kernel mappings. */
726 flush_user_mappings(cpu->lg, next); 732 flush_user_mappings(cpu->lg, next);
727 733
734 /* This hasn't run on any CPU at all. */
735 cpu->lg->pgdirs[next].last_host_cpu = -1;
736
728 return next; 737 return next;
729} 738}
730 739
740/*H:501
741 * We do need the Switcher code mapped at all times, so we allocate that
742 * part of the Guest page table here. We map the Switcher code immediately,
743 * but defer mapping of the guest register page and IDT/LDT etc page until
744 * just before we run the guest in map_switcher_in_guest().
745 *
746 * We *could* do this setup in map_switcher_in_guest(), but at that point
747 * we've interrupts disabled, and allocating pages like that is fraught: we
748 * can't sleep if we need to free up some memory.
749 */
750static bool allocate_switcher_mapping(struct lg_cpu *cpu)
751{
752 int i;
753
754 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
755 pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
756 CHECK_GPGD_MASK, _PAGE_TABLE);
757 if (!pte)
758 return false;
759
760 /*
761 * Map the switcher page if not already there. It might
762 * already be there because we call allocate_switcher_mapping()
763 * in guest_set_pgd() just in case it did discard our Switcher
764 * mapping, but it probably didn't.
765 */
766 if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
767 /* Get a reference to the Switcher page. */
768 get_page(lg_switcher_pages[0]);
769 /* Create a read-only, exectuable, kernel-style PTE */
770 set_pte(pte,
771 mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
772 }
773 }
774 cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
775 return true;
776}
777
731/*H:470 778/*H:470
732 * Finally, a routine which throws away everything: all PGD entries in all 779 * Finally, a routine which throws away everything: all PGD entries in all
733 * the shadow page tables, including the Guest's kernel mappings. This is used 780 * the shadow page tables, including the Guest's kernel mappings. This is used
@@ -738,28 +785,16 @@ static void release_all_pagetables(struct lguest *lg)
738 unsigned int i, j; 785 unsigned int i, j;
739 786
740 /* Every shadow pagetable this Guest has */ 787 /* Every shadow pagetable this Guest has */
741 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 788 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
742 if (lg->pgdirs[i].pgdir) { 789 if (!lg->pgdirs[i].pgdir)
743#ifdef CONFIG_X86_PAE 790 continue;
744 pgd_t *spgd; 791
745 pmd_t *pmdpage; 792 /* Every PGD entry. */
746 unsigned int k; 793 for (j = 0; j < PTRS_PER_PGD; j++)
747 794 release_pgd(lg->pgdirs[i].pgdir + j);
748 /* Get the last pmd page. */ 795 lg->pgdirs[i].switcher_mapped = false;
749 spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; 796 lg->pgdirs[i].last_host_cpu = -1;
750 pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 797 }
751
752 /*
753 * And release the pmd entries of that pmd page,
754 * except for the switcher pmd.
755 */
756 for (k = 0; k < SWITCHER_PMD_INDEX; k++)
757 release_pmd(&pmdpage[k]);
758#endif
759 /* Every PGD entry except the Switcher at the top */
760 for (j = 0; j < SWITCHER_PGD_INDEX; j++)
761 release_pgd(lg->pgdirs[i].pgdir + j);
762 }
763} 798}
764 799
765/* 800/*
@@ -773,6 +808,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
773 release_all_pagetables(cpu->lg); 808 release_all_pagetables(cpu->lg);
774 /* We need the Guest kernel stack mapped again. */ 809 /* We need the Guest kernel stack mapped again. */
775 pin_stack_pages(cpu); 810 pin_stack_pages(cpu);
811 /* And we need Switcher allocated. */
812 if (!allocate_switcher_mapping(cpu))
813 kill_guest(cpu, "Cannot populate switcher mapping");
776} 814}
777 815
778/*H:430 816/*H:430
@@ -808,9 +846,17 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
808 newpgdir = new_pgdir(cpu, pgtable, &repin); 846 newpgdir = new_pgdir(cpu, pgtable, &repin);
809 /* Change the current pgd index to the new one. */ 847 /* Change the current pgd index to the new one. */
810 cpu->cpu_pgd = newpgdir; 848 cpu->cpu_pgd = newpgdir;
811 /* If it was completely blank, we map in the Guest kernel stack */ 849 /*
850 * If it was completely blank, we map in the Guest kernel stack and
851 * the Switcher.
852 */
812 if (repin) 853 if (repin)
813 pin_stack_pages(cpu); 854 pin_stack_pages(cpu);
855
856 if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
857 if (!allocate_switcher_mapping(cpu))
858 kill_guest(cpu, "Cannot populate switcher mapping");
859 }
814} 860}
815/*:*/ 861/*:*/
816 862
@@ -865,7 +911,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
865 * micro-benchmark. 911 * micro-benchmark.
866 */ 912 */
867 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 913 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
868 check_gpte(cpu, gpte); 914 if (!check_gpte(cpu, gpte))
915 return;
869 set_pte(spte, 916 set_pte(spte,
870 gpte_to_spte(cpu, gpte, 917 gpte_to_spte(cpu, gpte,
871 pte_flags(gpte) & _PAGE_DIRTY)); 918 pte_flags(gpte) & _PAGE_DIRTY));
@@ -897,6 +944,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
897void guest_set_pte(struct lg_cpu *cpu, 944void guest_set_pte(struct lg_cpu *cpu,
898 unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 945 unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
899{ 946{
947 /* We don't let you remap the Switcher; we need it to get back! */
948 if (vaddr >= switcher_addr) {
949 kill_guest(cpu, "attempt to set pte into Switcher pages");
950 return;
951 }
952
900 /* 953 /*
901 * Kernel mappings must be changed on all top levels. Slow, but doesn't 954 * Kernel mappings must be changed on all top levels. Slow, but doesn't
902 * happen often. 955 * happen often.
@@ -933,14 +986,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
933{ 986{
934 int pgdir; 987 int pgdir;
935 988
936 if (idx >= SWITCHER_PGD_INDEX) 989 if (idx > PTRS_PER_PGD) {
990 kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
991 idx, PTRS_PER_PGD);
937 return; 992 return;
993 }
938 994
939 /* If they're talking about a page table we have a shadow for... */ 995 /* If they're talking about a page table we have a shadow for... */
940 pgdir = find_pgdir(lg, gpgdir); 996 pgdir = find_pgdir(lg, gpgdir);
941 if (pgdir < ARRAY_SIZE(lg->pgdirs)) 997 if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
942 /* ... throw it away. */ 998 /* ... throw it away. */
943 release_pgd(lg->pgdirs[pgdir].pgdir + idx); 999 release_pgd(lg->pgdirs[pgdir].pgdir + idx);
1000 /* That might have been the Switcher mapping, remap it. */
1001 if (!allocate_switcher_mapping(&lg->cpus[0])) {
1002 kill_guest(&lg->cpus[0],
1003 "Cannot populate switcher mapping");
1004 }
1005 }
944} 1006}
945 1007
946#ifdef CONFIG_X86_PAE 1008#ifdef CONFIG_X86_PAE
@@ -958,6 +1020,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
958 * we will populate on future faults. The Guest doesn't have any actual 1020 * we will populate on future faults. The Guest doesn't have any actual
959 * pagetables yet, so we set linear_pages to tell demand_page() to fake it 1021 * pagetables yet, so we set linear_pages to tell demand_page() to fake it
960 * for the moment. 1022 * for the moment.
1023 *
1024 * We do need the Switcher to be mapped at all times, so we allocate that
1025 * part of the Guest page table here.
961 */ 1026 */
962int init_guest_pagetable(struct lguest *lg) 1027int init_guest_pagetable(struct lguest *lg)
963{ 1028{
@@ -971,21 +1036,34 @@ int init_guest_pagetable(struct lguest *lg)
971 1036
972 /* We start with a linear mapping until the initialize. */ 1037 /* We start with a linear mapping until the initialize. */
973 cpu->linear_pages = true; 1038 cpu->linear_pages = true;
1039
1040 /* Allocate the page tables for the Switcher. */
1041 if (!allocate_switcher_mapping(cpu)) {
1042 release_all_pagetables(lg);
1043 return -ENOMEM;
1044 }
1045
974 return 0; 1046 return 0;
975} 1047}
976 1048
977/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 1049/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
978void page_table_guest_data_init(struct lg_cpu *cpu) 1050void page_table_guest_data_init(struct lg_cpu *cpu)
979{ 1051{
1052 /*
1053 * We tell the Guest that it can't use the virtual addresses
1054 * used by the Switcher. This trick is equivalent to 4GB -
1055 * switcher_addr.
1056 */
1057 u32 top = ~switcher_addr + 1;
1058
980 /* We get the kernel address: above this is all kernel memory. */ 1059 /* We get the kernel address: above this is all kernel memory. */
981 if (get_user(cpu->lg->kernel_address, 1060 if (get_user(cpu->lg->kernel_address,
982 &cpu->lg->lguest_data->kernel_address) 1061 &cpu->lg->lguest_data->kernel_address)
983 /* 1062 /*
984 * We tell the Guest that it can't use the top 2 or 4 MB 1063 * We tell the Guest that it can't use the top virtual
985 * of virtual addresses used by the Switcher. 1064 * addresses (used by the Switcher).
986 */ 1065 */
987 || put_user(RESERVE_MEM * 1024 * 1024, 1066 || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
988 &cpu->lg->lguest_data->reserve_mem)) {
989 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 1067 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
990 return; 1068 return;
991 } 1069 }
@@ -995,12 +1073,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
995 * "pgd_index(lg->kernel_address)". This assumes it won't hit the 1073 * "pgd_index(lg->kernel_address)". This assumes it won't hit the
996 * Switcher mappings, so check that now. 1074 * Switcher mappings, so check that now.
997 */ 1075 */
998#ifdef CONFIG_X86_PAE 1076 if (cpu->lg->kernel_address >= switcher_addr)
999 if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
1000 pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
1001#else
1002 if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
1003#endif
1004 kill_guest(cpu, "bad kernel address %#lx", 1077 kill_guest(cpu, "bad kernel address %#lx",
1005 cpu->lg->kernel_address); 1078 cpu->lg->kernel_address);
1006} 1079}
@@ -1017,102 +1090,96 @@ void free_guest_pagetable(struct lguest *lg)
1017 free_page((long)lg->pgdirs[i].pgdir); 1090 free_page((long)lg->pgdirs[i].pgdir);
1018} 1091}
1019 1092
1020/*H:480 1093/*H:481
1021 * (vi) Mapping the Switcher when the Guest is about to run. 1094 * This clears the Switcher mappings for cpu #i.
1022 *
1023 * The Switcher and the two pages for this CPU need to be visible in the
1024 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages
1025 * for each CPU already set up, we just need to hook them in now we know which
1026 * Guest is about to run on this CPU.
1027 */ 1095 */
1028void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) 1096static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
1029{ 1097{
1030 pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages); 1098 unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
1031 pte_t regs_pte; 1099 pte_t *pte;
1032 1100
1033#ifdef CONFIG_X86_PAE 1101 /* Clear the mappings for both pages. */
1034 pmd_t switcher_pmd; 1102 pte = find_spte(cpu, base, false, 0, 0);
1035 pmd_t *pmd_table; 1103 release_pte(*pte);
1036 1104 set_pte(pte, __pte(0));
1037 switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT,
1038 PAGE_KERNEL_EXEC);
1039
1040 /* Figure out where the pmd page is, by reading the PGD, and converting
1041 * it to a virtual address. */
1042 pmd_table = __va(pgd_pfn(cpu->lg->
1043 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
1044 << PAGE_SHIFT);
1045 /* Now write it into the shadow page table. */
1046 set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
1047#else
1048 pgd_t switcher_pgd;
1049 1105
1050 /* 1106 pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
1051 * Make the last PGD entry for this Guest point to the Switcher's PTE 1107 release_pte(*pte);
1052 * page for this CPU (with appropriate flags). 1108 set_pte(pte, __pte(0));
1053 */
1054 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
1055
1056 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
1057
1058#endif
1059 /*
1060 * We also change the Switcher PTE page. When we're running the Guest,
1061 * we want the Guest's "regs" page to appear where the first Switcher
1062 * page for this CPU is. This is an optimization: when the Switcher
1063 * saves the Guest registers, it saves them into the first page of this
1064 * CPU's "struct lguest_pages": if we make sure the Guest's register
1065 * page is already mapped there, we don't have to copy them out
1066 * again.
1067 */
1068 regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL);
1069 set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte);
1070} 1109}
1071/*:*/
1072 1110
1073static void free_switcher_pte_pages(void) 1111/*H:480
1074{ 1112 * (vi) Mapping the Switcher when the Guest is about to run.
1075 unsigned int i; 1113 *
1076 1114 * The Switcher and the two pages for this CPU need to be visible in the Guest
1077 for_each_possible_cpu(i) 1115 * (and not the pages for other CPUs).
1078 free_page((long)switcher_pte_page(i));
1079}
1080
1081/*H:520
1082 * Setting up the Switcher PTE page for given CPU is fairly easy, given
1083 * the CPU number and the "struct page"s for the Switcher code itself.
1084 * 1116 *
1085 * Currently the Switcher is less than a page long, so "pages" is always 1. 1117 * The pages for the pagetables have all been allocated before: we just need
1118 * to make sure the actual PTEs are up-to-date for the CPU we're about to run
1119 * on.
1086 */ 1120 */
1087static __init void populate_switcher_pte_page(unsigned int cpu, 1121void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
1088 struct page *switcher_page[],
1089 unsigned int pages)
1090{ 1122{
1091 unsigned int i; 1123 unsigned long base;
1092 pte_t *pte = switcher_pte_page(cpu); 1124 struct page *percpu_switcher_page, *regs_page;
1125 pte_t *pte;
1126 struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
1127
1128 /* Switcher page should always be mapped by now! */
1129 BUG_ON(!pgdir->switcher_mapped);
1130
1131 /*
1132 * Remember that we have two pages for each Host CPU, so we can run a
1133 * Guest on each CPU without them interfering. We need to make sure
1134 * those pages are mapped correctly in the Guest, but since we usually
1135 * run on the same CPU, we cache that, and only update the mappings
1136 * when we move.
1137 */
1138 if (pgdir->last_host_cpu == raw_smp_processor_id())
1139 return;
1093 1140
1094 /* The first entries are easy: they map the Switcher code. */ 1141 /* -1 means unknown so we remove everything. */
1095 for (i = 0; i < pages; i++) { 1142 if (pgdir->last_host_cpu == -1) {
1096 set_pte(&pte[i], mk_pte(switcher_page[i], 1143 unsigned int i;
1097 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); 1144 for_each_possible_cpu(i)
1145 remove_switcher_percpu_map(cpu, i);
1146 } else {
1147 /* We know exactly what CPU mapping to remove. */
1148 remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
1098 } 1149 }
1099 1150
1100 /* The only other thing we map is this CPU's pair of pages. */ 1151 /*
1101 i = pages + cpu*2; 1152 * When we're running the Guest, we want the Guest's "regs" page to
1102 1153 * appear where the first Switcher page for this CPU is. This is an
1103 /* First page (Guest registers) is writable from the Guest */ 1154 * optimization: when the Switcher saves the Guest registers, it saves
1104 set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), 1155 * them into the first page of this CPU's "struct lguest_pages": if we
1105 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); 1156 * make sure the Guest's register page is already mapped there, we
1157 * don't have to copy them out again.
1158 */
1159 /* Find the shadow PTE for this regs page. */
1160 base = switcher_addr + PAGE_SIZE
1161 + raw_smp_processor_id() * sizeof(struct lguest_pages);
1162 pte = find_spte(cpu, base, false, 0, 0);
1163 regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
1164 get_page(regs_page);
1165 set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
1106 1166
1107 /* 1167 /*
1108 * The second page contains the "struct lguest_ro_state", and is 1168 * We map the second page of the struct lguest_pages read-only in
1109 * read-only. 1169 * the Guest: the IDT, GDT and other things it's not supposed to
1170 * change.
1110 */ 1171 */
1111 set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), 1172 pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
1112 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); 1173 percpu_switcher_page
1174 = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
1175 get_page(percpu_switcher_page);
1176 set_pte(pte, mk_pte(percpu_switcher_page,
1177 __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
1178
1179 pgdir->last_host_cpu = raw_smp_processor_id();
1113} 1180}
1114 1181
1115/* 1182/*H:490
1116 * We've made it through the page table code. Perhaps our tired brains are 1183 * We've made it through the page table code. Perhaps our tired brains are
1117 * still processing the details, or perhaps we're simply glad it's over. 1184 * still processing the details, or perhaps we're simply glad it's over.
1118 * 1185 *
@@ -1124,29 +1191,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
1124 * 1191 *
1125 * There is just one file remaining in the Host. 1192 * There is just one file remaining in the Host.
1126 */ 1193 */
1127
1128/*H:510
1129 * At boot or module load time, init_pagetables() allocates and populates
1130 * the Switcher PTE page for each CPU.
1131 */
1132__init int init_pagetables(struct page **switcher_page, unsigned int pages)
1133{
1134 unsigned int i;
1135
1136 for_each_possible_cpu(i) {
1137 switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
1138 if (!switcher_pte_page(i)) {
1139 free_switcher_pte_pages();
1140 return -ENOMEM;
1141 }
1142 populate_switcher_pte_page(i, switcher_page, pages);
1143 }
1144 return 0;
1145}
1146/*:*/
1147
1148/* Cleaning up simply involves freeing the PTE page for each CPU. */
1149void free_pagetables(void)
1150{
1151 free_switcher_pte_pages();
1152}
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 4af12e1844d5..f0a3347b6441 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -59,14 +59,13 @@ static struct {
59/* Offset from where switcher.S was compiled to where we've copied it */ 59/* Offset from where switcher.S was compiled to where we've copied it */
60static unsigned long switcher_offset(void) 60static unsigned long switcher_offset(void)
61{ 61{
62 return SWITCHER_ADDR - (unsigned long)start_switcher_text; 62 return switcher_addr - (unsigned long)start_switcher_text;
63} 63}
64 64
65/* This cpu's struct lguest_pages. */ 65/* This cpu's struct lguest_pages (after the Switcher text page) */
66static struct lguest_pages *lguest_pages(unsigned int cpu) 66static struct lguest_pages *lguest_pages(unsigned int cpu)
67{ 67{
68 return &(((struct lguest_pages *) 68 return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
69 (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
70} 69}
71 70
72static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); 71static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig
index a966128c2a7a..7ffc756131a2 100644
--- a/drivers/net/caif/Kconfig
+++ b/drivers/net/caif/Kconfig
@@ -40,3 +40,17 @@ config CAIF_HSI
40 The caif low level driver for CAIF over HSI. 40 The caif low level driver for CAIF over HSI.
41 Be aware that if you enable this then you also need to 41 Be aware that if you enable this then you also need to
42 enable a low-level HSI driver. 42 enable a low-level HSI driver.
43
44config CAIF_VIRTIO
45 tristate "CAIF virtio transport driver"
46 depends on CAIF
47 select VHOST_RING
48 select VIRTIO
49 select GENERIC_ALLOCATOR
50 default n
51 ---help---
52 The caif driver for CAIF over Virtio.
53
54if CAIF_VIRTIO
55source "drivers/vhost/Kconfig"
56endif
diff --git a/drivers/net/caif/Makefile b/drivers/net/caif/Makefile
index 15a9d2fc753d..9bbd45391f6c 100644
--- a/drivers/net/caif/Makefile
+++ b/drivers/net/caif/Makefile
@@ -9,3 +9,6 @@ obj-$(CONFIG_CAIF_SPI_SLAVE) += cfspi_slave.o
9 9
10# HSI interface 10# HSI interface
11obj-$(CONFIG_CAIF_HSI) += caif_hsi.o 11obj-$(CONFIG_CAIF_HSI) += caif_hsi.o
12
13# Virtio interface
14obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o
diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
new file mode 100644
index 000000000000..b9ed1288ce2d
--- /dev/null
+++ b/drivers/net/caif/caif_virtio.c
@@ -0,0 +1,790 @@
1/*
2 * Copyright (C) ST-Ericsson AB 2013
3 * Authors: Vicram Arv
4 * Dmitry Tarnyagin <dmitry.tarnyagin@lockless.no>
5 * Sjur Brendeland
6 * License terms: GNU General Public License (GPL) version 2
7 */
8#include <linux/module.h>
9#include <linux/if_arp.h>
10#include <linux/virtio.h>
11#include <linux/vringh.h>
12#include <linux/debugfs.h>
13#include <linux/spinlock.h>
14#include <linux/genalloc.h>
15#include <linux/interrupt.h>
16#include <linux/netdevice.h>
17#include <linux/rtnetlink.h>
18#include <linux/virtio_ids.h>
19#include <linux/virtio_caif.h>
20#include <linux/virtio_ring.h>
21#include <linux/dma-mapping.h>
22#include <net/caif/caif_dev.h>
23#include <linux/virtio_config.h>
24
25MODULE_LICENSE("GPL v2");
26MODULE_AUTHOR("Vicram Arv");
27MODULE_AUTHOR("Sjur Brendeland");
28MODULE_DESCRIPTION("Virtio CAIF Driver");
29
30/* NAPI schedule quota */
31#define CFV_DEFAULT_QUOTA 32
32
33/* Defaults used if virtio config space is unavailable */
34#define CFV_DEF_MTU_SIZE 4096
35#define CFV_DEF_HEADROOM 32
36#define CFV_DEF_TAILROOM 32
37
38/* Required IP header alignment */
39#define IP_HDR_ALIGN 4
40
41/* struct cfv_napi_contxt - NAPI context info
42 * @riov: IOV holding data read from the ring. Note that riov may
43 * still hold data when cfv_rx_poll() returns.
44 * @head: Last descriptor ID we received from vringh_getdesc_kern.
45 * We use this to put descriptor back on the used ring. USHRT_MAX is
46 * used to indicate invalid head-id.
47 */
48struct cfv_napi_context {
49 struct vringh_kiov riov;
50 unsigned short head;
51};
52
53/* struct cfv_stats - statistics for debugfs
54 * @rx_napi_complete: Number of NAPI completions (RX)
55 * @rx_napi_resched: Number of calls where the full quota was used (RX)
56 * @rx_nomem: Number of SKB alloc failures (RX)
57 * @rx_kicks: Number of RX kicks
58 * @tx_full_ring: Number times TX ring was full
59 * @tx_no_mem: Number of times TX went out of memory
60 * @tx_flow_on: Number of flow on (TX)
61 * @tx_kicks: Number of TX kicks
62 */
63struct cfv_stats {
64 u32 rx_napi_complete;
65 u32 rx_napi_resched;
66 u32 rx_nomem;
67 u32 rx_kicks;
68 u32 tx_full_ring;
69 u32 tx_no_mem;
70 u32 tx_flow_on;
71 u32 tx_kicks;
72};
73
74/* struct cfv_info - Caif Virtio control structure
75 * @cfdev: caif common header
76 * @vdev: Associated virtio device
77 * @vr_rx: rx/downlink host vring
78 * @vq_tx: tx/uplink virtqueue
79 * @ndev: CAIF link layer device
80 * @watermark_tx: indicates number of free descriptors we need
81 * to reopen the tx-queues after overload.
82 * @tx_lock: protects vq_tx from concurrent use
83 * @tx_release_tasklet: Tasklet for freeing consumed TX buffers
84 * @napi: Napi context used in cfv_rx_poll()
85 * @ctx: Context data used in cfv_rx_poll()
86 * @tx_hr: transmit headroom
87 * @rx_hr: receive headroom
88 * @tx_tr: transmit tail room
89 * @rx_tr: receive tail room
90 * @mtu: transmit max size
91 * @mru: receive max size
92 * @allocsz: size of dma memory reserved for TX buffers
93 * @alloc_addr: virtual address to dma memory for TX buffers
94 * @alloc_dma: dma address to dma memory for TX buffers
95 * @genpool: Gen Pool used for allocating TX buffers
96 * @reserved_mem: Pointer to memory reserve allocated from genpool
97 * @reserved_size: Size of memory reserve allocated from genpool
98 * @stats: Statistics exposed in sysfs
99 * @debugfs: Debugfs dentry for statistic counters
100 */
101struct cfv_info {
102 struct caif_dev_common cfdev;
103 struct virtio_device *vdev;
104 struct vringh *vr_rx;
105 struct virtqueue *vq_tx;
106 struct net_device *ndev;
107 unsigned int watermark_tx;
108 /* Protect access to vq_tx */
109 spinlock_t tx_lock;
110 struct tasklet_struct tx_release_tasklet;
111 struct napi_struct napi;
112 struct cfv_napi_context ctx;
113 u16 tx_hr;
114 u16 rx_hr;
115 u16 tx_tr;
116 u16 rx_tr;
117 u32 mtu;
118 u32 mru;
119 size_t allocsz;
120 void *alloc_addr;
121 dma_addr_t alloc_dma;
122 struct gen_pool *genpool;
123 unsigned long reserved_mem;
124 size_t reserved_size;
125 struct cfv_stats stats;
126 struct dentry *debugfs;
127};
128
129/* struct buf_info - maintains transmit buffer data handle
130 * @size: size of transmit buffer
131 * @dma_handle: handle to allocated dma device memory area
132 * @vaddr: virtual address mapping to allocated memory area
133 */
134struct buf_info {
135 size_t size;
136 u8 *vaddr;
137};
138
139/* Called from virtio device, in IRQ context */
140static void cfv_release_cb(struct virtqueue *vq_tx)
141{
142 struct cfv_info *cfv = vq_tx->vdev->priv;
143
144 ++cfv->stats.tx_kicks;
145 tasklet_schedule(&cfv->tx_release_tasklet);
146}
147
148static void free_buf_info(struct cfv_info *cfv, struct buf_info *buf_info)
149{
150 if (!buf_info)
151 return;
152 gen_pool_free(cfv->genpool, (unsigned long) buf_info->vaddr,
153 buf_info->size);
154 kfree(buf_info);
155}
156
157/* This is invoked whenever the remote processor completed processing
158 * a TX msg we just sent, and the buffer is put back to the used ring.
159 */
160static void cfv_release_used_buf(struct virtqueue *vq_tx)
161{
162 struct cfv_info *cfv = vq_tx->vdev->priv;
163 unsigned long flags;
164
165 BUG_ON(vq_tx != cfv->vq_tx);
166
167 for (;;) {
168 unsigned int len;
169 struct buf_info *buf_info;
170
171 /* Get used buffer from used ring to recycle used descriptors */
172 spin_lock_irqsave(&cfv->tx_lock, flags);
173 buf_info = virtqueue_get_buf(vq_tx, &len);
174 spin_unlock_irqrestore(&cfv->tx_lock, flags);
175
176 /* Stop looping if there are no more buffers to free */
177 if (!buf_info)
178 break;
179
180 free_buf_info(cfv, buf_info);
181
182 /* watermark_tx indicates if we previously stopped the tx
183 * queues. If we have enough free stots in the virtio ring,
184 * re-establish memory reserved and open up tx queues.
185 */
186 if (cfv->vq_tx->num_free <= cfv->watermark_tx)
187 continue;
188
189 /* Re-establish memory reserve */
190 if (cfv->reserved_mem == 0 && cfv->genpool)
191 cfv->reserved_mem =
192 gen_pool_alloc(cfv->genpool,
193 cfv->reserved_size);
194
195 /* Open up the tx queues */
196 if (cfv->reserved_mem) {
197 cfv->watermark_tx =
198 virtqueue_get_vring_size(cfv->vq_tx);
199 netif_tx_wake_all_queues(cfv->ndev);
200 /* Buffers are recycled in cfv_netdev_tx, so
201 * disable notifications when queues are opened.
202 */
203 virtqueue_disable_cb(cfv->vq_tx);
204 ++cfv->stats.tx_flow_on;
205 } else {
206 /* if no memory reserve, wait for more free slots */
207 WARN_ON(cfv->watermark_tx >
208 virtqueue_get_vring_size(cfv->vq_tx));
209 cfv->watermark_tx +=
210 virtqueue_get_vring_size(cfv->vq_tx) / 4;
211 }
212 }
213}
214
215/* Allocate a SKB and copy packet data to it */
216static struct sk_buff *cfv_alloc_and_copy_skb(int *err,
217 struct cfv_info *cfv,
218 u8 *frm, u32 frm_len)
219{
220 struct sk_buff *skb;
221 u32 cfpkt_len, pad_len;
222
223 *err = 0;
224 /* Verify that packet size with down-link header and mtu size */
225 if (frm_len > cfv->mru || frm_len <= cfv->rx_hr + cfv->rx_tr) {
226 netdev_err(cfv->ndev,
227 "Invalid frmlen:%u mtu:%u hr:%d tr:%d\n",
228 frm_len, cfv->mru, cfv->rx_hr,
229 cfv->rx_tr);
230 *err = -EPROTO;
231 return NULL;
232 }
233
234 cfpkt_len = frm_len - (cfv->rx_hr + cfv->rx_tr);
235 pad_len = (unsigned long)(frm + cfv->rx_hr) & (IP_HDR_ALIGN - 1);
236
237 skb = netdev_alloc_skb(cfv->ndev, frm_len + pad_len);
238 if (!skb) {
239 *err = -ENOMEM;
240 return NULL;
241 }
242
243 skb_reserve(skb, cfv->rx_hr + pad_len);
244
245 memcpy(skb_put(skb, cfpkt_len), frm + cfv->rx_hr, cfpkt_len);
246 return skb;
247}
248
249/* Get packets from the host vring */
250static int cfv_rx_poll(struct napi_struct *napi, int quota)
251{
252 struct cfv_info *cfv = container_of(napi, struct cfv_info, napi);
253 int rxcnt = 0;
254 int err = 0;
255 void *buf;
256 struct sk_buff *skb;
257 struct vringh_kiov *riov = &cfv->ctx.riov;
258 unsigned int skb_len;
259
260again:
261 do {
262 skb = NULL;
263
264 /* Put the previous iovec back on the used ring and
265 * fetch a new iovec if we have processed all elements.
266 */
267 if (riov->i == riov->used) {
268 if (cfv->ctx.head != USHRT_MAX) {
269 vringh_complete_kern(cfv->vr_rx,
270 cfv->ctx.head,
271 0);
272 cfv->ctx.head = USHRT_MAX;
273 }
274
275 err = vringh_getdesc_kern(
276 cfv->vr_rx,
277 riov,
278 NULL,
279 &cfv->ctx.head,
280 GFP_ATOMIC);
281
282 if (err <= 0)
283 goto exit;
284 }
285
286 buf = phys_to_virt((unsigned long) riov->iov[riov->i].iov_base);
287 /* TODO: Add check on valid buffer address */
288
289 skb = cfv_alloc_and_copy_skb(&err, cfv, buf,
290 riov->iov[riov->i].iov_len);
291 if (unlikely(err))
292 goto exit;
293
294 /* Push received packet up the stack. */
295 skb_len = skb->len;
296 skb->protocol = htons(ETH_P_CAIF);
297 skb_reset_mac_header(skb);
298 skb->dev = cfv->ndev;
299 err = netif_receive_skb(skb);
300 if (unlikely(err)) {
301 ++cfv->ndev->stats.rx_dropped;
302 } else {
303 ++cfv->ndev->stats.rx_packets;
304 cfv->ndev->stats.rx_bytes += skb_len;
305 }
306
307 ++riov->i;
308 ++rxcnt;
309 } while (rxcnt < quota);
310
311 ++cfv->stats.rx_napi_resched;
312 goto out;
313
314exit:
315 switch (err) {
316 case 0:
317 ++cfv->stats.rx_napi_complete;
318
319 /* Really out of patckets? (stolen from virtio_net)*/
320 napi_complete(napi);
321 if (unlikely(!vringh_notify_enable_kern(cfv->vr_rx)) &&
322 napi_schedule_prep(napi)) {
323 vringh_notify_disable_kern(cfv->vr_rx);
324 __napi_schedule(napi);
325 goto again;
326 }
327 break;
328
329 case -ENOMEM:
330 ++cfv->stats.rx_nomem;
331 dev_kfree_skb(skb);
332 /* Stop NAPI poll on OOM, we hope to be polled later */
333 napi_complete(napi);
334 vringh_notify_enable_kern(cfv->vr_rx);
335 break;
336
337 default:
338 /* We're doomed, any modem fault is fatal */
339 netdev_warn(cfv->ndev, "Bad ring, disable device\n");
340 cfv->ndev->stats.rx_dropped = riov->used - riov->i;
341 napi_complete(napi);
342 vringh_notify_disable_kern(cfv->vr_rx);
343 netif_carrier_off(cfv->ndev);
344 break;
345 }
346out:
347 if (rxcnt && vringh_need_notify_kern(cfv->vr_rx) > 0)
348 vringh_notify(cfv->vr_rx);
349 return rxcnt;
350}
351
352static void cfv_recv(struct virtio_device *vdev, struct vringh *vr_rx)
353{
354 struct cfv_info *cfv = vdev->priv;
355
356 ++cfv->stats.rx_kicks;
357 vringh_notify_disable_kern(cfv->vr_rx);
358 napi_schedule(&cfv->napi);
359}
360
361static void cfv_destroy_genpool(struct cfv_info *cfv)
362{
363 if (cfv->alloc_addr)
364 dma_free_coherent(cfv->vdev->dev.parent->parent,
365 cfv->allocsz, cfv->alloc_addr,
366 cfv->alloc_dma);
367
368 if (!cfv->genpool)
369 return;
370 gen_pool_free(cfv->genpool, cfv->reserved_mem,
371 cfv->reserved_size);
372 gen_pool_destroy(cfv->genpool);
373 cfv->genpool = NULL;
374}
375
376static int cfv_create_genpool(struct cfv_info *cfv)
377{
378 int err;
379
380 /* dma_alloc can only allocate whole pages, and we need a more
381 * fine graned allocation so we use genpool. We ask for space needed
382 * by IP and a full ring. If the dma allcoation fails we retry with a
383 * smaller allocation size.
384 */
385 err = -ENOMEM;
386 cfv->allocsz = (virtqueue_get_vring_size(cfv->vq_tx) *
387 (ETH_DATA_LEN + cfv->tx_hr + cfv->tx_tr) * 11)/10;
388 if (cfv->allocsz <= (num_possible_cpus() + 1) * cfv->ndev->mtu)
389 return -EINVAL;
390
391 for (;;) {
392 if (cfv->allocsz <= num_possible_cpus() * cfv->ndev->mtu) {
393 netdev_info(cfv->ndev, "Not enough device memory\n");
394 return -ENOMEM;
395 }
396
397 cfv->alloc_addr = dma_alloc_coherent(
398 cfv->vdev->dev.parent->parent,
399 cfv->allocsz, &cfv->alloc_dma,
400 GFP_ATOMIC);
401 if (cfv->alloc_addr)
402 break;
403
404 cfv->allocsz = (cfv->allocsz * 3) >> 2;
405 }
406
407 netdev_dbg(cfv->ndev, "Allocated %zd bytes from dma-memory\n",
408 cfv->allocsz);
409
410 /* Allocate on 128 bytes boundaries (1 << 7)*/
411 cfv->genpool = gen_pool_create(7, -1);
412 if (!cfv->genpool)
413 goto err;
414
415 err = gen_pool_add_virt(cfv->genpool, (unsigned long)cfv->alloc_addr,
416 (phys_addr_t)virt_to_phys(cfv->alloc_addr),
417 cfv->allocsz, -1);
418 if (err)
419 goto err;
420
421 /* Reserve some memory for low memory situations. If we hit the roof
422 * in the memory pool, we stop TX flow and release the reserve.
423 */
424 cfv->reserved_size = num_possible_cpus() * cfv->ndev->mtu;
425 cfv->reserved_mem = gen_pool_alloc(cfv->genpool,
426 cfv->reserved_size);
427 if (!cfv->reserved_mem) {
428 err = -ENOMEM;
429 goto err;
430 }
431
432 cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx);
433 return 0;
434err:
435 cfv_destroy_genpool(cfv);
436 return err;
437}
438
439/* Enable the CAIF interface and allocate the memory-pool */
440static int cfv_netdev_open(struct net_device *netdev)
441{
442 struct cfv_info *cfv = netdev_priv(netdev);
443
444 if (cfv_create_genpool(cfv))
445 return -ENOMEM;
446
447 netif_carrier_on(netdev);
448 napi_enable(&cfv->napi);
449
450 /* Schedule NAPI to read any pending packets */
451 napi_schedule(&cfv->napi);
452 return 0;
453}
454
455/* Disable the CAIF interface and free the memory-pool */
456static int cfv_netdev_close(struct net_device *netdev)
457{
458 struct cfv_info *cfv = netdev_priv(netdev);
459 unsigned long flags;
460 struct buf_info *buf_info;
461
462 /* Disable interrupts, queues and NAPI polling */
463 netif_carrier_off(netdev);
464 virtqueue_disable_cb(cfv->vq_tx);
465 vringh_notify_disable_kern(cfv->vr_rx);
466 napi_disable(&cfv->napi);
467
468 /* Release any TX buffers on both used and avilable rings */
469 cfv_release_used_buf(cfv->vq_tx);
470 spin_lock_irqsave(&cfv->tx_lock, flags);
471 while ((buf_info = virtqueue_detach_unused_buf(cfv->vq_tx)))
472 free_buf_info(cfv, buf_info);
473 spin_unlock_irqrestore(&cfv->tx_lock, flags);
474
475 /* Release all dma allocated memory and destroy the pool */
476 cfv_destroy_genpool(cfv);
477 return 0;
478}
479
480/* Allocate a buffer in dma-memory and copy skb to it */
481static struct buf_info *cfv_alloc_and_copy_to_shm(struct cfv_info *cfv,
482 struct sk_buff *skb,
483 struct scatterlist *sg)
484{
485 struct caif_payload_info *info = (void *)&skb->cb;
486 struct buf_info *buf_info = NULL;
487 u8 pad_len, hdr_ofs;
488
489 if (!cfv->genpool)
490 goto err;
491
492 if (unlikely(cfv->tx_hr + skb->len + cfv->tx_tr > cfv->mtu)) {
493 netdev_warn(cfv->ndev, "Invalid packet len (%d > %d)\n",
494 cfv->tx_hr + skb->len + cfv->tx_tr, cfv->mtu);
495 goto err;
496 }
497
498 buf_info = kmalloc(sizeof(struct buf_info), GFP_ATOMIC);
499 if (unlikely(!buf_info))
500 goto err;
501
502 /* Make the IP header aligned in tbe buffer */
503 hdr_ofs = cfv->tx_hr + info->hdr_len;
504 pad_len = hdr_ofs & (IP_HDR_ALIGN - 1);
505 buf_info->size = cfv->tx_hr + skb->len + cfv->tx_tr + pad_len;
506
507 /* allocate dma memory buffer */
508 buf_info->vaddr = (void *)gen_pool_alloc(cfv->genpool, buf_info->size);
509 if (unlikely(!buf_info->vaddr))
510 goto err;
511
512 /* copy skbuf contents to send buffer */
513 skb_copy_bits(skb, 0, buf_info->vaddr + cfv->tx_hr + pad_len, skb->len);
514 sg_init_one(sg, buf_info->vaddr + pad_len,
515 skb->len + cfv->tx_hr + cfv->rx_hr);
516
517 return buf_info;
518err:
519 kfree(buf_info);
520 return NULL;
521}
522
523/* Put the CAIF packet on the virtio ring and kick the receiver */
524static int cfv_netdev_tx(struct sk_buff *skb, struct net_device *netdev)
525{
526 struct cfv_info *cfv = netdev_priv(netdev);
527 struct buf_info *buf_info;
528 struct scatterlist sg;
529 unsigned long flags;
530 bool flow_off = false;
531 int ret;
532
533 /* garbage collect released buffers */
534 cfv_release_used_buf(cfv->vq_tx);
535 spin_lock_irqsave(&cfv->tx_lock, flags);
536
537 /* Flow-off check takes into account number of cpus to make sure
538 * virtqueue will not be overfilled in any possible smp conditions.
539 *
540 * Flow-on is triggered when sufficient buffers are freed
541 */
542 if (unlikely(cfv->vq_tx->num_free <= num_present_cpus())) {
543 flow_off = true;
544 cfv->stats.tx_full_ring++;
545 }
546
547 /* If we run out of memory, we release the memory reserve and retry
548 * allocation.
549 */
550 buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
551 if (unlikely(!buf_info)) {
552 cfv->stats.tx_no_mem++;
553 flow_off = true;
554
555 if (cfv->reserved_mem && cfv->genpool) {
556 gen_pool_free(cfv->genpool, cfv->reserved_mem,
557 cfv->reserved_size);
558 cfv->reserved_mem = 0;
559 buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
560 }
561 }
562
563 if (unlikely(flow_off)) {
564 /* Turn flow on when a 1/4 of the descriptors are released */
565 cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx) / 4;
566 /* Enable notifications of recycled TX buffers */
567 virtqueue_enable_cb(cfv->vq_tx);
568 netif_tx_stop_all_queues(netdev);
569 }
570
571 if (unlikely(!buf_info)) {
572 /* If the memory reserve does it's job, this shouldn't happen */
573 netdev_warn(cfv->ndev, "Out of gen_pool memory\n");
574 goto err;
575 }
576
577 ret = virtqueue_add_outbuf(cfv->vq_tx, &sg, 1, buf_info, GFP_ATOMIC);
578 if (unlikely((ret < 0))) {
579 /* If flow control works, this shouldn't happen */
580 netdev_warn(cfv->ndev, "Failed adding buffer to TX vring:%d\n",
581 ret);
582 goto err;
583 }
584
585 /* update netdev statistics */
586 cfv->ndev->stats.tx_packets++;
587 cfv->ndev->stats.tx_bytes += skb->len;
588 spin_unlock_irqrestore(&cfv->tx_lock, flags);
589
590 /* tell the remote processor it has a pending message to read */
591 virtqueue_kick(cfv->vq_tx);
592
593 dev_kfree_skb(skb);
594 return NETDEV_TX_OK;
595err:
596 spin_unlock_irqrestore(&cfv->tx_lock, flags);
597 cfv->ndev->stats.tx_dropped++;
598 free_buf_info(cfv, buf_info);
599 dev_kfree_skb(skb);
600 return NETDEV_TX_OK;
601}
602
603static void cfv_tx_release_tasklet(unsigned long drv)
604{
605 struct cfv_info *cfv = (struct cfv_info *)drv;
606 cfv_release_used_buf(cfv->vq_tx);
607}
608
609static const struct net_device_ops cfv_netdev_ops = {
610 .ndo_open = cfv_netdev_open,
611 .ndo_stop = cfv_netdev_close,
612 .ndo_start_xmit = cfv_netdev_tx,
613};
614
615static void cfv_netdev_setup(struct net_device *netdev)
616{
617 netdev->netdev_ops = &cfv_netdev_ops;
618 netdev->type = ARPHRD_CAIF;
619 netdev->tx_queue_len = 100;
620 netdev->flags = IFF_POINTOPOINT | IFF_NOARP;
621 netdev->mtu = CFV_DEF_MTU_SIZE;
622 netdev->destructor = free_netdev;
623}
624
625/* Create debugfs counters for the device */
626static inline void debugfs_init(struct cfv_info *cfv)
627{
628 cfv->debugfs =
629 debugfs_create_dir(netdev_name(cfv->ndev), NULL);
630
631 if (IS_ERR(cfv->debugfs))
632 return;
633
634 debugfs_create_u32("rx-napi-complete", S_IRUSR, cfv->debugfs,
635 &cfv->stats.rx_napi_complete);
636 debugfs_create_u32("rx-napi-resched", S_IRUSR, cfv->debugfs,
637 &cfv->stats.rx_napi_resched);
638 debugfs_create_u32("rx-nomem", S_IRUSR, cfv->debugfs,
639 &cfv->stats.rx_nomem);
640 debugfs_create_u32("rx-kicks", S_IRUSR, cfv->debugfs,
641 &cfv->stats.rx_kicks);
642 debugfs_create_u32("tx-full-ring", S_IRUSR, cfv->debugfs,
643 &cfv->stats.tx_full_ring);
644 debugfs_create_u32("tx-no-mem", S_IRUSR, cfv->debugfs,
645 &cfv->stats.tx_no_mem);
646 debugfs_create_u32("tx-kicks", S_IRUSR, cfv->debugfs,
647 &cfv->stats.tx_kicks);
648 debugfs_create_u32("tx-flow-on", S_IRUSR, cfv->debugfs,
649 &cfv->stats.tx_flow_on);
650}
651
652/* Setup CAIF for the a virtio device */
653static int cfv_probe(struct virtio_device *vdev)
654{
655 vq_callback_t *vq_cbs = cfv_release_cb;
656 vrh_callback_t *vrh_cbs = cfv_recv;
657 const char *names = "output";
658 const char *cfv_netdev_name = "cfvrt";
659 struct net_device *netdev;
660 struct cfv_info *cfv;
661 int err = -EINVAL;
662
663 netdev = alloc_netdev(sizeof(struct cfv_info), cfv_netdev_name,
664 cfv_netdev_setup);
665 if (!netdev)
666 return -ENOMEM;
667
668 cfv = netdev_priv(netdev);
669 cfv->vdev = vdev;
670 cfv->ndev = netdev;
671
672 spin_lock_init(&cfv->tx_lock);
673
674 /* Get the RX virtio ring. This is a "host side vring". */
675 err = -ENODEV;
676 if (!vdev->vringh_config || !vdev->vringh_config->find_vrhs)
677 goto err;
678
679 err = vdev->vringh_config->find_vrhs(vdev, 1, &cfv->vr_rx, &vrh_cbs);
680 if (err)
681 goto err;
682
683 /* Get the TX virtio ring. This is a "guest side vring". */
684 err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names);
685 if (err)
686 goto err;
687
688 /* Get the CAIF configuration from virtio config space, if available */
689#define GET_VIRTIO_CONFIG_OPS(_v, _var, _f) \
690 ((_v)->config->get(_v, offsetof(struct virtio_caif_transf_config, _f), \
691 &_var, \
692 FIELD_SIZEOF(struct virtio_caif_transf_config, _f)))
693
694 if (vdev->config->get) {
695 GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_hr, headroom);
696 GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_hr, headroom);
697 GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_tr, tailroom);
698 GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_tr, tailroom);
699 GET_VIRTIO_CONFIG_OPS(vdev, cfv->mtu, mtu);
700 GET_VIRTIO_CONFIG_OPS(vdev, cfv->mru, mtu);
701 } else {
702 cfv->tx_hr = CFV_DEF_HEADROOM;
703 cfv->rx_hr = CFV_DEF_HEADROOM;
704 cfv->tx_tr = CFV_DEF_TAILROOM;
705 cfv->rx_tr = CFV_DEF_TAILROOM;
706 cfv->mtu = CFV_DEF_MTU_SIZE;
707 cfv->mru = CFV_DEF_MTU_SIZE;
708 }
709
710 netdev->needed_headroom = cfv->tx_hr;
711 netdev->needed_tailroom = cfv->tx_tr;
712
713 /* Disable buffer release interrupts unless we have stopped TX queues */
714 virtqueue_disable_cb(cfv->vq_tx);
715
716 netdev->mtu = cfv->mtu - cfv->tx_tr;
717 vdev->priv = cfv;
718
719 /* Initialize NAPI poll context data */
720 vringh_kiov_init(&cfv->ctx.riov, NULL, 0);
721 cfv->ctx.head = USHRT_MAX;
722 netif_napi_add(netdev, &cfv->napi, cfv_rx_poll, CFV_DEFAULT_QUOTA);
723
724 tasklet_init(&cfv->tx_release_tasklet,
725 cfv_tx_release_tasklet,
726 (unsigned long)cfv);
727
728 /* Carrier is off until netdevice is opened */
729 netif_carrier_off(netdev);
730
731 /* register Netdev */
732 err = register_netdev(netdev);
733 if (err) {
734 dev_err(&vdev->dev, "Unable to register netdev (%d)\n", err);
735 goto err;
736 }
737
738 debugfs_init(cfv);
739
740 return 0;
741err:
742 netdev_warn(cfv->ndev, "CAIF Virtio probe failed:%d\n", err);
743
744 if (cfv->vr_rx)
745 vdev->vringh_config->del_vrhs(cfv->vdev);
746 if (cfv->vdev)
747 vdev->config->del_vqs(cfv->vdev);
748 free_netdev(netdev);
749 return err;
750}
751
752static void cfv_remove(struct virtio_device *vdev)
753{
754 struct cfv_info *cfv = vdev->priv;
755
756 rtnl_lock();
757 dev_close(cfv->ndev);
758 rtnl_unlock();
759
760 tasklet_kill(&cfv->tx_release_tasklet);
761 debugfs_remove_recursive(cfv->debugfs);
762
763 vringh_kiov_cleanup(&cfv->ctx.riov);
764 vdev->config->reset(vdev);
765 vdev->vringh_config->del_vrhs(cfv->vdev);
766 cfv->vr_rx = NULL;
767 vdev->config->del_vqs(cfv->vdev);
768 unregister_netdev(cfv->ndev);
769}
770
771static struct virtio_device_id id_table[] = {
772 { VIRTIO_ID_CAIF, VIRTIO_DEV_ANY_ID },
773 { 0 },
774};
775
776static unsigned int features[] = {
777};
778
779static struct virtio_driver caif_virtio_driver = {
780 .feature_table = features,
781 .feature_table_size = ARRAY_SIZE(features),
782 .driver.name = KBUILD_MODNAME,
783 .driver.owner = THIS_MODULE,
784 .id_table = id_table,
785 .probe = cfv_probe,
786 .remove = cfv_remove,
787};
788
789module_virtio_driver(caif_virtio_driver);
790MODULE_DEVICE_TABLE(virtio, id_table);
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 50077753a0e5..3c23fdc27bf0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -39,7 +39,6 @@ module_param(gso, bool, 0444);
39#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) 39#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
40#define GOOD_COPY_LEN 128 40#define GOOD_COPY_LEN 128
41 41
42#define VIRTNET_SEND_COMMAND_SG_MAX 2
43#define VIRTNET_DRIVER_VERSION "1.0.0" 42#define VIRTNET_DRIVER_VERSION "1.0.0"
44 43
45struct virtnet_stats { 44struct virtnet_stats {
@@ -444,7 +443,7 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
444 443
445 skb_to_sgvec(skb, rq->sg + 1, 0, skb->len); 444 skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
446 445
447 err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp); 446 err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
448 if (err < 0) 447 if (err < 0)
449 dev_kfree_skb(skb); 448 dev_kfree_skb(skb);
450 449
@@ -489,8 +488,8 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
489 488
490 /* chain first in list head */ 489 /* chain first in list head */
491 first->private = (unsigned long)list; 490 first->private = (unsigned long)list;
492 err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2, 491 err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
493 first, gfp); 492 first, gfp);
494 if (err < 0) 493 if (err < 0)
495 give_pages(rq, first); 494 give_pages(rq, first);
496 495
@@ -508,7 +507,7 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
508 507
509 sg_init_one(rq->sg, page_address(page), PAGE_SIZE); 508 sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
510 509
511 err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp); 510 err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, page, gfp);
512 if (err < 0) 511 if (err < 0)
513 give_pages(rq, page); 512 give_pages(rq, page);
514 513
@@ -582,7 +581,7 @@ static void refill_work(struct work_struct *work)
582 bool still_empty; 581 bool still_empty;
583 int i; 582 int i;
584 583
585 for (i = 0; i < vi->max_queue_pairs; i++) { 584 for (i = 0; i < vi->curr_queue_pairs; i++) {
586 struct receive_queue *rq = &vi->rq[i]; 585 struct receive_queue *rq = &vi->rq[i];
587 586
588 napi_disable(&rq->napi); 587 napi_disable(&rq->napi);
@@ -637,7 +636,7 @@ static int virtnet_open(struct net_device *dev)
637 struct virtnet_info *vi = netdev_priv(dev); 636 struct virtnet_info *vi = netdev_priv(dev);
638 int i; 637 int i;
639 638
640 for (i = 0; i < vi->max_queue_pairs; i++) { 639 for (i = 0; i < vi->curr_queue_pairs; i++) {
641 /* Make sure we have some buffers: if oom use wq. */ 640 /* Make sure we have some buffers: if oom use wq. */
642 if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) 641 if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
643 schedule_delayed_work(&vi->refill, 0); 642 schedule_delayed_work(&vi->refill, 0);
@@ -711,8 +710,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
711 sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr); 710 sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr);
712 711
713 num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1; 712 num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
714 return virtqueue_add_buf(sq->vq, sq->sg, num_sg, 713 return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
715 0, skb, GFP_ATOMIC);
716} 714}
717 715
718static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) 716static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -767,32 +765,35 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
767 * never fail unless improperly formated. 765 * never fail unless improperly formated.
768 */ 766 */
769static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, 767static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
770 struct scatterlist *data, int out, int in) 768 struct scatterlist *out,
769 struct scatterlist *in)
771{ 770{
772 struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2]; 771 struct scatterlist *sgs[4], hdr, stat;
773 struct virtio_net_ctrl_hdr ctrl; 772 struct virtio_net_ctrl_hdr ctrl;
774 virtio_net_ctrl_ack status = ~0; 773 virtio_net_ctrl_ack status = ~0;
775 unsigned int tmp; 774 unsigned out_num = 0, in_num = 0, tmp;
776 int i;
777 775
778 /* Caller should know better */ 776 /* Caller should know better */
779 BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) || 777 BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
780 (out + in > VIRTNET_SEND_COMMAND_SG_MAX));
781
782 out++; /* Add header */
783 in++; /* Add return status */
784 778
785 ctrl.class = class; 779 ctrl.class = class;
786 ctrl.cmd = cmd; 780 ctrl.cmd = cmd;
781 /* Add header */
782 sg_init_one(&hdr, &ctrl, sizeof(ctrl));
783 sgs[out_num++] = &hdr;
787 784
788 sg_init_table(sg, out + in); 785 if (out)
786 sgs[out_num++] = out;
787 if (in)
788 sgs[out_num + in_num++] = in;
789 789
790 sg_set_buf(&sg[0], &ctrl, sizeof(ctrl)); 790 /* Add return status. */
791 for_each_sg(data, s, out + in - 2, i) 791 sg_init_one(&stat, &status, sizeof(status));
792 sg_set_buf(&sg[i + 1], sg_virt(s), s->length); 792 sgs[out_num + in_num++] = &stat;
793 sg_set_buf(&sg[out + in - 1], &status, sizeof(status));
794 793
795 BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi, GFP_ATOMIC) < 0); 794 BUG_ON(out_num + in_num > ARRAY_SIZE(sgs));
795 BUG_ON(virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC)
796 < 0);
796 797
797 virtqueue_kick(vi->cvq); 798 virtqueue_kick(vi->cvq);
798 799
@@ -821,7 +822,7 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
821 sg_init_one(&sg, addr->sa_data, dev->addr_len); 822 sg_init_one(&sg, addr->sa_data, dev->addr_len);
822 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, 823 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
823 VIRTIO_NET_CTRL_MAC_ADDR_SET, 824 VIRTIO_NET_CTRL_MAC_ADDR_SET,
824 &sg, 1, 0)) { 825 &sg, NULL)) {
825 dev_warn(&vdev->dev, 826 dev_warn(&vdev->dev,
826 "Failed to set mac address by vq command.\n"); 827 "Failed to set mac address by vq command.\n");
827 return -EINVAL; 828 return -EINVAL;
@@ -889,8 +890,7 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi)
889{ 890{
890 rtnl_lock(); 891 rtnl_lock();
891 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE, 892 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
892 VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, 893 VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, NULL))
893 0, 0))
894 dev_warn(&vi->dev->dev, "Failed to ack link announce.\n"); 894 dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
895 rtnl_unlock(); 895 rtnl_unlock();
896} 896}
@@ -900,6 +900,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
900 struct scatterlist sg; 900 struct scatterlist sg;
901 struct virtio_net_ctrl_mq s; 901 struct virtio_net_ctrl_mq s;
902 struct net_device *dev = vi->dev; 902 struct net_device *dev = vi->dev;
903 int i;
903 904
904 if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) 905 if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
905 return 0; 906 return 0;
@@ -908,12 +909,16 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
908 sg_init_one(&sg, &s, sizeof(s)); 909 sg_init_one(&sg, &s, sizeof(s));
909 910
910 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, 911 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
911 VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, 1, 0)){ 912 VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, NULL)) {
912 dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n", 913 dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
913 queue_pairs); 914 queue_pairs);
914 return -EINVAL; 915 return -EINVAL;
915 } else 916 } else {
917 for (i = vi->curr_queue_pairs; i < queue_pairs; i++)
918 if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
919 schedule_delayed_work(&vi->refill, 0);
916 vi->curr_queue_pairs = queue_pairs; 920 vi->curr_queue_pairs = queue_pairs;
921 }
917 922
918 return 0; 923 return 0;
919} 924}
@@ -955,7 +960,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
955 960
956 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, 961 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
957 VIRTIO_NET_CTRL_RX_PROMISC, 962 VIRTIO_NET_CTRL_RX_PROMISC,
958 sg, 1, 0)) 963 sg, NULL))
959 dev_warn(&dev->dev, "Failed to %sable promisc mode.\n", 964 dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
960 promisc ? "en" : "dis"); 965 promisc ? "en" : "dis");
961 966
@@ -963,7 +968,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
963 968
964 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, 969 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
965 VIRTIO_NET_CTRL_RX_ALLMULTI, 970 VIRTIO_NET_CTRL_RX_ALLMULTI,
966 sg, 1, 0)) 971 sg, NULL))
967 dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", 972 dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
968 allmulti ? "en" : "dis"); 973 allmulti ? "en" : "dis");
969 974
@@ -1000,7 +1005,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
1000 1005
1001 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, 1006 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1002 VIRTIO_NET_CTRL_MAC_TABLE_SET, 1007 VIRTIO_NET_CTRL_MAC_TABLE_SET,
1003 sg, 2, 0)) 1008 sg, NULL))
1004 dev_warn(&dev->dev, "Failed to set MAC fitler table.\n"); 1009 dev_warn(&dev->dev, "Failed to set MAC fitler table.\n");
1005 1010
1006 kfree(buf); 1011 kfree(buf);
@@ -1015,7 +1020,7 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev,
1015 sg_init_one(&sg, &vid, sizeof(vid)); 1020 sg_init_one(&sg, &vid, sizeof(vid));
1016 1021
1017 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, 1022 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1018 VIRTIO_NET_CTRL_VLAN_ADD, &sg, 1, 0)) 1023 VIRTIO_NET_CTRL_VLAN_ADD, &sg, NULL))
1019 dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid); 1024 dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1020 return 0; 1025 return 0;
1021} 1026}
@@ -1029,7 +1034,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
1029 sg_init_one(&sg, &vid, sizeof(vid)); 1034 sg_init_one(&sg, &vid, sizeof(vid));
1030 1035
1031 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, 1036 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1032 VIRTIO_NET_CTRL_VLAN_DEL, &sg, 1, 0)) 1037 VIRTIO_NET_CTRL_VLAN_DEL, &sg, NULL))
1033 dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid); 1038 dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1034 return 0; 1039 return 0;
1035} 1040}
@@ -1570,7 +1575,7 @@ static int virtnet_probe(struct virtio_device *vdev)
1570 } 1575 }
1571 1576
1572 /* Last of all, set up some receive buffers. */ 1577 /* Last of all, set up some receive buffers. */
1573 for (i = 0; i < vi->max_queue_pairs; i++) { 1578 for (i = 0; i < vi->curr_queue_pairs; i++) {
1574 try_fill_recv(&vi->rq[i], GFP_KERNEL); 1579 try_fill_recv(&vi->rq[i], GFP_KERNEL);
1575 1580
1576 /* If we didn't even get one input buffer, we're useless. */ 1581 /* If we didn't even get one input buffer, we're useless. */
@@ -1694,7 +1699,7 @@ static int virtnet_restore(struct virtio_device *vdev)
1694 1699
1695 netif_device_attach(vi->dev); 1700 netif_device_attach(vi->dev);
1696 1701
1697 for (i = 0; i < vi->max_queue_pairs; i++) 1702 for (i = 0; i < vi->curr_queue_pairs; i++)
1698 if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) 1703 if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
1699 schedule_delayed_work(&vi->refill, 0); 1704 schedule_delayed_work(&vi->refill, 0);
1700 1705
diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c
index 7861f1119b7d..56fceafec9ec 100644
--- a/drivers/rpmsg/virtio_rpmsg_bus.c
+++ b/drivers/rpmsg/virtio_rpmsg_bus.c
@@ -757,14 +757,14 @@ int rpmsg_send_offchannel_raw(struct rpmsg_channel *rpdev, u32 src, u32 dst,
757 mutex_lock(&vrp->tx_lock); 757 mutex_lock(&vrp->tx_lock);
758 758
759 /* add message to the remote processor's virtqueue */ 759 /* add message to the remote processor's virtqueue */
760 err = virtqueue_add_buf(vrp->svq, &sg, 1, 0, msg, GFP_KERNEL); 760 err = virtqueue_add_outbuf(vrp->svq, &sg, 1, msg, GFP_KERNEL);
761 if (err) { 761 if (err) {
762 /* 762 /*
763 * need to reclaim the buffer here, otherwise it's lost 763 * need to reclaim the buffer here, otherwise it's lost
764 * (memory won't leak, but rpmsg won't use it again for TX). 764 * (memory won't leak, but rpmsg won't use it again for TX).
765 * this will wait for a buffer management overhaul. 765 * this will wait for a buffer management overhaul.
766 */ 766 */
767 dev_err(dev, "virtqueue_add_buf failed: %d\n", err); 767 dev_err(dev, "virtqueue_add_outbuf failed: %d\n", err);
768 goto out; 768 goto out;
769 } 769 }
770 770
@@ -839,7 +839,7 @@ static void rpmsg_recv_done(struct virtqueue *rvq)
839 sg_init_one(&sg, msg, RPMSG_BUF_SIZE); 839 sg_init_one(&sg, msg, RPMSG_BUF_SIZE);
840 840
841 /* add the buffer back to the remote processor's virtqueue */ 841 /* add the buffer back to the remote processor's virtqueue */
842 err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, msg, GFP_KERNEL); 842 err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, msg, GFP_KERNEL);
843 if (err < 0) { 843 if (err < 0) {
844 dev_err(dev, "failed to add a virtqueue buffer: %d\n", err); 844 dev_err(dev, "failed to add a virtqueue buffer: %d\n", err);
845 return; 845 return;
@@ -972,7 +972,7 @@ static int rpmsg_probe(struct virtio_device *vdev)
972 972
973 sg_init_one(&sg, cpu_addr, RPMSG_BUF_SIZE); 973 sg_init_one(&sg, cpu_addr, RPMSG_BUF_SIZE);
974 974
975 err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, cpu_addr, 975 err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, cpu_addr,
976 GFP_KERNEL); 976 GFP_KERNEL);
977 WARN_ON(err); /* sanity check; this can't really happen */ 977 WARN_ON(err); /* sanity check; this can't really happen */
978 } 978 }
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 3449a1f8c656..2168258fb2c3 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -13,6 +13,8 @@
13 * 13 *
14 */ 14 */
15 15
16#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
17
16#include <linux/module.h> 18#include <linux/module.h>
17#include <linux/slab.h> 19#include <linux/slab.h>
18#include <linux/mempool.h> 20#include <linux/mempool.h>
@@ -20,12 +22,14 @@
20#include <linux/virtio_ids.h> 22#include <linux/virtio_ids.h>
21#include <linux/virtio_config.h> 23#include <linux/virtio_config.h>
22#include <linux/virtio_scsi.h> 24#include <linux/virtio_scsi.h>
25#include <linux/cpu.h>
23#include <scsi/scsi_host.h> 26#include <scsi/scsi_host.h>
24#include <scsi/scsi_device.h> 27#include <scsi/scsi_device.h>
25#include <scsi/scsi_cmnd.h> 28#include <scsi/scsi_cmnd.h>
26 29
27#define VIRTIO_SCSI_MEMPOOL_SZ 64 30#define VIRTIO_SCSI_MEMPOOL_SZ 64
28#define VIRTIO_SCSI_EVENT_LEN 8 31#define VIRTIO_SCSI_EVENT_LEN 8
32#define VIRTIO_SCSI_VQ_BASE 2
29 33
30/* Command queue element */ 34/* Command queue element */
31struct virtio_scsi_cmd { 35struct virtio_scsi_cmd {
@@ -57,27 +61,61 @@ struct virtio_scsi_vq {
57 struct virtqueue *vq; 61 struct virtqueue *vq;
58}; 62};
59 63
60/* Per-target queue state */ 64/*
65 * Per-target queue state.
66 *
67 * This struct holds the data needed by the queue steering policy. When a
68 * target is sent multiple requests, we need to drive them to the same queue so
69 * that FIFO processing order is kept. However, if a target was idle, we can
70 * choose a queue arbitrarily. In this case the queue is chosen according to
71 * the current VCPU, so the driver expects the number of request queues to be
72 * equal to the number of VCPUs. This makes it easy and fast to select the
73 * queue, and also lets the driver optimize the IRQ affinity for the virtqueues
74 * (each virtqueue's affinity is set to the CPU that "owns" the queue).
75 *
76 * An interesting effect of this policy is that only writes to req_vq need to
77 * take the tgt_lock. Read can be done outside the lock because:
78 *
79 * - writes of req_vq only occur when atomic_inc_return(&tgt->reqs) returns 1.
80 * In that case, no other CPU is reading req_vq: even if they were in
81 * virtscsi_queuecommand_multi, they would be spinning on tgt_lock.
82 *
83 * - reads of req_vq only occur when the target is not idle (reqs != 0).
84 * A CPU that enters virtscsi_queuecommand_multi will not modify req_vq.
85 *
86 * Similarly, decrements of reqs are never concurrent with writes of req_vq.
87 * Thus they can happen outside the tgt_lock, provided of course we make reqs
88 * an atomic_t.
89 */
61struct virtio_scsi_target_state { 90struct virtio_scsi_target_state {
62 /* Protects sg. Lock hierarchy is tgt_lock -> vq_lock. */ 91 /* This spinlock never held at the same time as vq_lock. */
63 spinlock_t tgt_lock; 92 spinlock_t tgt_lock;
64 93
65 /* For sglist construction when adding commands to the virtqueue. */ 94 /* Count of outstanding requests. */
66 struct scatterlist sg[]; 95 atomic_t reqs;
96
97 /* Currently active virtqueue for requests sent to this target. */
98 struct virtio_scsi_vq *req_vq;
67}; 99};
68 100
69/* Driver instance state */ 101/* Driver instance state */
70struct virtio_scsi { 102struct virtio_scsi {
71 struct virtio_device *vdev; 103 struct virtio_device *vdev;
72 104
73 struct virtio_scsi_vq ctrl_vq;
74 struct virtio_scsi_vq event_vq;
75 struct virtio_scsi_vq req_vq;
76
77 /* Get some buffers ready for event vq */ 105 /* Get some buffers ready for event vq */
78 struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN]; 106 struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];
79 107
80 struct virtio_scsi_target_state *tgt[]; 108 u32 num_queues;
109
110 /* If the affinity hint is set for virtqueues */
111 bool affinity_hint_set;
112
113 /* CPU hotplug notifier */
114 struct notifier_block nb;
115
116 struct virtio_scsi_vq ctrl_vq;
117 struct virtio_scsi_vq event_vq;
118 struct virtio_scsi_vq req_vqs[];
81}; 119};
82 120
83static struct kmem_cache *virtscsi_cmd_cache; 121static struct kmem_cache *virtscsi_cmd_cache;
@@ -107,11 +145,13 @@ static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid)
107 * 145 *
108 * Called with vq_lock held. 146 * Called with vq_lock held.
109 */ 147 */
110static void virtscsi_complete_cmd(void *buf) 148static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
111{ 149{
112 struct virtio_scsi_cmd *cmd = buf; 150 struct virtio_scsi_cmd *cmd = buf;
113 struct scsi_cmnd *sc = cmd->sc; 151 struct scsi_cmnd *sc = cmd->sc;
114 struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd; 152 struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
153 struct virtio_scsi_target_state *tgt =
154 scsi_target(sc->device)->hostdata;
115 155
116 dev_dbg(&sc->device->sdev_gendev, 156 dev_dbg(&sc->device->sdev_gendev,
117 "cmd %p response %u status %#02x sense_len %u\n", 157 "cmd %p response %u status %#02x sense_len %u\n",
@@ -166,32 +206,71 @@ static void virtscsi_complete_cmd(void *buf)
166 206
167 mempool_free(cmd, virtscsi_cmd_pool); 207 mempool_free(cmd, virtscsi_cmd_pool);
168 sc->scsi_done(sc); 208 sc->scsi_done(sc);
209
210 atomic_dec(&tgt->reqs);
169} 211}
170 212
171static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf)) 213static void virtscsi_vq_done(struct virtio_scsi *vscsi,
214 struct virtio_scsi_vq *virtscsi_vq,
215 void (*fn)(struct virtio_scsi *vscsi, void *buf))
172{ 216{
173 void *buf; 217 void *buf;
174 unsigned int len; 218 unsigned int len;
219 unsigned long flags;
220 struct virtqueue *vq = virtscsi_vq->vq;
175 221
222 spin_lock_irqsave(&virtscsi_vq->vq_lock, flags);
176 do { 223 do {
177 virtqueue_disable_cb(vq); 224 virtqueue_disable_cb(vq);
178 while ((buf = virtqueue_get_buf(vq, &len)) != NULL) 225 while ((buf = virtqueue_get_buf(vq, &len)) != NULL)
179 fn(buf); 226 fn(vscsi, buf);
180 } while (!virtqueue_enable_cb(vq)); 227 } while (!virtqueue_enable_cb(vq));
228 spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags);
181} 229}
182 230
183static void virtscsi_req_done(struct virtqueue *vq) 231static void virtscsi_req_done(struct virtqueue *vq)
184{ 232{
185 struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); 233 struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
186 struct virtio_scsi *vscsi = shost_priv(sh); 234 struct virtio_scsi *vscsi = shost_priv(sh);
187 unsigned long flags; 235 int index = vq->index - VIRTIO_SCSI_VQ_BASE;
236 struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index];
188 237
189 spin_lock_irqsave(&vscsi->req_vq.vq_lock, flags); 238 /*
190 virtscsi_vq_done(vq, virtscsi_complete_cmd); 239 * Read req_vq before decrementing the reqs field in
191 spin_unlock_irqrestore(&vscsi->req_vq.vq_lock, flags); 240 * virtscsi_complete_cmd.
241 *
242 * With barriers:
243 *
244 * CPU #0 virtscsi_queuecommand_multi (CPU #1)
245 * ------------------------------------------------------------
246 * lock vq_lock
247 * read req_vq
248 * read reqs (reqs = 1)
249 * write reqs (reqs = 0)
250 * increment reqs (reqs = 1)
251 * write req_vq
252 *
253 * Possible reordering without barriers:
254 *
255 * CPU #0 virtscsi_queuecommand_multi (CPU #1)
256 * ------------------------------------------------------------
257 * lock vq_lock
258 * read reqs (reqs = 1)
259 * write reqs (reqs = 0)
260 * increment reqs (reqs = 1)
261 * write req_vq
262 * read (wrong) req_vq
263 *
264 * We do not need a full smp_rmb, because req_vq is required to get
265 * to tgt->reqs: tgt is &vscsi->tgt[sc->device->id], where sc is stored
266 * in the virtqueue as the user token.
267 */
268 smp_read_barrier_depends();
269
270 virtscsi_vq_done(vscsi, req_vq, virtscsi_complete_cmd);
192}; 271};
193 272
194static void virtscsi_complete_free(void *buf) 273static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf)
195{ 274{
196 struct virtio_scsi_cmd *cmd = buf; 275 struct virtio_scsi_cmd *cmd = buf;
197 276
@@ -205,11 +284,8 @@ static void virtscsi_ctrl_done(struct virtqueue *vq)
205{ 284{
206 struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); 285 struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
207 struct virtio_scsi *vscsi = shost_priv(sh); 286 struct virtio_scsi *vscsi = shost_priv(sh);
208 unsigned long flags;
209 287
210 spin_lock_irqsave(&vscsi->ctrl_vq.vq_lock, flags); 288 virtscsi_vq_done(vscsi, &vscsi->ctrl_vq, virtscsi_complete_free);
211 virtscsi_vq_done(vq, virtscsi_complete_free);
212 spin_unlock_irqrestore(&vscsi->ctrl_vq.vq_lock, flags);
213}; 289};
214 290
215static int virtscsi_kick_event(struct virtio_scsi *vscsi, 291static int virtscsi_kick_event(struct virtio_scsi *vscsi,
@@ -223,8 +299,8 @@ static int virtscsi_kick_event(struct virtio_scsi *vscsi,
223 299
224 spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); 300 spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags);
225 301
226 err = virtqueue_add_buf(vscsi->event_vq.vq, &sg, 0, 1, event_node, 302 err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node,
227 GFP_ATOMIC); 303 GFP_ATOMIC);
228 if (!err) 304 if (!err)
229 virtqueue_kick(vscsi->event_vq.vq); 305 virtqueue_kick(vscsi->event_vq.vq);
230 306
@@ -254,7 +330,7 @@ static void virtscsi_cancel_event_work(struct virtio_scsi *vscsi)
254} 330}
255 331
256static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi, 332static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi,
257 struct virtio_scsi_event *event) 333 struct virtio_scsi_event *event)
258{ 334{
259 struct scsi_device *sdev; 335 struct scsi_device *sdev;
260 struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev); 336 struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
@@ -332,7 +408,7 @@ static void virtscsi_handle_event(struct work_struct *work)
332 virtscsi_kick_event(vscsi, event_node); 408 virtscsi_kick_event(vscsi, event_node);
333} 409}
334 410
335static void virtscsi_complete_event(void *buf) 411static void virtscsi_complete_event(struct virtio_scsi *vscsi, void *buf)
336{ 412{
337 struct virtio_scsi_event_node *event_node = buf; 413 struct virtio_scsi_event_node *event_node = buf;
338 414
@@ -344,82 +420,65 @@ static void virtscsi_event_done(struct virtqueue *vq)
344{ 420{
345 struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); 421 struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
346 struct virtio_scsi *vscsi = shost_priv(sh); 422 struct virtio_scsi *vscsi = shost_priv(sh);
347 unsigned long flags;
348 423
349 spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); 424 virtscsi_vq_done(vscsi, &vscsi->event_vq, virtscsi_complete_event);
350 virtscsi_vq_done(vq, virtscsi_complete_event);
351 spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags);
352}; 425};
353 426
354static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx,
355 struct scsi_data_buffer *sdb)
356{
357 struct sg_table *table = &sdb->table;
358 struct scatterlist *sg_elem;
359 unsigned int idx = *p_idx;
360 int i;
361
362 for_each_sg(table->sgl, sg_elem, table->nents, i)
363 sg[idx++] = *sg_elem;
364
365 *p_idx = idx;
366}
367
368/** 427/**
369 * virtscsi_map_cmd - map a scsi_cmd to a virtqueue scatterlist 428 * virtscsi_add_cmd - add a virtio_scsi_cmd to a virtqueue
370 * @vscsi : virtio_scsi state 429 * @vq : the struct virtqueue we're talking about
371 * @cmd : command structure 430 * @cmd : command structure
372 * @out_num : number of read-only elements
373 * @in_num : number of write-only elements
374 * @req_size : size of the request buffer 431 * @req_size : size of the request buffer
375 * @resp_size : size of the response buffer 432 * @resp_size : size of the response buffer
376 * 433 * @gfp : flags to use for memory allocations
377 * Called with tgt_lock held.
378 */ 434 */
379static void virtscsi_map_cmd(struct virtio_scsi_target_state *tgt, 435static int virtscsi_add_cmd(struct virtqueue *vq,
380 struct virtio_scsi_cmd *cmd, 436 struct virtio_scsi_cmd *cmd,
381 unsigned *out_num, unsigned *in_num, 437 size_t req_size, size_t resp_size, gfp_t gfp)
382 size_t req_size, size_t resp_size)
383{ 438{
384 struct scsi_cmnd *sc = cmd->sc; 439 struct scsi_cmnd *sc = cmd->sc;
385 struct scatterlist *sg = tgt->sg; 440 struct scatterlist *sgs[4], req, resp;
386 unsigned int idx = 0; 441 struct sg_table *out, *in;
442 unsigned out_num = 0, in_num = 0;
443
444 out = in = NULL;
445
446 if (sc && sc->sc_data_direction != DMA_NONE) {
447 if (sc->sc_data_direction != DMA_FROM_DEVICE)
448 out = &scsi_out(sc)->table;
449 if (sc->sc_data_direction != DMA_TO_DEVICE)
450 in = &scsi_in(sc)->table;
451 }
387 452
388 /* Request header. */ 453 /* Request header. */
389 sg_set_buf(&sg[idx++], &cmd->req, req_size); 454 sg_init_one(&req, &cmd->req, req_size);
455 sgs[out_num++] = &req;
390 456
391 /* Data-out buffer. */ 457 /* Data-out buffer. */
392 if (sc && sc->sc_data_direction != DMA_FROM_DEVICE) 458 if (out)
393 virtscsi_map_sgl(sg, &idx, scsi_out(sc)); 459 sgs[out_num++] = out->sgl;
394
395 *out_num = idx;
396 460
397 /* Response header. */ 461 /* Response header. */
398 sg_set_buf(&sg[idx++], &cmd->resp, resp_size); 462 sg_init_one(&resp, &cmd->resp, resp_size);
463 sgs[out_num + in_num++] = &resp;
399 464
400 /* Data-in buffer */ 465 /* Data-in buffer */
401 if (sc && sc->sc_data_direction != DMA_TO_DEVICE) 466 if (in)
402 virtscsi_map_sgl(sg, &idx, scsi_in(sc)); 467 sgs[out_num + in_num++] = in->sgl;
403 468
404 *in_num = idx - *out_num; 469 return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, gfp);
405} 470}
406 471
407static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt, 472static int virtscsi_kick_cmd(struct virtio_scsi_vq *vq,
408 struct virtio_scsi_vq *vq,
409 struct virtio_scsi_cmd *cmd, 473 struct virtio_scsi_cmd *cmd,
410 size_t req_size, size_t resp_size, gfp_t gfp) 474 size_t req_size, size_t resp_size, gfp_t gfp)
411{ 475{
412 unsigned int out_num, in_num;
413 unsigned long flags; 476 unsigned long flags;
414 int err; 477 int err;
415 bool needs_kick = false; 478 bool needs_kick = false;
416 479
417 spin_lock_irqsave(&tgt->tgt_lock, flags); 480 spin_lock_irqsave(&vq->vq_lock, flags);
418 virtscsi_map_cmd(tgt, cmd, &out_num, &in_num, req_size, resp_size); 481 err = virtscsi_add_cmd(vq->vq, cmd, req_size, resp_size, gfp);
419
420 spin_lock(&vq->vq_lock);
421 err = virtqueue_add_buf(vq->vq, tgt->sg, out_num, in_num, cmd, gfp);
422 spin_unlock(&tgt->tgt_lock);
423 if (!err) 482 if (!err)
424 needs_kick = virtqueue_kick_prepare(vq->vq); 483 needs_kick = virtqueue_kick_prepare(vq->vq);
425 484
@@ -430,10 +489,10 @@ static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt,
430 return err; 489 return err;
431} 490}
432 491
433static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) 492static int virtscsi_queuecommand(struct virtio_scsi *vscsi,
493 struct virtio_scsi_vq *req_vq,
494 struct scsi_cmnd *sc)
434{ 495{
435 struct virtio_scsi *vscsi = shost_priv(sh);
436 struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
437 struct virtio_scsi_cmd *cmd; 496 struct virtio_scsi_cmd *cmd;
438 int ret; 497 int ret;
439 498
@@ -467,7 +526,7 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
467 BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE); 526 BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE);
468 memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len); 527 memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len);
469 528
470 if (virtscsi_kick_cmd(tgt, &vscsi->req_vq, cmd, 529 if (virtscsi_kick_cmd(req_vq, cmd,
471 sizeof cmd->req.cmd, sizeof cmd->resp.cmd, 530 sizeof cmd->req.cmd, sizeof cmd->resp.cmd,
472 GFP_ATOMIC) == 0) 531 GFP_ATOMIC) == 0)
473 ret = 0; 532 ret = 0;
@@ -478,14 +537,62 @@ out:
478 return ret; 537 return ret;
479} 538}
480 539
540static int virtscsi_queuecommand_single(struct Scsi_Host *sh,
541 struct scsi_cmnd *sc)
542{
543 struct virtio_scsi *vscsi = shost_priv(sh);
544 struct virtio_scsi_target_state *tgt =
545 scsi_target(sc->device)->hostdata;
546
547 atomic_inc(&tgt->reqs);
548 return virtscsi_queuecommand(vscsi, &vscsi->req_vqs[0], sc);
549}
550
551static struct virtio_scsi_vq *virtscsi_pick_vq(struct virtio_scsi *vscsi,
552 struct virtio_scsi_target_state *tgt)
553{
554 struct virtio_scsi_vq *vq;
555 unsigned long flags;
556 u32 queue_num;
557
558 spin_lock_irqsave(&tgt->tgt_lock, flags);
559
560 /*
561 * The memory barrier after atomic_inc_return matches
562 * the smp_read_barrier_depends() in virtscsi_req_done.
563 */
564 if (atomic_inc_return(&tgt->reqs) > 1)
565 vq = ACCESS_ONCE(tgt->req_vq);
566 else {
567 queue_num = smp_processor_id();
568 while (unlikely(queue_num >= vscsi->num_queues))
569 queue_num -= vscsi->num_queues;
570
571 tgt->req_vq = vq = &vscsi->req_vqs[queue_num];
572 }
573
574 spin_unlock_irqrestore(&tgt->tgt_lock, flags);
575 return vq;
576}
577
578static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
579 struct scsi_cmnd *sc)
580{
581 struct virtio_scsi *vscsi = shost_priv(sh);
582 struct virtio_scsi_target_state *tgt =
583 scsi_target(sc->device)->hostdata;
584 struct virtio_scsi_vq *req_vq = virtscsi_pick_vq(vscsi, tgt);
585
586 return virtscsi_queuecommand(vscsi, req_vq, sc);
587}
588
481static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd) 589static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd)
482{ 590{
483 DECLARE_COMPLETION_ONSTACK(comp); 591 DECLARE_COMPLETION_ONSTACK(comp);
484 struct virtio_scsi_target_state *tgt = vscsi->tgt[cmd->sc->device->id];
485 int ret = FAILED; 592 int ret = FAILED;
486 593
487 cmd->comp = &comp; 594 cmd->comp = &comp;
488 if (virtscsi_kick_cmd(tgt, &vscsi->ctrl_vq, cmd, 595 if (virtscsi_kick_cmd(&vscsi->ctrl_vq, cmd,
489 sizeof cmd->req.tmf, sizeof cmd->resp.tmf, 596 sizeof cmd->req.tmf, sizeof cmd->resp.tmf,
490 GFP_NOIO) < 0) 597 GFP_NOIO) < 0)
491 goto out; 598 goto out;
@@ -547,18 +654,57 @@ static int virtscsi_abort(struct scsi_cmnd *sc)
547 return virtscsi_tmf(vscsi, cmd); 654 return virtscsi_tmf(vscsi, cmd);
548} 655}
549 656
550static struct scsi_host_template virtscsi_host_template = { 657static int virtscsi_target_alloc(struct scsi_target *starget)
658{
659 struct virtio_scsi_target_state *tgt =
660 kmalloc(sizeof(*tgt), GFP_KERNEL);
661 if (!tgt)
662 return -ENOMEM;
663
664 spin_lock_init(&tgt->tgt_lock);
665 atomic_set(&tgt->reqs, 0);
666 tgt->req_vq = NULL;
667
668 starget->hostdata = tgt;
669 return 0;
670}
671
672static void virtscsi_target_destroy(struct scsi_target *starget)
673{
674 struct virtio_scsi_target_state *tgt = starget->hostdata;
675 kfree(tgt);
676}
677
678static struct scsi_host_template virtscsi_host_template_single = {
679 .module = THIS_MODULE,
680 .name = "Virtio SCSI HBA",
681 .proc_name = "virtio_scsi",
682 .this_id = -1,
683 .queuecommand = virtscsi_queuecommand_single,
684 .eh_abort_handler = virtscsi_abort,
685 .eh_device_reset_handler = virtscsi_device_reset,
686
687 .can_queue = 1024,
688 .dma_boundary = UINT_MAX,
689 .use_clustering = ENABLE_CLUSTERING,
690 .target_alloc = virtscsi_target_alloc,
691 .target_destroy = virtscsi_target_destroy,
692};
693
694static struct scsi_host_template virtscsi_host_template_multi = {
551 .module = THIS_MODULE, 695 .module = THIS_MODULE,
552 .name = "Virtio SCSI HBA", 696 .name = "Virtio SCSI HBA",
553 .proc_name = "virtio_scsi", 697 .proc_name = "virtio_scsi",
554 .queuecommand = virtscsi_queuecommand,
555 .this_id = -1, 698 .this_id = -1,
699 .queuecommand = virtscsi_queuecommand_multi,
556 .eh_abort_handler = virtscsi_abort, 700 .eh_abort_handler = virtscsi_abort,
557 .eh_device_reset_handler = virtscsi_device_reset, 701 .eh_device_reset_handler = virtscsi_device_reset,
558 702
559 .can_queue = 1024, 703 .can_queue = 1024,
560 .dma_boundary = UINT_MAX, 704 .dma_boundary = UINT_MAX,
561 .use_clustering = ENABLE_CLUSTERING, 705 .use_clustering = ENABLE_CLUSTERING,
706 .target_alloc = virtscsi_target_alloc,
707 .target_destroy = virtscsi_target_destroy,
562}; 708};
563 709
564#define virtscsi_config_get(vdev, fld) \ 710#define virtscsi_config_get(vdev, fld) \
@@ -578,29 +724,69 @@ static struct scsi_host_template virtscsi_host_template = {
578 &__val, sizeof(__val)); \ 724 &__val, sizeof(__val)); \
579 }) 725 })
580 726
581static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, 727static void __virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
582 struct virtqueue *vq)
583{ 728{
584 spin_lock_init(&virtscsi_vq->vq_lock); 729 int i;
585 virtscsi_vq->vq = vq; 730 int cpu;
731
732 /* In multiqueue mode, when the number of cpu is equal
733 * to the number of request queues, we let the qeueues
734 * to be private to one cpu by setting the affinity hint
735 * to eliminate the contention.
736 */
737 if ((vscsi->num_queues == 1 ||
738 vscsi->num_queues != num_online_cpus()) && affinity) {
739 if (vscsi->affinity_hint_set)
740 affinity = false;
741 else
742 return;
743 }
744
745 if (affinity) {
746 i = 0;
747 for_each_online_cpu(cpu) {
748 virtqueue_set_affinity(vscsi->req_vqs[i].vq, cpu);
749 i++;
750 }
751
752 vscsi->affinity_hint_set = true;
753 } else {
754 for (i = 0; i < vscsi->num_queues - VIRTIO_SCSI_VQ_BASE; i++)
755 virtqueue_set_affinity(vscsi->req_vqs[i].vq, -1);
756
757 vscsi->affinity_hint_set = false;
758 }
586} 759}
587 760
588static struct virtio_scsi_target_state *virtscsi_alloc_tgt( 761static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
589 struct virtio_device *vdev, int sg_elems)
590{ 762{
591 struct virtio_scsi_target_state *tgt; 763 get_online_cpus();
592 gfp_t gfp_mask = GFP_KERNEL; 764 __virtscsi_set_affinity(vscsi, affinity);
593 765 put_online_cpus();
594 /* We need extra sg elements at head and tail. */ 766}
595 tgt = kmalloc(sizeof(*tgt) + sizeof(tgt->sg[0]) * (sg_elems + 2),
596 gfp_mask);
597 767
598 if (!tgt) 768static int virtscsi_cpu_callback(struct notifier_block *nfb,
599 return NULL; 769 unsigned long action, void *hcpu)
770{
771 struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb);
772 switch(action) {
773 case CPU_ONLINE:
774 case CPU_ONLINE_FROZEN:
775 case CPU_DEAD:
776 case CPU_DEAD_FROZEN:
777 __virtscsi_set_affinity(vscsi, true);
778 break;
779 default:
780 break;
781 }
782 return NOTIFY_OK;
783}
600 784
601 spin_lock_init(&tgt->tgt_lock); 785static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
602 sg_init_table(tgt->sg, sg_elems + 2); 786 struct virtqueue *vq)
603 return tgt; 787{
788 spin_lock_init(&virtscsi_vq->vq_lock);
789 virtscsi_vq->vq = vq;
604} 790}
605 791
606static void virtscsi_scan(struct virtio_device *vdev) 792static void virtscsi_scan(struct virtio_device *vdev)
@@ -614,46 +800,56 @@ static void virtscsi_remove_vqs(struct virtio_device *vdev)
614{ 800{
615 struct Scsi_Host *sh = virtio_scsi_host(vdev); 801 struct Scsi_Host *sh = virtio_scsi_host(vdev);
616 struct virtio_scsi *vscsi = shost_priv(sh); 802 struct virtio_scsi *vscsi = shost_priv(sh);
617 u32 i, num_targets; 803
804 virtscsi_set_affinity(vscsi, false);
618 805
619 /* Stop all the virtqueues. */ 806 /* Stop all the virtqueues. */
620 vdev->config->reset(vdev); 807 vdev->config->reset(vdev);
621 808
622 num_targets = sh->max_id;
623 for (i = 0; i < num_targets; i++) {
624 kfree(vscsi->tgt[i]);
625 vscsi->tgt[i] = NULL;
626 }
627
628 vdev->config->del_vqs(vdev); 809 vdev->config->del_vqs(vdev);
629} 810}
630 811
631static int virtscsi_init(struct virtio_device *vdev, 812static int virtscsi_init(struct virtio_device *vdev,
632 struct virtio_scsi *vscsi, int num_targets) 813 struct virtio_scsi *vscsi)
633{ 814{
634 int err; 815 int err;
635 struct virtqueue *vqs[3]; 816 u32 i;
636 u32 i, sg_elems; 817 u32 num_vqs;
818 vq_callback_t **callbacks;
819 const char **names;
820 struct virtqueue **vqs;
821
822 num_vqs = vscsi->num_queues + VIRTIO_SCSI_VQ_BASE;
823 vqs = kmalloc(num_vqs * sizeof(struct virtqueue *), GFP_KERNEL);
824 callbacks = kmalloc(num_vqs * sizeof(vq_callback_t *), GFP_KERNEL);
825 names = kmalloc(num_vqs * sizeof(char *), GFP_KERNEL);
826
827 if (!callbacks || !vqs || !names) {
828 err = -ENOMEM;
829 goto out;
830 }
637 831
638 vq_callback_t *callbacks[] = { 832 callbacks[0] = virtscsi_ctrl_done;
639 virtscsi_ctrl_done, 833 callbacks[1] = virtscsi_event_done;
640 virtscsi_event_done, 834 names[0] = "control";
641 virtscsi_req_done 835 names[1] = "event";
642 }; 836 for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) {
643 const char *names[] = { 837 callbacks[i] = virtscsi_req_done;
644 "control", 838 names[i] = "request";
645 "event", 839 }
646 "request"
647 };
648 840
649 /* Discover virtqueues and write information to configuration. */ 841 /* Discover virtqueues and write information to configuration. */
650 err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names); 842 err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
651 if (err) 843 if (err)
652 return err; 844 goto out;
653 845
654 virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]); 846 virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]);
655 virtscsi_init_vq(&vscsi->event_vq, vqs[1]); 847 virtscsi_init_vq(&vscsi->event_vq, vqs[1]);
656 virtscsi_init_vq(&vscsi->req_vq, vqs[2]); 848 for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++)
849 virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE],
850 vqs[i]);
851
852 virtscsi_set_affinity(vscsi, true);
657 853
658 virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE); 854 virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
659 virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE); 855 virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
@@ -661,19 +857,12 @@ static int virtscsi_init(struct virtio_device *vdev,
661 if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) 857 if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG))
662 virtscsi_kick_event_all(vscsi); 858 virtscsi_kick_event_all(vscsi);
663 859
664 /* We need to know how many segments before we allocate. */
665 sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1;
666
667 for (i = 0; i < num_targets; i++) {
668 vscsi->tgt[i] = virtscsi_alloc_tgt(vdev, sg_elems);
669 if (!vscsi->tgt[i]) {
670 err = -ENOMEM;
671 goto out;
672 }
673 }
674 err = 0; 860 err = 0;
675 861
676out: 862out:
863 kfree(names);
864 kfree(callbacks);
865 kfree(vqs);
677 if (err) 866 if (err)
678 virtscsi_remove_vqs(vdev); 867 virtscsi_remove_vqs(vdev);
679 return err; 868 return err;
@@ -686,13 +875,21 @@ static int virtscsi_probe(struct virtio_device *vdev)
686 int err; 875 int err;
687 u32 sg_elems, num_targets; 876 u32 sg_elems, num_targets;
688 u32 cmd_per_lun; 877 u32 cmd_per_lun;
878 u32 num_queues;
879 struct scsi_host_template *hostt;
880
881 /* We need to know how many queues before we allocate. */
882 num_queues = virtscsi_config_get(vdev, num_queues) ? : 1;
689 883
690 /* Allocate memory and link the structs together. */
691 num_targets = virtscsi_config_get(vdev, max_target) + 1; 884 num_targets = virtscsi_config_get(vdev, max_target) + 1;
692 shost = scsi_host_alloc(&virtscsi_host_template,
693 sizeof(*vscsi)
694 + num_targets * sizeof(struct virtio_scsi_target_state));
695 885
886 if (num_queues == 1)
887 hostt = &virtscsi_host_template_single;
888 else
889 hostt = &virtscsi_host_template_multi;
890
891 shost = scsi_host_alloc(hostt,
892 sizeof(*vscsi) + sizeof(vscsi->req_vqs[0]) * num_queues);
696 if (!shost) 893 if (!shost)
697 return -ENOMEM; 894 return -ENOMEM;
698 895
@@ -700,12 +897,20 @@ static int virtscsi_probe(struct virtio_device *vdev)
700 shost->sg_tablesize = sg_elems; 897 shost->sg_tablesize = sg_elems;
701 vscsi = shost_priv(shost); 898 vscsi = shost_priv(shost);
702 vscsi->vdev = vdev; 899 vscsi->vdev = vdev;
900 vscsi->num_queues = num_queues;
703 vdev->priv = shost; 901 vdev->priv = shost;
704 902
705 err = virtscsi_init(vdev, vscsi, num_targets); 903 err = virtscsi_init(vdev, vscsi);
706 if (err) 904 if (err)
707 goto virtscsi_init_failed; 905 goto virtscsi_init_failed;
708 906
907 vscsi->nb.notifier_call = &virtscsi_cpu_callback;
908 err = register_hotcpu_notifier(&vscsi->nb);
909 if (err) {
910 pr_err("registering cpu notifier failed\n");
911 goto scsi_add_host_failed;
912 }
913
709 cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1; 914 cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
710 shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue); 915 shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue);
711 shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF; 916 shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF;
@@ -743,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev)
743 948
744 scsi_remove_host(shost); 949 scsi_remove_host(shost);
745 950
951 unregister_hotcpu_notifier(&vscsi->nb);
952
746 virtscsi_remove_vqs(vdev); 953 virtscsi_remove_vqs(vdev);
747 scsi_host_put(shost); 954 scsi_host_put(shost);
748} 955}
@@ -759,7 +966,7 @@ static int virtscsi_restore(struct virtio_device *vdev)
759 struct Scsi_Host *sh = virtio_scsi_host(vdev); 966 struct Scsi_Host *sh = virtio_scsi_host(vdev);
760 struct virtio_scsi *vscsi = shost_priv(sh); 967 struct virtio_scsi *vscsi = shost_priv(sh);
761 968
762 return virtscsi_init(vdev, vscsi, sh->max_id); 969 return virtscsi_init(vdev, vscsi);
763} 970}
764#endif 971#endif
765 972
@@ -794,8 +1001,7 @@ static int __init init(void)
794 1001
795 virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0); 1002 virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0);
796 if (!virtscsi_cmd_cache) { 1003 if (!virtscsi_cmd_cache) {
797 printk(KERN_ERR "kmem_cache_create() for " 1004 pr_err("kmem_cache_create() for virtscsi_cmd_cache failed\n");
798 "virtscsi_cmd_cache failed\n");
799 goto error; 1005 goto error;
800 } 1006 }
801 1007
@@ -804,8 +1010,7 @@ static int __init init(void)
804 mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ, 1010 mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ,
805 virtscsi_cmd_cache); 1011 virtscsi_cmd_cache);
806 if (!virtscsi_cmd_pool) { 1012 if (!virtscsi_cmd_pool) {
807 printk(KERN_ERR "mempool_create() for" 1013 pr_err("mempool_create() for virtscsi_cmd_pool failed\n");
808 "virtscsi_cmd_pool failed\n");
809 goto error; 1014 goto error;
810 } 1015 }
811 ret = register_virtio_driver(&virtio_scsi_driver); 1016 ret = register_virtio_driver(&virtio_scsi_driver);
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 26a64e5b8a58..8b9226da3f54 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -1,6 +1,7 @@
1config VHOST_NET 1config VHOST_NET
2 tristate "Host kernel accelerator for virtio net" 2 tristate "Host kernel accelerator for virtio net"
3 depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP) 3 depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP)
4 select VHOST_RING
4 ---help--- 5 ---help---
5 This kernel module can be loaded in host kernel to accelerate 6 This kernel module can be loaded in host kernel to accelerate
6 guest networking with virtio_net. Not to be confused with virtio_net 7 guest networking with virtio_net. Not to be confused with virtio_net
@@ -12,7 +13,14 @@ config VHOST_NET
12config VHOST_SCSI 13config VHOST_SCSI
13 tristate "VHOST_SCSI TCM fabric driver" 14 tristate "VHOST_SCSI TCM fabric driver"
14 depends on TARGET_CORE && EVENTFD && m 15 depends on TARGET_CORE && EVENTFD && m
16 select VHOST_RING
15 default n 17 default n
16 ---help--- 18 ---help---
17 Say M here to enable the vhost_scsi TCM fabric module 19 Say M here to enable the vhost_scsi TCM fabric module
18 for use with virtio-scsi guests 20 for use with virtio-scsi guests
21
22config VHOST_RING
23 tristate
24 ---help---
25 This option is selected by any driver which needs to access
26 the host side of a virtio ring.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index ef21d5fdfa7d..654e9afb11f5 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -3,3 +3,5 @@ vhost_net-y := vhost.o net.o
3 3
4obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o 4obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
5vhost_scsi-y := scsi.o 5vhost_scsi-y := scsi.o
6
7obj-$(CONFIG_VHOST_RING) += vringh.o
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index be65414d5bb1..1ee45bc85f67 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -282,7 +282,9 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl,
282 return vhost_test_reset_owner(n); 282 return vhost_test_reset_owner(n);
283 default: 283 default:
284 mutex_lock(&n->dev.mutex); 284 mutex_lock(&n->dev.mutex);
285 r = vhost_dev_ioctl(&n->dev, ioctl, arg); 285 r = vhost_dev_ioctl(&n->dev, ioctl, argp);
286 if (r == -ENOIOCTLCMD)
287 r = vhost_vring_ioctl(&n->dev, ioctl, argp);
286 vhost_test_flush(n); 288 vhost_test_flush(n);
287 mutex_unlock(&n->dev.mutex); 289 mutex_unlock(&n->dev.mutex);
288 return r; 290 return r;
diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
new file mode 100644
index 000000000000..bff0775e258c
--- /dev/null
+++ b/drivers/vhost/vringh.c
@@ -0,0 +1,1007 @@
1/*
2 * Helpers for the host side of a virtio ring.
3 *
4 * Since these may be in userspace, we use (inline) accessors.
5 */
6#include <linux/vringh.h>
7#include <linux/virtio_ring.h>
8#include <linux/kernel.h>
9#include <linux/ratelimit.h>
10#include <linux/uaccess.h>
11#include <linux/slab.h>
12#include <linux/export.h>
13
14static __printf(1,2) __cold void vringh_bad(const char *fmt, ...)
15{
16 static DEFINE_RATELIMIT_STATE(vringh_rs,
17 DEFAULT_RATELIMIT_INTERVAL,
18 DEFAULT_RATELIMIT_BURST);
19 if (__ratelimit(&vringh_rs)) {
20 va_list ap;
21 va_start(ap, fmt);
22 printk(KERN_NOTICE "vringh:");
23 vprintk(fmt, ap);
24 va_end(ap);
25 }
26}
27
28/* Returns vring->num if empty, -ve on error. */
29static inline int __vringh_get_head(const struct vringh *vrh,
30 int (*getu16)(u16 *val, const u16 *p),
31 u16 *last_avail_idx)
32{
33 u16 avail_idx, i, head;
34 int err;
35
36 err = getu16(&avail_idx, &vrh->vring.avail->idx);
37 if (err) {
38 vringh_bad("Failed to access avail idx at %p",
39 &vrh->vring.avail->idx);
40 return err;
41 }
42
43 if (*last_avail_idx == avail_idx)
44 return vrh->vring.num;
45
46 /* Only get avail ring entries after they have been exposed by guest. */
47 virtio_rmb(vrh->weak_barriers);
48
49 i = *last_avail_idx & (vrh->vring.num - 1);
50
51 err = getu16(&head, &vrh->vring.avail->ring[i]);
52 if (err) {
53 vringh_bad("Failed to read head: idx %d address %p",
54 *last_avail_idx, &vrh->vring.avail->ring[i]);
55 return err;
56 }
57
58 if (head >= vrh->vring.num) {
59 vringh_bad("Guest says index %u > %u is available",
60 head, vrh->vring.num);
61 return -EINVAL;
62 }
63
64 (*last_avail_idx)++;
65 return head;
66}
67
68/* Copy some bytes to/from the iovec. Returns num copied. */
69static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
70 void *ptr, size_t len,
71 int (*xfer)(void *addr, void *ptr,
72 size_t len))
73{
74 int err, done = 0;
75
76 while (len && iov->i < iov->used) {
77 size_t partlen;
78
79 partlen = min(iov->iov[iov->i].iov_len, len);
80 err = xfer(iov->iov[iov->i].iov_base, ptr, partlen);
81 if (err)
82 return err;
83 done += partlen;
84 len -= partlen;
85 ptr += partlen;
86 iov->consumed += partlen;
87 iov->iov[iov->i].iov_len -= partlen;
88 iov->iov[iov->i].iov_base += partlen;
89
90 if (!iov->iov[iov->i].iov_len) {
91 /* Fix up old iov element then increment. */
92 iov->iov[iov->i].iov_len = iov->consumed;
93 iov->iov[iov->i].iov_base -= iov->consumed;
94
95 iov->consumed = 0;
96 iov->i++;
97 }
98 }
99 return done;
100}
101
102/* May reduce *len if range is shorter. */
103static inline bool range_check(struct vringh *vrh, u64 addr, size_t *len,
104 struct vringh_range *range,
105 bool (*getrange)(struct vringh *,
106 u64, struct vringh_range *))
107{
108 if (addr < range->start || addr > range->end_incl) {
109 if (!getrange(vrh, addr, range))
110 return false;
111 }
112 BUG_ON(addr < range->start || addr > range->end_incl);
113
114 /* To end of memory? */
115 if (unlikely(addr + *len == 0)) {
116 if (range->end_incl == -1ULL)
117 return true;
118 goto truncate;
119 }
120
121 /* Otherwise, don't wrap. */
122 if (addr + *len < addr) {
123 vringh_bad("Wrapping descriptor %zu@0x%llx",
124 *len, (unsigned long long)addr);
125 return false;
126 }
127
128 if (unlikely(addr + *len - 1 > range->end_incl))
129 goto truncate;
130 return true;
131
132truncate:
133 *len = range->end_incl + 1 - addr;
134 return true;
135}
136
137static inline bool no_range_check(struct vringh *vrh, u64 addr, size_t *len,
138 struct vringh_range *range,
139 bool (*getrange)(struct vringh *,
140 u64, struct vringh_range *))
141{
142 return true;
143}
144
145/* No reason for this code to be inline. */
146static int move_to_indirect(int *up_next, u16 *i, void *addr,
147 const struct vring_desc *desc,
148 struct vring_desc **descs, int *desc_max)
149{
150 /* Indirect tables can't have indirect. */
151 if (*up_next != -1) {
152 vringh_bad("Multilevel indirect %u->%u", *up_next, *i);
153 return -EINVAL;
154 }
155
156 if (unlikely(desc->len % sizeof(struct vring_desc))) {
157 vringh_bad("Strange indirect len %u", desc->len);
158 return -EINVAL;
159 }
160
161 /* We will check this when we follow it! */
162 if (desc->flags & VRING_DESC_F_NEXT)
163 *up_next = desc->next;
164 else
165 *up_next = -2;
166 *descs = addr;
167 *desc_max = desc->len / sizeof(struct vring_desc);
168
169 /* Now, start at the first indirect. */
170 *i = 0;
171 return 0;
172}
173
174static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp)
175{
176 struct kvec *new;
177 unsigned int flag, new_num = (iov->max_num & ~VRINGH_IOV_ALLOCATED) * 2;
178
179 if (new_num < 8)
180 new_num = 8;
181
182 flag = (iov->max_num & VRINGH_IOV_ALLOCATED);
183 if (flag)
184 new = krealloc(iov->iov, new_num * sizeof(struct iovec), gfp);
185 else {
186 new = kmalloc(new_num * sizeof(struct iovec), gfp);
187 if (new) {
188 memcpy(new, iov->iov,
189 iov->max_num * sizeof(struct iovec));
190 flag = VRINGH_IOV_ALLOCATED;
191 }
192 }
193 if (!new)
194 return -ENOMEM;
195 iov->iov = new;
196 iov->max_num = (new_num | flag);
197 return 0;
198}
199
200static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next,
201 struct vring_desc **descs, int *desc_max)
202{
203 u16 i = *up_next;
204
205 *up_next = -1;
206 *descs = vrh->vring.desc;
207 *desc_max = vrh->vring.num;
208 return i;
209}
210
211static int slow_copy(struct vringh *vrh, void *dst, const void *src,
212 bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
213 struct vringh_range *range,
214 bool (*getrange)(struct vringh *vrh,
215 u64,
216 struct vringh_range *)),
217 bool (*getrange)(struct vringh *vrh,
218 u64 addr,
219 struct vringh_range *r),
220 struct vringh_range *range,
221 int (*copy)(void *dst, const void *src, size_t len))
222{
223 size_t part, len = sizeof(struct vring_desc);
224
225 do {
226 u64 addr;
227 int err;
228
229 part = len;
230 addr = (u64)(unsigned long)src - range->offset;
231
232 if (!rcheck(vrh, addr, &part, range, getrange))
233 return -EINVAL;
234
235 err = copy(dst, src, part);
236 if (err)
237 return err;
238
239 dst += part;
240 src += part;
241 len -= part;
242 } while (len);
243 return 0;
244}
245
246static inline int
247__vringh_iov(struct vringh *vrh, u16 i,
248 struct vringh_kiov *riov,
249 struct vringh_kiov *wiov,
250 bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
251 struct vringh_range *range,
252 bool (*getrange)(struct vringh *, u64,
253 struct vringh_range *)),
254 bool (*getrange)(struct vringh *, u64, struct vringh_range *),
255 gfp_t gfp,
256 int (*copy)(void *dst, const void *src, size_t len))
257{
258 int err, count = 0, up_next, desc_max;
259 struct vring_desc desc, *descs;
260 struct vringh_range range = { -1ULL, 0 }, slowrange;
261 bool slow = false;
262
263 /* We start traversing vring's descriptor table. */
264 descs = vrh->vring.desc;
265 desc_max = vrh->vring.num;
266 up_next = -1;
267
268 if (riov)
269 riov->i = riov->used = 0;
270 else if (wiov)
271 wiov->i = wiov->used = 0;
272 else
273 /* You must want something! */
274 BUG();
275
276 for (;;) {
277 void *addr;
278 struct vringh_kiov *iov;
279 size_t len;
280
281 if (unlikely(slow))
282 err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange,
283 &slowrange, copy);
284 else
285 err = copy(&desc, &descs[i], sizeof(desc));
286 if (unlikely(err))
287 goto fail;
288
289 if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) {
290 /* Make sure it's OK, and get offset. */
291 len = desc.len;
292 if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
293 err = -EINVAL;
294 goto fail;
295 }
296
297 if (unlikely(len != desc.len)) {
298 slow = true;
299 /* We need to save this range to use offset */
300 slowrange = range;
301 }
302
303 addr = (void *)(long)(desc.addr + range.offset);
304 err = move_to_indirect(&up_next, &i, addr, &desc,
305 &descs, &desc_max);
306 if (err)
307 goto fail;
308 continue;
309 }
310
311 if (count++ == vrh->vring.num) {
312 vringh_bad("Descriptor loop in %p", descs);
313 err = -ELOOP;
314 goto fail;
315 }
316
317 if (desc.flags & VRING_DESC_F_WRITE)
318 iov = wiov;
319 else {
320 iov = riov;
321 if (unlikely(wiov && wiov->i)) {
322 vringh_bad("Readable desc %p after writable",
323 &descs[i]);
324 err = -EINVAL;
325 goto fail;
326 }
327 }
328
329 if (!iov) {
330 vringh_bad("Unexpected %s desc",
331 !wiov ? "writable" : "readable");
332 err = -EPROTO;
333 goto fail;
334 }
335
336 again:
337 /* Make sure it's OK, and get offset. */
338 len = desc.len;
339 if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
340 err = -EINVAL;
341 goto fail;
342 }
343 addr = (void *)(unsigned long)(desc.addr + range.offset);
344
345 if (unlikely(iov->used == (iov->max_num & ~VRINGH_IOV_ALLOCATED))) {
346 err = resize_iovec(iov, gfp);
347 if (err)
348 goto fail;
349 }
350
351 iov->iov[iov->used].iov_base = addr;
352 iov->iov[iov->used].iov_len = len;
353 iov->used++;
354
355 if (unlikely(len != desc.len)) {
356 desc.len -= len;
357 desc.addr += len;
358 goto again;
359 }
360
361 if (desc.flags & VRING_DESC_F_NEXT) {
362 i = desc.next;
363 } else {
364 /* Just in case we need to finish traversing above. */
365 if (unlikely(up_next > 0)) {
366 i = return_from_indirect(vrh, &up_next,
367 &descs, &desc_max);
368 slow = false;
369 } else
370 break;
371 }
372
373 if (i >= desc_max) {
374 vringh_bad("Chained index %u > %u", i, desc_max);
375 err = -EINVAL;
376 goto fail;
377 }
378 }
379
380 return 0;
381
382fail:
383 return err;
384}
385
386static inline int __vringh_complete(struct vringh *vrh,
387 const struct vring_used_elem *used,
388 unsigned int num_used,
389 int (*putu16)(u16 *p, u16 val),
390 int (*putused)(struct vring_used_elem *dst,
391 const struct vring_used_elem
392 *src, unsigned num))
393{
394 struct vring_used *used_ring;
395 int err;
396 u16 used_idx, off;
397
398 used_ring = vrh->vring.used;
399 used_idx = vrh->last_used_idx + vrh->completed;
400
401 off = used_idx % vrh->vring.num;
402
403 /* Compiler knows num_used == 1 sometimes, hence extra check */
404 if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) {
405 u16 part = vrh->vring.num - off;
406 err = putused(&used_ring->ring[off], used, part);
407 if (!err)
408 err = putused(&used_ring->ring[0], used + part,
409 num_used - part);
410 } else
411 err = putused(&used_ring->ring[off], used, num_used);
412
413 if (err) {
414 vringh_bad("Failed to write %u used entries %u at %p",
415 num_used, off, &used_ring->ring[off]);
416 return err;
417 }
418
419 /* Make sure buffer is written before we update index. */
420 virtio_wmb(vrh->weak_barriers);
421
422 err = putu16(&vrh->vring.used->idx, used_idx + num_used);
423 if (err) {
424 vringh_bad("Failed to update used index at %p",
425 &vrh->vring.used->idx);
426 return err;
427 }
428
429 vrh->completed += num_used;
430 return 0;
431}
432
433
434static inline int __vringh_need_notify(struct vringh *vrh,
435 int (*getu16)(u16 *val, const u16 *p))
436{
437 bool notify;
438 u16 used_event;
439 int err;
440
441 /* Flush out used index update. This is paired with the
442 * barrier that the Guest executes when enabling
443 * interrupts. */
444 virtio_mb(vrh->weak_barriers);
445
446 /* Old-style, without event indices. */
447 if (!vrh->event_indices) {
448 u16 flags;
449 err = getu16(&flags, &vrh->vring.avail->flags);
450 if (err) {
451 vringh_bad("Failed to get flags at %p",
452 &vrh->vring.avail->flags);
453 return err;
454 }
455 return (!(flags & VRING_AVAIL_F_NO_INTERRUPT));
456 }
457
458 /* Modern: we know when other side wants to know. */
459 err = getu16(&used_event, &vring_used_event(&vrh->vring));
460 if (err) {
461 vringh_bad("Failed to get used event idx at %p",
462 &vring_used_event(&vrh->vring));
463 return err;
464 }
465
466 /* Just in case we added so many that we wrap. */
467 if (unlikely(vrh->completed > 0xffff))
468 notify = true;
469 else
470 notify = vring_need_event(used_event,
471 vrh->last_used_idx + vrh->completed,
472 vrh->last_used_idx);
473
474 vrh->last_used_idx += vrh->completed;
475 vrh->completed = 0;
476 return notify;
477}
478
479static inline bool __vringh_notify_enable(struct vringh *vrh,
480 int (*getu16)(u16 *val, const u16 *p),
481 int (*putu16)(u16 *p, u16 val))
482{
483 u16 avail;
484
485 if (!vrh->event_indices) {
486 /* Old-school; update flags. */
487 if (putu16(&vrh->vring.used->flags, 0) != 0) {
488 vringh_bad("Clearing used flags %p",
489 &vrh->vring.used->flags);
490 return true;
491 }
492 } else {
493 if (putu16(&vring_avail_event(&vrh->vring),
494 vrh->last_avail_idx) != 0) {
495 vringh_bad("Updating avail event index %p",
496 &vring_avail_event(&vrh->vring));
497 return true;
498 }
499 }
500
501 /* They could have slipped one in as we were doing that: make
502 * sure it's written, then check again. */
503 virtio_mb(vrh->weak_barriers);
504
505 if (getu16(&avail, &vrh->vring.avail->idx) != 0) {
506 vringh_bad("Failed to check avail idx at %p",
507 &vrh->vring.avail->idx);
508 return true;
509 }
510
511 /* This is unlikely, so we just leave notifications enabled
512 * (if we're using event_indices, we'll only get one
513 * notification anyway). */
514 return avail == vrh->last_avail_idx;
515}
516
517static inline void __vringh_notify_disable(struct vringh *vrh,
518 int (*putu16)(u16 *p, u16 val))
519{
520 if (!vrh->event_indices) {
521 /* Old-school; update flags. */
522 if (putu16(&vrh->vring.used->flags, VRING_USED_F_NO_NOTIFY)) {
523 vringh_bad("Setting used flags %p",
524 &vrh->vring.used->flags);
525 }
526 }
527}
528
529/* Userspace access helpers: in this case, addresses are really userspace. */
530static inline int getu16_user(u16 *val, const u16 *p)
531{
532 return get_user(*val, (__force u16 __user *)p);
533}
534
535static inline int putu16_user(u16 *p, u16 val)
536{
537 return put_user(val, (__force u16 __user *)p);
538}
539
540static inline int copydesc_user(void *dst, const void *src, size_t len)
541{
542 return copy_from_user(dst, (__force void __user *)src, len) ?
543 -EFAULT : 0;
544}
545
546static inline int putused_user(struct vring_used_elem *dst,
547 const struct vring_used_elem *src,
548 unsigned int num)
549{
550 return copy_to_user((__force void __user *)dst, src,
551 sizeof(*dst) * num) ? -EFAULT : 0;
552}
553
554static inline int xfer_from_user(void *src, void *dst, size_t len)
555{
556 return copy_from_user(dst, (__force void __user *)src, len) ?
557 -EFAULT : 0;
558}
559
560static inline int xfer_to_user(void *dst, void *src, size_t len)
561{
562 return copy_to_user((__force void __user *)dst, src, len) ?
563 -EFAULT : 0;
564}
565
566/**
567 * vringh_init_user - initialize a vringh for a userspace vring.
568 * @vrh: the vringh to initialize.
569 * @features: the feature bits for this ring.
570 * @num: the number of elements.
571 * @weak_barriers: true if we only need memory barriers, not I/O.
572 * @desc: the userpace descriptor pointer.
573 * @avail: the userpace avail pointer.
574 * @used: the userpace used pointer.
575 *
576 * Returns an error if num is invalid: you should check pointers
577 * yourself!
578 */
579int vringh_init_user(struct vringh *vrh, u32 features,
580 unsigned int num, bool weak_barriers,
581 struct vring_desc __user *desc,
582 struct vring_avail __user *avail,
583 struct vring_used __user *used)
584{
585 /* Sane power of 2 please! */
586 if (!num || num > 0xffff || (num & (num - 1))) {
587 vringh_bad("Bad ring size %u", num);
588 return -EINVAL;
589 }
590
591 vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
592 vrh->weak_barriers = weak_barriers;
593 vrh->completed = 0;
594 vrh->last_avail_idx = 0;
595 vrh->last_used_idx = 0;
596 vrh->vring.num = num;
597 /* vring expects kernel addresses, but only used via accessors. */
598 vrh->vring.desc = (__force struct vring_desc *)desc;
599 vrh->vring.avail = (__force struct vring_avail *)avail;
600 vrh->vring.used = (__force struct vring_used *)used;
601 return 0;
602}
603EXPORT_SYMBOL(vringh_init_user);
604
605/**
606 * vringh_getdesc_user - get next available descriptor from userspace ring.
607 * @vrh: the userspace vring.
608 * @riov: where to put the readable descriptors (or NULL)
609 * @wiov: where to put the writable descriptors (or NULL)
610 * @getrange: function to call to check ranges.
611 * @head: head index we received, for passing to vringh_complete_user().
612 *
613 * Returns 0 if there was no descriptor, 1 if there was, or -errno.
614 *
615 * Note that on error return, you can tell the difference between an
616 * invalid ring and a single invalid descriptor: in the former case,
617 * *head will be vrh->vring.num. You may be able to ignore an invalid
618 * descriptor, but there's not much you can do with an invalid ring.
619 *
620 * Note that you may need to clean up riov and wiov, even on error!
621 */
622int vringh_getdesc_user(struct vringh *vrh,
623 struct vringh_iov *riov,
624 struct vringh_iov *wiov,
625 bool (*getrange)(struct vringh *vrh,
626 u64 addr, struct vringh_range *r),
627 u16 *head)
628{
629 int err;
630
631 *head = vrh->vring.num;
632 err = __vringh_get_head(vrh, getu16_user, &vrh->last_avail_idx);
633 if (err < 0)
634 return err;
635
636 /* Empty... */
637 if (err == vrh->vring.num)
638 return 0;
639
640 /* We need the layouts to be the identical for this to work */
641 BUILD_BUG_ON(sizeof(struct vringh_kiov) != sizeof(struct vringh_iov));
642 BUILD_BUG_ON(offsetof(struct vringh_kiov, iov) !=
643 offsetof(struct vringh_iov, iov));
644 BUILD_BUG_ON(offsetof(struct vringh_kiov, i) !=
645 offsetof(struct vringh_iov, i));
646 BUILD_BUG_ON(offsetof(struct vringh_kiov, used) !=
647 offsetof(struct vringh_iov, used));
648 BUILD_BUG_ON(offsetof(struct vringh_kiov, max_num) !=
649 offsetof(struct vringh_iov, max_num));
650 BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
651 BUILD_BUG_ON(offsetof(struct iovec, iov_base) !=
652 offsetof(struct kvec, iov_base));
653 BUILD_BUG_ON(offsetof(struct iovec, iov_len) !=
654 offsetof(struct kvec, iov_len));
655 BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_base)
656 != sizeof(((struct kvec *)NULL)->iov_base));
657 BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_len)
658 != sizeof(((struct kvec *)NULL)->iov_len));
659
660 *head = err;
661 err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov,
662 (struct vringh_kiov *)wiov,
663 range_check, getrange, GFP_KERNEL, copydesc_user);
664 if (err)
665 return err;
666
667 return 1;
668}
669EXPORT_SYMBOL(vringh_getdesc_user);
670
671/**
672 * vringh_iov_pull_user - copy bytes from vring_iov.
673 * @riov: the riov as passed to vringh_getdesc_user() (updated as we consume)
674 * @dst: the place to copy.
675 * @len: the maximum length to copy.
676 *
677 * Returns the bytes copied <= len or a negative errno.
678 */
679ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len)
680{
681 return vringh_iov_xfer((struct vringh_kiov *)riov,
682 dst, len, xfer_from_user);
683}
684EXPORT_SYMBOL(vringh_iov_pull_user);
685
686/**
687 * vringh_iov_push_user - copy bytes into vring_iov.
688 * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume)
689 * @dst: the place to copy.
690 * @len: the maximum length to copy.
691 *
692 * Returns the bytes copied <= len or a negative errno.
693 */
694ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
695 const void *src, size_t len)
696{
697 return vringh_iov_xfer((struct vringh_kiov *)wiov,
698 (void *)src, len, xfer_to_user);
699}
700EXPORT_SYMBOL(vringh_iov_push_user);
701
702/**
703 * vringh_abandon_user - we've decided not to handle the descriptor(s).
704 * @vrh: the vring.
705 * @num: the number of descriptors to put back (ie. num
706 * vringh_get_user() to undo).
707 *
708 * The next vringh_get_user() will return the old descriptor(s) again.
709 */
710void vringh_abandon_user(struct vringh *vrh, unsigned int num)
711{
712 /* We only update vring_avail_event(vr) when we want to be notified,
713 * so we haven't changed that yet. */
714 vrh->last_avail_idx -= num;
715}
716EXPORT_SYMBOL(vringh_abandon_user);
717
718/**
719 * vringh_complete_user - we've finished with descriptor, publish it.
720 * @vrh: the vring.
721 * @head: the head as filled in by vringh_getdesc_user.
722 * @len: the length of data we have written.
723 *
724 * You should check vringh_need_notify_user() after one or more calls
725 * to this function.
726 */
727int vringh_complete_user(struct vringh *vrh, u16 head, u32 len)
728{
729 struct vring_used_elem used;
730
731 used.id = head;
732 used.len = len;
733 return __vringh_complete(vrh, &used, 1, putu16_user, putused_user);
734}
735EXPORT_SYMBOL(vringh_complete_user);
736
737/**
738 * vringh_complete_multi_user - we've finished with many descriptors.
739 * @vrh: the vring.
740 * @used: the head, length pairs.
741 * @num_used: the number of used elements.
742 *
743 * You should check vringh_need_notify_user() after one or more calls
744 * to this function.
745 */
746int vringh_complete_multi_user(struct vringh *vrh,
747 const struct vring_used_elem used[],
748 unsigned num_used)
749{
750 return __vringh_complete(vrh, used, num_used,
751 putu16_user, putused_user);
752}
753EXPORT_SYMBOL(vringh_complete_multi_user);
754
755/**
756 * vringh_notify_enable_user - we want to know if something changes.
757 * @vrh: the vring.
758 *
759 * This always enables notifications, but returns false if there are
760 * now more buffers available in the vring.
761 */
762bool vringh_notify_enable_user(struct vringh *vrh)
763{
764 return __vringh_notify_enable(vrh, getu16_user, putu16_user);
765}
766EXPORT_SYMBOL(vringh_notify_enable_user);
767
768/**
769 * vringh_notify_disable_user - don't tell us if something changes.
770 * @vrh: the vring.
771 *
772 * This is our normal running state: we disable and then only enable when
773 * we're going to sleep.
774 */
775void vringh_notify_disable_user(struct vringh *vrh)
776{
777 __vringh_notify_disable(vrh, putu16_user);
778}
779EXPORT_SYMBOL(vringh_notify_disable_user);
780
781/**
782 * vringh_need_notify_user - must we tell the other side about used buffers?
783 * @vrh: the vring we've called vringh_complete_user() on.
784 *
785 * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
786 */
787int vringh_need_notify_user(struct vringh *vrh)
788{
789 return __vringh_need_notify(vrh, getu16_user);
790}
791EXPORT_SYMBOL(vringh_need_notify_user);
792
793/* Kernelspace access helpers. */
794static inline int getu16_kern(u16 *val, const u16 *p)
795{
796 *val = ACCESS_ONCE(*p);
797 return 0;
798}
799
800static inline int putu16_kern(u16 *p, u16 val)
801{
802 ACCESS_ONCE(*p) = val;
803 return 0;
804}
805
806static inline int copydesc_kern(void *dst, const void *src, size_t len)
807{
808 memcpy(dst, src, len);
809 return 0;
810}
811
812static inline int putused_kern(struct vring_used_elem *dst,
813 const struct vring_used_elem *src,
814 unsigned int num)
815{
816 memcpy(dst, src, num * sizeof(*dst));
817 return 0;
818}
819
820static inline int xfer_kern(void *src, void *dst, size_t len)
821{
822 memcpy(dst, src, len);
823 return 0;
824}
825
826/**
827 * vringh_init_kern - initialize a vringh for a kernelspace vring.
828 * @vrh: the vringh to initialize.
829 * @features: the feature bits for this ring.
830 * @num: the number of elements.
831 * @weak_barriers: true if we only need memory barriers, not I/O.
832 * @desc: the userpace descriptor pointer.
833 * @avail: the userpace avail pointer.
834 * @used: the userpace used pointer.
835 *
836 * Returns an error if num is invalid.
837 */
838int vringh_init_kern(struct vringh *vrh, u32 features,
839 unsigned int num, bool weak_barriers,
840 struct vring_desc *desc,
841 struct vring_avail *avail,
842 struct vring_used *used)
843{
844 /* Sane power of 2 please! */
845 if (!num || num > 0xffff || (num & (num - 1))) {
846 vringh_bad("Bad ring size %u", num);
847 return -EINVAL;
848 }
849
850 vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
851 vrh->weak_barriers = weak_barriers;
852 vrh->completed = 0;
853 vrh->last_avail_idx = 0;
854 vrh->last_used_idx = 0;
855 vrh->vring.num = num;
856 vrh->vring.desc = desc;
857 vrh->vring.avail = avail;
858 vrh->vring.used = used;
859 return 0;
860}
861EXPORT_SYMBOL(vringh_init_kern);
862
863/**
864 * vringh_getdesc_kern - get next available descriptor from kernelspace ring.
865 * @vrh: the kernelspace vring.
866 * @riov: where to put the readable descriptors (or NULL)
867 * @wiov: where to put the writable descriptors (or NULL)
868 * @head: head index we received, for passing to vringh_complete_kern().
869 * @gfp: flags for allocating larger riov/wiov.
870 *
871 * Returns 0 if there was no descriptor, 1 if there was, or -errno.
872 *
873 * Note that on error return, you can tell the difference between an
874 * invalid ring and a single invalid descriptor: in the former case,
875 * *head will be vrh->vring.num. You may be able to ignore an invalid
876 * descriptor, but there's not much you can do with an invalid ring.
877 *
878 * Note that you may need to clean up riov and wiov, even on error!
879 */
880int vringh_getdesc_kern(struct vringh *vrh,
881 struct vringh_kiov *riov,
882 struct vringh_kiov *wiov,
883 u16 *head,
884 gfp_t gfp)
885{
886 int err;
887
888 err = __vringh_get_head(vrh, getu16_kern, &vrh->last_avail_idx);
889 if (err < 0)
890 return err;
891
892 /* Empty... */
893 if (err == vrh->vring.num)
894 return 0;
895
896 *head = err;
897 err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL,
898 gfp, copydesc_kern);
899 if (err)
900 return err;
901
902 return 1;
903}
904EXPORT_SYMBOL(vringh_getdesc_kern);
905
906/**
907 * vringh_iov_pull_kern - copy bytes from vring_iov.
908 * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume)
909 * @dst: the place to copy.
910 * @len: the maximum length to copy.
911 *
912 * Returns the bytes copied <= len or a negative errno.
913 */
914ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len)
915{
916 return vringh_iov_xfer(riov, dst, len, xfer_kern);
917}
918EXPORT_SYMBOL(vringh_iov_pull_kern);
919
920/**
921 * vringh_iov_push_kern - copy bytes into vring_iov.
922 * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume)
923 * @dst: the place to copy.
924 * @len: the maximum length to copy.
925 *
926 * Returns the bytes copied <= len or a negative errno.
927 */
928ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
929 const void *src, size_t len)
930{
931 return vringh_iov_xfer(wiov, (void *)src, len, xfer_kern);
932}
933EXPORT_SYMBOL(vringh_iov_push_kern);
934
935/**
936 * vringh_abandon_kern - we've decided not to handle the descriptor(s).
937 * @vrh: the vring.
938 * @num: the number of descriptors to put back (ie. num
939 * vringh_get_kern() to undo).
940 *
941 * The next vringh_get_kern() will return the old descriptor(s) again.
942 */
943void vringh_abandon_kern(struct vringh *vrh, unsigned int num)
944{
945 /* We only update vring_avail_event(vr) when we want to be notified,
946 * so we haven't changed that yet. */
947 vrh->last_avail_idx -= num;
948}
949EXPORT_SYMBOL(vringh_abandon_kern);
950
951/**
952 * vringh_complete_kern - we've finished with descriptor, publish it.
953 * @vrh: the vring.
954 * @head: the head as filled in by vringh_getdesc_kern.
955 * @len: the length of data we have written.
956 *
957 * You should check vringh_need_notify_kern() after one or more calls
958 * to this function.
959 */
960int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len)
961{
962 struct vring_used_elem used;
963
964 used.id = head;
965 used.len = len;
966
967 return __vringh_complete(vrh, &used, 1, putu16_kern, putused_kern);
968}
969EXPORT_SYMBOL(vringh_complete_kern);
970
971/**
972 * vringh_notify_enable_kern - we want to know if something changes.
973 * @vrh: the vring.
974 *
975 * This always enables notifications, but returns false if there are
976 * now more buffers available in the vring.
977 */
978bool vringh_notify_enable_kern(struct vringh *vrh)
979{
980 return __vringh_notify_enable(vrh, getu16_kern, putu16_kern);
981}
982EXPORT_SYMBOL(vringh_notify_enable_kern);
983
984/**
985 * vringh_notify_disable_kern - don't tell us if something changes.
986 * @vrh: the vring.
987 *
988 * This is our normal running state: we disable and then only enable when
989 * we're going to sleep.
990 */
991void vringh_notify_disable_kern(struct vringh *vrh)
992{
993 __vringh_notify_disable(vrh, putu16_kern);
994}
995EXPORT_SYMBOL(vringh_notify_disable_kern);
996
997/**
998 * vringh_need_notify_kern - must we tell the other side about used buffers?
999 * @vrh: the vring we've called vringh_complete_kern() on.
1000 *
1001 * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
1002 */
1003int vringh_need_notify_kern(struct vringh *vrh)
1004{
1005 return __vringh_need_notify(vrh, getu16_kern);
1006}
1007EXPORT_SYMBOL(vringh_need_notify_kern);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 8dab163c5ef0..bd3ae324a1a2 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -108,7 +108,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
108 sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); 108 sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
109 109
110 /* We should always be able to add one buffer to an empty queue. */ 110 /* We should always be able to add one buffer to an empty queue. */
111 if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) 111 if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0)
112 BUG(); 112 BUG();
113 virtqueue_kick(vq); 113 virtqueue_kick(vq);
114 114
@@ -256,7 +256,7 @@ static void stats_handle_request(struct virtio_balloon *vb)
256 if (!virtqueue_get_buf(vq, &len)) 256 if (!virtqueue_get_buf(vq, &len))
257 return; 257 return;
258 sg_init_one(&sg, vb->stats, sizeof(vb->stats)); 258 sg_init_one(&sg, vb->stats, sizeof(vb->stats));
259 if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) 259 if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0)
260 BUG(); 260 BUG();
261 virtqueue_kick(vq); 261 virtqueue_kick(vq);
262} 262}
@@ -341,7 +341,7 @@ static int init_vqs(struct virtio_balloon *vb)
341 * use it to signal us later. 341 * use it to signal us later.
342 */ 342 */
343 sg_init_one(&sg, vb->stats, sizeof vb->stats); 343 sg_init_one(&sg, vb->stats, sizeof vb->stats);
344 if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb, GFP_KERNEL) 344 if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
345 < 0) 345 < 0)
346 BUG(); 346 BUG();
347 virtqueue_kick(vb->stats_vq); 347 virtqueue_kick(vb->stats_vq);
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index ffd7e7da5d3b..5217baf5528c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -24,27 +24,6 @@
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/hrtimer.h> 25#include <linux/hrtimer.h>
26 26
27/* virtio guest is communicating with a virtual "device" that actually runs on
28 * a host processor. Memory barriers are used to control SMP effects. */
29#ifdef CONFIG_SMP
30/* Where possible, use SMP barriers which are more lightweight than mandatory
31 * barriers, because mandatory barriers control MMIO effects on accesses
32 * through relaxed memory I/O windows (which virtio-pci does not use). */
33#define virtio_mb(vq) \
34 do { if ((vq)->weak_barriers) smp_mb(); else mb(); } while(0)
35#define virtio_rmb(vq) \
36 do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0)
37#define virtio_wmb(vq) \
38 do { if ((vq)->weak_barriers) smp_wmb(); else wmb(); } while(0)
39#else
40/* We must force memory ordering even if guest is UP since host could be
41 * running on another CPU, but SMP barriers are defined to barrier() in that
42 * configuration. So fall back to mandatory barriers instead. */
43#define virtio_mb(vq) mb()
44#define virtio_rmb(vq) rmb()
45#define virtio_wmb(vq) wmb()
46#endif
47
48#ifdef DEBUG 27#ifdef DEBUG
49/* For development, we want to crash whenever the ring is screwed. */ 28/* For development, we want to crash whenever the ring is screwed. */
50#define BAD_RING(_vq, fmt, args...) \ 29#define BAD_RING(_vq, fmt, args...) \
@@ -119,16 +98,36 @@ struct vring_virtqueue
119 98
120#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) 99#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
121 100
101static inline struct scatterlist *sg_next_chained(struct scatterlist *sg,
102 unsigned int *count)
103{
104 return sg_next(sg);
105}
106
107static inline struct scatterlist *sg_next_arr(struct scatterlist *sg,
108 unsigned int *count)
109{
110 if (--(*count) == 0)
111 return NULL;
112 return sg + 1;
113}
114
122/* Set up an indirect table of descriptors and add it to the queue. */ 115/* Set up an indirect table of descriptors and add it to the queue. */
123static int vring_add_indirect(struct vring_virtqueue *vq, 116static inline int vring_add_indirect(struct vring_virtqueue *vq,
124 struct scatterlist sg[], 117 struct scatterlist *sgs[],
125 unsigned int out, 118 struct scatterlist *(*next)
126 unsigned int in, 119 (struct scatterlist *, unsigned int *),
127 gfp_t gfp) 120 unsigned int total_sg,
121 unsigned int total_out,
122 unsigned int total_in,
123 unsigned int out_sgs,
124 unsigned int in_sgs,
125 gfp_t gfp)
128{ 126{
129 struct vring_desc *desc; 127 struct vring_desc *desc;
130 unsigned head; 128 unsigned head;
131 int i; 129 struct scatterlist *sg;
130 int i, n;
132 131
133 /* 132 /*
134 * We require lowmem mappings for the descriptors because 133 * We require lowmem mappings for the descriptors because
@@ -137,25 +136,31 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
137 */ 136 */
138 gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH); 137 gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH);
139 138
140 desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp); 139 desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
141 if (!desc) 140 if (!desc)
142 return -ENOMEM; 141 return -ENOMEM;
143 142
144 /* Transfer entries from the sg list into the indirect page */ 143 /* Transfer entries from the sg lists into the indirect page */
145 for (i = 0; i < out; i++) { 144 i = 0;
146 desc[i].flags = VRING_DESC_F_NEXT; 145 for (n = 0; n < out_sgs; n++) {
147 desc[i].addr = sg_phys(sg); 146 for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
148 desc[i].len = sg->length; 147 desc[i].flags = VRING_DESC_F_NEXT;
149 desc[i].next = i+1; 148 desc[i].addr = sg_phys(sg);
150 sg++; 149 desc[i].len = sg->length;
150 desc[i].next = i+1;
151 i++;
152 }
151 } 153 }
152 for (; i < (out + in); i++) { 154 for (; n < (out_sgs + in_sgs); n++) {
153 desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; 155 for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
154 desc[i].addr = sg_phys(sg); 156 desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
155 desc[i].len = sg->length; 157 desc[i].addr = sg_phys(sg);
156 desc[i].next = i+1; 158 desc[i].len = sg->length;
157 sg++; 159 desc[i].next = i+1;
160 i++;
161 }
158 } 162 }
163 BUG_ON(i != total_sg);
159 164
160 /* Last one doesn't continue. */ 165 /* Last one doesn't continue. */
161 desc[i-1].flags &= ~VRING_DESC_F_NEXT; 166 desc[i-1].flags &= ~VRING_DESC_F_NEXT;
@@ -176,29 +181,20 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
176 return head; 181 return head;
177} 182}
178 183
179/** 184static inline int virtqueue_add(struct virtqueue *_vq,
180 * virtqueue_add_buf - expose buffer to other end 185 struct scatterlist *sgs[],
181 * @vq: the struct virtqueue we're talking about. 186 struct scatterlist *(*next)
182 * @sg: the description of the buffer(s). 187 (struct scatterlist *, unsigned int *),
183 * @out_num: the number of sg readable by other side 188 unsigned int total_out,
184 * @in_num: the number of sg which are writable (after readable ones) 189 unsigned int total_in,
185 * @data: the token identifying the buffer. 190 unsigned int out_sgs,
186 * @gfp: how to do memory allocations (if necessary). 191 unsigned int in_sgs,
187 * 192 void *data,
188 * Caller must ensure we don't call this with other virtqueue operations 193 gfp_t gfp)
189 * at the same time (except where noted).
190 *
191 * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
192 */
193int virtqueue_add_buf(struct virtqueue *_vq,
194 struct scatterlist sg[],
195 unsigned int out,
196 unsigned int in,
197 void *data,
198 gfp_t gfp)
199{ 194{
200 struct vring_virtqueue *vq = to_vvq(_vq); 195 struct vring_virtqueue *vq = to_vvq(_vq);
201 unsigned int i, avail, uninitialized_var(prev); 196 struct scatterlist *sg;
197 unsigned int i, n, avail, uninitialized_var(prev), total_sg;
202 int head; 198 int head;
203 199
204 START_USE(vq); 200 START_USE(vq);
@@ -218,46 +214,54 @@ int virtqueue_add_buf(struct virtqueue *_vq,
218 } 214 }
219#endif 215#endif
220 216
217 total_sg = total_in + total_out;
218
221 /* If the host supports indirect descriptor tables, and we have multiple 219 /* If the host supports indirect descriptor tables, and we have multiple
222 * buffers, then go indirect. FIXME: tune this threshold */ 220 * buffers, then go indirect. FIXME: tune this threshold */
223 if (vq->indirect && (out + in) > 1 && vq->vq.num_free) { 221 if (vq->indirect && total_sg > 1 && vq->vq.num_free) {
224 head = vring_add_indirect(vq, sg, out, in, gfp); 222 head = vring_add_indirect(vq, sgs, next, total_sg, total_out,
223 total_in,
224 out_sgs, in_sgs, gfp);
225 if (likely(head >= 0)) 225 if (likely(head >= 0))
226 goto add_head; 226 goto add_head;
227 } 227 }
228 228
229 BUG_ON(out + in > vq->vring.num); 229 BUG_ON(total_sg > vq->vring.num);
230 BUG_ON(out + in == 0); 230 BUG_ON(total_sg == 0);
231 231
232 if (vq->vq.num_free < out + in) { 232 if (vq->vq.num_free < total_sg) {
233 pr_debug("Can't add buf len %i - avail = %i\n", 233 pr_debug("Can't add buf len %i - avail = %i\n",
234 out + in, vq->vq.num_free); 234 total_sg, vq->vq.num_free);
235 /* FIXME: for historical reasons, we force a notify here if 235 /* FIXME: for historical reasons, we force a notify here if
236 * there are outgoing parts to the buffer. Presumably the 236 * there are outgoing parts to the buffer. Presumably the
237 * host should service the ring ASAP. */ 237 * host should service the ring ASAP. */
238 if (out) 238 if (out_sgs)
239 vq->notify(&vq->vq); 239 vq->notify(&vq->vq);
240 END_USE(vq); 240 END_USE(vq);
241 return -ENOSPC; 241 return -ENOSPC;
242 } 242 }
243 243
244 /* We're about to use some buffers from the free list. */ 244 /* We're about to use some buffers from the free list. */
245 vq->vq.num_free -= out + in; 245 vq->vq.num_free -= total_sg;
246 246
247 head = vq->free_head; 247 head = i = vq->free_head;
248 for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) { 248 for (n = 0; n < out_sgs; n++) {
249 vq->vring.desc[i].flags = VRING_DESC_F_NEXT; 249 for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
250 vq->vring.desc[i].addr = sg_phys(sg); 250 vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
251 vq->vring.desc[i].len = sg->length; 251 vq->vring.desc[i].addr = sg_phys(sg);
252 prev = i; 252 vq->vring.desc[i].len = sg->length;
253 sg++; 253 prev = i;
254 i = vq->vring.desc[i].next;
255 }
254 } 256 }
255 for (; in; i = vq->vring.desc[i].next, in--) { 257 for (; n < (out_sgs + in_sgs); n++) {
256 vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; 258 for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
257 vq->vring.desc[i].addr = sg_phys(sg); 259 vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
258 vq->vring.desc[i].len = sg->length; 260 vq->vring.desc[i].addr = sg_phys(sg);
259 prev = i; 261 vq->vring.desc[i].len = sg->length;
260 sg++; 262 prev = i;
263 i = vq->vring.desc[i].next;
264 }
261 } 265 }
262 /* Last one doesn't continue. */ 266 /* Last one doesn't continue. */
263 vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT; 267 vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
@@ -276,7 +280,7 @@ add_head:
276 280
277 /* Descriptors and available array need to be set before we expose the 281 /* Descriptors and available array need to be set before we expose the
278 * new available array entries. */ 282 * new available array entries. */
279 virtio_wmb(vq); 283 virtio_wmb(vq->weak_barriers);
280 vq->vring.avail->idx++; 284 vq->vring.avail->idx++;
281 vq->num_added++; 285 vq->num_added++;
282 286
@@ -290,9 +294,122 @@ add_head:
290 294
291 return 0; 295 return 0;
292} 296}
297
298/**
299 * virtqueue_add_buf - expose buffer to other end
300 * @vq: the struct virtqueue we're talking about.
301 * @sg: the description of the buffer(s).
302 * @out_num: the number of sg readable by other side
303 * @in_num: the number of sg which are writable (after readable ones)
304 * @data: the token identifying the buffer.
305 * @gfp: how to do memory allocations (if necessary).
306 *
307 * Caller must ensure we don't call this with other virtqueue operations
308 * at the same time (except where noted).
309 *
310 * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
311 */
312int virtqueue_add_buf(struct virtqueue *_vq,
313 struct scatterlist sg[],
314 unsigned int out,
315 unsigned int in,
316 void *data,
317 gfp_t gfp)
318{
319 struct scatterlist *sgs[2];
320
321 sgs[0] = sg;
322 sgs[1] = sg + out;
323
324 return virtqueue_add(_vq, sgs, sg_next_arr,
325 out, in, out ? 1 : 0, in ? 1 : 0, data, gfp);
326}
293EXPORT_SYMBOL_GPL(virtqueue_add_buf); 327EXPORT_SYMBOL_GPL(virtqueue_add_buf);
294 328
295/** 329/**
330 * virtqueue_add_sgs - expose buffers to other end
331 * @vq: the struct virtqueue we're talking about.
332 * @sgs: array of terminated scatterlists.
333 * @out_num: the number of scatterlists readable by other side
334 * @in_num: the number of scatterlists which are writable (after readable ones)
335 * @data: the token identifying the buffer.
336 * @gfp: how to do memory allocations (if necessary).
337 *
338 * Caller must ensure we don't call this with other virtqueue operations
339 * at the same time (except where noted).
340 *
341 * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
342 */
343int virtqueue_add_sgs(struct virtqueue *_vq,
344 struct scatterlist *sgs[],
345 unsigned int out_sgs,
346 unsigned int in_sgs,
347 void *data,
348 gfp_t gfp)
349{
350 unsigned int i, total_out, total_in;
351
352 /* Count them first. */
353 for (i = total_out = total_in = 0; i < out_sgs; i++) {
354 struct scatterlist *sg;
355 for (sg = sgs[i]; sg; sg = sg_next(sg))
356 total_out++;
357 }
358 for (; i < out_sgs + in_sgs; i++) {
359 struct scatterlist *sg;
360 for (sg = sgs[i]; sg; sg = sg_next(sg))
361 total_in++;
362 }
363 return virtqueue_add(_vq, sgs, sg_next_chained,
364 total_out, total_in, out_sgs, in_sgs, data, gfp);
365}
366EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
367
368/**
369 * virtqueue_add_outbuf - expose output buffers to other end
370 * @vq: the struct virtqueue we're talking about.
371 * @sgs: array of scatterlists (need not be terminated!)
372 * @num: the number of scatterlists readable by other side
373 * @data: the token identifying the buffer.
374 * @gfp: how to do memory allocations (if necessary).
375 *
376 * Caller must ensure we don't call this with other virtqueue operations
377 * at the same time (except where noted).
378 *
379 * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
380 */
381int virtqueue_add_outbuf(struct virtqueue *vq,
382 struct scatterlist sg[], unsigned int num,
383 void *data,
384 gfp_t gfp)
385{
386 return virtqueue_add(vq, &sg, sg_next_arr, num, 0, 1, 0, data, gfp);
387}
388EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
389
390/**
391 * virtqueue_add_inbuf - expose input buffers to other end
392 * @vq: the struct virtqueue we're talking about.
393 * @sgs: array of scatterlists (need not be terminated!)
394 * @num: the number of scatterlists writable by other side
395 * @data: the token identifying the buffer.
396 * @gfp: how to do memory allocations (if necessary).
397 *
398 * Caller must ensure we don't call this with other virtqueue operations
399 * at the same time (except where noted).
400 *
401 * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
402 */
403int virtqueue_add_inbuf(struct virtqueue *vq,
404 struct scatterlist sg[], unsigned int num,
405 void *data,
406 gfp_t gfp)
407{
408 return virtqueue_add(vq, &sg, sg_next_arr, 0, num, 0, 1, data, gfp);
409}
410EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
411
412/**
296 * virtqueue_kick_prepare - first half of split virtqueue_kick call. 413 * virtqueue_kick_prepare - first half of split virtqueue_kick call.
297 * @vq: the struct virtqueue 414 * @vq: the struct virtqueue
298 * 415 *
@@ -312,7 +429,7 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
312 START_USE(vq); 429 START_USE(vq);
313 /* We need to expose available array entries before checking avail 430 /* We need to expose available array entries before checking avail
314 * event. */ 431 * event. */
315 virtio_mb(vq); 432 virtio_mb(vq->weak_barriers);
316 433
317 old = vq->vring.avail->idx - vq->num_added; 434 old = vq->vring.avail->idx - vq->num_added;
318 new = vq->vring.avail->idx; 435 new = vq->vring.avail->idx;
@@ -436,7 +553,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
436 } 553 }
437 554
438 /* Only get used array entries after they have been exposed by host. */ 555 /* Only get used array entries after they have been exposed by host. */
439 virtio_rmb(vq); 556 virtio_rmb(vq->weak_barriers);
440 557
441 last_used = (vq->last_used_idx & (vq->vring.num - 1)); 558 last_used = (vq->last_used_idx & (vq->vring.num - 1));
442 i = vq->vring.used->ring[last_used].id; 559 i = vq->vring.used->ring[last_used].id;
@@ -460,7 +577,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
460 * the read in the next get_buf call. */ 577 * the read in the next get_buf call. */
461 if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { 578 if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
462 vring_used_event(&vq->vring) = vq->last_used_idx; 579 vring_used_event(&vq->vring) = vq->last_used_idx;
463 virtio_mb(vq); 580 virtio_mb(vq->weak_barriers);
464 } 581 }
465 582
466#ifdef DEBUG 583#ifdef DEBUG
@@ -513,7 +630,7 @@ bool virtqueue_enable_cb(struct virtqueue *_vq)
513 * entry. Always do both to keep code simple. */ 630 * entry. Always do both to keep code simple. */
514 vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; 631 vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
515 vring_used_event(&vq->vring) = vq->last_used_idx; 632 vring_used_event(&vq->vring) = vq->last_used_idx;
516 virtio_mb(vq); 633 virtio_mb(vq->weak_barriers);
517 if (unlikely(more_used(vq))) { 634 if (unlikely(more_used(vq))) {
518 END_USE(vq); 635 END_USE(vq);
519 return false; 636 return false;
@@ -553,7 +670,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
553 /* TODO: tune this threshold */ 670 /* TODO: tune this threshold */
554 bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4; 671 bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4;
555 vring_used_event(&vq->vring) = vq->last_used_idx + bufs; 672 vring_used_event(&vq->vring) = vq->last_used_idx + bufs;
556 virtio_mb(vq); 673 virtio_mb(vq->weak_barriers);
557 if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) { 674 if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) {
558 END_USE(vq); 675 END_USE(vq);
559 return false; 676 return false;
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 2d8bdaef9611..bfc47e0de81c 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -172,6 +172,22 @@ static inline void sg_mark_end(struct scatterlist *sg)
172} 172}
173 173
174/** 174/**
175 * sg_unmark_end - Undo setting the end of the scatterlist
176 * @sg: SG entryScatterlist
177 *
178 * Description:
179 * Removes the termination marker from the given entry of the scatterlist.
180 *
181 **/
182static inline void sg_unmark_end(struct scatterlist *sg)
183{
184#ifdef CONFIG_DEBUG_SG
185 BUG_ON(sg->sg_magic != SG_MAGIC);
186#endif
187 sg->page_link &= ~0x02;
188}
189
190/**
175 * sg_phys - Return physical address of an sg entry 191 * sg_phys - Return physical address of an sg entry
176 * @sg: SG entry 192 * @sg: SG entry
177 * 193 *
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 2d7a5e045908..9ff8645b7e0b 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -8,6 +8,7 @@
8#include <linux/device.h> 8#include <linux/device.h>
9#include <linux/mod_devicetable.h> 9#include <linux/mod_devicetable.h>
10#include <linux/gfp.h> 10#include <linux/gfp.h>
11#include <linux/vringh.h>
11 12
12/** 13/**
13 * virtqueue - a queue to register buffers for sending or receiving. 14 * virtqueue - a queue to register buffers for sending or receiving.
@@ -40,6 +41,23 @@ int virtqueue_add_buf(struct virtqueue *vq,
40 void *data, 41 void *data,
41 gfp_t gfp); 42 gfp_t gfp);
42 43
44int virtqueue_add_outbuf(struct virtqueue *vq,
45 struct scatterlist sg[], unsigned int num,
46 void *data,
47 gfp_t gfp);
48
49int virtqueue_add_inbuf(struct virtqueue *vq,
50 struct scatterlist sg[], unsigned int num,
51 void *data,
52 gfp_t gfp);
53
54int virtqueue_add_sgs(struct virtqueue *vq,
55 struct scatterlist *sgs[],
56 unsigned int out_sgs,
57 unsigned int in_sgs,
58 void *data,
59 gfp_t gfp);
60
43void virtqueue_kick(struct virtqueue *vq); 61void virtqueue_kick(struct virtqueue *vq);
44 62
45bool virtqueue_kick_prepare(struct virtqueue *vq); 63bool virtqueue_kick_prepare(struct virtqueue *vq);
@@ -64,6 +82,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *vq);
64 * @dev: underlying device. 82 * @dev: underlying device.
65 * @id: the device type identification (used to match it with a driver). 83 * @id: the device type identification (used to match it with a driver).
66 * @config: the configuration ops for this device. 84 * @config: the configuration ops for this device.
85 * @vringh_config: configuration ops for host vrings.
67 * @vqs: the list of virtqueues for this device. 86 * @vqs: the list of virtqueues for this device.
68 * @features: the features supported by both driver and device. 87 * @features: the features supported by both driver and device.
69 * @priv: private pointer for the driver's use. 88 * @priv: private pointer for the driver's use.
@@ -73,6 +92,7 @@ struct virtio_device {
73 struct device dev; 92 struct device dev;
74 struct virtio_device_id id; 93 struct virtio_device_id id;
75 const struct virtio_config_ops *config; 94 const struct virtio_config_ops *config;
95 const struct vringh_config_ops *vringh_config;
76 struct list_head vqs; 96 struct list_head vqs;
77 /* Note that this is a Linux set_bit-style bitmap. */ 97 /* Note that this is a Linux set_bit-style bitmap. */
78 unsigned long features[1]; 98 unsigned long features[1];
diff --git a/include/linux/virtio_caif.h b/include/linux/virtio_caif.h
new file mode 100644
index 000000000000..5d2d3124ca3d
--- /dev/null
+++ b/include/linux/virtio_caif.h
@@ -0,0 +1,24 @@
1/*
2 * Copyright (C) ST-Ericsson AB 2012
3 * Author: Sjur Brændeland <sjur.brandeland@stericsson.com>
4 *
5 * This header is BSD licensed so
6 * anyone can use the definitions to implement compatible remote processors
7 */
8
9#ifndef VIRTIO_CAIF_H
10#define VIRTIO_CAIF_H
11
12#include <linux/types.h>
13struct virtio_caif_transf_config {
14 u16 headroom;
15 u16 tailroom;
16 u32 mtu;
17 u8 reserved[4];
18};
19
20struct virtio_caif_config {
21 struct virtio_caif_transf_config uplink, downlink;
22 u8 reserved[8];
23};
24#endif
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index 63c6ea199519..ca3ad41c2c82 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -4,6 +4,63 @@
4#include <linux/irqreturn.h> 4#include <linux/irqreturn.h>
5#include <uapi/linux/virtio_ring.h> 5#include <uapi/linux/virtio_ring.h>
6 6
7/*
8 * Barriers in virtio are tricky. Non-SMP virtio guests can't assume
9 * they're not on an SMP host system, so they need to assume real
10 * barriers. Non-SMP virtio hosts could skip the barriers, but does
11 * anyone care?
12 *
13 * For virtio_pci on SMP, we don't need to order with respect to MMIO
14 * accesses through relaxed memory I/O windows, so smp_mb() et al are
15 * sufficient.
16 *
17 * For using virtio to talk to real devices (eg. other heterogeneous
18 * CPUs) we do need real barriers. In theory, we could be using both
19 * kinds of virtio, so it's a runtime decision, and the branch is
20 * actually quite cheap.
21 */
22
23#ifdef CONFIG_SMP
24static inline void virtio_mb(bool weak_barriers)
25{
26 if (weak_barriers)
27 smp_mb();
28 else
29 mb();
30}
31
32static inline void virtio_rmb(bool weak_barriers)
33{
34 if (weak_barriers)
35 smp_rmb();
36 else
37 rmb();
38}
39
40static inline void virtio_wmb(bool weak_barriers)
41{
42 if (weak_barriers)
43 smp_wmb();
44 else
45 wmb();
46}
47#else
48static inline void virtio_mb(bool weak_barriers)
49{
50 mb();
51}
52
53static inline void virtio_rmb(bool weak_barriers)
54{
55 rmb();
56}
57
58static inline void virtio_wmb(bool weak_barriers)
59{
60 wmb();
61}
62#endif
63
7struct virtio_device; 64struct virtio_device;
8struct virtqueue; 65struct virtqueue;
9 66
diff --git a/include/linux/vringh.h b/include/linux/vringh.h
new file mode 100644
index 000000000000..749cde28728b
--- /dev/null
+++ b/include/linux/vringh.h
@@ -0,0 +1,225 @@
1/*
2 * Linux host-side vring helpers; for when the kernel needs to access
3 * someone else's vring.
4 *
5 * Copyright IBM Corporation, 2013.
6 * Parts taken from drivers/vhost/vhost.c Copyright 2009 Red Hat, Inc.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 *
22 * Written by: Rusty Russell <rusty@rustcorp.com.au>
23 */
24#ifndef _LINUX_VRINGH_H
25#define _LINUX_VRINGH_H
26#include <uapi/linux/virtio_ring.h>
27#include <linux/uio.h>
28#include <linux/slab.h>
29#include <asm/barrier.h>
30
31/* virtio_ring with information needed for host access. */
32struct vringh {
33 /* Guest publishes used event idx (note: we always do). */
34 bool event_indices;
35
36 /* Can we get away with weak barriers? */
37 bool weak_barriers;
38
39 /* Last available index we saw (ie. where we're up to). */
40 u16 last_avail_idx;
41
42 /* Last index we used. */
43 u16 last_used_idx;
44
45 /* How many descriptors we've completed since last need_notify(). */
46 u32 completed;
47
48 /* The vring (note: it may contain user pointers!) */
49 struct vring vring;
50
51 /* The function to call to notify the guest about added buffers */
52 void (*notify)(struct vringh *);
53};
54
55/**
56 * struct vringh_config_ops - ops for creating a host vring from a virtio driver
57 * @find_vrhs: find the host vrings and instantiate them
58 * vdev: the virtio_device
59 * nhvrs: the number of host vrings to find
60 * hvrs: on success, includes new host vrings
61 * callbacks: array of driver callbacks, for each host vring
62 * include a NULL entry for vqs that do not need a callback
63 * Returns 0 on success or error status
64 * @del_vrhs: free the host vrings found by find_vrhs().
65 */
66struct virtio_device;
67typedef void vrh_callback_t(struct virtio_device *, struct vringh *);
68struct vringh_config_ops {
69 int (*find_vrhs)(struct virtio_device *vdev, unsigned nhvrs,
70 struct vringh *vrhs[], vrh_callback_t *callbacks[]);
71 void (*del_vrhs)(struct virtio_device *vdev);
72};
73
74/* The memory the vring can access, and what offset to apply. */
75struct vringh_range {
76 u64 start, end_incl;
77 u64 offset;
78};
79
80/**
81 * struct vringh_iov - iovec mangler.
82 *
83 * Mangles iovec in place, and restores it.
84 * Remaining data is iov + i, of used - i elements.
85 */
86struct vringh_iov {
87 struct iovec *iov;
88 size_t consumed; /* Within iov[i] */
89 unsigned i, used, max_num;
90};
91
92/**
93 * struct vringh_iov - kvec mangler.
94 *
95 * Mangles kvec in place, and restores it.
96 * Remaining data is iov + i, of used - i elements.
97 */
98struct vringh_kiov {
99 struct kvec *iov;
100 size_t consumed; /* Within iov[i] */
101 unsigned i, used, max_num;
102};
103
104/* Flag on max_num to indicate we're kmalloced. */
105#define VRINGH_IOV_ALLOCATED 0x8000000
106
107/* Helpers for userspace vrings. */
108int vringh_init_user(struct vringh *vrh, u32 features,
109 unsigned int num, bool weak_barriers,
110 struct vring_desc __user *desc,
111 struct vring_avail __user *avail,
112 struct vring_used __user *used);
113
114static inline void vringh_iov_init(struct vringh_iov *iov,
115 struct iovec *iovec, unsigned num)
116{
117 iov->used = iov->i = 0;
118 iov->consumed = 0;
119 iov->max_num = num;
120 iov->iov = iovec;
121}
122
123static inline void vringh_iov_reset(struct vringh_iov *iov)
124{
125 iov->iov[iov->i].iov_len += iov->consumed;
126 iov->iov[iov->i].iov_base -= iov->consumed;
127 iov->consumed = 0;
128 iov->i = 0;
129}
130
131static inline void vringh_iov_cleanup(struct vringh_iov *iov)
132{
133 if (iov->max_num & VRINGH_IOV_ALLOCATED)
134 kfree(iov->iov);
135 iov->max_num = iov->used = iov->i = iov->consumed = 0;
136 iov->iov = NULL;
137}
138
139/* Convert a descriptor into iovecs. */
140int vringh_getdesc_user(struct vringh *vrh,
141 struct vringh_iov *riov,
142 struct vringh_iov *wiov,
143 bool (*getrange)(struct vringh *vrh,
144 u64 addr, struct vringh_range *r),
145 u16 *head);
146
147/* Copy bytes from readable vsg, consuming it (and incrementing wiov->i). */
148ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len);
149
150/* Copy bytes into writable vsg, consuming it (and incrementing wiov->i). */
151ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
152 const void *src, size_t len);
153
154/* Mark a descriptor as used. */
155int vringh_complete_user(struct vringh *vrh, u16 head, u32 len);
156int vringh_complete_multi_user(struct vringh *vrh,
157 const struct vring_used_elem used[],
158 unsigned num_used);
159
160/* Pretend we've never seen descriptor (for easy error handling). */
161void vringh_abandon_user(struct vringh *vrh, unsigned int num);
162
163/* Do we need to fire the eventfd to notify the other side? */
164int vringh_need_notify_user(struct vringh *vrh);
165
166bool vringh_notify_enable_user(struct vringh *vrh);
167void vringh_notify_disable_user(struct vringh *vrh);
168
169/* Helpers for kernelspace vrings. */
170int vringh_init_kern(struct vringh *vrh, u32 features,
171 unsigned int num, bool weak_barriers,
172 struct vring_desc *desc,
173 struct vring_avail *avail,
174 struct vring_used *used);
175
176static inline void vringh_kiov_init(struct vringh_kiov *kiov,
177 struct kvec *kvec, unsigned num)
178{
179 kiov->used = kiov->i = 0;
180 kiov->consumed = 0;
181 kiov->max_num = num;
182 kiov->iov = kvec;
183}
184
185static inline void vringh_kiov_reset(struct vringh_kiov *kiov)
186{
187 kiov->iov[kiov->i].iov_len += kiov->consumed;
188 kiov->iov[kiov->i].iov_base -= kiov->consumed;
189 kiov->consumed = 0;
190 kiov->i = 0;
191}
192
193static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov)
194{
195 if (kiov->max_num & VRINGH_IOV_ALLOCATED)
196 kfree(kiov->iov);
197 kiov->max_num = kiov->used = kiov->i = kiov->consumed = 0;
198 kiov->iov = NULL;
199}
200
201int vringh_getdesc_kern(struct vringh *vrh,
202 struct vringh_kiov *riov,
203 struct vringh_kiov *wiov,
204 u16 *head,
205 gfp_t gfp);
206
207ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len);
208ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
209 const void *src, size_t len);
210void vringh_abandon_kern(struct vringh *vrh, unsigned int num);
211int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len);
212
213bool vringh_notify_enable_kern(struct vringh *vrh);
214void vringh_notify_disable_kern(struct vringh *vrh);
215
216int vringh_need_notify_kern(struct vringh *vrh);
217
218/* Notify the guest about buffers added to the used ring */
219static inline void vringh_notify(struct vringh *vrh)
220{
221 if (vrh->notify)
222 vrh->notify(vrh);
223}
224
225#endif /* _LINUX_VRINGH_H */
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index 652dc8bea921..5e26f61b5df5 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -52,8 +52,8 @@ struct virtio_balloon_config
52#define VIRTIO_BALLOON_S_NR 6 52#define VIRTIO_BALLOON_S_NR 6
53 53
54struct virtio_balloon_stat { 54struct virtio_balloon_stat {
55 u16 tag; 55 __u16 tag;
56 u64 val; 56 __u64 val;
57} __attribute__((packed)); 57} __attribute__((packed));
58 58
59#endif /* _LINUX_VIRTIO_BALLOON_H */ 59#endif /* _LINUX_VIRTIO_BALLOON_H */
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index a7630d04029f..284fc3a05f7b 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -38,5 +38,6 @@
38#define VIRTIO_ID_SCSI 8 /* virtio scsi */ 38#define VIRTIO_ID_SCSI 8 /* virtio scsi */
39#define VIRTIO_ID_9P 9 /* 9p virtio console */ 39#define VIRTIO_ID_9P 9 /* 9p virtio console */
40#define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ 40#define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */
41#define VIRTIO_ID_CAIF 12 /* Virtio caif */
41 42
42#endif /* _LINUX_VIRTIO_IDS_H */ 43#endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index de2e950a0a7a..e1c26b101830 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -194,11 +194,14 @@ static int pack_sg_list(struct scatterlist *sg, int start,
194 if (s > count) 194 if (s > count)
195 s = count; 195 s = count;
196 BUG_ON(index > limit); 196 BUG_ON(index > limit);
197 /* Make sure we don't terminate early. */
198 sg_unmark_end(&sg[index]);
197 sg_set_buf(&sg[index++], data, s); 199 sg_set_buf(&sg[index++], data, s);
198 count -= s; 200 count -= s;
199 data += s; 201 data += s;
200 } 202 }
201 203 if (index-start)
204 sg_mark_end(&sg[index - 1]);
202 return index-start; 205 return index-start;
203} 206}
204 207
@@ -236,12 +239,17 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit,
236 s = rest_of_page(data); 239 s = rest_of_page(data);
237 if (s > count) 240 if (s > count)
238 s = count; 241 s = count;
242 /* Make sure we don't terminate early. */
243 sg_unmark_end(&sg[index]);
239 sg_set_page(&sg[index++], pdata[i++], s, data_off); 244 sg_set_page(&sg[index++], pdata[i++], s, data_off);
240 data_off = 0; 245 data_off = 0;
241 data += s; 246 data += s;
242 count -= s; 247 count -= s;
243 nr_pages--; 248 nr_pages--;
244 } 249 }
250
251 if (index-start)
252 sg_mark_end(&sg[index - 1]);
245 return index - start; 253 return index - start;
246} 254}
247 255
@@ -256,9 +264,10 @@ static int
256p9_virtio_request(struct p9_client *client, struct p9_req_t *req) 264p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
257{ 265{
258 int err; 266 int err;
259 int in, out; 267 int in, out, out_sgs, in_sgs;
260 unsigned long flags; 268 unsigned long flags;
261 struct virtio_chan *chan = client->trans; 269 struct virtio_chan *chan = client->trans;
270 struct scatterlist *sgs[2];
262 271
263 p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n"); 272 p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n");
264 273
@@ -266,14 +275,19 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
266req_retry: 275req_retry:
267 spin_lock_irqsave(&chan->lock, flags); 276 spin_lock_irqsave(&chan->lock, flags);
268 277
278 out_sgs = in_sgs = 0;
269 /* Handle out VirtIO ring buffers */ 279 /* Handle out VirtIO ring buffers */
270 out = pack_sg_list(chan->sg, 0, 280 out = pack_sg_list(chan->sg, 0,
271 VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); 281 VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
282 if (out)
283 sgs[out_sgs++] = chan->sg;
272 284
273 in = pack_sg_list(chan->sg, out, 285 in = pack_sg_list(chan->sg, out,
274 VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity); 286 VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity);
287 if (in)
288 sgs[out_sgs + in_sgs++] = chan->sg + out;
275 289
276 err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, 290 err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc,
277 GFP_ATOMIC); 291 GFP_ATOMIC);
278 if (err < 0) { 292 if (err < 0) {
279 if (err == -ENOSPC) { 293 if (err == -ENOSPC) {
@@ -289,7 +303,7 @@ req_retry:
289 } else { 303 } else {
290 spin_unlock_irqrestore(&chan->lock, flags); 304 spin_unlock_irqrestore(&chan->lock, flags);
291 p9_debug(P9_DEBUG_TRANS, 305 p9_debug(P9_DEBUG_TRANS,
292 "virtio rpc add_buf returned failure\n"); 306 "virtio rpc add_sgs returned failure\n");
293 return -EIO; 307 return -EIO;
294 } 308 }
295 } 309 }
@@ -351,11 +365,12 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
351 char *uidata, char *uodata, int inlen, 365 char *uidata, char *uodata, int inlen,
352 int outlen, int in_hdr_len, int kern_buf) 366 int outlen, int in_hdr_len, int kern_buf)
353{ 367{
354 int in, out, err; 368 int in, out, err, out_sgs, in_sgs;
355 unsigned long flags; 369 unsigned long flags;
356 int in_nr_pages = 0, out_nr_pages = 0; 370 int in_nr_pages = 0, out_nr_pages = 0;
357 struct page **in_pages = NULL, **out_pages = NULL; 371 struct page **in_pages = NULL, **out_pages = NULL;
358 struct virtio_chan *chan = client->trans; 372 struct virtio_chan *chan = client->trans;
373 struct scatterlist *sgs[4];
359 374
360 p9_debug(P9_DEBUG_TRANS, "virtio request\n"); 375 p9_debug(P9_DEBUG_TRANS, "virtio request\n");
361 376
@@ -396,13 +411,22 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
396 req->status = REQ_STATUS_SENT; 411 req->status = REQ_STATUS_SENT;
397req_retry_pinned: 412req_retry_pinned:
398 spin_lock_irqsave(&chan->lock, flags); 413 spin_lock_irqsave(&chan->lock, flags);
414
415 out_sgs = in_sgs = 0;
416
399 /* out data */ 417 /* out data */
400 out = pack_sg_list(chan->sg, 0, 418 out = pack_sg_list(chan->sg, 0,
401 VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); 419 VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
402 420
403 if (out_pages) 421 if (out)
422 sgs[out_sgs++] = chan->sg;
423
424 if (out_pages) {
425 sgs[out_sgs++] = chan->sg + out;
404 out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, 426 out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM,
405 out_pages, out_nr_pages, uodata, outlen); 427 out_pages, out_nr_pages, uodata, outlen);
428 }
429
406 /* 430 /*
407 * Take care of in data 431 * Take care of in data
408 * For example TREAD have 11. 432 * For example TREAD have 11.
@@ -412,11 +436,17 @@ req_retry_pinned:
412 */ 436 */
413 in = pack_sg_list(chan->sg, out, 437 in = pack_sg_list(chan->sg, out,
414 VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len); 438 VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len);
415 if (in_pages) 439 if (in)
440 sgs[out_sgs + in_sgs++] = chan->sg + out;
441
442 if (in_pages) {
443 sgs[out_sgs + in_sgs++] = chan->sg + out + in;
416 in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM, 444 in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM,
417 in_pages, in_nr_pages, uidata, inlen); 445 in_pages, in_nr_pages, uidata, inlen);
446 }
418 447
419 err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, 448 BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs));
449 err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc,
420 GFP_ATOMIC); 450 GFP_ATOMIC);
421 if (err < 0) { 451 if (err < 0) {
422 if (err == -ENOSPC) { 452 if (err == -ENOSPC) {
@@ -432,7 +462,7 @@ req_retry_pinned:
432 } else { 462 } else {
433 spin_unlock_irqrestore(&chan->lock, flags); 463 spin_unlock_irqrestore(&chan->lock, flags);
434 p9_debug(P9_DEBUG_TRANS, 464 p9_debug(P9_DEBUG_TRANS,
435 "virtio rpc add_buf returned failure\n"); 465 "virtio rpc add_sgs returned failure\n");
436 err = -EIO; 466 err = -EIO;
437 goto err_out; 467 goto err_out;
438 } 468 }
diff --git a/tools/lguest/lguest.txt b/tools/lguest/lguest.txt
index 7203ace65e83..06e1f4649511 100644
--- a/tools/lguest/lguest.txt
+++ b/tools/lguest/lguest.txt
@@ -70,7 +70,7 @@ Running Lguest:
70 70
71- Run an lguest as root: 71- Run an lguest as root:
72 72
73 Documentation/virtual/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ 73 tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
74 --block=rootfile root=/dev/vda 74 --block=rootfile root=/dev/vda
75 75
76 Explanation: 76 Explanation:
diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d1d442ed106a..3187c62d9814 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,12 +1,14 @@
1all: test mod 1all: test mod
2test: virtio_test 2test: virtio_test vringh_test
3virtio_test: virtio_ring.o virtio_test.o 3virtio_test: virtio_ring.o virtio_test.o
4CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -MMD 4vringh_test: vringh_test.o vringh.o virtio_ring.o
5vpath %.c ../../drivers/virtio 5
6CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE
7vpath %.c ../../drivers/virtio ../../drivers/vhost
6mod: 8mod:
7 ${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test 9 ${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test
8.PHONY: all test mod clean 10.PHONY: all test mod clean
9clean: 11clean:
10 ${RM} *.o vhost_test/*.o vhost_test/.*.cmd \ 12 ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
11 vhost_test/Module.symvers vhost_test/modules.order *.d 13 vhost_test/Module.symvers vhost_test/modules.order *.d
12-include *.d 14-include *.d
diff --git a/tools/virtio/asm/barrier.h b/tools/virtio/asm/barrier.h
new file mode 100644
index 000000000000..aff61e13306c
--- /dev/null
+++ b/tools/virtio/asm/barrier.h
@@ -0,0 +1,14 @@
1#if defined(__i386__) || defined(__x86_64__)
2#define barrier() asm volatile("" ::: "memory")
3#define mb() __sync_synchronize()
4
5#define smp_mb() mb()
6# define smp_rmb() barrier()
7# define smp_wmb() barrier()
8/* Weak barriers should be used. If not - it's a bug */
9# define rmb() abort()
10# define wmb() abort()
11#else
12#error Please fill in barrier macros
13#endif
14
diff --git a/tools/virtio/linux/bug.h b/tools/virtio/linux/bug.h
new file mode 100644
index 000000000000..fb94f0787c47
--- /dev/null
+++ b/tools/virtio/linux/bug.h
@@ -0,0 +1,10 @@
1#ifndef BUG_H
2#define BUG_H
3
4#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond))
5
6#define BUILD_BUG_ON(x)
7
8#define BUG() abort()
9
10#endif /* BUG_H */
diff --git a/tools/virtio/linux/err.h b/tools/virtio/linux/err.h
new file mode 100644
index 000000000000..e32eff8b2a14
--- /dev/null
+++ b/tools/virtio/linux/err.h
@@ -0,0 +1,26 @@
1#ifndef ERR_H
2#define ERR_H
3#define MAX_ERRNO 4095
4
5#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
6
7static inline void * __must_check ERR_PTR(long error)
8{
9 return (void *) error;
10}
11
12static inline long __must_check PTR_ERR(const void *ptr)
13{
14 return (long) ptr;
15}
16
17static inline long __must_check IS_ERR(const void *ptr)
18{
19 return IS_ERR_VALUE((unsigned long)ptr);
20}
21
22static inline long __must_check IS_ERR_OR_NULL(const void *ptr)
23{
24 return !ptr || IS_ERR_VALUE((unsigned long)ptr);
25}
26#endif /* ERR_H */
diff --git a/tools/virtio/linux/export.h b/tools/virtio/linux/export.h
new file mode 100644
index 000000000000..7311d326894a
--- /dev/null
+++ b/tools/virtio/linux/export.h
@@ -0,0 +1,5 @@
1#define EXPORT_SYMBOL(sym)
2#define EXPORT_SYMBOL_GPL(sym)
3#define EXPORT_SYMBOL_GPL_FUTURE(sym)
4#define EXPORT_UNUSED_SYMBOL(sym)
5#define EXPORT_UNUSED_SYMBOL_GPL(sym)
diff --git a/tools/virtio/linux/irqreturn.h b/tools/virtio/linux/irqreturn.h
new file mode 100644
index 000000000000..a3c4e7be7089
--- /dev/null
+++ b/tools/virtio/linux/irqreturn.h
@@ -0,0 +1 @@
#include "../../../include/linux/irqreturn.h"
diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h
new file mode 100644
index 000000000000..fba705963968
--- /dev/null
+++ b/tools/virtio/linux/kernel.h
@@ -0,0 +1,112 @@
1#ifndef KERNEL_H
2#define KERNEL_H
3#include <stdbool.h>
4#include <stdlib.h>
5#include <stddef.h>
6#include <stdio.h>
7#include <string.h>
8#include <assert.h>
9#include <stdarg.h>
10
11#include <linux/types.h>
12#include <linux/printk.h>
13#include <linux/bug.h>
14#include <errno.h>
15#include <unistd.h>
16#include <asm/barrier.h>
17
18#define CONFIG_SMP
19
20#define PAGE_SIZE getpagesize()
21#define PAGE_MASK (~(PAGE_SIZE-1))
22
23typedef unsigned long long dma_addr_t;
24typedef size_t __kernel_size_t;
25
26struct page {
27 unsigned long long dummy;
28};
29
30/* Physical == Virtual */
31#define virt_to_phys(p) ((unsigned long)p)
32#define phys_to_virt(a) ((void *)(unsigned long)(a))
33/* Page address: Virtual / 4K */
34#define page_to_phys(p) ((dma_addr_t)(unsigned long)(p))
35#define virt_to_page(p) ((struct page *)((unsigned long)p & PAGE_MASK))
36
37#define offset_in_page(p) (((unsigned long)p) % PAGE_SIZE)
38
39#define __printf(a,b) __attribute__((format(printf,a,b)))
40
41typedef enum {
42 GFP_KERNEL,
43 GFP_ATOMIC,
44 __GFP_HIGHMEM,
45 __GFP_HIGH
46} gfp_t;
47
48#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
49
50extern void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
51static inline void *kmalloc(size_t s, gfp_t gfp)
52{
53 if (__kmalloc_fake)
54 return __kmalloc_fake;
55 return malloc(s);
56}
57
58static inline void kfree(void *p)
59{
60 if (p >= __kfree_ignore_start && p < __kfree_ignore_end)
61 return;
62 free(p);
63}
64
65static inline void *krealloc(void *p, size_t s, gfp_t gfp)
66{
67 return realloc(p, s);
68}
69
70
71static inline unsigned long __get_free_page(gfp_t gfp)
72{
73 void *p;
74
75 posix_memalign(&p, PAGE_SIZE, PAGE_SIZE);
76 return (unsigned long)p;
77}
78
79static inline void free_page(unsigned long addr)
80{
81 free((void *)addr);
82}
83
84#define container_of(ptr, type, member) ({ \
85 const typeof( ((type *)0)->member ) *__mptr = (ptr); \
86 (type *)( (char *)__mptr - offsetof(type,member) );})
87
88#define uninitialized_var(x) x = x
89
90# ifndef likely
91# define likely(x) (__builtin_expect(!!(x), 1))
92# endif
93# ifndef unlikely
94# define unlikely(x) (__builtin_expect(!!(x), 0))
95# endif
96
97#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
98#ifdef DEBUG
99#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
100#else
101#define pr_debug(format, ...) do {} while (0)
102#endif
103#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
104#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
105
106#define min(x, y) ({ \
107 typeof(x) _min1 = (x); \
108 typeof(y) _min2 = (y); \
109 (void) (&_min1 == &_min2); \
110 _min1 < _min2 ? _min1 : _min2; })
111
112#endif /* KERNEL_H */
diff --git a/tools/virtio/linux/module.h b/tools/virtio/linux/module.h
index e69de29bb2d1..3039a7e972b6 100644
--- a/tools/virtio/linux/module.h
+++ b/tools/virtio/linux/module.h
@@ -0,0 +1 @@
#include <linux/export.h>
diff --git a/tools/virtio/linux/printk.h b/tools/virtio/linux/printk.h
new file mode 100644
index 000000000000..9f2423bd89c2
--- /dev/null
+++ b/tools/virtio/linux/printk.h
@@ -0,0 +1,4 @@
1#include "../../../include/linux/kern_levels.h"
2
3#define printk printf
4#define vprintk vprintf
diff --git a/tools/virtio/linux/ratelimit.h b/tools/virtio/linux/ratelimit.h
new file mode 100644
index 000000000000..dcce1725f90d
--- /dev/null
+++ b/tools/virtio/linux/ratelimit.h
@@ -0,0 +1,4 @@
1#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) int name = 0
2
3#define __ratelimit(x) (*(x))
4
diff --git a/tools/virtio/linux/scatterlist.h b/tools/virtio/linux/scatterlist.h
new file mode 100644
index 000000000000..68c9e2adc996
--- /dev/null
+++ b/tools/virtio/linux/scatterlist.h
@@ -0,0 +1,189 @@
1#ifndef SCATTERLIST_H
2#define SCATTERLIST_H
3#include <linux/kernel.h>
4
5struct scatterlist {
6 unsigned long page_link;
7 unsigned int offset;
8 unsigned int length;
9 dma_addr_t dma_address;
10};
11
12/* Scatterlist helpers, stolen from linux/scatterlist.h */
13#define sg_is_chain(sg) ((sg)->page_link & 0x01)
14#define sg_is_last(sg) ((sg)->page_link & 0x02)
15#define sg_chain_ptr(sg) \
16 ((struct scatterlist *) ((sg)->page_link & ~0x03))
17
18/**
19 * sg_assign_page - Assign a given page to an SG entry
20 * @sg: SG entry
21 * @page: The page
22 *
23 * Description:
24 * Assign page to sg entry. Also see sg_set_page(), the most commonly used
25 * variant.
26 *
27 **/
28static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
29{
30 unsigned long page_link = sg->page_link & 0x3;
31
32 /*
33 * In order for the low bit stealing approach to work, pages
34 * must be aligned at a 32-bit boundary as a minimum.
35 */
36 BUG_ON((unsigned long) page & 0x03);
37#ifdef CONFIG_DEBUG_SG
38 BUG_ON(sg->sg_magic != SG_MAGIC);
39 BUG_ON(sg_is_chain(sg));
40#endif
41 sg->page_link = page_link | (unsigned long) page;
42}
43
44/**
45 * sg_set_page - Set sg entry to point at given page
46 * @sg: SG entry
47 * @page: The page
48 * @len: Length of data
49 * @offset: Offset into page
50 *
51 * Description:
52 * Use this function to set an sg entry pointing at a page, never assign
53 * the page directly. We encode sg table information in the lower bits
54 * of the page pointer. See sg_page() for looking up the page belonging
55 * to an sg entry.
56 *
57 **/
58static inline void sg_set_page(struct scatterlist *sg, struct page *page,
59 unsigned int len, unsigned int offset)
60{
61 sg_assign_page(sg, page);
62 sg->offset = offset;
63 sg->length = len;
64}
65
66static inline struct page *sg_page(struct scatterlist *sg)
67{
68#ifdef CONFIG_DEBUG_SG
69 BUG_ON(sg->sg_magic != SG_MAGIC);
70 BUG_ON(sg_is_chain(sg));
71#endif
72 return (struct page *)((sg)->page_link & ~0x3);
73}
74
75/*
76 * Loop over each sg element, following the pointer to a new list if necessary
77 */
78#define for_each_sg(sglist, sg, nr, __i) \
79 for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))
80
81/**
82 * sg_chain - Chain two sglists together
83 * @prv: First scatterlist
84 * @prv_nents: Number of entries in prv
85 * @sgl: Second scatterlist
86 *
87 * Description:
88 * Links @prv@ and @sgl@ together, to form a longer scatterlist.
89 *
90 **/
91static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
92 struct scatterlist *sgl)
93{
94 /*
95 * offset and length are unused for chain entry. Clear them.
96 */
97 prv[prv_nents - 1].offset = 0;
98 prv[prv_nents - 1].length = 0;
99
100 /*
101 * Set lowest bit to indicate a link pointer, and make sure to clear
102 * the termination bit if it happens to be set.
103 */
104 prv[prv_nents - 1].page_link = ((unsigned long) sgl | 0x01) & ~0x02;
105}
106
107/**
108 * sg_mark_end - Mark the end of the scatterlist
109 * @sg: SG entryScatterlist
110 *
111 * Description:
112 * Marks the passed in sg entry as the termination point for the sg
113 * table. A call to sg_next() on this entry will return NULL.
114 *
115 **/
116static inline void sg_mark_end(struct scatterlist *sg)
117{
118#ifdef CONFIG_DEBUG_SG
119 BUG_ON(sg->sg_magic != SG_MAGIC);
120#endif
121 /*
122 * Set termination bit, clear potential chain bit
123 */
124 sg->page_link |= 0x02;
125 sg->page_link &= ~0x01;
126}
127
128/**
129 * sg_unmark_end - Undo setting the end of the scatterlist
130 * @sg: SG entryScatterlist
131 *
132 * Description:
133 * Removes the termination marker from the given entry of the scatterlist.
134 *
135 **/
136static inline void sg_unmark_end(struct scatterlist *sg)
137{
138#ifdef CONFIG_DEBUG_SG
139 BUG_ON(sg->sg_magic != SG_MAGIC);
140#endif
141 sg->page_link &= ~0x02;
142}
143
144static inline struct scatterlist *sg_next(struct scatterlist *sg)
145{
146#ifdef CONFIG_DEBUG_SG
147 BUG_ON(sg->sg_magic != SG_MAGIC);
148#endif
149 if (sg_is_last(sg))
150 return NULL;
151
152 sg++;
153 if (unlikely(sg_is_chain(sg)))
154 sg = sg_chain_ptr(sg);
155
156 return sg;
157}
158
159static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents)
160{
161 memset(sgl, 0, sizeof(*sgl) * nents);
162#ifdef CONFIG_DEBUG_SG
163 {
164 unsigned int i;
165 for (i = 0; i < nents; i++)
166 sgl[i].sg_magic = SG_MAGIC;
167 }
168#endif
169 sg_mark_end(&sgl[nents - 1]);
170}
171
172static inline dma_addr_t sg_phys(struct scatterlist *sg)
173{
174 return page_to_phys(sg_page(sg)) + sg->offset;
175}
176
177static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
178 unsigned int buflen)
179{
180 sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
181}
182
183static inline void sg_init_one(struct scatterlist *sg,
184 const void *buf, unsigned int buflen)
185{
186 sg_init_table(sg, 1);
187 sg_set_buf(sg, buf, buflen);
188}
189#endif /* SCATTERLIST_H */
diff --git a/tools/virtio/linux/types.h b/tools/virtio/linux/types.h
new file mode 100644
index 000000000000..f8ebb9a2b3d6
--- /dev/null
+++ b/tools/virtio/linux/types.h
@@ -0,0 +1,28 @@
1#ifndef TYPES_H
2#define TYPES_H
3#include <stdint.h>
4
5#define __force
6#define __user
7#define __must_check
8#define __cold
9
10typedef uint64_t u64;
11typedef int64_t s64;
12typedef uint32_t u32;
13typedef int32_t s32;
14typedef uint16_t u16;
15typedef int16_t s16;
16typedef uint8_t u8;
17typedef int8_t s8;
18
19typedef uint64_t __u64;
20typedef int64_t __s64;
21typedef uint32_t __u32;
22typedef int32_t __s32;
23typedef uint16_t __u16;
24typedef int16_t __s16;
25typedef uint8_t __u8;
26typedef int8_t __s8;
27
28#endif /* TYPES_H */
diff --git a/tools/virtio/linux/uaccess.h b/tools/virtio/linux/uaccess.h
new file mode 100644
index 000000000000..0a578fe18653
--- /dev/null
+++ b/tools/virtio/linux/uaccess.h
@@ -0,0 +1,50 @@
1#ifndef UACCESS_H
2#define UACCESS_H
3extern void *__user_addr_min, *__user_addr_max;
4
5#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
6
7static inline void __chk_user_ptr(const volatile void *p, size_t size)
8{
9 assert(p >= __user_addr_min && p + size <= __user_addr_max);
10}
11
12#define put_user(x, ptr) \
13({ \
14 typeof(ptr) __pu_ptr = (ptr); \
15 __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \
16 ACCESS_ONCE(*(__pu_ptr)) = x; \
17 0; \
18})
19
20#define get_user(x, ptr) \
21({ \
22 typeof(ptr) __pu_ptr = (ptr); \
23 __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \
24 x = ACCESS_ONCE(*(__pu_ptr)); \
25 0; \
26})
27
28static void volatile_memcpy(volatile char *to, const volatile char *from,
29 unsigned long n)
30{
31 while (n--)
32 *(to++) = *(from++);
33}
34
35static inline int copy_from_user(void *to, const void __user volatile *from,
36 unsigned long n)
37{
38 __chk_user_ptr(from, n);
39 volatile_memcpy(to, from, n);
40 return 0;
41}
42
43static inline int copy_to_user(void __user volatile *to, const void *from,
44 unsigned long n)
45{
46 __chk_user_ptr(to, n);
47 volatile_memcpy(to, from, n);
48 return 0;
49}
50#endif /* UACCESS_H */
diff --git a/tools/virtio/linux/uio.h b/tools/virtio/linux/uio.h
new file mode 100644
index 000000000000..cd20f0ba3081
--- /dev/null
+++ b/tools/virtio/linux/uio.h
@@ -0,0 +1,3 @@
1#include <linux/kernel.h>
2
3#include "../../../include/linux/uio.h"
diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h
index 81847dd08bd0..cd801838156f 100644
--- a/tools/virtio/linux/virtio.h
+++ b/tools/virtio/linux/virtio.h
@@ -1,127 +1,7 @@
1#ifndef LINUX_VIRTIO_H 1#ifndef LINUX_VIRTIO_H
2#define LINUX_VIRTIO_H 2#define LINUX_VIRTIO_H
3 3#include <linux/scatterlist.h>
4#include <stdbool.h> 4#include <linux/kernel.h>
5#include <stdlib.h>
6#include <stddef.h>
7#include <stdio.h>
8#include <string.h>
9#include <assert.h>
10
11#include <linux/types.h>
12#include <errno.h>
13
14typedef unsigned long long dma_addr_t;
15
16struct scatterlist {
17 unsigned long page_link;
18 unsigned int offset;
19 unsigned int length;
20 dma_addr_t dma_address;
21};
22
23struct page {
24 unsigned long long dummy;
25};
26
27#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond))
28
29/* Physical == Virtual */
30#define virt_to_phys(p) ((unsigned long)p)
31#define phys_to_virt(a) ((void *)(unsigned long)(a))
32/* Page address: Virtual / 4K */
33#define virt_to_page(p) ((struct page*)((virt_to_phys(p) / 4096) * \
34 sizeof(struct page)))
35#define offset_in_page(p) (((unsigned long)p) % 4096)
36#define sg_phys(sg) ((sg->page_link & ~0x3) / sizeof(struct page) * 4096 + \
37 sg->offset)
38static inline void sg_mark_end(struct scatterlist *sg)
39{
40 /*
41 * Set termination bit, clear potential chain bit
42 */
43 sg->page_link |= 0x02;
44 sg->page_link &= ~0x01;
45}
46static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents)
47{
48 memset(sgl, 0, sizeof(*sgl) * nents);
49 sg_mark_end(&sgl[nents - 1]);
50}
51static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
52{
53 unsigned long page_link = sg->page_link & 0x3;
54
55 /*
56 * In order for the low bit stealing approach to work, pages
57 * must be aligned at a 32-bit boundary as a minimum.
58 */
59 BUG_ON((unsigned long) page & 0x03);
60 sg->page_link = page_link | (unsigned long) page;
61}
62
63static inline void sg_set_page(struct scatterlist *sg, struct page *page,
64 unsigned int len, unsigned int offset)
65{
66 sg_assign_page(sg, page);
67 sg->offset = offset;
68 sg->length = len;
69}
70
71static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
72 unsigned int buflen)
73{
74 sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
75}
76
77static inline void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen)
78{
79 sg_init_table(sg, 1);
80 sg_set_buf(sg, buf, buflen);
81}
82
83typedef __u16 u16;
84
85typedef enum {
86 GFP_KERNEL,
87 GFP_ATOMIC,
88} gfp_t;
89typedef enum {
90 IRQ_NONE,
91 IRQ_HANDLED
92} irqreturn_t;
93
94static inline void *kmalloc(size_t s, gfp_t gfp)
95{
96 return malloc(s);
97}
98
99static inline void kfree(void *p)
100{
101 free(p);
102}
103
104#define container_of(ptr, type, member) ({ \
105 const typeof( ((type *)0)->member ) *__mptr = (ptr); \
106 (type *)( (char *)__mptr - offsetof(type,member) );})
107
108#define uninitialized_var(x) x = x
109
110# ifndef likely
111# define likely(x) (__builtin_expect(!!(x), 1))
112# endif
113# ifndef unlikely
114# define unlikely(x) (__builtin_expect(!!(x), 0))
115# endif
116
117#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
118#ifdef DEBUG
119#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
120#else
121#define pr_debug(format, ...) do {} while (0)
122#endif
123#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
124#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
125 5
126/* TODO: empty stubs for now. Broken but enough for virtio_ring.c */ 6/* TODO: empty stubs for now. Broken but enough for virtio_ring.c */
127#define list_add_tail(a, b) do {} while (0) 7#define list_add_tail(a, b) do {} while (0)
@@ -131,6 +11,7 @@ static inline void kfree(void *p)
131#define BITS_PER_BYTE 8 11#define BITS_PER_BYTE 8
132#define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE) 12#define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE)
133#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) 13#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
14
134/* TODO: Not atomic as it should be: 15/* TODO: Not atomic as it should be:
135 * we don't use this for anything important. */ 16 * we don't use this for anything important. */
136static inline void clear_bit(int nr, volatile unsigned long *addr) 17static inline void clear_bit(int nr, volatile unsigned long *addr)
@@ -145,10 +26,6 @@ static inline int test_bit(int nr, const volatile unsigned long *addr)
145{ 26{
146 return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); 27 return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
147} 28}
148
149/* The only feature we care to support */
150#define virtio_has_feature(dev, feature) \
151 test_bit((feature), (dev)->features)
152/* end of stubs */ 29/* end of stubs */
153 30
154struct virtio_device { 31struct virtio_device {
@@ -163,39 +40,32 @@ struct virtqueue {
163 void (*callback)(struct virtqueue *vq); 40 void (*callback)(struct virtqueue *vq);
164 const char *name; 41 const char *name;
165 struct virtio_device *vdev; 42 struct virtio_device *vdev;
43 unsigned int index;
44 unsigned int num_free;
166 void *priv; 45 void *priv;
167}; 46};
168 47
169#define EXPORT_SYMBOL_GPL(__EXPORT_SYMBOL_GPL_name) \
170 void __EXPORT_SYMBOL_GPL##__EXPORT_SYMBOL_GPL_name() { \
171}
172#define MODULE_LICENSE(__MODULE_LICENSE_value) \ 48#define MODULE_LICENSE(__MODULE_LICENSE_value) \
173 const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value 49 const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value
174 50
175#define CONFIG_SMP
176
177#if defined(__i386__) || defined(__x86_64__)
178#define barrier() asm volatile("" ::: "memory")
179#define mb() __sync_synchronize()
180
181#define smp_mb() mb()
182# define smp_rmb() barrier()
183# define smp_wmb() barrier()
184/* Weak barriers should be used. If not - it's a bug */
185# define rmb() abort()
186# define wmb() abort()
187#else
188#error Please fill in barrier macros
189#endif
190
191/* Interfaces exported by virtio_ring. */ 51/* Interfaces exported by virtio_ring. */
192int virtqueue_add_buf(struct virtqueue *vq, 52int virtqueue_add_sgs(struct virtqueue *vq,
193 struct scatterlist sg[], 53 struct scatterlist *sgs[],
194 unsigned int out_num, 54 unsigned int out_sgs,
195 unsigned int in_num, 55 unsigned int in_sgs,
196 void *data, 56 void *data,
197 gfp_t gfp); 57 gfp_t gfp);
198 58
59int virtqueue_add_outbuf(struct virtqueue *vq,
60 struct scatterlist sg[], unsigned int num,
61 void *data,
62 gfp_t gfp);
63
64int virtqueue_add_inbuf(struct virtqueue *vq,
65 struct scatterlist sg[], unsigned int num,
66 void *data,
67 gfp_t gfp);
68
199void virtqueue_kick(struct virtqueue *vq); 69void virtqueue_kick(struct virtqueue *vq);
200 70
201void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); 71void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len);
@@ -206,7 +76,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq);
206bool virtqueue_enable_cb_delayed(struct virtqueue *vq); 76bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
207 77
208void *virtqueue_detach_unused_buf(struct virtqueue *vq); 78void *virtqueue_detach_unused_buf(struct virtqueue *vq);
209struct virtqueue *vring_new_virtqueue(unsigned int num, 79struct virtqueue *vring_new_virtqueue(unsigned int index,
80 unsigned int num,
210 unsigned int vring_align, 81 unsigned int vring_align,
211 struct virtio_device *vdev, 82 struct virtio_device *vdev,
212 bool weak_barriers, 83 bool weak_barriers,
diff --git a/tools/virtio/linux/virtio_config.h b/tools/virtio/linux/virtio_config.h
new file mode 100644
index 000000000000..5049967f99f7
--- /dev/null
+++ b/tools/virtio/linux/virtio_config.h
@@ -0,0 +1,6 @@
1#define VIRTIO_TRANSPORT_F_START 28
2#define VIRTIO_TRANSPORT_F_END 32
3
4#define virtio_has_feature(dev, feature) \
5 test_bit((feature), (dev)->features)
6
diff --git a/tools/virtio/linux/virtio_ring.h b/tools/virtio/linux/virtio_ring.h
new file mode 100644
index 000000000000..8949c4e2772c
--- /dev/null
+++ b/tools/virtio/linux/virtio_ring.h
@@ -0,0 +1 @@
#include "../../../include/linux/virtio_ring.h"
diff --git a/tools/virtio/linux/vringh.h b/tools/virtio/linux/vringh.h
new file mode 100644
index 000000000000..9348957be56e
--- /dev/null
+++ b/tools/virtio/linux/vringh.h
@@ -0,0 +1 @@
#include "../../../include/linux/vringh.h"
diff --git a/tools/virtio/uapi/linux/uio.h b/tools/virtio/uapi/linux/uio.h
new file mode 100644
index 000000000000..7230e9002207
--- /dev/null
+++ b/tools/virtio/uapi/linux/uio.h
@@ -0,0 +1 @@
#include <sys/uio.h>
diff --git a/tools/virtio/uapi/linux/virtio_config.h b/tools/virtio/uapi/linux/virtio_config.h
new file mode 100644
index 000000000000..4c86675f0159
--- /dev/null
+++ b/tools/virtio/uapi/linux/virtio_config.h
@@ -0,0 +1 @@
#include "../../../../include/uapi/linux/virtio_config.h"
diff --git a/tools/virtio/uapi/linux/virtio_ring.h b/tools/virtio/uapi/linux/virtio_ring.h
new file mode 100644
index 000000000000..4d99c78234d3
--- /dev/null
+++ b/tools/virtio/uapi/linux/virtio_ring.h
@@ -0,0 +1,4 @@
1#ifndef VIRTIO_RING_H
2#define VIRTIO_RING_H
3#include "../../../../include/uapi/linux/virtio_ring.h"
4#endif /* VIRTIO_RING_H */
diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c
index fcc9aa25fd08..da7a19558281 100644
--- a/tools/virtio/virtio_test.c
+++ b/tools/virtio/virtio_test.c
@@ -10,11 +10,15 @@
10#include <sys/stat.h> 10#include <sys/stat.h>
11#include <sys/types.h> 11#include <sys/types.h>
12#include <fcntl.h> 12#include <fcntl.h>
13#include <stdbool.h>
13#include <linux/vhost.h> 14#include <linux/vhost.h>
14#include <linux/virtio.h> 15#include <linux/virtio.h>
15#include <linux/virtio_ring.h> 16#include <linux/virtio_ring.h>
16#include "../../drivers/vhost/test.h" 17#include "../../drivers/vhost/test.h"
17 18
19/* Unused */
20void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
21
18struct vq_info { 22struct vq_info {
19 int kick; 23 int kick;
20 int call; 24 int call;
@@ -92,7 +96,8 @@ static void vq_info_add(struct vdev_info *dev, int num)
92 assert(r >= 0); 96 assert(r >= 0);
93 memset(info->ring, 0, vring_size(num, 4096)); 97 memset(info->ring, 0, vring_size(num, 4096));
94 vring_init(&info->vring, num, info->ring, 4096); 98 vring_init(&info->vring, num, info->ring, 4096);
95 info->vq = vring_new_virtqueue(info->vring.num, 4096, &dev->vdev, 99 info->vq = vring_new_virtqueue(info->idx,
100 info->vring.num, 4096, &dev->vdev,
96 true, info->ring, 101 true, info->ring,
97 vq_notify, vq_callback, "test"); 102 vq_notify, vq_callback, "test");
98 assert(info->vq); 103 assert(info->vq);
@@ -161,9 +166,9 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq,
161 do { 166 do {
162 if (started < bufs) { 167 if (started < bufs) {
163 sg_init_one(&sl, dev->buf, dev->buf_size); 168 sg_init_one(&sl, dev->buf, dev->buf_size);
164 r = virtqueue_add_buf(vq->vq, &sl, 1, 0, 169 r = virtqueue_add_outbuf(vq->vq, &sl, 1,
165 dev->buf + started, 170 dev->buf + started,
166 GFP_ATOMIC); 171 GFP_ATOMIC);
167 if (likely(r == 0)) { 172 if (likely(r == 0)) {
168 ++started; 173 ++started;
169 virtqueue_kick(vq->vq); 174 virtqueue_kick(vq->vq);
diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c
new file mode 100644
index 000000000000..d053ea40c001
--- /dev/null
+++ b/tools/virtio/vringh_test.c
@@ -0,0 +1,741 @@
1/* Simple test of virtio code, entirely in userpsace. */
2#define _GNU_SOURCE
3#include <sched.h>
4#include <err.h>
5#include <linux/kernel.h>
6#include <linux/err.h>
7#include <linux/virtio.h>
8#include <linux/vringh.h>
9#include <linux/virtio_ring.h>
10#include <linux/uaccess.h>
11#include <sys/types.h>
12#include <sys/stat.h>
13#include <sys/mman.h>
14#include <sys/wait.h>
15#include <fcntl.h>
16
17#define USER_MEM (1024*1024)
18void *__user_addr_min, *__user_addr_max;
19void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
20static u64 user_addr_offset;
21
22#define RINGSIZE 256
23#define ALIGN 4096
24
25static void never_notify_host(struct virtqueue *vq)
26{
27 abort();
28}
29
30static void never_callback_guest(struct virtqueue *vq)
31{
32 abort();
33}
34
35static bool getrange_iov(struct vringh *vrh, u64 addr, struct vringh_range *r)
36{
37 if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset)
38 return false;
39 if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset)
40 return false;
41
42 r->start = (u64)(unsigned long)__user_addr_min - user_addr_offset;
43 r->end_incl = (u64)(unsigned long)__user_addr_max - 1 - user_addr_offset;
44 r->offset = user_addr_offset;
45 return true;
46}
47
48/* We return single byte ranges. */
49static bool getrange_slow(struct vringh *vrh, u64 addr, struct vringh_range *r)
50{
51 if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset)
52 return false;
53 if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset)
54 return false;
55
56 r->start = addr;
57 r->end_incl = r->start;
58 r->offset = user_addr_offset;
59 return true;
60}
61
62struct guest_virtio_device {
63 struct virtio_device vdev;
64 int to_host_fd;
65 unsigned long notifies;
66};
67
68static void parallel_notify_host(struct virtqueue *vq)
69{
70 struct guest_virtio_device *gvdev;
71
72 gvdev = container_of(vq->vdev, struct guest_virtio_device, vdev);
73 write(gvdev->to_host_fd, "", 1);
74 gvdev->notifies++;
75}
76
77static void no_notify_host(struct virtqueue *vq)
78{
79}
80
81#define NUM_XFERS (10000000)
82
83/* We aim for two "distant" cpus. */
84static void find_cpus(unsigned int *first, unsigned int *last)
85{
86 unsigned int i;
87
88 *first = -1U;
89 *last = 0;
90 for (i = 0; i < 4096; i++) {
91 cpu_set_t set;
92 CPU_ZERO(&set);
93 CPU_SET(i, &set);
94 if (sched_setaffinity(getpid(), sizeof(set), &set) == 0) {
95 if (i < *first)
96 *first = i;
97 if (i > *last)
98 *last = i;
99 }
100 }
101}
102
103/* Opencoded version for fast mode */
104static inline int vringh_get_head(struct vringh *vrh, u16 *head)
105{
106 u16 avail_idx, i;
107 int err;
108
109 err = get_user(avail_idx, &vrh->vring.avail->idx);
110 if (err)
111 return err;
112
113 if (vrh->last_avail_idx == avail_idx)
114 return 0;
115
116 /* Only get avail ring entries after they have been exposed by guest. */
117 virtio_rmb(vrh->weak_barriers);
118
119 i = vrh->last_avail_idx & (vrh->vring.num - 1);
120
121 err = get_user(*head, &vrh->vring.avail->ring[i]);
122 if (err)
123 return err;
124
125 vrh->last_avail_idx++;
126 return 1;
127}
128
129static int parallel_test(unsigned long features,
130 bool (*getrange)(struct vringh *vrh,
131 u64 addr, struct vringh_range *r),
132 bool fast_vringh)
133{
134 void *host_map, *guest_map;
135 int fd, mapsize, to_guest[2], to_host[2];
136 unsigned long xfers = 0, notifies = 0, receives = 0;
137 unsigned int first_cpu, last_cpu;
138 cpu_set_t cpu_set;
139 char buf[128];
140
141 /* Create real file to mmap. */
142 fd = open("/tmp/vringh_test-file", O_RDWR|O_CREAT|O_TRUNC, 0600);
143 if (fd < 0)
144 err(1, "Opening /tmp/vringh_test-file");
145
146 /* Extra room at the end for some data, and indirects */
147 mapsize = vring_size(RINGSIZE, ALIGN)
148 + RINGSIZE * 2 * sizeof(int)
149 + RINGSIZE * 6 * sizeof(struct vring_desc);
150 mapsize = (mapsize + getpagesize() - 1) & ~(getpagesize() - 1);
151 ftruncate(fd, mapsize);
152
153 /* Parent and child use separate addresses, to check our mapping logic! */
154 host_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
155 guest_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
156
157 pipe(to_guest);
158 pipe(to_host);
159
160 CPU_ZERO(&cpu_set);
161 find_cpus(&first_cpu, &last_cpu);
162 printf("Using CPUS %u and %u\n", first_cpu, last_cpu);
163 fflush(stdout);
164
165 if (fork() != 0) {
166 struct vringh vrh;
167 int status, err, rlen = 0;
168 char rbuf[5];
169
170 /* We are the host: never access guest addresses! */
171 munmap(guest_map, mapsize);
172
173 __user_addr_min = host_map;
174 __user_addr_max = __user_addr_min + mapsize;
175 user_addr_offset = host_map - guest_map;
176 assert(user_addr_offset);
177
178 close(to_guest[0]);
179 close(to_host[1]);
180
181 vring_init(&vrh.vring, RINGSIZE, host_map, ALIGN);
182 vringh_init_user(&vrh, features, RINGSIZE, true,
183 vrh.vring.desc, vrh.vring.avail, vrh.vring.used);
184 CPU_SET(first_cpu, &cpu_set);
185 if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set))
186 errx(1, "Could not set affinity to cpu %u", first_cpu);
187
188 while (xfers < NUM_XFERS) {
189 struct iovec host_riov[2], host_wiov[2];
190 struct vringh_iov riov, wiov;
191 u16 head, written;
192
193 if (fast_vringh) {
194 for (;;) {
195 err = vringh_get_head(&vrh, &head);
196 if (err != 0)
197 break;
198 err = vringh_need_notify_user(&vrh);
199 if (err < 0)
200 errx(1, "vringh_need_notify_user: %i",
201 err);
202 if (err) {
203 write(to_guest[1], "", 1);
204 notifies++;
205 }
206 }
207 if (err != 1)
208 errx(1, "vringh_get_head");
209 written = 0;
210 goto complete;
211 } else {
212 vringh_iov_init(&riov,
213 host_riov,
214 ARRAY_SIZE(host_riov));
215 vringh_iov_init(&wiov,
216 host_wiov,
217 ARRAY_SIZE(host_wiov));
218
219 err = vringh_getdesc_user(&vrh, &riov, &wiov,
220 getrange, &head);
221 }
222 if (err == 0) {
223 err = vringh_need_notify_user(&vrh);
224 if (err < 0)
225 errx(1, "vringh_need_notify_user: %i",
226 err);
227 if (err) {
228 write(to_guest[1], "", 1);
229 notifies++;
230 }
231
232 if (!vringh_notify_enable_user(&vrh))
233 continue;
234
235 /* Swallow all notifies at once. */
236 if (read(to_host[0], buf, sizeof(buf)) < 1)
237 break;
238
239 vringh_notify_disable_user(&vrh);
240 receives++;
241 continue;
242 }
243 if (err != 1)
244 errx(1, "vringh_getdesc_user: %i", err);
245
246 /* We simply copy bytes. */
247 if (riov.used) {
248 rlen = vringh_iov_pull_user(&riov, rbuf,
249 sizeof(rbuf));
250 if (rlen != 4)
251 errx(1, "vringh_iov_pull_user: %i",
252 rlen);
253 assert(riov.i == riov.used);
254 written = 0;
255 } else {
256 err = vringh_iov_push_user(&wiov, rbuf, rlen);
257 if (err != rlen)
258 errx(1, "vringh_iov_push_user: %i",
259 err);
260 assert(wiov.i == wiov.used);
261 written = err;
262 }
263 complete:
264 xfers++;
265
266 err = vringh_complete_user(&vrh, head, written);
267 if (err != 0)
268 errx(1, "vringh_complete_user: %i", err);
269 }
270
271 err = vringh_need_notify_user(&vrh);
272 if (err < 0)
273 errx(1, "vringh_need_notify_user: %i", err);
274 if (err) {
275 write(to_guest[1], "", 1);
276 notifies++;
277 }
278 wait(&status);
279 if (!WIFEXITED(status))
280 errx(1, "Child died with signal %i?", WTERMSIG(status));
281 if (WEXITSTATUS(status) != 0)
282 errx(1, "Child exited %i?", WEXITSTATUS(status));
283 printf("Host: notified %lu, pinged %lu\n", notifies, receives);
284 return 0;
285 } else {
286 struct guest_virtio_device gvdev;
287 struct virtqueue *vq;
288 unsigned int *data;
289 struct vring_desc *indirects;
290 unsigned int finished = 0;
291
292 /* We pass sg[]s pointing into here, but we need RINGSIZE+1 */
293 data = guest_map + vring_size(RINGSIZE, ALIGN);
294 indirects = (void *)data + (RINGSIZE + 1) * 2 * sizeof(int);
295
296 /* We are the guest. */
297 munmap(host_map, mapsize);
298
299 close(to_guest[1]);
300 close(to_host[0]);
301
302 gvdev.vdev.features[0] = features;
303 gvdev.to_host_fd = to_host[1];
304 gvdev.notifies = 0;
305
306 CPU_SET(first_cpu, &cpu_set);
307 if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set))
308 err(1, "Could not set affinity to cpu %u", first_cpu);
309
310 vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &gvdev.vdev, true,
311 guest_map, fast_vringh ? no_notify_host
312 : parallel_notify_host,
313 never_callback_guest, "guest vq");
314
315 /* Don't kfree indirects. */
316 __kfree_ignore_start = indirects;
317 __kfree_ignore_end = indirects + RINGSIZE * 6;
318
319 while (xfers < NUM_XFERS) {
320 struct scatterlist sg[4];
321 unsigned int num_sg, len;
322 int *dbuf, err;
323 bool output = !(xfers % 2);
324
325 /* Consume bufs. */
326 while ((dbuf = virtqueue_get_buf(vq, &len)) != NULL) {
327 if (len == 4)
328 assert(*dbuf == finished - 1);
329 else if (!fast_vringh)
330 assert(*dbuf == finished);
331 finished++;
332 }
333
334 /* Produce a buffer. */
335 dbuf = data + (xfers % (RINGSIZE + 1));
336
337 if (output)
338 *dbuf = xfers;
339 else
340 *dbuf = -1;
341
342 switch ((xfers / sizeof(*dbuf)) % 4) {
343 case 0:
344 /* Nasty three-element sg list. */
345 sg_init_table(sg, num_sg = 3);
346 sg_set_buf(&sg[0], (void *)dbuf, 1);
347 sg_set_buf(&sg[1], (void *)dbuf + 1, 2);
348 sg_set_buf(&sg[2], (void *)dbuf + 3, 1);
349 break;
350 case 1:
351 sg_init_table(sg, num_sg = 2);
352 sg_set_buf(&sg[0], (void *)dbuf, 1);
353 sg_set_buf(&sg[1], (void *)dbuf + 1, 3);
354 break;
355 case 2:
356 sg_init_table(sg, num_sg = 1);
357 sg_set_buf(&sg[0], (void *)dbuf, 4);
358 break;
359 case 3:
360 sg_init_table(sg, num_sg = 4);
361 sg_set_buf(&sg[0], (void *)dbuf, 1);
362 sg_set_buf(&sg[1], (void *)dbuf + 1, 1);
363 sg_set_buf(&sg[2], (void *)dbuf + 2, 1);
364 sg_set_buf(&sg[3], (void *)dbuf + 3, 1);
365 break;
366 }
367
368 /* May allocate an indirect, so force it to allocate
369 * user addr */
370 __kmalloc_fake = indirects + (xfers % RINGSIZE) * 4;
371 if (output)
372 err = virtqueue_add_outbuf(vq, sg, num_sg, dbuf,
373 GFP_KERNEL);
374 else
375 err = virtqueue_add_inbuf(vq, sg, num_sg,
376 dbuf, GFP_KERNEL);
377
378 if (err == -ENOSPC) {
379 if (!virtqueue_enable_cb_delayed(vq))
380 continue;
381 /* Swallow all notifies at once. */
382 if (read(to_guest[0], buf, sizeof(buf)) < 1)
383 break;
384
385 receives++;
386 virtqueue_disable_cb(vq);
387 continue;
388 }
389
390 if (err)
391 errx(1, "virtqueue_add_in/outbuf: %i", err);
392
393 xfers++;
394 virtqueue_kick(vq);
395 }
396
397 /* Any extra? */
398 while (finished != xfers) {
399 int *dbuf;
400 unsigned int len;
401
402 /* Consume bufs. */
403 dbuf = virtqueue_get_buf(vq, &len);
404 if (dbuf) {
405 if (len == 4)
406 assert(*dbuf == finished - 1);
407 else
408 assert(len == 0);
409 finished++;
410 continue;
411 }
412
413 if (!virtqueue_enable_cb_delayed(vq))
414 continue;
415 if (read(to_guest[0], buf, sizeof(buf)) < 1)
416 break;
417
418 receives++;
419 virtqueue_disable_cb(vq);
420 }
421
422 printf("Guest: notified %lu, pinged %lu\n",
423 gvdev.notifies, receives);
424 vring_del_virtqueue(vq);
425 return 0;
426 }
427}
428
429int main(int argc, char *argv[])
430{
431 struct virtio_device vdev;
432 struct virtqueue *vq;
433 struct vringh vrh;
434 struct scatterlist guest_sg[RINGSIZE], *sgs[2];
435 struct iovec host_riov[2], host_wiov[2];
436 struct vringh_iov riov, wiov;
437 struct vring_used_elem used[RINGSIZE];
438 char buf[28];
439 u16 head;
440 int err;
441 unsigned i;
442 void *ret;
443 bool (*getrange)(struct vringh *vrh, u64 addr, struct vringh_range *r);
444 bool fast_vringh = false, parallel = false;
445
446 getrange = getrange_iov;
447 vdev.features[0] = 0;
448
449 while (argv[1]) {
450 if (strcmp(argv[1], "--indirect") == 0)
451 vdev.features[0] |= (1 << VIRTIO_RING_F_INDIRECT_DESC);
452 else if (strcmp(argv[1], "--eventidx") == 0)
453 vdev.features[0] |= (1 << VIRTIO_RING_F_EVENT_IDX);
454 else if (strcmp(argv[1], "--slow-range") == 0)
455 getrange = getrange_slow;
456 else if (strcmp(argv[1], "--fast-vringh") == 0)
457 fast_vringh = true;
458 else if (strcmp(argv[1], "--parallel") == 0)
459 parallel = true;
460 else
461 errx(1, "Unknown arg %s", argv[1]);
462 argv++;
463 }
464
465 if (parallel)
466 return parallel_test(vdev.features[0], getrange, fast_vringh);
467
468 if (posix_memalign(&__user_addr_min, PAGE_SIZE, USER_MEM) != 0)
469 abort();
470 __user_addr_max = __user_addr_min + USER_MEM;
471 memset(__user_addr_min, 0, vring_size(RINGSIZE, ALIGN));
472
473 /* Set up guest side. */
474 vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true,
475 __user_addr_min,
476 never_notify_host, never_callback_guest,
477 "guest vq");
478
479 /* Set up host side. */
480 vring_init(&vrh.vring, RINGSIZE, __user_addr_min, ALIGN);
481 vringh_init_user(&vrh, vdev.features[0], RINGSIZE, true,
482 vrh.vring.desc, vrh.vring.avail, vrh.vring.used);
483
484 /* No descriptor to get yet... */
485 err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
486 if (err != 0)
487 errx(1, "vringh_getdesc_user: %i", err);
488
489 /* Guest puts in a descriptor. */
490 memcpy(__user_addr_max - 1, "a", 1);
491 sg_init_table(guest_sg, 1);
492 sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1);
493 sg_init_table(guest_sg+1, 1);
494 sg_set_buf(&guest_sg[1], __user_addr_max - 3, 2);
495 sgs[0] = &guest_sg[0];
496 sgs[1] = &guest_sg[1];
497
498 /* May allocate an indirect, so force it to allocate user addr */
499 __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN);
500 err = virtqueue_add_sgs(vq, sgs, 1, 1, &err, GFP_KERNEL);
501 if (err)
502 errx(1, "virtqueue_add_sgs: %i", err);
503 __kmalloc_fake = NULL;
504
505 /* Host retreives it. */
506 vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
507 vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
508
509 err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
510 if (err != 1)
511 errx(1, "vringh_getdesc_user: %i", err);
512
513 assert(riov.used == 1);
514 assert(riov.iov[0].iov_base == __user_addr_max - 1);
515 assert(riov.iov[0].iov_len == 1);
516 if (getrange != getrange_slow) {
517 assert(wiov.used == 1);
518 assert(wiov.iov[0].iov_base == __user_addr_max - 3);
519 assert(wiov.iov[0].iov_len == 2);
520 } else {
521 assert(wiov.used == 2);
522 assert(wiov.iov[0].iov_base == __user_addr_max - 3);
523 assert(wiov.iov[0].iov_len == 1);
524 assert(wiov.iov[1].iov_base == __user_addr_max - 2);
525 assert(wiov.iov[1].iov_len == 1);
526 }
527
528 err = vringh_iov_pull_user(&riov, buf, 5);
529 if (err != 1)
530 errx(1, "vringh_iov_pull_user: %i", err);
531 assert(buf[0] == 'a');
532 assert(riov.i == 1);
533 assert(vringh_iov_pull_user(&riov, buf, 5) == 0);
534
535 memcpy(buf, "bcdef", 5);
536 err = vringh_iov_push_user(&wiov, buf, 5);
537 if (err != 2)
538 errx(1, "vringh_iov_push_user: %i", err);
539 assert(memcmp(__user_addr_max - 3, "bc", 2) == 0);
540 assert(wiov.i == wiov.used);
541 assert(vringh_iov_push_user(&wiov, buf, 5) == 0);
542
543 /* Host is done. */
544 err = vringh_complete_user(&vrh, head, err);
545 if (err != 0)
546 errx(1, "vringh_complete_user: %i", err);
547
548 /* Guest should see used token now. */
549 __kfree_ignore_start = __user_addr_min + vring_size(RINGSIZE, ALIGN);
550 __kfree_ignore_end = __kfree_ignore_start + 1;
551 ret = virtqueue_get_buf(vq, &i);
552 if (ret != &err)
553 errx(1, "virtqueue_get_buf: %p", ret);
554 assert(i == 2);
555
556 /* Guest puts in a huge descriptor. */
557 sg_init_table(guest_sg, RINGSIZE);
558 for (i = 0; i < RINGSIZE; i++) {
559 sg_set_buf(&guest_sg[i],
560 __user_addr_max - USER_MEM/4, USER_MEM/4);
561 }
562
563 /* Fill contents with recognisable garbage. */
564 for (i = 0; i < USER_MEM/4; i++)
565 ((char *)__user_addr_max - USER_MEM/4)[i] = i;
566
567 /* This will allocate an indirect, so force it to allocate user addr */
568 __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN);
569 err = virtqueue_add_outbuf(vq, guest_sg, RINGSIZE, &err, GFP_KERNEL);
570 if (err)
571 errx(1, "virtqueue_add_outbuf (large): %i", err);
572 __kmalloc_fake = NULL;
573
574 /* Host picks it up (allocates new iov). */
575 vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
576 vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
577
578 err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
579 if (err != 1)
580 errx(1, "vringh_getdesc_user: %i", err);
581
582 assert(riov.max_num & VRINGH_IOV_ALLOCATED);
583 assert(riov.iov != host_riov);
584 if (getrange != getrange_slow)
585 assert(riov.used == RINGSIZE);
586 else
587 assert(riov.used == RINGSIZE * USER_MEM/4);
588
589 assert(!(wiov.max_num & VRINGH_IOV_ALLOCATED));
590 assert(wiov.used == 0);
591
592 /* Pull data back out (in odd chunks), should be as expected. */
593 for (i = 0; i < RINGSIZE * USER_MEM/4; i += 3) {
594 err = vringh_iov_pull_user(&riov, buf, 3);
595 if (err != 3 && i + err != RINGSIZE * USER_MEM/4)
596 errx(1, "vringh_iov_pull_user large: %i", err);
597 assert(buf[0] == (char)i);
598 assert(err < 2 || buf[1] == (char)(i + 1));
599 assert(err < 3 || buf[2] == (char)(i + 2));
600 }
601 assert(riov.i == riov.used);
602 vringh_iov_cleanup(&riov);
603 vringh_iov_cleanup(&wiov);
604
605 /* Complete using multi interface, just because we can. */
606 used[0].id = head;
607 used[0].len = 0;
608 err = vringh_complete_multi_user(&vrh, used, 1);
609 if (err)
610 errx(1, "vringh_complete_multi_user(1): %i", err);
611
612 /* Free up those descriptors. */
613 ret = virtqueue_get_buf(vq, &i);
614 if (ret != &err)
615 errx(1, "virtqueue_get_buf: %p", ret);
616
617 /* Add lots of descriptors. */
618 sg_init_table(guest_sg, 1);
619 sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1);
620 for (i = 0; i < RINGSIZE; i++) {
621 err = virtqueue_add_outbuf(vq, guest_sg, 1, &err, GFP_KERNEL);
622 if (err)
623 errx(1, "virtqueue_add_outbuf (multiple): %i", err);
624 }
625
626 /* Now get many, and consume them all at once. */
627 vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
628 vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
629
630 for (i = 0; i < RINGSIZE; i++) {
631 err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
632 if (err != 1)
633 errx(1, "vringh_getdesc_user: %i", err);
634 used[i].id = head;
635 used[i].len = 0;
636 }
637 /* Make sure it wraps around ring, to test! */
638 assert(vrh.vring.used->idx % RINGSIZE != 0);
639 err = vringh_complete_multi_user(&vrh, used, RINGSIZE);
640 if (err)
641 errx(1, "vringh_complete_multi_user: %i", err);
642
643 /* Free those buffers. */
644 for (i = 0; i < RINGSIZE; i++) {
645 unsigned len;
646 assert(virtqueue_get_buf(vq, &len) != NULL);
647 }
648
649 /* Test weird (but legal!) indirect. */
650 if (vdev.features[0] & (1 << VIRTIO_RING_F_INDIRECT_DESC)) {
651 char *data = __user_addr_max - USER_MEM/4;
652 struct vring_desc *d = __user_addr_max - USER_MEM/2;
653 struct vring vring;
654
655 /* Force creation of direct, which we modify. */
656 vdev.features[0] &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC);
657 vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true,
658 __user_addr_min,
659 never_notify_host,
660 never_callback_guest,
661 "guest vq");
662
663 sg_init_table(guest_sg, 4);
664 sg_set_buf(&guest_sg[0], d, sizeof(*d)*2);
665 sg_set_buf(&guest_sg[1], d + 2, sizeof(*d)*1);
666 sg_set_buf(&guest_sg[2], data + 6, 4);
667 sg_set_buf(&guest_sg[3], d + 3, sizeof(*d)*3);
668
669 err = virtqueue_add_outbuf(vq, guest_sg, 4, &err, GFP_KERNEL);
670 if (err)
671 errx(1, "virtqueue_add_outbuf (indirect): %i", err);
672
673 vring_init(&vring, RINGSIZE, __user_addr_min, ALIGN);
674
675 /* They're used in order, but double-check... */
676 assert(vring.desc[0].addr == (unsigned long)d);
677 assert(vring.desc[1].addr == (unsigned long)(d+2));
678 assert(vring.desc[2].addr == (unsigned long)data + 6);
679 assert(vring.desc[3].addr == (unsigned long)(d+3));
680 vring.desc[0].flags |= VRING_DESC_F_INDIRECT;
681 vring.desc[1].flags |= VRING_DESC_F_INDIRECT;
682 vring.desc[3].flags |= VRING_DESC_F_INDIRECT;
683
684 /* First indirect */
685 d[0].addr = (unsigned long)data;
686 d[0].len = 1;
687 d[0].flags = VRING_DESC_F_NEXT;
688 d[0].next = 1;
689 d[1].addr = (unsigned long)data + 1;
690 d[1].len = 2;
691 d[1].flags = 0;
692
693 /* Second indirect */
694 d[2].addr = (unsigned long)data + 3;
695 d[2].len = 3;
696 d[2].flags = 0;
697
698 /* Third indirect */
699 d[3].addr = (unsigned long)data + 10;
700 d[3].len = 5;
701 d[3].flags = VRING_DESC_F_NEXT;
702 d[3].next = 1;
703 d[4].addr = (unsigned long)data + 15;
704 d[4].len = 6;
705 d[4].flags = VRING_DESC_F_NEXT;
706 d[4].next = 2;
707 d[5].addr = (unsigned long)data + 21;
708 d[5].len = 7;
709 d[5].flags = 0;
710
711 /* Host picks it up (allocates new iov). */
712 vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
713 vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
714
715 err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
716 if (err != 1)
717 errx(1, "vringh_getdesc_user: %i", err);
718
719 if (head != 0)
720 errx(1, "vringh_getdesc_user: head %i not 0", head);
721
722 assert(riov.max_num & VRINGH_IOV_ALLOCATED);
723 if (getrange != getrange_slow)
724 assert(riov.used == 7);
725 else
726 assert(riov.used == 28);
727 err = vringh_iov_pull_user(&riov, buf, 29);
728 assert(err == 28);
729
730 /* Data should be linear. */
731 for (i = 0; i < err; i++)
732 assert(buf[i] == i);
733 vringh_iov_cleanup(&riov);
734 }
735
736 /* Don't leak memory... */
737 vring_del_virtqueue(vq);
738 free(__user_addr_min);
739
740 return 0;
741}