diff options
60 files changed, 4481 insertions, 4074 deletions
diff --git a/Documentation/virtual/00-INDEX b/Documentation/virtual/00-INDEX index 924bd462675e..e952d30bbf0f 100644 --- a/Documentation/virtual/00-INDEX +++ b/Documentation/virtual/00-INDEX | |||
@@ -6,6 +6,3 @@ kvm/ | |||
6 | - Kernel Virtual Machine. See also http://linux-kvm.org | 6 | - Kernel Virtual Machine. See also http://linux-kvm.org |
7 | uml/ | 7 | uml/ |
8 | - User Mode Linux, builds/runs Linux kernel as a userspace program. | 8 | - User Mode Linux, builds/runs Linux kernel as a userspace program. |
9 | virtio.txt | ||
10 | - Text version of draft virtio spec. | ||
11 | See http://ozlabs.org/~rusty/virtio-spec | ||
diff --git a/Documentation/virtual/virtio-spec.txt b/Documentation/virtual/virtio-spec.txt deleted file mode 100644 index eb094039b50d..000000000000 --- a/Documentation/virtual/virtio-spec.txt +++ /dev/null | |||
@@ -1,3210 +0,0 @@ | |||
1 | [Generated file: see http://ozlabs.org/~rusty/virtio-spec/] | ||
2 | Virtio PCI Card Specification | ||
3 | v0.9.5 DRAFT | ||
4 | - | ||
5 | |||
6 | Rusty Russell <rusty@rustcorp.com.au> IBM Corporation (Editor) | ||
7 | |||
8 | 2012 May 7. | ||
9 | |||
10 | Purpose and Description | ||
11 | |||
12 | This document describes the specifications of the “virtio” family | ||
13 | of PCI[LaTeX Command: nomenclature] devices. These are devices | ||
14 | are found in virtual environments[LaTeX Command: nomenclature], | ||
15 | yet by design they are not all that different from physical PCI | ||
16 | devices, and this document treats them as such. This allows the | ||
17 | guest to use standard PCI drivers and discovery mechanisms. | ||
18 | |||
19 | The purpose of virtio and this specification is that virtual | ||
20 | environments and guests should have a straightforward, efficient, | ||
21 | standard and extensible mechanism for virtual devices, rather | ||
22 | than boutique per-environment or per-OS mechanisms. | ||
23 | |||
24 | Straightforward: Virtio PCI devices use normal PCI mechanisms | ||
25 | of interrupts and DMA which should be familiar to any device | ||
26 | driver author. There is no exotic page-flipping or COW | ||
27 | mechanism: it's just a PCI device.[footnote: | ||
28 | This lack of page-sharing implies that the implementation of the | ||
29 | device (e.g. the hypervisor or host) needs full access to the | ||
30 | guest memory. Communication with untrusted parties (i.e. | ||
31 | inter-guest communication) requires copying. | ||
32 | ] | ||
33 | |||
34 | Efficient: Virtio PCI devices consist of rings of descriptors | ||
35 | for input and output, which are neatly separated to avoid cache | ||
36 | effects from both guest and device writing to the same cache | ||
37 | lines. | ||
38 | |||
39 | Standard: Virtio PCI makes no assumptions about the environment | ||
40 | in which it operates, beyond supporting PCI. In fact the virtio | ||
41 | devices specified in the appendices do not require PCI at all: | ||
42 | they have been implemented on non-PCI buses.[footnote: | ||
43 | The Linux implementation further separates the PCI virtio code | ||
44 | from the specific virtio drivers: these drivers are shared with | ||
45 | the non-PCI implementations (currently lguest and S/390). | ||
46 | ] | ||
47 | |||
48 | Extensible: Virtio PCI devices contain feature bits which are | ||
49 | acknowledged by the guest operating system during device setup. | ||
50 | This allows forwards and backwards compatibility: the device | ||
51 | offers all the features it knows about, and the driver | ||
52 | acknowledges those it understands and wishes to use. | ||
53 | |||
54 | Virtqueues | ||
55 | |||
56 | The mechanism for bulk data transport on virtio PCI devices is | ||
57 | pretentiously called a virtqueue. Each device can have zero or | ||
58 | more virtqueues: for example, the network device has one for | ||
59 | transmit and one for receive. | ||
60 | |||
61 | Each virtqueue occupies two or more physically-contiguous pages | ||
62 | (defined, for the purposes of this specification, as 4096 bytes), | ||
63 | and consists of three parts: | ||
64 | |||
65 | |||
66 | +-------------------+-----------------------------------+-----------+ | ||
67 | | Descriptor Table | Available Ring (padding) | Used Ring | | ||
68 | +-------------------+-----------------------------------+-----------+ | ||
69 | |||
70 | |||
71 | When the driver wants to send a buffer to the device, it fills in | ||
72 | a slot in the descriptor table (or chains several together), and | ||
73 | writes the descriptor index into the available ring. It then | ||
74 | notifies the device. When the device has finished a buffer, it | ||
75 | writes the descriptor into the used ring, and sends an interrupt. | ||
76 | |||
77 | Specification | ||
78 | |||
79 | PCI Discovery | ||
80 | |||
81 | Any PCI device with Vendor ID 0x1AF4, and Device ID 0x1000 | ||
82 | through 0x103F inclusive is a virtio device[footnote: | ||
83 | The actual value within this range is ignored | ||
84 | ]. The device must also have a Revision ID of 0 to match this | ||
85 | specification. | ||
86 | |||
87 | The Subsystem Device ID indicates which virtio device is | ||
88 | supported by the device. The Subsystem Vendor ID should reflect | ||
89 | the PCI Vendor ID of the environment (it's currently only used | ||
90 | for informational purposes by the guest). | ||
91 | |||
92 | |||
93 | +----------------------+--------------------+---------------+ | ||
94 | | Subsystem Device ID | Virtio Device | Specification | | ||
95 | +----------------------+--------------------+---------------+ | ||
96 | +----------------------+--------------------+---------------+ | ||
97 | | 1 | network card | Appendix C | | ||
98 | +----------------------+--------------------+---------------+ | ||
99 | | 2 | block device | Appendix D | | ||
100 | +----------------------+--------------------+---------------+ | ||
101 | | 3 | console | Appendix E | | ||
102 | +----------------------+--------------------+---------------+ | ||
103 | | 4 | entropy source | Appendix F | | ||
104 | +----------------------+--------------------+---------------+ | ||
105 | | 5 | memory ballooning | Appendix G | | ||
106 | +----------------------+--------------------+---------------+ | ||
107 | | 6 | ioMemory | - | | ||
108 | +----------------------+--------------------+---------------+ | ||
109 | | 7 | rpmsg | Appendix H | | ||
110 | +----------------------+--------------------+---------------+ | ||
111 | | 8 | SCSI host | Appendix I | | ||
112 | +----------------------+--------------------+---------------+ | ||
113 | | 9 | 9P transport | - | | ||
114 | +----------------------+--------------------+---------------+ | ||
115 | | 10 | mac80211 wlan | - | | ||
116 | +----------------------+--------------------+---------------+ | ||
117 | |||
118 | |||
119 | Device Configuration | ||
120 | |||
121 | To configure the device, we use the first I/O region of the PCI | ||
122 | device. This contains a virtio header followed by a | ||
123 | device-specific region. | ||
124 | |||
125 | There may be different widths of accesses to the I/O region; the “ | ||
126 | natural” access method for each field in the virtio header must | ||
127 | be used (i.e. 32-bit accesses for 32-bit fields, etc), but the | ||
128 | device-specific region can be accessed using any width accesses, | ||
129 | and should obtain the same results. | ||
130 | |||
131 | Note that this is possible because while the virtio header is PCI | ||
132 | (i.e. little) endian, the device-specific region is encoded in | ||
133 | the native endian of the guest (where such distinction is | ||
134 | applicable). | ||
135 | |||
136 | Device Initialization Sequence<sub:Device-Initialization-Sequence> | ||
137 | |||
138 | We start with an overview of device initialization, then expand | ||
139 | on the details of the device and how each step is preformed. | ||
140 | |||
141 | Reset the device. This is not required on initial start up. | ||
142 | |||
143 | The ACKNOWLEDGE status bit is set: we have noticed the device. | ||
144 | |||
145 | The DRIVER status bit is set: we know how to drive the device. | ||
146 | |||
147 | Device-specific setup, including reading the Device Feature | ||
148 | Bits, discovery of virtqueues for the device, optional MSI-X | ||
149 | setup, and reading and possibly writing the virtio | ||
150 | configuration space. | ||
151 | |||
152 | The subset of Device Feature Bits understood by the driver is | ||
153 | written to the device. | ||
154 | |||
155 | The DRIVER_OK status bit is set. | ||
156 | |||
157 | The device can now be used (ie. buffers added to the | ||
158 | virtqueues)[footnote: | ||
159 | Historically, drivers have used the device before steps 5 and 6. | ||
160 | This is only allowed if the driver does not use any features | ||
161 | which would alter this early use of the device. | ||
162 | ] | ||
163 | |||
164 | If any of these steps go irrecoverably wrong, the guest should | ||
165 | set the FAILED status bit to indicate that it has given up on the | ||
166 | device (it can reset the device later to restart if desired). | ||
167 | |||
168 | We now cover the fields required for general setup in detail. | ||
169 | |||
170 | Virtio Header | ||
171 | |||
172 | The virtio header looks as follows: | ||
173 | |||
174 | |||
175 | +------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ | ||
176 | | Bits || 32 | 32 | 32 | 16 | 16 | 16 | 8 | 8 | | ||
177 | +------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ | ||
178 | | Read/Write || R | R+W | R+W | R | R+W | R+W | R+W | R | | ||
179 | +------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ | ||
180 | | Purpose || Device | Guest | Queue | Queue | Queue | Queue | Device | ISR | | ||
181 | | || Features bits 0:31 | Features bits 0:31 | Address | Size | Select | Notify | Status | Status | | ||
182 | +------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ | ||
183 | |||
184 | |||
185 | If MSI-X is enabled for the device, two additional fields | ||
186 | immediately follow this header:[footnote: | ||
187 | ie. once you enable MSI-X on the device, the other fields move. | ||
188 | If you turn it off again, they move back! | ||
189 | ] | ||
190 | |||
191 | |||
192 | +------------++----------------+--------+ | ||
193 | | Bits || 16 | 16 | | ||
194 | +----------------+--------+ | ||
195 | +------------++----------------+--------+ | ||
196 | | Read/Write || R+W | R+W | | ||
197 | +------------++----------------+--------+ | ||
198 | | Purpose || Configuration | Queue | | ||
199 | | (MSI-X) || Vector | Vector | | ||
200 | +------------++----------------+--------+ | ||
201 | |||
202 | |||
203 | Immediately following these general headers, there may be | ||
204 | device-specific headers: | ||
205 | |||
206 | |||
207 | +------------++--------------------+ | ||
208 | | Bits || Device Specific | | ||
209 | +--------------------+ | ||
210 | +------------++--------------------+ | ||
211 | | Read/Write || Device Specific | | ||
212 | +------------++--------------------+ | ||
213 | | Purpose || Device Specific... | | ||
214 | | || | | ||
215 | +------------++--------------------+ | ||
216 | |||
217 | |||
218 | Device Status | ||
219 | |||
220 | The Device Status field is updated by the guest to indicate its | ||
221 | progress. This provides a simple low-level diagnostic: it's most | ||
222 | useful to imagine them hooked up to traffic lights on the console | ||
223 | indicating the status of each device. | ||
224 | |||
225 | The device can be reset by writing a 0 to this field, otherwise | ||
226 | at least one bit should be set: | ||
227 | |||
228 | ACKNOWLEDGE (1) Indicates that the guest OS has found the | ||
229 | device and recognized it as a valid virtio device. | ||
230 | |||
231 | DRIVER (2) Indicates that the guest OS knows how to drive the | ||
232 | device. Under Linux, drivers can be loadable modules so there | ||
233 | may be a significant (or infinite) delay before setting this | ||
234 | bit. | ||
235 | |||
236 | DRIVER_OK (4) Indicates that the driver is set up and ready to | ||
237 | drive the device. | ||
238 | |||
239 | FAILED (128) Indicates that something went wrong in the guest, | ||
240 | and it has given up on the device. This could be an internal | ||
241 | error, or the driver didn't like the device for some reason, or | ||
242 | even a fatal error during device operation. The device must be | ||
243 | reset before attempting to re-initialize. | ||
244 | |||
245 | Feature Bits<sub:Feature-Bits> | ||
246 | |||
247 | Thefirst configuration field indicates the features that the | ||
248 | device supports. The bits are allocated as follows: | ||
249 | |||
250 | 0 to 23 Feature bits for the specific device type | ||
251 | |||
252 | 24 to 32 Feature bits reserved for extensions to the queue and | ||
253 | feature negotiation mechanisms | ||
254 | |||
255 | For example, feature bit 0 for a network device (i.e. Subsystem | ||
256 | Device ID 1) indicates that the device supports checksumming of | ||
257 | packets. | ||
258 | |||
259 | The feature bits are negotiated: the device lists all the | ||
260 | features it understands in the Device Features field, and the | ||
261 | guest writes the subset that it understands into the Guest | ||
262 | Features field. The only way to renegotiate is to reset the | ||
263 | device. | ||
264 | |||
265 | In particular, new fields in the device configuration header are | ||
266 | indicated by offering a feature bit, so the guest can check | ||
267 | before accessing that part of the configuration space. | ||
268 | |||
269 | This allows for forwards and backwards compatibility: if the | ||
270 | device is enhanced with a new feature bit, older guests will not | ||
271 | write that feature bit back to the Guest Features field and it | ||
272 | can go into backwards compatibility mode. Similarly, if a guest | ||
273 | is enhanced with a feature that the device doesn't support, it | ||
274 | will not see that feature bit in the Device Features field and | ||
275 | can go into backwards compatibility mode (or, for poor | ||
276 | implementations, set the FAILED Device Status bit). | ||
277 | |||
278 | Configuration/Queue Vectors | ||
279 | |||
280 | When MSI-X capability is present and enabled in the device | ||
281 | (through standard PCI configuration space) 4 bytes at byte offset | ||
282 | 20 are used to map configuration change and queue interrupts to | ||
283 | MSI-X vectors. In this case, the ISR Status field is unused, and | ||
284 | device specific configuration starts at byte offset 24 in virtio | ||
285 | header structure. When MSI-X capability is not enabled, device | ||
286 | specific configuration starts at byte offset 20 in virtio header. | ||
287 | |||
288 | Writing a valid MSI-X Table entry number, 0 to 0x7FF, to one of | ||
289 | Configuration/Queue Vector registers, maps interrupts triggered | ||
290 | by the configuration change/selected queue events respectively to | ||
291 | the corresponding MSI-X vector. To disable interrupts for a | ||
292 | specific event type, unmap it by writing a special NO_VECTOR | ||
293 | value: | ||
294 | |||
295 | /* Vector value used to disable MSI for queue */ | ||
296 | |||
297 | #define VIRTIO_MSI_NO_VECTOR 0xffff | ||
298 | |||
299 | Reading these registers returns vector mapped to a given event, | ||
300 | or NO_VECTOR if unmapped. All queue and configuration change | ||
301 | events are unmapped by default. | ||
302 | |||
303 | Note that mapping an event to vector might require allocating | ||
304 | internal device resources, and might fail. Devices report such | ||
305 | failures by returning the NO_VECTOR value when the relevant | ||
306 | Vector field is read. After mapping an event to vector, the | ||
307 | driver must verify success by reading the Vector field value: on | ||
308 | success, the previously written value is returned, and on | ||
309 | failure, NO_VECTOR is returned. If a mapping failure is detected, | ||
310 | the driver can retry mapping with fewervectors, or disable MSI-X. | ||
311 | |||
312 | Virtqueue Configuration<sec:Virtqueue-Configuration> | ||
313 | |||
314 | As a device can have zero or more virtqueues for bulk data | ||
315 | transport (for example, the network driver has two), the driver | ||
316 | needs to configure them as part of the device-specific | ||
317 | configuration. | ||
318 | |||
319 | This is done as follows, for each virtqueue a device has: | ||
320 | |||
321 | Write the virtqueue index (first queue is 0) to the Queue | ||
322 | Select field. | ||
323 | |||
324 | Read the virtqueue size from the Queue Size field, which is | ||
325 | always a power of 2. This controls how big the virtqueue is | ||
326 | (see below). If this field is 0, the virtqueue does not exist. | ||
327 | |||
328 | Allocate and zero virtqueue in contiguous physical memory, on a | ||
329 | 4096 byte alignment. Write the physical address, divided by | ||
330 | 4096 to the Queue Address field.[footnote: | ||
331 | The 4096 is based on the x86 page size, but it's also large | ||
332 | enough to ensure that the separate parts of the virtqueue are on | ||
333 | separate cache lines. | ||
334 | ] | ||
335 | |||
336 | Optionally, if MSI-X capability is present and enabled on the | ||
337 | device, select a vector to use to request interrupts triggered | ||
338 | by virtqueue events. Write the MSI-X Table entry number | ||
339 | corresponding to this vector in Queue Vector field. Read the | ||
340 | Queue Vector field: on success, previously written value is | ||
341 | returned; on failure, NO_VECTOR value is returned. | ||
342 | |||
343 | The Queue Size field controls the total number of bytes required | ||
344 | for the virtqueue according to the following formula: | ||
345 | |||
346 | #define ALIGN(x) (((x) + 4095) & ~4095) | ||
347 | |||
348 | static inline unsigned vring_size(unsigned int qsz) | ||
349 | |||
350 | { | ||
351 | |||
352 | return ALIGN(sizeof(struct vring_desc)*qsz + sizeof(u16)*(2 | ||
353 | + qsz)) | ||
354 | |||
355 | + ALIGN(sizeof(struct vring_used_elem)*qsz); | ||
356 | |||
357 | } | ||
358 | |||
359 | This currently wastes some space with padding, but also allows | ||
360 | future extensions. The virtqueue layout structure looks like this | ||
361 | (qsz is the Queue Size field, which is a variable, so this code | ||
362 | won't compile): | ||
363 | |||
364 | struct vring { | ||
365 | |||
366 | /* The actual descriptors (16 bytes each) */ | ||
367 | |||
368 | struct vring_desc desc[qsz]; | ||
369 | |||
370 | |||
371 | |||
372 | /* A ring of available descriptor heads with free-running | ||
373 | index. */ | ||
374 | |||
375 | struct vring_avail avail; | ||
376 | |||
377 | |||
378 | |||
379 | // Padding to the next 4096 boundary. | ||
380 | |||
381 | char pad[]; | ||
382 | |||
383 | |||
384 | |||
385 | // A ring of used descriptor heads with free-running index. | ||
386 | |||
387 | struct vring_used used; | ||
388 | |||
389 | }; | ||
390 | |||
391 | A Note on Virtqueue Endianness | ||
392 | |||
393 | Note that the endian of these fields and everything else in the | ||
394 | virtqueue is the native endian of the guest, not little-endian as | ||
395 | PCI normally is. This makes for simpler guest code, and it is | ||
396 | assumed that the host already has to be deeply aware of the guest | ||
397 | endian so such an “endian-aware” device is not a significant | ||
398 | issue. | ||
399 | |||
400 | Descriptor Table | ||
401 | |||
402 | The descriptor table refers to the buffers the guest is using for | ||
403 | the device. The addresses are physical addresses, and the buffers | ||
404 | can be chained via the next field. Each descriptor describes a | ||
405 | buffer which is read-only or write-only, but a chain of | ||
406 | descriptors can contain both read-only and write-only buffers. | ||
407 | |||
408 | No descriptor chain may be more than 2^32 bytes long in total.struct vring_desc { | ||
409 | |||
410 | /* Address (guest-physical). */ | ||
411 | |||
412 | u64 addr; | ||
413 | |||
414 | /* Length. */ | ||
415 | |||
416 | u32 len; | ||
417 | |||
418 | /* This marks a buffer as continuing via the next field. */ | ||
419 | |||
420 | #define VRING_DESC_F_NEXT 1 | ||
421 | |||
422 | /* This marks a buffer as write-only (otherwise read-only). */ | ||
423 | |||
424 | #define VRING_DESC_F_WRITE 2 | ||
425 | |||
426 | /* This means the buffer contains a list of buffer descriptors. | ||
427 | */ | ||
428 | |||
429 | #define VRING_DESC_F_INDIRECT 4 | ||
430 | |||
431 | /* The flags as indicated above. */ | ||
432 | |||
433 | u16 flags; | ||
434 | |||
435 | /* Next field if flags & NEXT */ | ||
436 | |||
437 | u16 next; | ||
438 | |||
439 | }; | ||
440 | |||
441 | The number of descriptors in the table is specified by the Queue | ||
442 | Size field for this virtqueue. | ||
443 | |||
444 | <sub:Indirect-Descriptors>Indirect Descriptors | ||
445 | |||
446 | Some devices benefit by concurrently dispatching a large number | ||
447 | of large requests. The VIRTIO_RING_F_INDIRECT_DESC feature can be | ||
448 | used to allow this (see [cha:Reserved-Feature-Bits]). To increase | ||
449 | ring capacity it is possible to store a table of indirect | ||
450 | descriptors anywhere in memory, and insert a descriptor in main | ||
451 | virtqueue (with flags&INDIRECT on) that refers to memory buffer | ||
452 | containing this indirect descriptor table; fields addr and len | ||
453 | refer to the indirect table address and length in bytes, | ||
454 | respectively. The indirect table layout structure looks like this | ||
455 | (len is the length of the descriptor that refers to this table, | ||
456 | which is a variable, so this code won't compile): | ||
457 | |||
458 | struct indirect_descriptor_table { | ||
459 | |||
460 | /* The actual descriptors (16 bytes each) */ | ||
461 | |||
462 | struct vring_desc desc[len / 16]; | ||
463 | |||
464 | }; | ||
465 | |||
466 | The first indirect descriptor is located at start of the indirect | ||
467 | descriptor table (index 0), additional indirect descriptors are | ||
468 | chained by next field. An indirect descriptor without next field | ||
469 | (with flags&NEXT off) signals the end of the indirect descriptor | ||
470 | table, and transfers control back to the main virtqueue. An | ||
471 | indirect descriptor can not refer to another indirect descriptor | ||
472 | table (flags&INDIRECT must be off). A single indirect descriptor | ||
473 | table can include both read-only and write-only descriptors; | ||
474 | write-only flag (flags&WRITE) in the descriptor that refers to it | ||
475 | is ignored. | ||
476 | |||
477 | Available Ring | ||
478 | |||
479 | The available ring refers to what descriptors we are offering the | ||
480 | device: it refers to the head of a descriptor chain. The “flags” | ||
481 | field is currently 0 or 1: 1 indicating that we do not need an | ||
482 | interrupt when the device consumes a descriptor from the | ||
483 | available ring. Alternatively, the guest can ask the device to | ||
484 | delay interrupts until an entry with an index specified by the “ | ||
485 | used_event” field is written in the used ring (equivalently, | ||
486 | until the idx field in the used ring will reach the value | ||
487 | used_event + 1). The method employed by the device is controlled | ||
488 | by the VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits] | ||
489 | ). This interrupt suppression is merely an optimization; it may | ||
490 | not suppress interrupts entirely. | ||
491 | |||
492 | The “idx” field indicates where we would put the next descriptor | ||
493 | entry (modulo the ring size). This starts at 0, and increases. | ||
494 | |||
495 | struct vring_avail { | ||
496 | |||
497 | #define VRING_AVAIL_F_NO_INTERRUPT 1 | ||
498 | |||
499 | u16 flags; | ||
500 | |||
501 | u16 idx; | ||
502 | |||
503 | u16 ring[qsz]; /* qsz is the Queue Size field read from device | ||
504 | */ | ||
505 | |||
506 | u16 used_event; | ||
507 | |||
508 | }; | ||
509 | |||
510 | Used Ring | ||
511 | |||
512 | The used ring is where the device returns buffers once it is done | ||
513 | with them. The flags field can be used by the device to hint that | ||
514 | no notification is necessary when the guest adds to the available | ||
515 | ring. Alternatively, the “avail_event” field can be used by the | ||
516 | device to hint that no notification is necessary until an entry | ||
517 | with an index specified by the “avail_event” is written in the | ||
518 | available ring (equivalently, until the idx field in the | ||
519 | available ring will reach the value avail_event + 1). The method | ||
520 | employed by the device is controlled by the guest through the | ||
521 | VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits] | ||
522 | ). [footnote: | ||
523 | These fields are kept here because this is the only part of the | ||
524 | virtqueue written by the device | ||
525 | ]. | ||
526 | |||
527 | Each entry in the ring is a pair: the head entry of the | ||
528 | descriptor chain describing the buffer (this matches an entry | ||
529 | placed in the available ring by the guest earlier), and the total | ||
530 | of bytes written into the buffer. The latter is extremely useful | ||
531 | for guests using untrusted buffers: if you do not know exactly | ||
532 | how much has been written by the device, you usually have to zero | ||
533 | the buffer to ensure no data leakage occurs. | ||
534 | |||
535 | /* u32 is used here for ids for padding reasons. */ | ||
536 | |||
537 | struct vring_used_elem { | ||
538 | |||
539 | /* Index of start of used descriptor chain. */ | ||
540 | |||
541 | u32 id; | ||
542 | |||
543 | /* Total length of the descriptor chain which was used | ||
544 | (written to) */ | ||
545 | |||
546 | u32 len; | ||
547 | |||
548 | }; | ||
549 | |||
550 | |||
551 | |||
552 | struct vring_used { | ||
553 | |||
554 | #define VRING_USED_F_NO_NOTIFY 1 | ||
555 | |||
556 | u16 flags; | ||
557 | |||
558 | u16 idx; | ||
559 | |||
560 | struct vring_used_elem ring[qsz]; | ||
561 | |||
562 | u16 avail_event; | ||
563 | |||
564 | }; | ||
565 | |||
566 | Helpers for Managing Virtqueues | ||
567 | |||
568 | The Linux Kernel Source code contains the definitions above and | ||
569 | helper routines in a more usable form, in | ||
570 | include/linux/virtio_ring.h. This was explicitly licensed by IBM | ||
571 | and Red Hat under the (3-clause) BSD license so that it can be | ||
572 | freely used by all other projects, and is reproduced (with slight | ||
573 | variation to remove Linux assumptions) in Appendix A. | ||
574 | |||
575 | Device Operation<sec:Device-Operation> | ||
576 | |||
577 | There are two parts to device operation: supplying new buffers to | ||
578 | the device, and processing used buffers from the device. As an | ||
579 | example, the virtio network device has two virtqueues: the | ||
580 | transmit virtqueue and the receive virtqueue. The driver adds | ||
581 | outgoing (read-only) packets to the transmit virtqueue, and then | ||
582 | frees them after they are used. Similarly, incoming (write-only) | ||
583 | buffers are added to the receive virtqueue, and processed after | ||
584 | they are used. | ||
585 | |||
586 | Supplying Buffers to The Device | ||
587 | |||
588 | Actual transfer of buffers from the guest OS to the device | ||
589 | operates as follows: | ||
590 | |||
591 | Place the buffer(s) into free descriptor(s). | ||
592 | |||
593 | If there are no free descriptors, the guest may choose to | ||
594 | notify the device even if notifications are suppressed (to | ||
595 | reduce latency).[footnote: | ||
596 | The Linux drivers do this only for read-only buffers: for | ||
597 | write-only buffers, it is assumed that the driver is merely | ||
598 | trying to keep the receive buffer ring full, and no notification | ||
599 | of this expected condition is necessary. | ||
600 | ] | ||
601 | |||
602 | Place the id of the buffer in the next ring entry of the | ||
603 | available ring. | ||
604 | |||
605 | The steps (1) and (2) may be performed repeatedly if batching | ||
606 | is possible. | ||
607 | |||
608 | A memory barrier should be executed to ensure the device sees | ||
609 | the updated descriptor table and available ring before the next | ||
610 | step. | ||
611 | |||
612 | The available “idx” field should be increased by the number of | ||
613 | entries added to the available ring. | ||
614 | |||
615 | A memory barrier should be executed to ensure that we update | ||
616 | the idx field before checking for notification suppression. | ||
617 | |||
618 | If notifications are not suppressed, the device should be | ||
619 | notified of the new buffers. | ||
620 | |||
621 | Note that the above code does not take precautions against the | ||
622 | available ring buffer wrapping around: this is not possible since | ||
623 | the ring buffer is the same size as the descriptor table, so step | ||
624 | (1) will prevent such a condition. | ||
625 | |||
626 | In addition, the maximum queue size is 32768 (it must be a power | ||
627 | of 2 which fits in 16 bits), so the 16-bit “idx” value can always | ||
628 | distinguish between a full and empty buffer. | ||
629 | |||
630 | Here is a description of each stage in more detail. | ||
631 | |||
632 | Placing Buffers Into The Descriptor Table | ||
633 | |||
634 | A buffer consists of zero or more read-only physically-contiguous | ||
635 | elements followed by zero or more physically-contiguous | ||
636 | write-only elements (it must have at least one element). This | ||
637 | algorithm maps it into the descriptor table: | ||
638 | |||
639 | for each buffer element, b: | ||
640 | |||
641 | Get the next free descriptor table entry, d | ||
642 | |||
643 | Set d.addr to the physical address of the start of b | ||
644 | |||
645 | Set d.len to the length of b. | ||
646 | |||
647 | If b is write-only, set d.flags to VRING_DESC_F_WRITE, | ||
648 | otherwise 0. | ||
649 | |||
650 | If there is a buffer element after this: | ||
651 | |||
652 | Set d.next to the index of the next free descriptor element. | ||
653 | |||
654 | Set the VRING_DESC_F_NEXT bit in d.flags. | ||
655 | |||
656 | In practice, the d.next fields are usually used to chain free | ||
657 | descriptors, and a separate count kept to check there are enough | ||
658 | free descriptors before beginning the mappings. | ||
659 | |||
660 | Updating The Available Ring | ||
661 | |||
662 | The head of the buffer we mapped is the first d in the algorithm | ||
663 | above. A naive implementation would do the following: | ||
664 | |||
665 | avail->ring[avail->idx % qsz] = head; | ||
666 | |||
667 | However, in general we can add many descriptors before we update | ||
668 | the “idx” field (at which point they become visible to the | ||
669 | device), so we keep a counter of how many we've added: | ||
670 | |||
671 | avail->ring[(avail->idx + added++) % qsz] = head; | ||
672 | |||
673 | Updating The Index Field | ||
674 | |||
675 | Once the idx field of the virtqueue is updated, the device will | ||
676 | be able to access the descriptor entries we've created and the | ||
677 | memory they refer to. This is why a memory barrier is generally | ||
678 | used before the idx update, to ensure it sees the most up-to-date | ||
679 | copy. | ||
680 | |||
681 | The idx field always increments, and we let it wrap naturally at | ||
682 | 65536: | ||
683 | |||
684 | avail->idx += added; | ||
685 | |||
686 | <sub:Notifying-The-Device>Notifying The Device | ||
687 | |||
688 | Device notification occurs by writing the 16-bit virtqueue index | ||
689 | of this virtqueue to the Queue Notify field of the virtio header | ||
690 | in the first I/O region of the PCI device. This can be expensive, | ||
691 | however, so the device can suppress such notifications if it | ||
692 | doesn't need them. We have to be careful to expose the new idx | ||
693 | value before checking the suppression flag: it's OK to notify | ||
694 | gratuitously, but not to omit a required notification. So again, | ||
695 | we use a memory barrier here before reading the flags or the | ||
696 | avail_event field. | ||
697 | |||
698 | If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated, and if | ||
699 | the VRING_USED_F_NOTIFY flag is not set, we go ahead and write to | ||
700 | the PCI configuration space. | ||
701 | |||
702 | If the VIRTIO_F_RING_EVENT_IDX feature is negotiated, we read the | ||
703 | avail_event field in the available ring structure. If the | ||
704 | available index crossed_the avail_event field value since the | ||
705 | last notification, we go ahead and write to the PCI configuration | ||
706 | space. The avail_event field wraps naturally at 65536 as well: | ||
707 | |||
708 | (u16)(new_idx - avail_event - 1) < (u16)(new_idx - old_idx) | ||
709 | |||
710 | <sub:Receiving-Used-Buffers>Receiving Used Buffers From The | ||
711 | Device | ||
712 | |||
713 | Once the device has used a buffer (read from or written to it, or | ||
714 | parts of both, depending on the nature of the virtqueue and the | ||
715 | device), it sends an interrupt, following an algorithm very | ||
716 | similar to the algorithm used for the driver to send the device a | ||
717 | buffer: | ||
718 | |||
719 | Write the head descriptor number to the next field in the used | ||
720 | ring. | ||
721 | |||
722 | Update the used ring idx. | ||
723 | |||
724 | Determine whether an interrupt is necessary: | ||
725 | |||
726 | If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated: check | ||
727 | if f the VRING_AVAIL_F_NO_INTERRUPT flag is not set in avail- | ||
728 | >flags | ||
729 | |||
730 | If the VIRTIO_F_RING_EVENT_IDX feature is negotiated: check | ||
731 | whether the used index crossed the used_event field value | ||
732 | since the last update. The used_event field wraps naturally | ||
733 | at 65536 as well:(u16)(new_idx - used_event - 1) < (u16)(new_idx - old_idx) | ||
734 | |||
735 | If an interrupt is necessary: | ||
736 | |||
737 | If MSI-X capability is disabled: | ||
738 | |||
739 | Set the lower bit of the ISR Status field for the device. | ||
740 | |||
741 | Send the appropriate PCI interrupt for the device. | ||
742 | |||
743 | If MSI-X capability is enabled: | ||
744 | |||
745 | Request the appropriate MSI-X interrupt message for the | ||
746 | device, Queue Vector field sets the MSI-X Table entry | ||
747 | number. | ||
748 | |||
749 | If Queue Vector field value is NO_VECTOR, no interrupt | ||
750 | message is requested for this event. | ||
751 | |||
752 | The guest interrupt handler should: | ||
753 | |||
754 | If MSI-X capability is disabled: read the ISR Status field, | ||
755 | which will reset it to zero. If the lower bit is zero, the | ||
756 | interrupt was not for this device. Otherwise, the guest driver | ||
757 | should look through the used rings of each virtqueue for the | ||
758 | device, to see if any progress has been made by the device | ||
759 | which requires servicing. | ||
760 | |||
761 | If MSI-X capability is enabled: look through the used rings of | ||
762 | each virtqueue mapped to the specific MSI-X vector for the | ||
763 | device, to see if any progress has been made by the device | ||
764 | which requires servicing. | ||
765 | |||
766 | For each ring, guest should then disable interrupts by writing | ||
767 | VRING_AVAIL_F_NO_INTERRUPT flag in avail structure, if required. | ||
768 | It can then process used ring entries finally enabling interrupts | ||
769 | by clearing the VRING_AVAIL_F_NO_INTERRUPT flag or updating the | ||
770 | EVENT_IDX field in the available structure, Guest should then | ||
771 | execute a memory barrier, and then recheck the ring empty | ||
772 | condition. This is necessary to handle the case where, after the | ||
773 | last check and before enabling interrupts, an interrupt has been | ||
774 | suppressed by the device: | ||
775 | |||
776 | vring_disable_interrupts(vq); | ||
777 | |||
778 | for (;;) { | ||
779 | |||
780 | if (vq->last_seen_used != vring->used.idx) { | ||
781 | |||
782 | vring_enable_interrupts(vq); | ||
783 | |||
784 | mb(); | ||
785 | |||
786 | if (vq->last_seen_used != vring->used.idx) | ||
787 | |||
788 | break; | ||
789 | |||
790 | } | ||
791 | |||
792 | struct vring_used_elem *e = | ||
793 | vring.used->ring[vq->last_seen_used%vsz]; | ||
794 | |||
795 | process_buffer(e); | ||
796 | |||
797 | vq->last_seen_used++; | ||
798 | |||
799 | } | ||
800 | |||
801 | Dealing With Configuration Changes<sub:Dealing-With-Configuration> | ||
802 | |||
803 | Some virtio PCI devices can change the device configuration | ||
804 | state, as reflected in the virtio header in the PCI configuration | ||
805 | space. In this case: | ||
806 | |||
807 | If MSI-X capability is disabled: an interrupt is delivered and | ||
808 | the second highest bit is set in the ISR Status field to | ||
809 | indicate that the driver should re-examine the configuration | ||
810 | space.Note that a single interrupt can indicate both that one | ||
811 | or more virtqueue has been used and that the configuration | ||
812 | space has changed: even if the config bit is set, virtqueues | ||
813 | must be scanned. | ||
814 | |||
815 | If MSI-X capability is enabled: an interrupt message is | ||
816 | requested. The Configuration Vector field sets the MSI-X Table | ||
817 | entry number to use. If Configuration Vector field value is | ||
818 | NO_VECTOR, no interrupt message is requested for this event. | ||
819 | |||
820 | Creating New Device Types | ||
821 | |||
822 | Various considerations are necessary when creating a new device | ||
823 | type: | ||
824 | |||
825 | How Many Virtqueues? | ||
826 | |||
827 | It is possible that a very simple device will operate entirely | ||
828 | through its configuration space, but most will need at least one | ||
829 | virtqueue in which it will place requests. A device with both | ||
830 | input and output (eg. console and network devices described here) | ||
831 | need two queues: one which the driver fills with buffers to | ||
832 | receive input, and one which the driver places buffers to | ||
833 | transmit output. | ||
834 | |||
835 | What Configuration Space Layout? | ||
836 | |||
837 | Configuration space is generally used for rarely-changing or | ||
838 | initialization-time parameters. But it is a limited resource, so | ||
839 | it might be better to use a virtqueue to update configuration | ||
840 | information (the network device does this for filtering, | ||
841 | otherwise the table in the config space could potentially be very | ||
842 | large). | ||
843 | |||
844 | Note that this space is generally the guest's native endian, | ||
845 | rather than PCI's little-endian. | ||
846 | |||
847 | What Device Number? | ||
848 | |||
849 | Currently device numbers are assigned quite freely: a simple | ||
850 | request mail to the author of this document or the Linux | ||
851 | virtualization mailing list[footnote: | ||
852 | |||
853 | https://lists.linux-foundation.org/mailman/listinfo/virtualization | ||
854 | ] will be sufficient to secure a unique one. | ||
855 | |||
856 | Meanwhile for experimental drivers, use 65535 and work backwards. | ||
857 | |||
858 | How many MSI-X vectors? | ||
859 | |||
860 | Using the optional MSI-X capability devices can speed up | ||
861 | interrupt processing by removing the need to read ISR Status | ||
862 | register by guest driver (which might be an expensive operation), | ||
863 | reducing interrupt sharing between devices and queues within the | ||
864 | device, and handling interrupts from multiple CPUs. However, some | ||
865 | systems impose a limit (which might be as low as 256) on the | ||
866 | total number of MSI-X vectors that can be allocated to all | ||
867 | devices. Devices and/or device drivers should take this into | ||
868 | account, limiting the number of vectors used unless the device is | ||
869 | expected to cause a high volume of interrupts. Devices can | ||
870 | control the number of vectors used by limiting the MSI-X Table | ||
871 | Size or not presenting MSI-X capability in PCI configuration | ||
872 | space. Drivers can control this by mapping events to as small | ||
873 | number of vectors as possible, or disabling MSI-X capability | ||
874 | altogether. | ||
875 | |||
876 | Message Framing | ||
877 | |||
878 | The descriptors used for a buffer should not effect the semantics | ||
879 | of the message, except for the total length of the buffer. For | ||
880 | example, a network buffer consists of a 10 byte header followed | ||
881 | by the network packet. Whether this is presented in the ring | ||
882 | descriptor chain as (say) a 10 byte buffer and a 1514 byte | ||
883 | buffer, or a single 1524 byte buffer, or even three buffers, | ||
884 | should have no effect. | ||
885 | |||
886 | In particular, no implementation should use the descriptor | ||
887 | boundaries to determine the size of any header in a request.[footnote: | ||
888 | The current qemu device implementations mistakenly insist that | ||
889 | the first descriptor cover the header in these cases exactly, so | ||
890 | a cautious driver should arrange it so. | ||
891 | ] | ||
892 | |||
893 | Device Improvements | ||
894 | |||
895 | Any change to configuration space, or new virtqueues, or | ||
896 | behavioural changes, should be indicated by negotiation of a new | ||
897 | feature bit. This establishes clarity[footnote: | ||
898 | Even if it does mean documenting design or implementation | ||
899 | mistakes! | ||
900 | ] and avoids future expansion problems. | ||
901 | |||
902 | Clusters of functionality which are always implemented together | ||
903 | can use a single bit, but if one feature makes sense without the | ||
904 | others they should not be gratuitously grouped together to | ||
905 | conserve feature bits. We can always extend the spec when the | ||
906 | first person needs more than 24 feature bits for their device. | ||
907 | |||
908 | [LaTeX Command: printnomenclature] | ||
909 | |||
910 | Appendix A: virtio_ring.h | ||
911 | |||
912 | #ifndef VIRTIO_RING_H | ||
913 | |||
914 | #define VIRTIO_RING_H | ||
915 | |||
916 | /* An interface for efficient virtio implementation. | ||
917 | |||
918 | * | ||
919 | |||
920 | * This header is BSD licensed so anyone can use the definitions | ||
921 | |||
922 | * to implement compatible drivers/servers. | ||
923 | |||
924 | * | ||
925 | |||
926 | * Copyright 2007, 2009, IBM Corporation | ||
927 | |||
928 | * Copyright 2011, Red Hat, Inc | ||
929 | |||
930 | * All rights reserved. | ||
931 | |||
932 | * | ||
933 | |||
934 | * Redistribution and use in source and binary forms, with or | ||
935 | without | ||
936 | |||
937 | * modification, are permitted provided that the following | ||
938 | conditions | ||
939 | |||
940 | * are met: | ||
941 | |||
942 | * 1. Redistributions of source code must retain the above | ||
943 | copyright | ||
944 | |||
945 | * notice, this list of conditions and the following | ||
946 | disclaimer. | ||
947 | |||
948 | * 2. Redistributions in binary form must reproduce the above | ||
949 | copyright | ||
950 | |||
951 | * notice, this list of conditions and the following | ||
952 | disclaimer in the | ||
953 | |||
954 | * documentation and/or other materials provided with the | ||
955 | distribution. | ||
956 | |||
957 | * 3. Neither the name of IBM nor the names of its contributors | ||
958 | |||
959 | * may be used to endorse or promote products derived from | ||
960 | this software | ||
961 | |||
962 | * without specific prior written permission. | ||
963 | |||
964 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND | ||
965 | CONTRIBUTORS ``AS IS'' AND | ||
966 | |||
967 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | ||
968 | TO, THE | ||
969 | |||
970 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A | ||
971 | PARTICULAR PURPOSE | ||
972 | |||
973 | * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE | ||
974 | LIABLE | ||
975 | |||
976 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
977 | CONSEQUENTIAL | ||
978 | |||
979 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
980 | SUBSTITUTE GOODS | ||
981 | |||
982 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
983 | INTERRUPTION) | ||
984 | |||
985 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
986 | CONTRACT, STRICT | ||
987 | |||
988 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING | ||
989 | IN ANY WAY | ||
990 | |||
991 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
992 | POSSIBILITY OF | ||
993 | |||
994 | * SUCH DAMAGE. | ||
995 | |||
996 | */ | ||
997 | |||
998 | |||
999 | |||
1000 | /* This marks a buffer as continuing via the next field. */ | ||
1001 | |||
1002 | #define VRING_DESC_F_NEXT 1 | ||
1003 | |||
1004 | /* This marks a buffer as write-only (otherwise read-only). */ | ||
1005 | |||
1006 | #define VRING_DESC_F_WRITE 2 | ||
1007 | |||
1008 | |||
1009 | |||
1010 | /* The Host uses this in used->flags to advise the Guest: don't | ||
1011 | kick me | ||
1012 | |||
1013 | * when you add a buffer. It's unreliable, so it's simply an | ||
1014 | |||
1015 | * optimization. Guest will still kick if it's out of buffers. | ||
1016 | */ | ||
1017 | |||
1018 | #define VRING_USED_F_NO_NOTIFY 1 | ||
1019 | |||
1020 | /* The Guest uses this in avail->flags to advise the Host: don't | ||
1021 | |||
1022 | * interrupt me when you consume a buffer. It's unreliable, so | ||
1023 | it's | ||
1024 | |||
1025 | * simply an optimization. */ | ||
1026 | |||
1027 | #define VRING_AVAIL_F_NO_INTERRUPT 1 | ||
1028 | |||
1029 | |||
1030 | |||
1031 | /* Virtio ring descriptors: 16 bytes. | ||
1032 | |||
1033 | * These can chain together via "next". */ | ||
1034 | |||
1035 | struct vring_desc { | ||
1036 | |||
1037 | /* Address (guest-physical). */ | ||
1038 | |||
1039 | uint64_t addr; | ||
1040 | |||
1041 | /* Length. */ | ||
1042 | |||
1043 | uint32_t len; | ||
1044 | |||
1045 | /* The flags as indicated above. */ | ||
1046 | |||
1047 | uint16_t flags; | ||
1048 | |||
1049 | /* We chain unused descriptors via this, too */ | ||
1050 | |||
1051 | uint16_t next; | ||
1052 | |||
1053 | }; | ||
1054 | |||
1055 | |||
1056 | |||
1057 | struct vring_avail { | ||
1058 | |||
1059 | uint16_t flags; | ||
1060 | |||
1061 | uint16_t idx; | ||
1062 | |||
1063 | uint16_t ring[]; | ||
1064 | |||
1065 | uint16_t used_event; | ||
1066 | |||
1067 | }; | ||
1068 | |||
1069 | |||
1070 | |||
1071 | /* u32 is used here for ids for padding reasons. */ | ||
1072 | |||
1073 | struct vring_used_elem { | ||
1074 | |||
1075 | /* Index of start of used descriptor chain. */ | ||
1076 | |||
1077 | uint32_t id; | ||
1078 | |||
1079 | /* Total length of the descriptor chain which was written | ||
1080 | to. */ | ||
1081 | |||
1082 | uint32_t len; | ||
1083 | |||
1084 | }; | ||
1085 | |||
1086 | |||
1087 | |||
1088 | struct vring_used { | ||
1089 | |||
1090 | uint16_t flags; | ||
1091 | |||
1092 | uint16_t idx; | ||
1093 | |||
1094 | struct vring_used_elem ring[]; | ||
1095 | |||
1096 | uint16_t avail_event; | ||
1097 | |||
1098 | }; | ||
1099 | |||
1100 | |||
1101 | |||
1102 | struct vring { | ||
1103 | |||
1104 | unsigned int num; | ||
1105 | |||
1106 | |||
1107 | |||
1108 | struct vring_desc *desc; | ||
1109 | |||
1110 | struct vring_avail *avail; | ||
1111 | |||
1112 | struct vring_used *used; | ||
1113 | |||
1114 | }; | ||
1115 | |||
1116 | |||
1117 | |||
1118 | /* The standard layout for the ring is a continuous chunk of | ||
1119 | memory which | ||
1120 | |||
1121 | * looks like this. We assume num is a power of 2. | ||
1122 | |||
1123 | * | ||
1124 | |||
1125 | * struct vring { | ||
1126 | |||
1127 | * // The actual descriptors (16 bytes each) | ||
1128 | |||
1129 | * struct vring_desc desc[num]; | ||
1130 | |||
1131 | * | ||
1132 | |||
1133 | * // A ring of available descriptor heads with free-running | ||
1134 | index. | ||
1135 | |||
1136 | * __u16 avail_flags; | ||
1137 | |||
1138 | * __u16 avail_idx; | ||
1139 | |||
1140 | * __u16 available[num]; | ||
1141 | |||
1142 | * | ||
1143 | |||
1144 | * // Padding to the next align boundary. | ||
1145 | |||
1146 | * char pad[]; | ||
1147 | |||
1148 | * | ||
1149 | |||
1150 | * // A ring of used descriptor heads with free-running | ||
1151 | index. | ||
1152 | |||
1153 | * __u16 used_flags; | ||
1154 | |||
1155 | * __u16 EVENT_IDX; | ||
1156 | |||
1157 | * struct vring_used_elem used[num]; | ||
1158 | |||
1159 | * }; | ||
1160 | |||
1161 | * Note: for virtio PCI, align is 4096. | ||
1162 | |||
1163 | */ | ||
1164 | |||
1165 | static inline void vring_init(struct vring *vr, unsigned int num, | ||
1166 | void *p, | ||
1167 | |||
1168 | unsigned long align) | ||
1169 | |||
1170 | { | ||
1171 | |||
1172 | vr->num = num; | ||
1173 | |||
1174 | vr->desc = p; | ||
1175 | |||
1176 | vr->avail = p + num*sizeof(struct vring_desc); | ||
1177 | |||
1178 | vr->used = (void *)(((unsigned long)&vr->avail->ring[num] | ||
1179 | |||
1180 | + align-1) | ||
1181 | |||
1182 | & ~(align - 1)); | ||
1183 | |||
1184 | } | ||
1185 | |||
1186 | |||
1187 | |||
1188 | static inline unsigned vring_size(unsigned int num, unsigned long | ||
1189 | align) | ||
1190 | |||
1191 | { | ||
1192 | |||
1193 | return ((sizeof(struct vring_desc)*num + | ||
1194 | sizeof(uint16_t)*(2+num) | ||
1195 | |||
1196 | + align - 1) & ~(align - 1)) | ||
1197 | |||
1198 | + sizeof(uint16_t)*3 + sizeof(struct | ||
1199 | vring_used_elem)*num; | ||
1200 | |||
1201 | } | ||
1202 | |||
1203 | |||
1204 | |||
1205 | static inline int vring_need_event(uint16_t event_idx, uint16_t | ||
1206 | new_idx, uint16_t old_idx) | ||
1207 | |||
1208 | { | ||
1209 | |||
1210 | return (uint16_t)(new_idx - event_idx - 1) < | ||
1211 | (uint16_t)(new_idx - old_idx); | ||
1212 | |||
1213 | } | ||
1214 | |||
1215 | #endif /* VIRTIO_RING_H */ | ||
1216 | |||
1217 | <cha:Reserved-Feature-Bits>Appendix B: Reserved Feature Bits | ||
1218 | |||
1219 | Currently there are five device-independent feature bits defined: | ||
1220 | |||
1221 | VIRTIO_F_NOTIFY_ON_EMPTY (24) Negotiating this feature | ||
1222 | indicates that the driver wants an interrupt if the device runs | ||
1223 | out of available descriptors on a virtqueue, even though | ||
1224 | interrupts are suppressed using the VRING_AVAIL_F_NO_INTERRUPT | ||
1225 | flag or the used_event field. An example of this is the | ||
1226 | networking driver: it doesn't need to know every time a packet | ||
1227 | is transmitted, but it does need to free the transmitted | ||
1228 | packets a finite time after they are transmitted. It can avoid | ||
1229 | using a timer if the device interrupts it when all the packets | ||
1230 | are transmitted. | ||
1231 | |||
1232 | VIRTIO_F_RING_INDIRECT_DESC (28) Negotiating this feature | ||
1233 | indicates that the driver can use descriptors with the | ||
1234 | VRING_DESC_F_INDIRECT flag set, as described in [sub:Indirect-Descriptors] | ||
1235 | . | ||
1236 | |||
1237 | VIRTIO_F_RING_EVENT_IDX(29) This feature enables the used_event | ||
1238 | and the avail_event fields. If set, it indicates that the | ||
1239 | device should ignore the flags field in the available ring | ||
1240 | structure. Instead, the used_event field in this structure is | ||
1241 | used by guest to suppress device interrupts. Further, the | ||
1242 | driver should ignore the flags field in the used ring | ||
1243 | structure. Instead, the avail_event field in this structure is | ||
1244 | used by the device to suppress notifications. If unset, the | ||
1245 | driver should ignore the used_event field; the device should | ||
1246 | ignore the avail_event field; the flags field is used | ||
1247 | |||
1248 | Appendix C: Network Device | ||
1249 | |||
1250 | The virtio network device is a virtual ethernet card, and is the | ||
1251 | most complex of the devices supported so far by virtio. It has | ||
1252 | enhanced rapidly and demonstrates clearly how support for new | ||
1253 | features should be added to an existing device. Empty buffers are | ||
1254 | placed in one virtqueue for receiving packets, and outgoing | ||
1255 | packets are enqueued into another for transmission in that order. | ||
1256 | A third command queue is used to control advanced filtering | ||
1257 | features. | ||
1258 | |||
1259 | Configuration | ||
1260 | |||
1261 | Subsystem Device ID 1 | ||
1262 | |||
1263 | Virtqueues 0:receiveq. 1:transmitq. 2:controlq[footnote: | ||
1264 | Only if VIRTIO_NET_F_CTRL_VQ set | ||
1265 | ] | ||
1266 | |||
1267 | Feature bits | ||
1268 | |||
1269 | VIRTIO_NET_F_CSUM (0) Device handles packets with partial | ||
1270 | checksum | ||
1271 | |||
1272 | VIRTIO_NET_F_GUEST_CSUM (1) Guest handles packets with partial | ||
1273 | checksum | ||
1274 | |||
1275 | VIRTIO_NET_F_MAC (5) Device has given MAC address. | ||
1276 | |||
1277 | VIRTIO_NET_F_GSO (6) (Deprecated) device handles packets with | ||
1278 | any GSO type.[footnote: | ||
1279 | It was supposed to indicate segmentation offload support, but | ||
1280 | upon further investigation it became clear that multiple bits | ||
1281 | were required. | ||
1282 | ] | ||
1283 | |||
1284 | VIRTIO_NET_F_GUEST_TSO4 (7) Guest can receive TSOv4. | ||
1285 | |||
1286 | VIRTIO_NET_F_GUEST_TSO6 (8) Guest can receive TSOv6. | ||
1287 | |||
1288 | VIRTIO_NET_F_GUEST_ECN (9) Guest can receive TSO with ECN. | ||
1289 | |||
1290 | VIRTIO_NET_F_GUEST_UFO (10) Guest can receive UFO. | ||
1291 | |||
1292 | VIRTIO_NET_F_HOST_TSO4 (11) Device can receive TSOv4. | ||
1293 | |||
1294 | VIRTIO_NET_F_HOST_TSO6 (12) Device can receive TSOv6. | ||
1295 | |||
1296 | VIRTIO_NET_F_HOST_ECN (13) Device can receive TSO with ECN. | ||
1297 | |||
1298 | VIRTIO_NET_F_HOST_UFO (14) Device can receive UFO. | ||
1299 | |||
1300 | VIRTIO_NET_F_MRG_RXBUF (15) Guest can merge receive buffers. | ||
1301 | |||
1302 | VIRTIO_NET_F_STATUS (16) Configuration status field is | ||
1303 | available. | ||
1304 | |||
1305 | VIRTIO_NET_F_CTRL_VQ (17) Control channel is available. | ||
1306 | |||
1307 | VIRTIO_NET_F_CTRL_RX (18) Control channel RX mode support. | ||
1308 | |||
1309 | VIRTIO_NET_F_CTRL_VLAN (19) Control channel VLAN filtering. | ||
1310 | |||
1311 | VIRTIO_NET_F_GUEST_ANNOUNCE(21) Guest can send gratuitous | ||
1312 | packets. | ||
1313 | |||
1314 | Device configuration layout Two configuration fields are | ||
1315 | currently defined. The mac address field always exists (though | ||
1316 | is only valid if VIRTIO_NET_F_MAC is set), and the status field | ||
1317 | only exists if VIRTIO_NET_F_STATUS is set. Two read-only bits | ||
1318 | are currently defined for the status field: | ||
1319 | VIRTIO_NET_S_LINK_UP and VIRTIO_NET_S_ANNOUNCE. #define VIRTIO_NET_S_LINK_UP 1 | ||
1320 | |||
1321 | #define VIRTIO_NET_S_ANNOUNCE 2 | ||
1322 | |||
1323 | |||
1324 | |||
1325 | struct virtio_net_config { | ||
1326 | |||
1327 | u8 mac[6]; | ||
1328 | |||
1329 | u16 status; | ||
1330 | |||
1331 | }; | ||
1332 | |||
1333 | Device Initialization | ||
1334 | |||
1335 | The initialization routine should identify the receive and | ||
1336 | transmission virtqueues. | ||
1337 | |||
1338 | If the VIRTIO_NET_F_MAC feature bit is set, the configuration | ||
1339 | space “mac” entry indicates the “physical” address of the the | ||
1340 | network card, otherwise a private MAC address should be | ||
1341 | assigned. All guests are expected to negotiate this feature if | ||
1342 | it is set. | ||
1343 | |||
1344 | If the VIRTIO_NET_F_CTRL_VQ feature bit is negotiated, identify | ||
1345 | the control virtqueue. | ||
1346 | |||
1347 | If the VIRTIO_NET_F_STATUS feature bit is negotiated, the link | ||
1348 | status can be read from the bottom bit of the “status” config | ||
1349 | field. Otherwise, the link should be assumed active. | ||
1350 | |||
1351 | The receive virtqueue should be filled with receive buffers. | ||
1352 | This is described in detail below in “Setting Up Receive | ||
1353 | Buffers”. | ||
1354 | |||
1355 | A driver can indicate that it will generate checksumless | ||
1356 | packets by negotating the VIRTIO_NET_F_CSUM feature. This “ | ||
1357 | checksum offload” is a common feature on modern network cards. | ||
1358 | |||
1359 | If that feature is negotiated[footnote: | ||
1360 | ie. VIRTIO_NET_F_HOST_TSO* and VIRTIO_NET_F_HOST_UFO are | ||
1361 | dependent on VIRTIO_NET_F_CSUM; a dvice which offers the offload | ||
1362 | features must offer the checksum feature, and a driver which | ||
1363 | accepts the offload features must accept the checksum feature. | ||
1364 | Similar logic applies to the VIRTIO_NET_F_GUEST_TSO4 features | ||
1365 | depending on VIRTIO_NET_F_GUEST_CSUM. | ||
1366 | ], a driver can use TCP or UDP segmentation offload by | ||
1367 | negotiating the VIRTIO_NET_F_HOST_TSO4 (IPv4 TCP), | ||
1368 | VIRTIO_NET_F_HOST_TSO6 (IPv6 TCP) and VIRTIO_NET_F_HOST_UFO | ||
1369 | (UDP fragmentation) features. It should not send TCP packets | ||
1370 | requiring segmentation offload which have the Explicit | ||
1371 | Congestion Notification bit set, unless the | ||
1372 | VIRTIO_NET_F_HOST_ECN feature is negotiated.[footnote: | ||
1373 | This is a common restriction in real, older network cards. | ||
1374 | ] | ||
1375 | |||
1376 | The converse features are also available: a driver can save the | ||
1377 | virtual device some work by negotiating these features.[footnote: | ||
1378 | For example, a network packet transported between two guests on | ||
1379 | the same system may not require checksumming at all, nor | ||
1380 | segmentation, if both guests are amenable. | ||
1381 | ] The VIRTIO_NET_F_GUEST_CSUM feature indicates that partially | ||
1382 | checksummed packets can be received, and if it can do that then | ||
1383 | the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, | ||
1384 | VIRTIO_NET_F_GUEST_UFO and VIRTIO_NET_F_GUEST_ECN are the input | ||
1385 | equivalents of the features described above. See “Receiving | ||
1386 | Packets” below. | ||
1387 | |||
1388 | Device Operation | ||
1389 | |||
1390 | Packets are transmitted by placing them in the transmitq, and | ||
1391 | buffers for incoming packets are placed in the receiveq. In each | ||
1392 | case, the packet itself is preceded by a header: | ||
1393 | |||
1394 | struct virtio_net_hdr { | ||
1395 | |||
1396 | #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 | ||
1397 | |||
1398 | u8 flags; | ||
1399 | |||
1400 | #define VIRTIO_NET_HDR_GSO_NONE 0 | ||
1401 | |||
1402 | #define VIRTIO_NET_HDR_GSO_TCPV4 1 | ||
1403 | |||
1404 | #define VIRTIO_NET_HDR_GSO_UDP 3 | ||
1405 | |||
1406 | #define VIRTIO_NET_HDR_GSO_TCPV6 4 | ||
1407 | |||
1408 | #define VIRTIO_NET_HDR_GSO_ECN 0x80 | ||
1409 | |||
1410 | u8 gso_type; | ||
1411 | |||
1412 | u16 hdr_len; | ||
1413 | |||
1414 | u16 gso_size; | ||
1415 | |||
1416 | u16 csum_start; | ||
1417 | |||
1418 | u16 csum_offset; | ||
1419 | |||
1420 | /* Only if VIRTIO_NET_F_MRG_RXBUF: */ | ||
1421 | |||
1422 | u16 num_buffers | ||
1423 | |||
1424 | }; | ||
1425 | |||
1426 | The controlq is used to control device features such as | ||
1427 | filtering. | ||
1428 | |||
1429 | Packet Transmission | ||
1430 | |||
1431 | Transmitting a single packet is simple, but varies depending on | ||
1432 | the different features the driver negotiated. | ||
1433 | |||
1434 | If the driver negotiated VIRTIO_NET_F_CSUM, and the packet has | ||
1435 | not been fully checksummed, then the virtio_net_hdr's fields | ||
1436 | are set as follows. Otherwise, the packet must be fully | ||
1437 | checksummed, and flags is zero. | ||
1438 | |||
1439 | flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set, | ||
1440 | |||
1441 | <ite:csum_start-is-set>csum_start is set to the offset within | ||
1442 | the packet to begin checksumming, and | ||
1443 | |||
1444 | csum_offset indicates how many bytes after the csum_start the | ||
1445 | new (16 bit ones' complement) checksum should be placed.[footnote: | ||
1446 | For example, consider a partially checksummed TCP (IPv4) packet. | ||
1447 | It will have a 14 byte ethernet header and 20 byte IP header | ||
1448 | followed by the TCP header (with the TCP checksum field 16 bytes | ||
1449 | into that header). csum_start will be 14+20 = 34 (the TCP | ||
1450 | checksum includes the header), and csum_offset will be 16. The | ||
1451 | value in the TCP checksum field should be initialized to the sum | ||
1452 | of the TCP pseudo header, so that replacing it by the ones' | ||
1453 | complement checksum of the TCP header and body will give the | ||
1454 | correct result. | ||
1455 | ] | ||
1456 | |||
1457 | <enu:If-the-driver>If the driver negotiated | ||
1458 | VIRTIO_NET_F_HOST_TSO4, TSO6 or UFO, and the packet requires | ||
1459 | TCP segmentation or UDP fragmentation, then the “gso_type” | ||
1460 | field is set to VIRTIO_NET_HDR_GSO_TCPV4, TCPV6 or UDP. | ||
1461 | (Otherwise, it is set to VIRTIO_NET_HDR_GSO_NONE). In this | ||
1462 | case, packets larger than 1514 bytes can be transmitted: the | ||
1463 | metadata indicates how to replicate the packet header to cut it | ||
1464 | into smaller packets. The other gso fields are set: | ||
1465 | |||
1466 | hdr_len is a hint to the device as to how much of the header | ||
1467 | needs to be kept to copy into each packet, usually set to the | ||
1468 | length of the headers, including the transport header.[footnote: | ||
1469 | Due to various bugs in implementations, this field is not useful | ||
1470 | as a guarantee of the transport header size. | ||
1471 | ] | ||
1472 | |||
1473 | gso_size is the maximum size of each packet beyond that header | ||
1474 | (ie. MSS). | ||
1475 | |||
1476 | If the driver negotiated the VIRTIO_NET_F_HOST_ECN feature, the | ||
1477 | VIRTIO_NET_HDR_GSO_ECN bit may be set in “gso_type” as well, | ||
1478 | indicating that the TCP packet has the ECN bit set.[footnote: | ||
1479 | This case is not handled by some older hardware, so is called out | ||
1480 | specifically in the protocol. | ||
1481 | ] | ||
1482 | |||
1483 | If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, | ||
1484 | the num_buffers field is set to zero. | ||
1485 | |||
1486 | The header and packet are added as one output buffer to the | ||
1487 | transmitq, and the device is notified of the new entry (see [sub:Notifying-The-Device] | ||
1488 | ).[footnote: | ||
1489 | Note that the header will be two bytes longer for the | ||
1490 | VIRTIO_NET_F_MRG_RXBUF case. | ||
1491 | ] | ||
1492 | |||
1493 | Packet Transmission Interrupt | ||
1494 | |||
1495 | Often a driver will suppress transmission interrupts using the | ||
1496 | VRING_AVAIL_F_NO_INTERRUPT flag (see [sub:Receiving-Used-Buffers] | ||
1497 | ) and check for used packets in the transmit path of following | ||
1498 | packets. However, it will still receive interrupts if the | ||
1499 | VIRTIO_F_NOTIFY_ON_EMPTY feature is negotiated, indicating that | ||
1500 | the transmission queue is completely emptied. | ||
1501 | |||
1502 | The normal behavior in this interrupt handler is to retrieve and | ||
1503 | new descriptors from the used ring and free the corresponding | ||
1504 | headers and packets. | ||
1505 | |||
1506 | Setting Up Receive Buffers | ||
1507 | |||
1508 | It is generally a good idea to keep the receive virtqueue as | ||
1509 | fully populated as possible: if it runs out, network performance | ||
1510 | will suffer. | ||
1511 | |||
1512 | If the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6 or | ||
1513 | VIRTIO_NET_F_GUEST_UFO features are used, the Guest will need to | ||
1514 | accept packets of up to 65550 bytes long (the maximum size of a | ||
1515 | TCP or UDP packet, plus the 14 byte ethernet header), otherwise | ||
1516 | 1514 bytes. So unless VIRTIO_NET_F_MRG_RXBUF is negotiated, every | ||
1517 | buffer in the receive queue needs to be at least this length [footnote: | ||
1518 | Obviously each one can be split across multiple descriptor | ||
1519 | elements. | ||
1520 | ]. | ||
1521 | |||
1522 | If VIRTIO_NET_F_MRG_RXBUF is negotiated, each buffer must be at | ||
1523 | least the size of the struct virtio_net_hdr. | ||
1524 | |||
1525 | Packet Receive Interrupt | ||
1526 | |||
1527 | When a packet is copied into a buffer in the receiveq, the | ||
1528 | optimal path is to disable further interrupts for the receiveq | ||
1529 | (see [sub:Receiving-Used-Buffers]) and process packets until no | ||
1530 | more are found, then re-enable them. | ||
1531 | |||
1532 | Processing packet involves: | ||
1533 | |||
1534 | If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, | ||
1535 | then the “num_buffers” field indicates how many descriptors | ||
1536 | this packet is spread over (including this one). This allows | ||
1537 | receipt of large packets without having to allocate large | ||
1538 | buffers. In this case, there will be at least “num_buffers” in | ||
1539 | the used ring, and they should be chained together to form a | ||
1540 | single packet. The other buffers will not begin with a struct | ||
1541 | virtio_net_hdr. | ||
1542 | |||
1543 | If the VIRTIO_NET_F_MRG_RXBUF feature was not negotiated, or | ||
1544 | the “num_buffers” field is one, then the entire packet will be | ||
1545 | contained within this buffer, immediately following the struct | ||
1546 | virtio_net_hdr. | ||
1547 | |||
1548 | If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the | ||
1549 | VIRTIO_NET_HDR_F_NEEDS_CSUM bit in the “flags” field may be | ||
1550 | set: if so, the checksum on the packet is incomplete and the “ | ||
1551 | csum_start” and “csum_offset” fields indicate how to calculate | ||
1552 | it (see [ite:csum_start-is-set]). | ||
1553 | |||
1554 | If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were | ||
1555 | negotiated, then the “gso_type” may be something other than | ||
1556 | VIRTIO_NET_HDR_GSO_NONE, and the “gso_size” field indicates the | ||
1557 | desired MSS (see [enu:If-the-driver]). | ||
1558 | |||
1559 | Control Virtqueue | ||
1560 | |||
1561 | The driver uses the control virtqueue (if VIRTIO_NET_F_VTRL_VQ is | ||
1562 | negotiated) to send commands to manipulate various features of | ||
1563 | the device which would not easily map into the configuration | ||
1564 | space. | ||
1565 | |||
1566 | All commands are of the following form: | ||
1567 | |||
1568 | struct virtio_net_ctrl { | ||
1569 | |||
1570 | u8 class; | ||
1571 | |||
1572 | u8 command; | ||
1573 | |||
1574 | u8 command-specific-data[]; | ||
1575 | |||
1576 | u8 ack; | ||
1577 | |||
1578 | }; | ||
1579 | |||
1580 | |||
1581 | |||
1582 | /* ack values */ | ||
1583 | |||
1584 | #define VIRTIO_NET_OK 0 | ||
1585 | |||
1586 | #define VIRTIO_NET_ERR 1 | ||
1587 | |||
1588 | The class, command and command-specific-data are set by the | ||
1589 | driver, and the device sets the ack byte. There is little it can | ||
1590 | do except issue a diagnostic if the ack byte is not | ||
1591 | VIRTIO_NET_OK. | ||
1592 | |||
1593 | Packet Receive Filtering | ||
1594 | |||
1595 | If the VIRTIO_NET_F_CTRL_RX feature is negotiated, the driver can | ||
1596 | send control commands for promiscuous mode, multicast receiving, | ||
1597 | and filtering of MAC addresses. | ||
1598 | |||
1599 | Note that in general, these commands are best-effort: unwanted | ||
1600 | packets may still arrive. | ||
1601 | |||
1602 | Setting Promiscuous Mode | ||
1603 | |||
1604 | #define VIRTIO_NET_CTRL_RX 0 | ||
1605 | |||
1606 | #define VIRTIO_NET_CTRL_RX_PROMISC 0 | ||
1607 | |||
1608 | #define VIRTIO_NET_CTRL_RX_ALLMULTI 1 | ||
1609 | |||
1610 | The class VIRTIO_NET_CTRL_RX has two commands: | ||
1611 | VIRTIO_NET_CTRL_RX_PROMISC turns promiscuous mode on and off, and | ||
1612 | VIRTIO_NET_CTRL_RX_ALLMULTI turns all-multicast receive on and | ||
1613 | off. The command-specific-data is one byte containing 0 (off) or | ||
1614 | 1 (on). | ||
1615 | |||
1616 | Setting MAC Address Filtering | ||
1617 | |||
1618 | struct virtio_net_ctrl_mac { | ||
1619 | |||
1620 | u32 entries; | ||
1621 | |||
1622 | u8 macs[entries][ETH_ALEN]; | ||
1623 | |||
1624 | }; | ||
1625 | |||
1626 | |||
1627 | |||
1628 | #define VIRTIO_NET_CTRL_MAC 1 | ||
1629 | |||
1630 | #define VIRTIO_NET_CTRL_MAC_TABLE_SET 0 | ||
1631 | |||
1632 | The device can filter incoming packets by any number of | ||
1633 | destination MAC addresses.[footnote: | ||
1634 | Since there are no guarantees, it can use a hash filter | ||
1635 | orsilently switch to allmulti or promiscuous mode if it is given | ||
1636 | too many addresses. | ||
1637 | ] This table is set using the class VIRTIO_NET_CTRL_MAC and the | ||
1638 | command VIRTIO_NET_CTRL_MAC_TABLE_SET. The command-specific-data | ||
1639 | is two variable length tables of 6-byte MAC addresses. The first | ||
1640 | table contains unicast addresses, and the second contains | ||
1641 | multicast addresses. | ||
1642 | |||
1643 | VLAN Filtering | ||
1644 | |||
1645 | If the driver negotiates the VIRTION_NET_F_CTRL_VLAN feature, it | ||
1646 | can control a VLAN filter table in the device. | ||
1647 | |||
1648 | #define VIRTIO_NET_CTRL_VLAN 2 | ||
1649 | |||
1650 | #define VIRTIO_NET_CTRL_VLAN_ADD 0 | ||
1651 | |||
1652 | #define VIRTIO_NET_CTRL_VLAN_DEL 1 | ||
1653 | |||
1654 | Both the VIRTIO_NET_CTRL_VLAN_ADD and VIRTIO_NET_CTRL_VLAN_DEL | ||
1655 | command take a 16-bit VLAN id as the command-specific-data. | ||
1656 | |||
1657 | Gratuitous Packet Sending | ||
1658 | |||
1659 | If the driver negotiates the VIRTIO_NET_F_GUEST_ANNOUNCE (depends | ||
1660 | on VIRTIO_NET_F_CTRL_VQ), it can ask the guest to send gratuitous | ||
1661 | packets; this is usually done after the guest has been physically | ||
1662 | migrated, and needs to announce its presence on the new network | ||
1663 | links. (As hypervisor does not have the knowledge of guest | ||
1664 | network configuration (eg. tagged vlan) it is simplest to prod | ||
1665 | the guest in this way). | ||
1666 | |||
1667 | #define VIRTIO_NET_CTRL_ANNOUNCE 3 | ||
1668 | |||
1669 | #define VIRTIO_NET_CTRL_ANNOUNCE_ACK 0 | ||
1670 | |||
1671 | The Guest needs to check VIRTIO_NET_S_ANNOUNCE bit in status | ||
1672 | field when it notices the changes of device configuration. The | ||
1673 | command VIRTIO_NET_CTRL_ANNOUNCE_ACK is used to indicate that | ||
1674 | driver has recevied the notification and device would clear the | ||
1675 | VIRTIO_NET_S_ANNOUNCE bit in the status filed after it received | ||
1676 | this command. | ||
1677 | |||
1678 | Processing this notification involves: | ||
1679 | |||
1680 | Sending the gratuitous packets or marking there are pending | ||
1681 | gratuitous packets to be sent and letting deferred routine to | ||
1682 | send them. | ||
1683 | |||
1684 | Sending VIRTIO_NET_CTRL_ANNOUNCE_ACK command through control | ||
1685 | vq. | ||
1686 | |||
1687 | . | ||
1688 | |||
1689 | Appendix D: Block Device | ||
1690 | |||
1691 | The virtio block device is a simple virtual block device (ie. | ||
1692 | disk). Read and write requests (and other exotic requests) are | ||
1693 | placed in the queue, and serviced (probably out of order) by the | ||
1694 | device except where noted. | ||
1695 | |||
1696 | Configuration | ||
1697 | |||
1698 | Subsystem Device ID 2 | ||
1699 | |||
1700 | Virtqueues 0:requestq. | ||
1701 | |||
1702 | Feature bits | ||
1703 | |||
1704 | VIRTIO_BLK_F_BARRIER (0) Host supports request barriers. | ||
1705 | |||
1706 | VIRTIO_BLK_F_SIZE_MAX (1) Maximum size of any single segment is | ||
1707 | in “size_max”. | ||
1708 | |||
1709 | VIRTIO_BLK_F_SEG_MAX (2) Maximum number of segments in a | ||
1710 | request is in “seg_max”. | ||
1711 | |||
1712 | VIRTIO_BLK_F_GEOMETRY (4) Disk-style geometry specified in “ | ||
1713 | geometry”. | ||
1714 | |||
1715 | VIRTIO_BLK_F_RO (5) Device is read-only. | ||
1716 | |||
1717 | VIRTIO_BLK_F_BLK_SIZE (6) Block size of disk is in “blk_size”. | ||
1718 | |||
1719 | VIRTIO_BLK_F_SCSI (7) Device supports scsi packet commands. | ||
1720 | |||
1721 | VIRTIO_BLK_F_FLUSH (9) Cache flush command support. | ||
1722 | |||
1723 | Device configuration layout The capacity of the device | ||
1724 | (expressed in 512-byte sectors) is always present. The | ||
1725 | availability of the others all depend on various feature bits | ||
1726 | as indicated above. struct virtio_blk_config { | ||
1727 | |||
1728 | u64 capacity; | ||
1729 | |||
1730 | u32 size_max; | ||
1731 | |||
1732 | u32 seg_max; | ||
1733 | |||
1734 | struct virtio_blk_geometry { | ||
1735 | |||
1736 | u16 cylinders; | ||
1737 | |||
1738 | u8 heads; | ||
1739 | |||
1740 | u8 sectors; | ||
1741 | |||
1742 | } geometry; | ||
1743 | |||
1744 | u32 blk_size; | ||
1745 | |||
1746 | |||
1747 | |||
1748 | }; | ||
1749 | |||
1750 | Device Initialization | ||
1751 | |||
1752 | The device size should be read from the “capacity” | ||
1753 | configuration field. No requests should be submitted which goes | ||
1754 | beyond this limit. | ||
1755 | |||
1756 | If the VIRTIO_BLK_F_BLK_SIZE feature is negotiated, the | ||
1757 | blk_size field can be read to determine the optimal sector size | ||
1758 | for the driver to use. This does not effect the units used in | ||
1759 | the protocol (always 512 bytes), but awareness of the correct | ||
1760 | value can effect performance. | ||
1761 | |||
1762 | If the VIRTIO_BLK_F_RO feature is set by the device, any write | ||
1763 | requests will fail. | ||
1764 | |||
1765 | Device Operation | ||
1766 | |||
1767 | The driver queues requests to the virtqueue, and they are used by | ||
1768 | the device (not necessarily in order). Each request is of form: | ||
1769 | |||
1770 | struct virtio_blk_req { | ||
1771 | |||
1772 | |||
1773 | |||
1774 | u32 type; | ||
1775 | |||
1776 | u32 ioprio; | ||
1777 | |||
1778 | u64 sector; | ||
1779 | |||
1780 | char data[][512]; | ||
1781 | |||
1782 | u8 status; | ||
1783 | |||
1784 | }; | ||
1785 | |||
1786 | If the device has VIRTIO_BLK_F_SCSI feature, it can also support | ||
1787 | scsi packet command requests, each of these requests is of form:struct virtio_scsi_pc_req { | ||
1788 | |||
1789 | u32 type; | ||
1790 | |||
1791 | u32 ioprio; | ||
1792 | |||
1793 | u64 sector; | ||
1794 | |||
1795 | char cmd[]; | ||
1796 | |||
1797 | char data[][512]; | ||
1798 | |||
1799 | #define SCSI_SENSE_BUFFERSIZE 96 | ||
1800 | |||
1801 | u8 sense[SCSI_SENSE_BUFFERSIZE]; | ||
1802 | |||
1803 | u32 errors; | ||
1804 | |||
1805 | u32 data_len; | ||
1806 | |||
1807 | u32 sense_len; | ||
1808 | |||
1809 | u32 residual; | ||
1810 | |||
1811 | u8 status; | ||
1812 | |||
1813 | }; | ||
1814 | |||
1815 | The type of the request is either a read (VIRTIO_BLK_T_IN), a | ||
1816 | write (VIRTIO_BLK_T_OUT), a scsi packet command | ||
1817 | (VIRTIO_BLK_T_SCSI_CMD or VIRTIO_BLK_T_SCSI_CMD_OUT[footnote: | ||
1818 | the SCSI_CMD and SCSI_CMD_OUT types are equivalent, the device | ||
1819 | does not distinguish between them | ||
1820 | ]) or a flush (VIRTIO_BLK_T_FLUSH or VIRTIO_BLK_T_FLUSH_OUT[footnote: | ||
1821 | the FLUSH and FLUSH_OUT types are equivalent, the device does not | ||
1822 | distinguish between them | ||
1823 | ]). If the device has VIRTIO_BLK_F_BARRIER feature the high bit | ||
1824 | (VIRTIO_BLK_T_BARRIER) indicates that this request acts as a | ||
1825 | barrier and that all preceding requests must be complete before | ||
1826 | this one, and all following requests must not be started until | ||
1827 | this is complete. Note that a barrier does not flush caches in | ||
1828 | the underlying backend device in host, and thus does not serve as | ||
1829 | data consistency guarantee. Driver must use FLUSH request to | ||
1830 | flush the host cache. | ||
1831 | |||
1832 | #define VIRTIO_BLK_T_IN 0 | ||
1833 | |||
1834 | #define VIRTIO_BLK_T_OUT 1 | ||
1835 | |||
1836 | #define VIRTIO_BLK_T_SCSI_CMD 2 | ||
1837 | |||
1838 | #define VIRTIO_BLK_T_SCSI_CMD_OUT 3 | ||
1839 | |||
1840 | #define VIRTIO_BLK_T_FLUSH 4 | ||
1841 | |||
1842 | #define VIRTIO_BLK_T_FLUSH_OUT 5 | ||
1843 | |||
1844 | #define VIRTIO_BLK_T_BARRIER 0x80000000 | ||
1845 | |||
1846 | The ioprio field is a hint about the relative priorities of | ||
1847 | requests to the device: higher numbers indicate more important | ||
1848 | requests. | ||
1849 | |||
1850 | The sector number indicates the offset (multiplied by 512) where | ||
1851 | the read or write is to occur. This field is unused and set to 0 | ||
1852 | for scsi packet commands and for flush commands. | ||
1853 | |||
1854 | The cmd field is only present for scsi packet command requests, | ||
1855 | and indicates the command to perform. This field must reside in a | ||
1856 | single, separate read-only buffer; command length can be derived | ||
1857 | from the length of this buffer. | ||
1858 | |||
1859 | Note that these first three (four for scsi packet commands) | ||
1860 | fields are always read-only: the data field is either read-only | ||
1861 | or write-only, depending on the request. The size of the read or | ||
1862 | write can be derived from the total size of the request buffers. | ||
1863 | |||
1864 | The sense field is only present for scsi packet command requests, | ||
1865 | and indicates the buffer for scsi sense data. | ||
1866 | |||
1867 | The data_len field is only present for scsi packet command | ||
1868 | requests, this field is deprecated, and should be ignored by the | ||
1869 | driver. Historically, devices copied data length there. | ||
1870 | |||
1871 | The sense_len field is only present for scsi packet command | ||
1872 | requests and indicates the number of bytes actually written to | ||
1873 | the sense buffer. | ||
1874 | |||
1875 | The residual field is only present for scsi packet command | ||
1876 | requests and indicates the residual size, calculated as data | ||
1877 | length - number of bytes actually transferred. | ||
1878 | |||
1879 | The final status byte is written by the device: either | ||
1880 | VIRTIO_BLK_S_OK for success, VIRTIO_BLK_S_IOERR for host or guest | ||
1881 | error or VIRTIO_BLK_S_UNSUPP for a request unsupported by host:#define VIRTIO_BLK_S_OK 0 | ||
1882 | |||
1883 | #define VIRTIO_BLK_S_IOERR 1 | ||
1884 | |||
1885 | #define VIRTIO_BLK_S_UNSUPP 2 | ||
1886 | |||
1887 | Historically, devices assumed that the fields type, ioprio and | ||
1888 | sector reside in a single, separate read-only buffer; the fields | ||
1889 | errors, data_len, sense_len and residual reside in a single, | ||
1890 | separate write-only buffer; the sense field in a separate | ||
1891 | write-only buffer of size 96 bytes, by itself; the fields errors, | ||
1892 | data_len, sense_len and residual in a single write-only buffer; | ||
1893 | and the status field is a separate read-only buffer of size 1 | ||
1894 | byte, by itself. | ||
1895 | |||
1896 | Appendix E: Console Device | ||
1897 | |||
1898 | The virtio console device is a simple device for data input and | ||
1899 | output. A device may have one or more ports. Each port has a pair | ||
1900 | of input and output virtqueues. Moreover, a device has a pair of | ||
1901 | control IO virtqueues. The control virtqueues are used to | ||
1902 | communicate information between the device and the driver about | ||
1903 | ports being opened and closed on either side of the connection, | ||
1904 | indication from the host about whether a particular port is a | ||
1905 | console port, adding new ports, port hot-plug/unplug, etc., and | ||
1906 | indication from the guest about whether a port or a device was | ||
1907 | successfully added, port open/close, etc.. For data IO, one or | ||
1908 | more empty buffers are placed in the receive queue for incoming | ||
1909 | data and outgoing characters are placed in the transmit queue. | ||
1910 | |||
1911 | Configuration | ||
1912 | |||
1913 | Subsystem Device ID 3 | ||
1914 | |||
1915 | Virtqueues 0:receiveq(port0). 1:transmitq(port0), 2:control | ||
1916 | receiveq[footnote: | ||
1917 | Ports 2 onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set | ||
1918 | ], 3:control transmitq, 4:receiveq(port1), 5:transmitq(port1), | ||
1919 | ... | ||
1920 | |||
1921 | Feature bits | ||
1922 | |||
1923 | VIRTIO_CONSOLE_F_SIZE (0) Configuration cols and rows fields | ||
1924 | are valid. | ||
1925 | |||
1926 | VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple | ||
1927 | ports; configuration fields nr_ports and max_nr_ports are | ||
1928 | valid and control virtqueues will be used. | ||
1929 | |||
1930 | Device configuration layout The size of the console is supplied | ||
1931 | in the configuration space if the VIRTIO_CONSOLE_F_SIZE feature | ||
1932 | is set. Furthermore, if the VIRTIO_CONSOLE_F_MULTIPORT feature | ||
1933 | is set, the maximum number of ports supported by the device can | ||
1934 | be fetched.struct virtio_console_config { | ||
1935 | |||
1936 | u16 cols; | ||
1937 | |||
1938 | u16 rows; | ||
1939 | |||
1940 | |||
1941 | |||
1942 | u32 max_nr_ports; | ||
1943 | |||
1944 | }; | ||
1945 | |||
1946 | Device Initialization | ||
1947 | |||
1948 | If the VIRTIO_CONSOLE_F_SIZE feature is negotiated, the driver | ||
1949 | can read the console dimensions from the configuration fields. | ||
1950 | |||
1951 | If the VIRTIO_CONSOLE_F_MULTIPORT feature is negotiated, the | ||
1952 | driver can spawn multiple ports, not all of which may be | ||
1953 | attached to a console. Some could be generic ports. In this | ||
1954 | case, the control virtqueues are enabled and according to the | ||
1955 | max_nr_ports configuration-space value, the appropriate number | ||
1956 | of virtqueues are created. A control message indicating the | ||
1957 | driver is ready is sent to the host. The host can then send | ||
1958 | control messages for adding new ports to the device. After | ||
1959 | creating and initializing each port, a | ||
1960 | VIRTIO_CONSOLE_PORT_READY control message is sent to the host | ||
1961 | for that port so the host can let us know of any additional | ||
1962 | configuration options set for that port. | ||
1963 | |||
1964 | The receiveq for each port is populated with one or more | ||
1965 | receive buffers. | ||
1966 | |||
1967 | Device Operation | ||
1968 | |||
1969 | For output, a buffer containing the characters is placed in the | ||
1970 | port's transmitq.[footnote: | ||
1971 | Because this is high importance and low bandwidth, the current | ||
1972 | Linux implementation polls for the buffer to be used, rather than | ||
1973 | waiting for an interrupt, simplifying the implementation | ||
1974 | significantly. However, for generic serial ports with the | ||
1975 | O_NONBLOCK flag set, the polling limitation is relaxed and the | ||
1976 | consumed buffers are freed upon the next write or poll call or | ||
1977 | when a port is closed or hot-unplugged. | ||
1978 | ] | ||
1979 | |||
1980 | When a buffer is used in the receiveq (signalled by an | ||
1981 | interrupt), the contents is the input to the port associated | ||
1982 | with the virtqueue for which the notification was received. | ||
1983 | |||
1984 | If the driver negotiated the VIRTIO_CONSOLE_F_SIZE feature, a | ||
1985 | configuration change interrupt may occur. The updated size can | ||
1986 | be read from the configuration fields. | ||
1987 | |||
1988 | If the driver negotiated the VIRTIO_CONSOLE_F_MULTIPORT | ||
1989 | feature, active ports are announced by the host using the | ||
1990 | VIRTIO_CONSOLE_PORT_ADD control message. The same message is | ||
1991 | used for port hot-plug as well. | ||
1992 | |||
1993 | If the host specified a port `name', a sysfs attribute is | ||
1994 | created with the name filled in, so that udev rules can be | ||
1995 | written that can create a symlink from the port's name to the | ||
1996 | char device for port discovery by applications in the guest. | ||
1997 | |||
1998 | Changes to ports' state are effected by control messages. | ||
1999 | Appropriate action is taken on the port indicated in the | ||
2000 | control message. The layout of the structure of the control | ||
2001 | buffer and the events associated are:struct virtio_console_control { | ||
2002 | |||
2003 | uint32_t id; /* Port number */ | ||
2004 | |||
2005 | uint16_t event; /* The kind of control event */ | ||
2006 | |||
2007 | uint16_t value; /* Extra information for the event */ | ||
2008 | |||
2009 | }; | ||
2010 | |||
2011 | |||
2012 | |||
2013 | /* Some events for the internal messages (control packets) */ | ||
2014 | |||
2015 | |||
2016 | |||
2017 | #define VIRTIO_CONSOLE_DEVICE_READY 0 | ||
2018 | |||
2019 | #define VIRTIO_CONSOLE_PORT_ADD 1 | ||
2020 | |||
2021 | #define VIRTIO_CONSOLE_PORT_REMOVE 2 | ||
2022 | |||
2023 | #define VIRTIO_CONSOLE_PORT_READY 3 | ||
2024 | |||
2025 | #define VIRTIO_CONSOLE_CONSOLE_PORT 4 | ||
2026 | |||
2027 | #define VIRTIO_CONSOLE_RESIZE 5 | ||
2028 | |||
2029 | #define VIRTIO_CONSOLE_PORT_OPEN 6 | ||
2030 | |||
2031 | #define VIRTIO_CONSOLE_PORT_NAME 7 | ||
2032 | |||
2033 | Appendix F: Entropy Device | ||
2034 | |||
2035 | The virtio entropy device supplies high-quality randomness for | ||
2036 | guest use. | ||
2037 | |||
2038 | Configuration | ||
2039 | |||
2040 | Subsystem Device ID 4 | ||
2041 | |||
2042 | Virtqueues 0:requestq. | ||
2043 | |||
2044 | Feature bits None currently defined | ||
2045 | |||
2046 | Device configuration layout None currently defined. | ||
2047 | |||
2048 | Device Initialization | ||
2049 | |||
2050 | The virtqueue is initialized | ||
2051 | |||
2052 | Device Operation | ||
2053 | |||
2054 | When the driver requires random bytes, it places the descriptor | ||
2055 | of one or more buffers in the queue. It will be completely filled | ||
2056 | by random data by the device. | ||
2057 | |||
2058 | Appendix G: Memory Balloon Device | ||
2059 | |||
2060 | The virtio memory balloon device is a primitive device for | ||
2061 | managing guest memory: the device asks for a certain amount of | ||
2062 | memory, and the guest supplies it (or withdraws it, if the device | ||
2063 | has more than it asks for). This allows the guest to adapt to | ||
2064 | changes in allowance of underlying physical memory. If the | ||
2065 | feature is negotiated, the device can also be used to communicate | ||
2066 | guest memory statistics to the host. | ||
2067 | |||
2068 | Configuration | ||
2069 | |||
2070 | Subsystem Device ID 5 | ||
2071 | |||
2072 | Virtqueues 0:inflateq. 1:deflateq. 2:statsq.[footnote: | ||
2073 | Only if VIRTIO_BALLON_F_STATS_VQ set | ||
2074 | ] | ||
2075 | |||
2076 | Feature bits | ||
2077 | |||
2078 | VIRTIO_BALLOON_F_MUST_TELL_HOST (0) Host must be told before | ||
2079 | pages from the balloon are used. | ||
2080 | |||
2081 | VIRTIO_BALLOON_F_STATS_VQ (1) A virtqueue for reporting guest | ||
2082 | memory statistics is present. | ||
2083 | |||
2084 | Device configuration layout Both fields of this configuration | ||
2085 | are always available. Note that they are little endian, despite | ||
2086 | convention that device fields are guest endian:struct virtio_balloon_config { | ||
2087 | |||
2088 | u32 num_pages; | ||
2089 | |||
2090 | u32 actual; | ||
2091 | |||
2092 | }; | ||
2093 | |||
2094 | Device Initialization | ||
2095 | |||
2096 | The inflate and deflate virtqueues are identified. | ||
2097 | |||
2098 | If the VIRTIO_BALLOON_F_STATS_VQ feature bit is negotiated: | ||
2099 | |||
2100 | Identify the stats virtqueue. | ||
2101 | |||
2102 | Add one empty buffer to the stats virtqueue and notify the | ||
2103 | host. | ||
2104 | |||
2105 | Device operation begins immediately. | ||
2106 | |||
2107 | Device Operation | ||
2108 | |||
2109 | Memory Ballooning The device is driven by the receipt of a | ||
2110 | configuration change interrupt. | ||
2111 | |||
2112 | The “num_pages” configuration field is examined. If this is | ||
2113 | greater than the “actual” number of pages, memory must be given | ||
2114 | to the balloon. If it is less than the “actual” number of | ||
2115 | pages, memory may be taken back from the balloon for general | ||
2116 | use. | ||
2117 | |||
2118 | To supply memory to the balloon (aka. inflate): | ||
2119 | |||
2120 | The driver constructs an array of addresses of unused memory | ||
2121 | pages. These addresses are divided by 4096[footnote: | ||
2122 | This is historical, and independent of the guest page size | ||
2123 | ] and the descriptor describing the resulting 32-bit array is | ||
2124 | added to the inflateq. | ||
2125 | |||
2126 | To remove memory from the balloon (aka. deflate): | ||
2127 | |||
2128 | The driver constructs an array of addresses of memory pages it | ||
2129 | has previously given to the balloon, as described above. This | ||
2130 | descriptor is added to the deflateq. | ||
2131 | |||
2132 | If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is set, the | ||
2133 | guest may not use these requested pages until that descriptor | ||
2134 | in the deflateq has been used by the device. | ||
2135 | |||
2136 | Otherwise, the guest may begin to re-use pages previously given | ||
2137 | to the balloon before the device has acknowledged their | ||
2138 | withdrawl. [footnote: | ||
2139 | In this case, deflation advice is merely a courtesy | ||
2140 | ] | ||
2141 | |||
2142 | In either case, once the device has completed the inflation or | ||
2143 | deflation, the “actual” field of the configuration should be | ||
2144 | updated to reflect the new number of pages in the balloon.[footnote: | ||
2145 | As updates to configuration space are not atomic, this field | ||
2146 | isn't particularly reliable, but can be used to diagnose buggy | ||
2147 | guests. | ||
2148 | ] | ||
2149 | |||
2150 | Memory Statistics | ||
2151 | |||
2152 | The stats virtqueue is atypical because communication is driven | ||
2153 | by the device (not the driver). The channel becomes active at | ||
2154 | driver initialization time when the driver adds an empty buffer | ||
2155 | and notifies the device. A request for memory statistics proceeds | ||
2156 | as follows: | ||
2157 | |||
2158 | The device pushes the buffer onto the used ring and sends an | ||
2159 | interrupt. | ||
2160 | |||
2161 | The driver pops the used buffer and discards it. | ||
2162 | |||
2163 | The driver collects memory statistics and writes them into a | ||
2164 | new buffer. | ||
2165 | |||
2166 | The driver adds the buffer to the virtqueue and notifies the | ||
2167 | device. | ||
2168 | |||
2169 | The device pops the buffer (retaining it to initiate a | ||
2170 | subsequent request) and consumes the statistics. | ||
2171 | |||
2172 | Memory Statistics Format Each statistic consists of a 16 bit | ||
2173 | tag and a 64 bit value. Both quantities are represented in the | ||
2174 | native endian of the guest. All statistics are optional and the | ||
2175 | driver may choose which ones to supply. To guarantee backwards | ||
2176 | compatibility, unsupported statistics should be omitted. | ||
2177 | |||
2178 | struct virtio_balloon_stat { | ||
2179 | |||
2180 | #define VIRTIO_BALLOON_S_SWAP_IN 0 | ||
2181 | |||
2182 | #define VIRTIO_BALLOON_S_SWAP_OUT 1 | ||
2183 | |||
2184 | #define VIRTIO_BALLOON_S_MAJFLT 2 | ||
2185 | |||
2186 | #define VIRTIO_BALLOON_S_MINFLT 3 | ||
2187 | |||
2188 | #define VIRTIO_BALLOON_S_MEMFREE 4 | ||
2189 | |||
2190 | #define VIRTIO_BALLOON_S_MEMTOT 5 | ||
2191 | |||
2192 | u16 tag; | ||
2193 | |||
2194 | u64 val; | ||
2195 | |||
2196 | } __attribute__((packed)); | ||
2197 | |||
2198 | Tags | ||
2199 | |||
2200 | VIRTIO_BALLOON_S_SWAP_IN The amount of memory that has been | ||
2201 | swapped in (in bytes). | ||
2202 | |||
2203 | VIRTIO_BALLOON_S_SWAP_OUT The amount of memory that has been | ||
2204 | swapped out to disk (in bytes). | ||
2205 | |||
2206 | VIRTIO_BALLOON_S_MAJFLT The number of major page faults that | ||
2207 | have occurred. | ||
2208 | |||
2209 | VIRTIO_BALLOON_S_MINFLT The number of minor page faults that | ||
2210 | have occurred. | ||
2211 | |||
2212 | VIRTIO_BALLOON_S_MEMFREE The amount of memory not being used | ||
2213 | for any purpose (in bytes). | ||
2214 | |||
2215 | VIRTIO_BALLOON_S_MEMTOT The total amount of memory available | ||
2216 | (in bytes). | ||
2217 | |||
2218 | Appendix H: Rpmsg: Remote Processor Messaging | ||
2219 | |||
2220 | Virtio rpmsg devices represent remote processors on the system | ||
2221 | which run in asymmetric multi-processing (AMP) configuration, and | ||
2222 | which are usually used to offload cpu-intensive tasks from the | ||
2223 | main application processor (a typical SoC methodology). | ||
2224 | |||
2225 | Virtio is being used to communicate with those remote processors; | ||
2226 | empty buffers are placed in one virtqueue for receiving messages, | ||
2227 | and non-empty buffers, containing outbound messages, are enqueued | ||
2228 | in a second virtqueue for transmission. | ||
2229 | |||
2230 | Numerous communication channels can be multiplexed over those two | ||
2231 | virtqueues, so different entities, running on the application and | ||
2232 | remote processor, can directly communicate in a point-to-point | ||
2233 | fashion. | ||
2234 | |||
2235 | Configuration | ||
2236 | |||
2237 | Subsystem Device ID 7 | ||
2238 | |||
2239 | Virtqueues 0:receiveq. 1:transmitq. | ||
2240 | |||
2241 | Feature bits | ||
2242 | |||
2243 | VIRTIO_RPMSG_F_NS (0) Device sends (and capable of receiving) | ||
2244 | name service messages announcing the creation (or | ||
2245 | destruction) of a channel:/** | ||
2246 | |||
2247 | * struct rpmsg_ns_msg - dynamic name service announcement | ||
2248 | message | ||
2249 | |||
2250 | * @name: name of remote service that is published | ||
2251 | |||
2252 | * @addr: address of remote service that is published | ||
2253 | |||
2254 | * @flags: indicates whether service is created or destroyed | ||
2255 | |||
2256 | * | ||
2257 | |||
2258 | * This message is sent across to publish a new service (or | ||
2259 | announce | ||
2260 | |||
2261 | * about its removal). When we receives these messages, an | ||
2262 | appropriate | ||
2263 | |||
2264 | * rpmsg channel (i.e device) is created/destroyed. | ||
2265 | |||
2266 | */ | ||
2267 | |||
2268 | struct rpmsg_ns_msgoon_config { | ||
2269 | |||
2270 | char name[RPMSG_NAME_SIZE]; | ||
2271 | |||
2272 | u32 addr; | ||
2273 | |||
2274 | u32 flags; | ||
2275 | |||
2276 | } __packed; | ||
2277 | |||
2278 | |||
2279 | |||
2280 | /** | ||
2281 | |||
2282 | * enum rpmsg_ns_flags - dynamic name service announcement flags | ||
2283 | |||
2284 | * | ||
2285 | |||
2286 | * @RPMSG_NS_CREATE: a new remote service was just created | ||
2287 | |||
2288 | * @RPMSG_NS_DESTROY: a remote service was just destroyed | ||
2289 | |||
2290 | */ | ||
2291 | |||
2292 | enum rpmsg_ns_flags { | ||
2293 | |||
2294 | RPMSG_NS_CREATE = 0, | ||
2295 | |||
2296 | RPMSG_NS_DESTROY = 1, | ||
2297 | |||
2298 | }; | ||
2299 | |||
2300 | Device configuration layout | ||
2301 | |||
2302 | At his point none currently defined. | ||
2303 | |||
2304 | Device Initialization | ||
2305 | |||
2306 | The initialization routine should identify the receive and | ||
2307 | transmission virtqueues. | ||
2308 | |||
2309 | The receive virtqueue should be filled with receive buffers. | ||
2310 | |||
2311 | Device Operation | ||
2312 | |||
2313 | Messages are transmitted by placing them in the transmitq, and | ||
2314 | buffers for inbound messages are placed in the receiveq. In any | ||
2315 | case, messages are always preceded by the following header: /** | ||
2316 | |||
2317 | * struct rpmsg_hdr - common header for all rpmsg messages | ||
2318 | |||
2319 | * @src: source address | ||
2320 | |||
2321 | * @dst: destination address | ||
2322 | |||
2323 | * @reserved: reserved for future use | ||
2324 | |||
2325 | * @len: length of payload (in bytes) | ||
2326 | |||
2327 | * @flags: message flags | ||
2328 | |||
2329 | * @data: @len bytes of message payload data | ||
2330 | |||
2331 | * | ||
2332 | |||
2333 | * Every message sent(/received) on the rpmsg bus begins with | ||
2334 | this header. | ||
2335 | |||
2336 | */ | ||
2337 | |||
2338 | struct rpmsg_hdr { | ||
2339 | |||
2340 | u32 src; | ||
2341 | |||
2342 | u32 dst; | ||
2343 | |||
2344 | u32 reserved; | ||
2345 | |||
2346 | u16 len; | ||
2347 | |||
2348 | u16 flags; | ||
2349 | |||
2350 | u8 data[0]; | ||
2351 | |||
2352 | } __packed; | ||
2353 | |||
2354 | Appendix I: SCSI Host Device | ||
2355 | |||
2356 | The virtio SCSI host device groups together one or more virtual | ||
2357 | logical units (such as disks), and allows communicating to them | ||
2358 | using the SCSI protocol. An instance of the device represents a | ||
2359 | SCSI host to which many targets and LUNs are attached. | ||
2360 | |||
2361 | The virtio SCSI device services two kinds of requests: | ||
2362 | |||
2363 | command requests for a logical unit; | ||
2364 | |||
2365 | task management functions related to a logical unit, target or | ||
2366 | command. | ||
2367 | |||
2368 | The device is also able to send out notifications about added and | ||
2369 | removed logical units. Together, these capabilities provide a | ||
2370 | SCSI transport protocol that uses virtqueues as the transfer | ||
2371 | medium. In the transport protocol, the virtio driver acts as the | ||
2372 | initiator, while the virtio SCSI host provides one or more | ||
2373 | targets that receive and process the requests. | ||
2374 | |||
2375 | Configuration | ||
2376 | |||
2377 | Subsystem Device ID 8 | ||
2378 | |||
2379 | Virtqueues 0:controlq; 1:eventq; 2..n:request queues. | ||
2380 | |||
2381 | Feature bits | ||
2382 | |||
2383 | VIRTIO_SCSI_F_INOUT (0) A single request can include both | ||
2384 | read-only and write-only data buffers. | ||
2385 | |||
2386 | VIRTIO_SCSI_F_HOTPLUG (1) The host should enable | ||
2387 | hot-plug/hot-unplug of new LUNs and targets on the SCSI bus. | ||
2388 | |||
2389 | Device configuration layout All fields of this configuration | ||
2390 | are always available. sense_size and cdb_size are writable by | ||
2391 | the guest.struct virtio_scsi_config { | ||
2392 | |||
2393 | u32 num_queues; | ||
2394 | |||
2395 | u32 seg_max; | ||
2396 | |||
2397 | u32 max_sectors; | ||
2398 | |||
2399 | u32 cmd_per_lun; | ||
2400 | |||
2401 | u32 event_info_size; | ||
2402 | |||
2403 | u32 sense_size; | ||
2404 | |||
2405 | u32 cdb_size; | ||
2406 | |||
2407 | u16 max_channel; | ||
2408 | |||
2409 | u16 max_target; | ||
2410 | |||
2411 | u32 max_lun; | ||
2412 | |||
2413 | }; | ||
2414 | |||
2415 | num_queues is the total number of request virtqueues exposed by | ||
2416 | the device. The driver is free to use only one request queue, | ||
2417 | or it can use more to achieve better performance. | ||
2418 | |||
2419 | seg_max is the maximum number of segments that can be in a | ||
2420 | command. A bidirectional command can include seg_max input | ||
2421 | segments and seg_max output segments. | ||
2422 | |||
2423 | max_sectors is a hint to the guest about the maximum transfer | ||
2424 | size it should use. | ||
2425 | |||
2426 | cmd_per_lun is a hint to the guest about the maximum number of | ||
2427 | linked commands it should send to one LUN. The actual value | ||
2428 | to be used is the minimum of cmd_per_lun and the virtqueue | ||
2429 | size. | ||
2430 | |||
2431 | event_info_size is the maximum size that the device will fill | ||
2432 | for buffers that the driver places in the eventq. The driver | ||
2433 | should always put buffers at least of this size. It is | ||
2434 | written by the device depending on the set of negotated | ||
2435 | features. | ||
2436 | |||
2437 | sense_size is the maximum size of the sense data that the | ||
2438 | device will write. The default value is written by the device | ||
2439 | and will always be 96, but the driver can modify it. It is | ||
2440 | restored to the default when the device is reset. | ||
2441 | |||
2442 | cdb_size is the maximum size of the CDB that the driver will | ||
2443 | write. The default value is written by the device and will | ||
2444 | always be 32, but the driver can likewise modify it. It is | ||
2445 | restored to the default when the device is reset. | ||
2446 | |||
2447 | max_channel, max_target and max_lun can be used by the driver | ||
2448 | as hints to constrain scanning the logical units on the | ||
2449 | host.h | ||
2450 | |||
2451 | Device Initialization | ||
2452 | |||
2453 | The initialization routine should first of all discover the | ||
2454 | device's virtqueues. | ||
2455 | |||
2456 | If the driver uses the eventq, it should then place at least a | ||
2457 | buffer in the eventq. | ||
2458 | |||
2459 | The driver can immediately issue requests (for example, INQUIRY | ||
2460 | or REPORT LUNS) or task management functions (for example, I_T | ||
2461 | RESET). | ||
2462 | |||
2463 | Device Operation: request queues | ||
2464 | |||
2465 | The driver queues requests to an arbitrary request queue, and | ||
2466 | they are used by the device on that same queue. It is the | ||
2467 | responsibility of the driver to ensure strict request ordering | ||
2468 | for commands placed on different queues, because they will be | ||
2469 | consumed with no order constraints. | ||
2470 | |||
2471 | Requests have the following format: | ||
2472 | |||
2473 | struct virtio_scsi_req_cmd { | ||
2474 | |||
2475 | // Read-only | ||
2476 | |||
2477 | u8 lun[8]; | ||
2478 | |||
2479 | u64 id; | ||
2480 | |||
2481 | u8 task_attr; | ||
2482 | |||
2483 | u8 prio; | ||
2484 | |||
2485 | u8 crn; | ||
2486 | |||
2487 | char cdb[cdb_size]; | ||
2488 | |||
2489 | char dataout[]; | ||
2490 | |||
2491 | // Write-only part | ||
2492 | |||
2493 | u32 sense_len; | ||
2494 | |||
2495 | u32 residual; | ||
2496 | |||
2497 | u16 status_qualifier; | ||
2498 | |||
2499 | u8 status; | ||
2500 | |||
2501 | u8 response; | ||
2502 | |||
2503 | u8 sense[sense_size]; | ||
2504 | |||
2505 | char datain[]; | ||
2506 | |||
2507 | }; | ||
2508 | |||
2509 | |||
2510 | |||
2511 | /* command-specific response values */ | ||
2512 | |||
2513 | #define VIRTIO_SCSI_S_OK 0 | ||
2514 | |||
2515 | #define VIRTIO_SCSI_S_OVERRUN 1 | ||
2516 | |||
2517 | #define VIRTIO_SCSI_S_ABORTED 2 | ||
2518 | |||
2519 | #define VIRTIO_SCSI_S_BAD_TARGET 3 | ||
2520 | |||
2521 | #define VIRTIO_SCSI_S_RESET 4 | ||
2522 | |||
2523 | #define VIRTIO_SCSI_S_BUSY 5 | ||
2524 | |||
2525 | #define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 | ||
2526 | |||
2527 | #define VIRTIO_SCSI_S_TARGET_FAILURE 7 | ||
2528 | |||
2529 | #define VIRTIO_SCSI_S_NEXUS_FAILURE 8 | ||
2530 | |||
2531 | #define VIRTIO_SCSI_S_FAILURE 9 | ||
2532 | |||
2533 | |||
2534 | |||
2535 | /* task_attr */ | ||
2536 | |||
2537 | #define VIRTIO_SCSI_S_SIMPLE 0 | ||
2538 | |||
2539 | #define VIRTIO_SCSI_S_ORDERED 1 | ||
2540 | |||
2541 | #define VIRTIO_SCSI_S_HEAD 2 | ||
2542 | |||
2543 | #define VIRTIO_SCSI_S_ACA 3 | ||
2544 | |||
2545 | The lun field addresses a target and logical unit in the | ||
2546 | virtio-scsi device's SCSI domain. The only supported format for | ||
2547 | the LUN field is: first byte set to 1, second byte set to target, | ||
2548 | third and fourth byte representing a single level LUN structure, | ||
2549 | followed by four zero bytes. With this representation, a | ||
2550 | virtio-scsi device can serve up to 256 targets and 16384 LUNs per | ||
2551 | target. | ||
2552 | |||
2553 | The id field is the command identifier (“tag”). | ||
2554 | |||
2555 | task_attr, prio and crn should be left to zero. task_attr defines | ||
2556 | the task attribute as in the table above, but all task attributes | ||
2557 | may be mapped to SIMPLE by the device; crn may also be provided | ||
2558 | by clients, but is generally expected to be 0. The maximum CRN | ||
2559 | value defined by the protocol is 255, since CRN is stored in an | ||
2560 | 8-bit integer. | ||
2561 | |||
2562 | All of these fields are defined in SAM. They are always | ||
2563 | read-only, as are the cdb and dataout field. The cdb_size is | ||
2564 | taken from the configuration space. | ||
2565 | |||
2566 | sense and subsequent fields are always write-only. The sense_len | ||
2567 | field indicates the number of bytes actually written to the sense | ||
2568 | buffer. The residual field indicates the residual size, | ||
2569 | calculated as “data_length - number_of_transferred_bytes”, for | ||
2570 | read or write operations. For bidirectional commands, the | ||
2571 | number_of_transferred_bytes includes both read and written bytes. | ||
2572 | A residual field that is less than the size of datain means that | ||
2573 | the dataout field was processed entirely. A residual field that | ||
2574 | exceeds the size of datain means that the dataout field was | ||
2575 | processed partially and the datain field was not processed at | ||
2576 | all. | ||
2577 | |||
2578 | The status byte is written by the device to be the status code as | ||
2579 | defined in SAM. | ||
2580 | |||
2581 | The response byte is written by the device to be one of the | ||
2582 | following: | ||
2583 | |||
2584 | VIRTIO_SCSI_S_OK when the request was completed and the status | ||
2585 | byte is filled with a SCSI status code (not necessarily | ||
2586 | "GOOD"). | ||
2587 | |||
2588 | VIRTIO_SCSI_S_OVERRUN if the content of the CDB requires | ||
2589 | transferring more data than is available in the data buffers. | ||
2590 | |||
2591 | VIRTIO_SCSI_S_ABORTED if the request was cancelled due to an | ||
2592 | ABORT TASK or ABORT TASK SET task management function. | ||
2593 | |||
2594 | VIRTIO_SCSI_S_BAD_TARGET if the request was never processed | ||
2595 | because the target indicated by the lun field does not exist. | ||
2596 | |||
2597 | VIRTIO_SCSI_S_RESET if the request was cancelled due to a bus | ||
2598 | or device reset (including a task management function). | ||
2599 | |||
2600 | VIRTIO_SCSI_S_TRANSPORT_FAILURE if the request failed due to a | ||
2601 | problem in the connection between the host and the target | ||
2602 | (severed link). | ||
2603 | |||
2604 | VIRTIO_SCSI_S_TARGET_FAILURE if the target is suffering a | ||
2605 | failure and the guest should not retry on other paths. | ||
2606 | |||
2607 | VIRTIO_SCSI_S_NEXUS_FAILURE if the nexus is suffering a failure | ||
2608 | but retrying on other paths might yield a different result. | ||
2609 | |||
2610 | VIRTIO_SCSI_S_BUSY if the request failed but retrying on the | ||
2611 | same path should work. | ||
2612 | |||
2613 | VIRTIO_SCSI_S_FAILURE for other host or guest error. In | ||
2614 | particular, if neither dataout nor datain is empty, and the | ||
2615 | VIRTIO_SCSI_F_INOUT feature has not been negotiated, the | ||
2616 | request will be immediately returned with a response equal to | ||
2617 | VIRTIO_SCSI_S_FAILURE. | ||
2618 | |||
2619 | Device Operation: controlq | ||
2620 | |||
2621 | The controlq is used for other SCSI transport operations. | ||
2622 | Requests have the following format: | ||
2623 | |||
2624 | struct virtio_scsi_ctrl { | ||
2625 | |||
2626 | u32 type; | ||
2627 | |||
2628 | ... | ||
2629 | |||
2630 | u8 response; | ||
2631 | |||
2632 | }; | ||
2633 | |||
2634 | |||
2635 | |||
2636 | /* response values valid for all commands */ | ||
2637 | |||
2638 | #define VIRTIO_SCSI_S_OK 0 | ||
2639 | |||
2640 | #define VIRTIO_SCSI_S_BAD_TARGET 3 | ||
2641 | |||
2642 | #define VIRTIO_SCSI_S_BUSY 5 | ||
2643 | |||
2644 | #define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 | ||
2645 | |||
2646 | #define VIRTIO_SCSI_S_TARGET_FAILURE 7 | ||
2647 | |||
2648 | #define VIRTIO_SCSI_S_NEXUS_FAILURE 8 | ||
2649 | |||
2650 | #define VIRTIO_SCSI_S_FAILURE 9 | ||
2651 | |||
2652 | #define VIRTIO_SCSI_S_INCORRECT_LUN 12 | ||
2653 | |||
2654 | The type identifies the remaining fields. | ||
2655 | |||
2656 | The following commands are defined: | ||
2657 | |||
2658 | Task management function | ||
2659 | #define VIRTIO_SCSI_T_TMF 0 | ||
2660 | |||
2661 | |||
2662 | |||
2663 | #define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 | ||
2664 | |||
2665 | #define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 | ||
2666 | |||
2667 | #define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 | ||
2668 | |||
2669 | #define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 | ||
2670 | |||
2671 | #define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 | ||
2672 | |||
2673 | #define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 | ||
2674 | |||
2675 | #define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 | ||
2676 | |||
2677 | #define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 | ||
2678 | |||
2679 | |||
2680 | |||
2681 | struct virtio_scsi_ctrl_tmf | ||
2682 | |||
2683 | { | ||
2684 | |||
2685 | // Read-only part | ||
2686 | |||
2687 | u32 type; | ||
2688 | |||
2689 | u32 subtype; | ||
2690 | |||
2691 | u8 lun[8]; | ||
2692 | |||
2693 | u64 id; | ||
2694 | |||
2695 | // Write-only part | ||
2696 | |||
2697 | u8 response; | ||
2698 | |||
2699 | } | ||
2700 | |||
2701 | |||
2702 | |||
2703 | /* command-specific response values */ | ||
2704 | |||
2705 | #define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0 | ||
2706 | |||
2707 | #define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 | ||
2708 | |||
2709 | #define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 | ||
2710 | |||
2711 | The type is VIRTIO_SCSI_T_TMF; the subtype field defines. All | ||
2712 | fields except response are filled by the driver. The subtype | ||
2713 | field must always be specified and identifies the requested | ||
2714 | task management function. | ||
2715 | |||
2716 | Other fields may be irrelevant for the requested TMF; if so, | ||
2717 | they are ignored but they should still be present. The lun | ||
2718 | field is in the same format specified for request queues; the | ||
2719 | single level LUN is ignored when the task management function | ||
2720 | addresses a whole I_T nexus. When relevant, the value of the id | ||
2721 | field is matched against the id values passed on the requestq. | ||
2722 | |||
2723 | The outcome of the task management function is written by the | ||
2724 | device in the response field. The command-specific response | ||
2725 | values map 1-to-1 with those defined in SAM. | ||
2726 | |||
2727 | Asynchronous notification query | ||
2728 | #define VIRTIO_SCSI_T_AN_QUERY 1 | ||
2729 | |||
2730 | |||
2731 | |||
2732 | struct virtio_scsi_ctrl_an { | ||
2733 | |||
2734 | // Read-only part | ||
2735 | |||
2736 | u32 type; | ||
2737 | |||
2738 | u8 lun[8]; | ||
2739 | |||
2740 | u32 event_requested; | ||
2741 | |||
2742 | // Write-only part | ||
2743 | |||
2744 | u32 event_actual; | ||
2745 | |||
2746 | u8 response; | ||
2747 | |||
2748 | } | ||
2749 | |||
2750 | |||
2751 | |||
2752 | #define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2 | ||
2753 | |||
2754 | #define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4 | ||
2755 | |||
2756 | #define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8 | ||
2757 | |||
2758 | #define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16 | ||
2759 | |||
2760 | #define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32 | ||
2761 | |||
2762 | #define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64 | ||
2763 | |||
2764 | By sending this command, the driver asks the device which | ||
2765 | events the given LUN can report, as described in paragraphs 6.6 | ||
2766 | and A.6 of the SCSI MMC specification. The driver writes the | ||
2767 | events it is interested in into the event_requested; the device | ||
2768 | responds by writing the events that it supports into | ||
2769 | event_actual. | ||
2770 | |||
2771 | The type is VIRTIO_SCSI_T_AN_QUERY. The lun and event_requested | ||
2772 | fields are written by the driver. The event_actual and response | ||
2773 | fields are written by the device. | ||
2774 | |||
2775 | No command-specific values are defined for the response byte. | ||
2776 | |||
2777 | Asynchronous notification subscription | ||
2778 | #define VIRTIO_SCSI_T_AN_SUBSCRIBE 2 | ||
2779 | |||
2780 | |||
2781 | |||
2782 | struct virtio_scsi_ctrl_an { | ||
2783 | |||
2784 | // Read-only part | ||
2785 | |||
2786 | u32 type; | ||
2787 | |||
2788 | u8 lun[8]; | ||
2789 | |||
2790 | u32 event_requested; | ||
2791 | |||
2792 | // Write-only part | ||
2793 | |||
2794 | u32 event_actual; | ||
2795 | |||
2796 | u8 response; | ||
2797 | |||
2798 | } | ||
2799 | |||
2800 | By sending this command, the driver asks the specified LUN to | ||
2801 | report events for its physical interface, again as described in | ||
2802 | the SCSI MMC specification. The driver writes the events it is | ||
2803 | interested in into the event_requested; the device responds by | ||
2804 | writing the events that it supports into event_actual. | ||
2805 | |||
2806 | Event types are the same as for the asynchronous notification | ||
2807 | query message. | ||
2808 | |||
2809 | The type is VIRTIO_SCSI_T_AN_SUBSCRIBE. The lun and | ||
2810 | event_requested fields are written by the driver. The | ||
2811 | event_actual and response fields are written by the device. | ||
2812 | |||
2813 | No command-specific values are defined for the response byte. | ||
2814 | |||
2815 | Device Operation: eventq | ||
2816 | |||
2817 | The eventq is used by the device to report information on logical | ||
2818 | units that are attached to it. The driver should always leave a | ||
2819 | few buffers ready in the eventq. In general, the device will not | ||
2820 | queue events to cope with an empty eventq, and will end up | ||
2821 | dropping events if it finds no buffer ready. However, when | ||
2822 | reporting events for many LUNs (e.g. when a whole target | ||
2823 | disappears), the device can throttle events to avoid dropping | ||
2824 | them. For this reason, placing 10-15 buffers on the event queue | ||
2825 | should be enough. | ||
2826 | |||
2827 | Buffers are placed in the eventq and filled by the device when | ||
2828 | interesting events occur. The buffers should be strictly | ||
2829 | write-only (device-filled) and the size of the buffers should be | ||
2830 | at least the value given in the device's configuration | ||
2831 | information. | ||
2832 | |||
2833 | Buffers returned by the device on the eventq will be referred to | ||
2834 | as "events" in the rest of this section. Events have the | ||
2835 | following format: | ||
2836 | |||
2837 | #define VIRTIO_SCSI_T_EVENTS_MISSED 0x80000000 | ||
2838 | |||
2839 | |||
2840 | |||
2841 | struct virtio_scsi_event { | ||
2842 | |||
2843 | // Write-only part | ||
2844 | |||
2845 | u32 event; | ||
2846 | |||
2847 | ... | ||
2848 | |||
2849 | } | ||
2850 | |||
2851 | If bit 31 is set in the event field, the device failed to report | ||
2852 | an event due to missing buffers. In this case, the driver should | ||
2853 | poll the logical units for unit attention conditions, and/or do | ||
2854 | whatever form of bus scan is appropriate for the guest operating | ||
2855 | system. | ||
2856 | |||
2857 | Other data that the device writes to the buffer depends on the | ||
2858 | contents of the event field. The following events are defined: | ||
2859 | |||
2860 | No event | ||
2861 | #define VIRTIO_SCSI_T_NO_EVENT 0 | ||
2862 | |||
2863 | This event is fired in the following cases: | ||
2864 | |||
2865 | When the device detects in the eventq a buffer that is shorter | ||
2866 | than what is indicated in the configuration field, it might | ||
2867 | use it immediately and put this dummy value in the event | ||
2868 | field. A well-written driver will never observe this | ||
2869 | situation. | ||
2870 | |||
2871 | When events are dropped, the device may signal this event as | ||
2872 | soon as the drivers makes a buffer available, in order to | ||
2873 | request action from the driver. In this case, of course, this | ||
2874 | event will be reported with the VIRTIO_SCSI_T_EVENTS_MISSED | ||
2875 | flag. | ||
2876 | |||
2877 | Transport reset | ||
2878 | #define VIRTIO_SCSI_T_TRANSPORT_RESET 1 | ||
2879 | |||
2880 | |||
2881 | |||
2882 | struct virtio_scsi_event_reset { | ||
2883 | |||
2884 | // Write-only part | ||
2885 | |||
2886 | u32 event; | ||
2887 | |||
2888 | u8 lun[8]; | ||
2889 | |||
2890 | u32 reason; | ||
2891 | |||
2892 | } | ||
2893 | |||
2894 | |||
2895 | |||
2896 | #define VIRTIO_SCSI_EVT_RESET_HARD 0 | ||
2897 | |||
2898 | #define VIRTIO_SCSI_EVT_RESET_RESCAN 1 | ||
2899 | |||
2900 | #define VIRTIO_SCSI_EVT_RESET_REMOVED 2 | ||
2901 | |||
2902 | By sending this event, the device signals that a logical unit | ||
2903 | on a target has been reset, including the case of a new device | ||
2904 | appearing or disappearing on the bus.The device fills in all | ||
2905 | fields. The event field is set to | ||
2906 | VIRTIO_SCSI_T_TRANSPORT_RESET. The lun field addresses a | ||
2907 | logical unit in the SCSI host. | ||
2908 | |||
2909 | The reason value is one of the three #define values appearing | ||
2910 | above: | ||
2911 | |||
2912 | VIRTIO_SCSI_EVT_RESET_REMOVED (“LUN/target removed”) is used if | ||
2913 | the target or logical unit is no longer able to receive | ||
2914 | commands. | ||
2915 | |||
2916 | VIRTIO_SCSI_EVT_RESET_HARD (“LUN hard reset”) is used if the | ||
2917 | logical unit has been reset, but is still present. | ||
2918 | |||
2919 | VIRTIO_SCSI_EVT_RESET_RESCAN (“rescan LUN/target”) is used if a | ||
2920 | target or logical unit has just appeared on the device. | ||
2921 | |||
2922 | The “removed” and “rescan” events, when sent for LUN 0, may | ||
2923 | apply to the entire target. After receiving them the driver | ||
2924 | should ask the initiator to rescan the target, in order to | ||
2925 | detect the case when an entire target has appeared or | ||
2926 | disappeared. These two events will never be reported unless the | ||
2927 | VIRTIO_SCSI_F_HOTPLUG feature was negotiated between the host | ||
2928 | and the guest. | ||
2929 | |||
2930 | Events will also be reported via sense codes (this obviously | ||
2931 | does not apply to newly appeared buses or targets, since the | ||
2932 | application has never discovered them): | ||
2933 | |||
2934 | “LUN/target removed” maps to sense key ILLEGAL REQUEST, asc | ||
2935 | 0x25, ascq 0x00 (LOGICAL UNIT NOT SUPPORTED) | ||
2936 | |||
2937 | “LUN hard reset” maps to sense key UNIT ATTENTION, asc 0x29 | ||
2938 | (POWER ON, RESET OR BUS DEVICE RESET OCCURRED) | ||
2939 | |||
2940 | “rescan LUN/target” maps to sense key UNIT ATTENTION, asc 0x3f, | ||
2941 | ascq 0x0e (REPORTED LUNS DATA HAS CHANGED) | ||
2942 | |||
2943 | The preferred way to detect transport reset is always to use | ||
2944 | events, because sense codes are only seen by the driver when it | ||
2945 | sends a SCSI command to the logical unit or target. However, in | ||
2946 | case events are dropped, the initiator will still be able to | ||
2947 | synchronize with the actual state of the controller if the | ||
2948 | driver asks the initiator to rescan of the SCSI bus. During the | ||
2949 | rescan, the initiator will be able to observe the above sense | ||
2950 | codes, and it will process them as if it the driver had | ||
2951 | received the equivalent event. | ||
2952 | |||
2953 | Asynchronous notification | ||
2954 | #define VIRTIO_SCSI_T_ASYNC_NOTIFY 2 | ||
2955 | |||
2956 | |||
2957 | |||
2958 | struct virtio_scsi_event_an { | ||
2959 | |||
2960 | // Write-only part | ||
2961 | |||
2962 | u32 event; | ||
2963 | |||
2964 | u8 lun[8]; | ||
2965 | |||
2966 | u32 reason; | ||
2967 | |||
2968 | } | ||
2969 | |||
2970 | By sending this event, the device signals that an asynchronous | ||
2971 | event was fired from a physical interface. | ||
2972 | |||
2973 | All fields are written by the device. The event field is set to | ||
2974 | VIRTIO_SCSI_T_ASYNC_NOTIFY. The lun field addresses a logical | ||
2975 | unit in the SCSI host. The reason field is a subset of the | ||
2976 | events that the driver has subscribed to via the "Asynchronous | ||
2977 | notification subscription" command. | ||
2978 | |||
2979 | When dropped events are reported, the driver should poll for | ||
2980 | asynchronous events manually using SCSI commands. | ||
2981 | |||
2982 | Appendix X: virtio-mmio | ||
2983 | |||
2984 | Virtual environments without PCI support (a common situation in | ||
2985 | embedded devices models) might use simple memory mapped device (“ | ||
2986 | virtio-mmio”) instead of the PCI device. | ||
2987 | |||
2988 | The memory mapped virtio device behaviour is based on the PCI | ||
2989 | device specification. Therefore most of operations like device | ||
2990 | initialization, queues configuration and buffer transfers are | ||
2991 | nearly identical. Existing differences are described in the | ||
2992 | following sections. | ||
2993 | |||
2994 | Device Initialization | ||
2995 | |||
2996 | Instead of using the PCI IO space for virtio header, the “ | ||
2997 | virtio-mmio” device provides a set of memory mapped control | ||
2998 | registers, all 32 bits wide, followed by device-specific | ||
2999 | configuration space. The following list presents their layout: | ||
3000 | |||
3001 | Offset from the device base address | Direction | Name | ||
3002 | Description | ||
3003 | |||
3004 | 0x000 | R | MagicValue | ||
3005 | “virt” string. | ||
3006 | |||
3007 | 0x004 | R | Version | ||
3008 | Device version number. Currently must be 1. | ||
3009 | |||
3010 | 0x008 | R | DeviceID | ||
3011 | Virtio Subsystem Device ID (ie. 1 for network card). | ||
3012 | |||
3013 | 0x00c | R | VendorID | ||
3014 | Virtio Subsystem Vendor ID. | ||
3015 | |||
3016 | 0x010 | R | HostFeatures | ||
3017 | Flags representing features the device supports. | ||
3018 | Reading from this register returns 32 consecutive flag bits, | ||
3019 | first bit depending on the last value written to | ||
3020 | HostFeaturesSel register. Access to this register returns bits HostFeaturesSel*32 | ||
3021 | |||
3022 | to (HostFeaturesSel*32)+31 | ||
3023 | , eg. feature bits 0 to 31 if | ||
3024 | HostFeaturesSel is set to 0 and features bits 32 to 63 if | ||
3025 | HostFeaturesSel is set to 1. Also see [sub:Feature-Bits] | ||
3026 | |||
3027 | 0x014 | W | HostFeaturesSel | ||
3028 | Device (Host) features word selection. | ||
3029 | Writing to this register selects a set of 32 device feature bits | ||
3030 | accessible by reading from HostFeatures register. Device driver | ||
3031 | must write a value to the HostFeaturesSel register before | ||
3032 | reading from the HostFeatures register. | ||
3033 | |||
3034 | 0x020 | W | GuestFeatures | ||
3035 | Flags representing device features understood and activated by | ||
3036 | the driver. | ||
3037 | Writing to this register sets 32 consecutive flag bits, first | ||
3038 | bit depending on the last value written to GuestFeaturesSel | ||
3039 | register. Access to this register sets bits GuestFeaturesSel*32 | ||
3040 | |||
3041 | to (GuestFeaturesSel*32)+31 | ||
3042 | , eg. feature bits 0 to 31 if | ||
3043 | GuestFeaturesSel is set to 0 and features bits 32 to 63 if | ||
3044 | GuestFeaturesSel is set to 1. Also see [sub:Feature-Bits] | ||
3045 | |||
3046 | 0x024 | W | GuestFeaturesSel | ||
3047 | Activated (Guest) features word selection. | ||
3048 | Writing to this register selects a set of 32 activated feature | ||
3049 | bits accessible by writing to the GuestFeatures register. | ||
3050 | Device driver must write a value to the GuestFeaturesSel | ||
3051 | register before writing to the GuestFeatures register. | ||
3052 | |||
3053 | 0x028 | W | GuestPageSize | ||
3054 | Guest page size. | ||
3055 | Device driver must write the guest page size in bytes to the | ||
3056 | register during initialization, before any queues are used. | ||
3057 | This value must be a power of 2 and is used by the Host to | ||
3058 | calculate Guest address of the first queue page (see QueuePFN). | ||
3059 | |||
3060 | 0x030 | W | QueueSel | ||
3061 | Virtual queue index (first queue is 0). | ||
3062 | Writing to this register selects the virtual queue that the | ||
3063 | following operations on QueueNum, QueueAlign and QueuePFN apply | ||
3064 | to. | ||
3065 | |||
3066 | 0x034 | R | QueueNumMax | ||
3067 | Maximum virtual queue size. | ||
3068 | Reading from the register returns the maximum size of the queue | ||
3069 | the Host is ready to process or zero (0x0) if the queue is not | ||
3070 | available. This applies to the queue selected by writing to | ||
3071 | QueueSel and is allowed only when QueuePFN is set to zero | ||
3072 | (0x0), so when the queue is not actively used. | ||
3073 | |||
3074 | 0x038 | W | QueueNum | ||
3075 | Virtual queue size. | ||
3076 | Queue size is a number of elements in the queue, therefore size | ||
3077 | of the descriptor table and both available and used rings. | ||
3078 | Writing to this register notifies the Host what size of the | ||
3079 | queue the Guest will use. This applies to the queue selected by | ||
3080 | writing to QueueSel. | ||
3081 | |||
3082 | 0x03c | W | QueueAlign | ||
3083 | Used Ring alignment in the virtual queue. | ||
3084 | Writing to this register notifies the Host about alignment | ||
3085 | boundary of the Used Ring in bytes. This value must be a power | ||
3086 | of 2 and applies to the queue selected by writing to QueueSel. | ||
3087 | |||
3088 | 0x040 | RW | QueuePFN | ||
3089 | Guest physical page number of the virtual queue. | ||
3090 | Writing to this register notifies the host about location of the | ||
3091 | virtual queue in the Guest's physical address space. This value | ||
3092 | is the index number of a page starting with the queue | ||
3093 | Descriptor Table. Value zero (0x0) means physical address zero | ||
3094 | (0x00000000) and is illegal. When the Guest stops using the | ||
3095 | queue it must write zero (0x0) to this register. | ||
3096 | Reading from this register returns the currently used page | ||
3097 | number of the queue, therefore a value other than zero (0x0) | ||
3098 | means that the queue is in use. | ||
3099 | Both read and write accesses apply to the queue selected by | ||
3100 | writing to QueueSel. | ||
3101 | |||
3102 | 0x050 | W | QueueNotify | ||
3103 | Queue notifier. | ||
3104 | Writing a queue index to this register notifies the Host that | ||
3105 | there are new buffers to process in the queue. | ||
3106 | |||
3107 | 0x60 | R | InterruptStatus | ||
3108 | Interrupt status. | ||
3109 | Reading from this register returns a bit mask of interrupts | ||
3110 | asserted by the device. An interrupt is asserted if the | ||
3111 | corresponding bit is set, ie. equals one (1). | ||
3112 | |||
3113 | Bit 0 | Used Ring Update | ||
3114 | This interrupt is asserted when the Host has updated the Used | ||
3115 | Ring in at least one of the active virtual queues. | ||
3116 | |||
3117 | Bit 1 | Configuration change | ||
3118 | This interrupt is asserted when configuration of the device has | ||
3119 | changed. | ||
3120 | |||
3121 | 0x064 | W | InterruptACK | ||
3122 | Interrupt acknowledge. | ||
3123 | Writing to this register notifies the Host that the Guest | ||
3124 | finished handling interrupts. Set bits in the value clear the | ||
3125 | corresponding bits of the InterruptStatus register. | ||
3126 | |||
3127 | 0x070 | RW | Status | ||
3128 | Device status. | ||
3129 | Reading from this register returns the current device status | ||
3130 | flags. | ||
3131 | Writing non-zero values to this register sets the status flags, | ||
3132 | indicating the Guest progress. Writing zero (0x0) to this | ||
3133 | register triggers a device reset. | ||
3134 | Also see [sub:Device-Initialization-Sequence] | ||
3135 | |||
3136 | 0x100+ | RW | Config | ||
3137 | Device-specific configuration space starts at an offset 0x100 | ||
3138 | and is accessed with byte alignment. Its meaning and size | ||
3139 | depends on the device and the driver. | ||
3140 | |||
3141 | Virtual queue size is a number of elements in the queue, | ||
3142 | therefore size of the descriptor table and both available and | ||
3143 | used rings. | ||
3144 | |||
3145 | The endianness of the registers follows the native endianness of | ||
3146 | the Guest. Writing to registers described as “R” and reading from | ||
3147 | registers described as “W” is not permitted and can cause | ||
3148 | undefined behavior. | ||
3149 | |||
3150 | The device initialization is performed as described in [sub:Device-Initialization-Sequence] | ||
3151 | with one exception: the Guest must notify the Host about its | ||
3152 | page size, writing the size in bytes to GuestPageSize register | ||
3153 | before the initialization is finished. | ||
3154 | |||
3155 | The memory mapped virtio devices generate single interrupt only, | ||
3156 | therefore no special configuration is required. | ||
3157 | |||
3158 | Virtqueue Configuration | ||
3159 | |||
3160 | The virtual queue configuration is performed in a similar way to | ||
3161 | the one described in [sec:Virtqueue-Configuration] with a few | ||
3162 | additional operations: | ||
3163 | |||
3164 | Select the queue writing its index (first queue is 0) to the | ||
3165 | QueueSel register. | ||
3166 | |||
3167 | Check if the queue is not already in use: read QueuePFN | ||
3168 | register, returned value should be zero (0x0). | ||
3169 | |||
3170 | Read maximum queue size (number of elements) from the | ||
3171 | QueueNumMax register. If the returned value is zero (0x0) the | ||
3172 | queue is not available. | ||
3173 | |||
3174 | Allocate and zero the queue pages in contiguous virtual memory, | ||
3175 | aligning the Used Ring to an optimal boundary (usually page | ||
3176 | size). Size of the allocated queue may be smaller than or equal | ||
3177 | to the maximum size returned by the Host. | ||
3178 | |||
3179 | Notify the Host about the queue size by writing the size to | ||
3180 | QueueNum register. | ||
3181 | |||
3182 | Notify the Host about the used alignment by writing its value | ||
3183 | in bytes to QueueAlign register. | ||
3184 | |||
3185 | Write the physical number of the first page of the queue to the | ||
3186 | QueuePFN register. | ||
3187 | |||
3188 | The queue and the device are ready to begin normal operations | ||
3189 | now. | ||
3190 | |||
3191 | Device Operation | ||
3192 | |||
3193 | The memory mapped virtio device behaves in the same way as | ||
3194 | described in [sec:Device-Operation], with the following | ||
3195 | exceptions: | ||
3196 | |||
3197 | The device is notified about new buffers available in a queue | ||
3198 | by writing the queue index to register QueueNum instead of the | ||
3199 | virtio header in PCI I/O space ([sub:Notifying-The-Device]). | ||
3200 | |||
3201 | The memory mapped virtio device is using single, dedicated | ||
3202 | interrupt signal, which is raised when at least one of the | ||
3203 | interrupts described in the InterruptStatus register | ||
3204 | description is asserted. After receiving an interrupt, the | ||
3205 | driver must read the InterruptStatus register to check what | ||
3206 | caused the interrupt (see the register description). After the | ||
3207 | interrupt is handled, the driver must acknowledge it by writing | ||
3208 | a bit mask corresponding to the serviced interrupt to the | ||
3209 | InterruptACK register. | ||
3210 | |||
diff --git a/MAINTAINERS b/MAINTAINERS index ee468fac7dbf..b57e2765a342 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -8743,6 +8743,7 @@ F: drivers/virtio/ | |||
8743 | F: drivers/net/virtio_net.c | 8743 | F: drivers/net/virtio_net.c |
8744 | F: drivers/block/virtio_blk.c | 8744 | F: drivers/block/virtio_blk.c |
8745 | F: include/linux/virtio_*.h | 8745 | F: include/linux/virtio_*.h |
8746 | F: include/uapi/linux/virtio_*.h | ||
8746 | 8747 | ||
8747 | VIRTIO HOST (VHOST) | 8748 | VIRTIO HOST (VHOST) |
8748 | M: "Michael S. Tsirkin" <mst@redhat.com> | 8749 | M: "Michael S. Tsirkin" <mst@redhat.com> |
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 0d97deba1e35..e2d4a4afa8c3 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h | |||
@@ -11,18 +11,11 @@ | |||
11 | 11 | ||
12 | #define GUEST_PL 1 | 12 | #define GUEST_PL 1 |
13 | 13 | ||
14 | /* Every guest maps the core switcher code. */ | 14 | /* Page for Switcher text itself, then two pages per cpu */ |
15 | #define SHARED_SWITCHER_PAGES \ | 15 | #define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids) |
16 | DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) | 16 | |
17 | /* Pages for switcher itself, then two pages per cpu */ | 17 | /* Where we map the Switcher, in both Host and Guest. */ |
18 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) | 18 | extern unsigned long switcher_addr; |
19 | |||
20 | /* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */ | ||
21 | #ifdef CONFIG_X86_PAE | ||
22 | #define SWITCHER_ADDR 0xFFE00000 | ||
23 | #else | ||
24 | #define SWITCHER_ADDR 0xFFC00000 | ||
25 | #endif | ||
26 | 19 | ||
27 | /* Found in switcher.S */ | 20 | /* Found in switcher.S */ |
28 | extern unsigned long default_idt_entries[]; | 21 | extern unsigned long default_idt_entries[]; |
diff --git a/block/blk-integrity.c b/block/blk-integrity.c index dabd221857e1..03cf7179e8ef 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c | |||
@@ -110,7 +110,7 @@ new_segment: | |||
110 | if (!sg) | 110 | if (!sg) |
111 | sg = sglist; | 111 | sg = sglist; |
112 | else { | 112 | else { |
113 | sg->page_link &= ~0x02; | 113 | sg_unmark_end(sg); |
114 | sg = sg_next(sg); | 114 | sg = sg_next(sg); |
115 | } | 115 | } |
116 | 116 | ||
diff --git a/block/blk-merge.c b/block/blk-merge.c index 936a110de0b9..5f2448253797 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c | |||
@@ -143,7 +143,7 @@ new_segment: | |||
143 | * termination bit to avoid doing a full | 143 | * termination bit to avoid doing a full |
144 | * sg_init_table() in drivers for each command. | 144 | * sg_init_table() in drivers for each command. |
145 | */ | 145 | */ |
146 | (*sg)->page_link &= ~0x02; | 146 | sg_unmark_end(*sg); |
147 | *sg = sg_next(*sg); | 147 | *sg = sg_next(*sg); |
148 | } | 148 | } |
149 | 149 | ||
diff --git a/drivers/Makefile b/drivers/Makefile index 33360de63650..8e57688ebd95 100644 --- a/drivers/Makefile +++ b/drivers/Makefile | |||
@@ -124,7 +124,7 @@ obj-$(CONFIG_PPC_PS3) += ps3/ | |||
124 | obj-$(CONFIG_OF) += of/ | 124 | obj-$(CONFIG_OF) += of/ |
125 | obj-$(CONFIG_SSB) += ssb/ | 125 | obj-$(CONFIG_SSB) += ssb/ |
126 | obj-$(CONFIG_BCMA) += bcma/ | 126 | obj-$(CONFIG_BCMA) += bcma/ |
127 | obj-$(CONFIG_VHOST_NET) += vhost/ | 127 | obj-$(CONFIG_VHOST_RING) += vhost/ |
128 | obj-$(CONFIG_VLYNQ) += vlynq/ | 128 | obj-$(CONFIG_VLYNQ) += vlynq/ |
129 | obj-$(CONFIG_STAGING) += staging/ | 129 | obj-$(CONFIG_STAGING) += staging/ |
130 | obj-y += platform/ | 130 | obj-y += platform/ |
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 8ad21a25bc0d..64723953e1c9 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c | |||
@@ -100,96 +100,103 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, | |||
100 | return vbr; | 100 | return vbr; |
101 | } | 101 | } |
102 | 102 | ||
103 | static void virtblk_add_buf_wait(struct virtio_blk *vblk, | 103 | static int __virtblk_add_req(struct virtqueue *vq, |
104 | struct virtblk_req *vbr, | 104 | struct virtblk_req *vbr, |
105 | unsigned long out, | 105 | struct scatterlist *data_sg, |
106 | unsigned long in) | 106 | bool have_data) |
107 | { | 107 | { |
108 | DEFINE_WAIT(wait); | 108 | struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6]; |
109 | unsigned int num_out = 0, num_in = 0; | ||
110 | int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT; | ||
109 | 111 | ||
110 | for (;;) { | 112 | sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); |
111 | prepare_to_wait_exclusive(&vblk->queue_wait, &wait, | 113 | sgs[num_out++] = &hdr; |
112 | TASK_UNINTERRUPTIBLE); | ||
113 | 114 | ||
114 | spin_lock_irq(vblk->disk->queue->queue_lock); | 115 | /* |
115 | if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, | 116 | * If this is a packet command we need a couple of additional headers. |
116 | GFP_ATOMIC) < 0) { | 117 | * Behind the normal outhdr we put a segment with the scsi command |
117 | spin_unlock_irq(vblk->disk->queue->queue_lock); | 118 | * block, and before the normal inhdr we put the sense data and the |
118 | io_schedule(); | 119 | * inhdr with additional status information. |
119 | } else { | 120 | */ |
120 | virtqueue_kick(vblk->vq); | 121 | if (type == VIRTIO_BLK_T_SCSI_CMD) { |
121 | spin_unlock_irq(vblk->disk->queue->queue_lock); | 122 | sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len); |
122 | break; | 123 | sgs[num_out++] = &cmd; |
123 | } | 124 | } |
124 | 125 | ||
126 | if (have_data) { | ||
127 | if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT) | ||
128 | sgs[num_out++] = data_sg; | ||
129 | else | ||
130 | sgs[num_out + num_in++] = data_sg; | ||
125 | } | 131 | } |
126 | 132 | ||
127 | finish_wait(&vblk->queue_wait, &wait); | 133 | if (type == VIRTIO_BLK_T_SCSI_CMD) { |
134 | sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE); | ||
135 | sgs[num_out + num_in++] = &sense; | ||
136 | sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr)); | ||
137 | sgs[num_out + num_in++] = &inhdr; | ||
138 | } | ||
139 | |||
140 | sg_init_one(&status, &vbr->status, sizeof(vbr->status)); | ||
141 | sgs[num_out + num_in++] = &status; | ||
142 | |||
143 | return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); | ||
128 | } | 144 | } |
129 | 145 | ||
130 | static inline void virtblk_add_req(struct virtblk_req *vbr, | 146 | static void virtblk_add_req(struct virtblk_req *vbr, bool have_data) |
131 | unsigned int out, unsigned int in) | ||
132 | { | 147 | { |
133 | struct virtio_blk *vblk = vbr->vblk; | 148 | struct virtio_blk *vblk = vbr->vblk; |
149 | DEFINE_WAIT(wait); | ||
150 | int ret; | ||
134 | 151 | ||
135 | spin_lock_irq(vblk->disk->queue->queue_lock); | 152 | spin_lock_irq(vblk->disk->queue->queue_lock); |
136 | if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, | 153 | while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg, |
137 | GFP_ATOMIC) < 0)) { | 154 | have_data)) < 0)) { |
155 | prepare_to_wait_exclusive(&vblk->queue_wait, &wait, | ||
156 | TASK_UNINTERRUPTIBLE); | ||
157 | |||
138 | spin_unlock_irq(vblk->disk->queue->queue_lock); | 158 | spin_unlock_irq(vblk->disk->queue->queue_lock); |
139 | virtblk_add_buf_wait(vblk, vbr, out, in); | 159 | io_schedule(); |
140 | return; | 160 | spin_lock_irq(vblk->disk->queue->queue_lock); |
161 | |||
162 | finish_wait(&vblk->queue_wait, &wait); | ||
141 | } | 163 | } |
164 | |||
142 | virtqueue_kick(vblk->vq); | 165 | virtqueue_kick(vblk->vq); |
143 | spin_unlock_irq(vblk->disk->queue->queue_lock); | 166 | spin_unlock_irq(vblk->disk->queue->queue_lock); |
144 | } | 167 | } |
145 | 168 | ||
146 | static int virtblk_bio_send_flush(struct virtblk_req *vbr) | 169 | static void virtblk_bio_send_flush(struct virtblk_req *vbr) |
147 | { | 170 | { |
148 | unsigned int out = 0, in = 0; | ||
149 | |||
150 | vbr->flags |= VBLK_IS_FLUSH; | 171 | vbr->flags |= VBLK_IS_FLUSH; |
151 | vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; | 172 | vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; |
152 | vbr->out_hdr.sector = 0; | 173 | vbr->out_hdr.sector = 0; |
153 | vbr->out_hdr.ioprio = 0; | 174 | vbr->out_hdr.ioprio = 0; |
154 | sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); | ||
155 | sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status)); | ||
156 | |||
157 | virtblk_add_req(vbr, out, in); | ||
158 | 175 | ||
159 | return 0; | 176 | virtblk_add_req(vbr, false); |
160 | } | 177 | } |
161 | 178 | ||
162 | static int virtblk_bio_send_data(struct virtblk_req *vbr) | 179 | static void virtblk_bio_send_data(struct virtblk_req *vbr) |
163 | { | 180 | { |
164 | struct virtio_blk *vblk = vbr->vblk; | 181 | struct virtio_blk *vblk = vbr->vblk; |
165 | unsigned int num, out = 0, in = 0; | ||
166 | struct bio *bio = vbr->bio; | 182 | struct bio *bio = vbr->bio; |
183 | bool have_data; | ||
167 | 184 | ||
168 | vbr->flags &= ~VBLK_IS_FLUSH; | 185 | vbr->flags &= ~VBLK_IS_FLUSH; |
169 | vbr->out_hdr.type = 0; | 186 | vbr->out_hdr.type = 0; |
170 | vbr->out_hdr.sector = bio->bi_sector; | 187 | vbr->out_hdr.sector = bio->bi_sector; |
171 | vbr->out_hdr.ioprio = bio_prio(bio); | 188 | vbr->out_hdr.ioprio = bio_prio(bio); |
172 | 189 | ||
173 | sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); | 190 | if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) { |
174 | 191 | have_data = true; | |
175 | num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out); | 192 | if (bio->bi_rw & REQ_WRITE) |
176 | |||
177 | sg_set_buf(&vbr->sg[num + out + in++], &vbr->status, | ||
178 | sizeof(vbr->status)); | ||
179 | |||
180 | if (num) { | ||
181 | if (bio->bi_rw & REQ_WRITE) { | ||
182 | vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; | 193 | vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; |
183 | out += num; | 194 | else |
184 | } else { | ||
185 | vbr->out_hdr.type |= VIRTIO_BLK_T_IN; | 195 | vbr->out_hdr.type |= VIRTIO_BLK_T_IN; |
186 | in += num; | 196 | } else |
187 | } | 197 | have_data = false; |
188 | } | ||
189 | 198 | ||
190 | virtblk_add_req(vbr, out, in); | 199 | virtblk_add_req(vbr, have_data); |
191 | |||
192 | return 0; | ||
193 | } | 200 | } |
194 | 201 | ||
195 | static void virtblk_bio_send_data_work(struct work_struct *work) | 202 | static void virtblk_bio_send_data_work(struct work_struct *work) |
@@ -298,7 +305,7 @@ static void virtblk_done(struct virtqueue *vq) | |||
298 | static bool do_req(struct request_queue *q, struct virtio_blk *vblk, | 305 | static bool do_req(struct request_queue *q, struct virtio_blk *vblk, |
299 | struct request *req) | 306 | struct request *req) |
300 | { | 307 | { |
301 | unsigned long num, out = 0, in = 0; | 308 | unsigned int num; |
302 | struct virtblk_req *vbr; | 309 | struct virtblk_req *vbr; |
303 | 310 | ||
304 | vbr = virtblk_alloc_req(vblk, GFP_ATOMIC); | 311 | vbr = virtblk_alloc_req(vblk, GFP_ATOMIC); |
@@ -335,40 +342,15 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, | |||
335 | } | 342 | } |
336 | } | 343 | } |
337 | 344 | ||
338 | sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); | 345 | num = blk_rq_map_sg(q, vbr->req, vblk->sg); |
339 | |||
340 | /* | ||
341 | * If this is a packet command we need a couple of additional headers. | ||
342 | * Behind the normal outhdr we put a segment with the scsi command | ||
343 | * block, and before the normal inhdr we put the sense data and the | ||
344 | * inhdr with additional status information before the normal inhdr. | ||
345 | */ | ||
346 | if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) | ||
347 | sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len); | ||
348 | |||
349 | num = blk_rq_map_sg(q, vbr->req, vblk->sg + out); | ||
350 | |||
351 | if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) { | ||
352 | sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE); | ||
353 | sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr, | ||
354 | sizeof(vbr->in_hdr)); | ||
355 | } | ||
356 | |||
357 | sg_set_buf(&vblk->sg[num + out + in++], &vbr->status, | ||
358 | sizeof(vbr->status)); | ||
359 | |||
360 | if (num) { | 346 | if (num) { |
361 | if (rq_data_dir(vbr->req) == WRITE) { | 347 | if (rq_data_dir(vbr->req) == WRITE) |
362 | vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; | 348 | vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; |
363 | out += num; | 349 | else |
364 | } else { | ||
365 | vbr->out_hdr.type |= VIRTIO_BLK_T_IN; | 350 | vbr->out_hdr.type |= VIRTIO_BLK_T_IN; |
366 | in += num; | ||
367 | } | ||
368 | } | 351 | } |
369 | 352 | ||
370 | if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, | 353 | if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) { |
371 | GFP_ATOMIC) < 0) { | ||
372 | mempool_free(vbr, vblk->pool); | 354 | mempool_free(vbr, vblk->pool); |
373 | return false; | 355 | return false; |
374 | } | 356 | } |
@@ -539,6 +521,7 @@ static void virtblk_config_changed_work(struct work_struct *work) | |||
539 | struct virtio_device *vdev = vblk->vdev; | 521 | struct virtio_device *vdev = vblk->vdev; |
540 | struct request_queue *q = vblk->disk->queue; | 522 | struct request_queue *q = vblk->disk->queue; |
541 | char cap_str_2[10], cap_str_10[10]; | 523 | char cap_str_2[10], cap_str_10[10]; |
524 | char *envp[] = { "RESIZE=1", NULL }; | ||
542 | u64 capacity, size; | 525 | u64 capacity, size; |
543 | 526 | ||
544 | mutex_lock(&vblk->config_lock); | 527 | mutex_lock(&vblk->config_lock); |
@@ -568,6 +551,7 @@ static void virtblk_config_changed_work(struct work_struct *work) | |||
568 | 551 | ||
569 | set_capacity(vblk->disk, capacity); | 552 | set_capacity(vblk->disk, capacity); |
570 | revalidate_disk(vblk->disk); | 553 | revalidate_disk(vblk->disk); |
554 | kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp); | ||
571 | done: | 555 | done: |
572 | mutex_unlock(&vblk->config_lock); | 556 | mutex_unlock(&vblk->config_lock); |
573 | } | 557 | } |
diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 6bf4d47324eb..ef46a9cfd832 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c | |||
@@ -47,7 +47,7 @@ static void register_buffer(u8 *buf, size_t size) | |||
47 | sg_init_one(&sg, buf, size); | 47 | sg_init_one(&sg, buf, size); |
48 | 48 | ||
49 | /* There should always be room for one buffer. */ | 49 | /* There should always be room for one buffer. */ |
50 | if (virtqueue_add_buf(vq, &sg, 0, 1, buf, GFP_KERNEL) < 0) | 50 | if (virtqueue_add_inbuf(vq, &sg, 1, buf, GFP_KERNEL) < 0) |
51 | BUG(); | 51 | BUG(); |
52 | 52 | ||
53 | virtqueue_kick(vq); | 53 | virtqueue_kick(vq); |
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index ce5f3fc25d6d..1b456fe9b87a 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c | |||
@@ -78,8 +78,8 @@ struct ports_driver_data { | |||
78 | }; | 78 | }; |
79 | static struct ports_driver_data pdrvdata; | 79 | static struct ports_driver_data pdrvdata; |
80 | 80 | ||
81 | DEFINE_SPINLOCK(pdrvdata_lock); | 81 | static DEFINE_SPINLOCK(pdrvdata_lock); |
82 | DECLARE_COMPLETION(early_console_added); | 82 | static DECLARE_COMPLETION(early_console_added); |
83 | 83 | ||
84 | /* This struct holds information that's relevant only for console ports */ | 84 | /* This struct holds information that's relevant only for console ports */ |
85 | struct console { | 85 | struct console { |
@@ -503,7 +503,7 @@ static int add_inbuf(struct virtqueue *vq, struct port_buffer *buf) | |||
503 | 503 | ||
504 | sg_init_one(sg, buf->buf, buf->size); | 504 | sg_init_one(sg, buf->buf, buf->size); |
505 | 505 | ||
506 | ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC); | 506 | ret = virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC); |
507 | virtqueue_kick(vq); | 507 | virtqueue_kick(vq); |
508 | if (!ret) | 508 | if (!ret) |
509 | ret = vq->num_free; | 509 | ret = vq->num_free; |
@@ -572,7 +572,7 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id, | |||
572 | sg_init_one(sg, &cpkt, sizeof(cpkt)); | 572 | sg_init_one(sg, &cpkt, sizeof(cpkt)); |
573 | 573 | ||
574 | spin_lock(&portdev->c_ovq_lock); | 574 | spin_lock(&portdev->c_ovq_lock); |
575 | if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) == 0) { | 575 | if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) { |
576 | virtqueue_kick(vq); | 576 | virtqueue_kick(vq); |
577 | while (!virtqueue_get_buf(vq, &len)) | 577 | while (!virtqueue_get_buf(vq, &len)) |
578 | cpu_relax(); | 578 | cpu_relax(); |
@@ -622,7 +622,7 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg, | |||
622 | 622 | ||
623 | reclaim_consumed_buffers(port); | 623 | reclaim_consumed_buffers(port); |
624 | 624 | ||
625 | err = virtqueue_add_buf(out_vq, sg, nents, 0, data, GFP_ATOMIC); | 625 | err = virtqueue_add_outbuf(out_vq, sg, nents, data, GFP_ATOMIC); |
626 | 626 | ||
627 | /* Tell Host to go! */ | 627 | /* Tell Host to go! */ |
628 | virtqueue_kick(out_vq); | 628 | virtqueue_kick(out_vq); |
@@ -1040,7 +1040,7 @@ static int port_fops_open(struct inode *inode, struct file *filp) | |||
1040 | spin_lock_irq(&port->inbuf_lock); | 1040 | spin_lock_irq(&port->inbuf_lock); |
1041 | if (port->guest_connected) { | 1041 | if (port->guest_connected) { |
1042 | spin_unlock_irq(&port->inbuf_lock); | 1042 | spin_unlock_irq(&port->inbuf_lock); |
1043 | ret = -EMFILE; | 1043 | ret = -EBUSY; |
1044 | goto out; | 1044 | goto out; |
1045 | } | 1045 | } |
1046 | 1046 | ||
@@ -1202,7 +1202,7 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int)) | |||
1202 | return hvc_instantiate(0, 0, &hv_ops); | 1202 | return hvc_instantiate(0, 0, &hv_ops); |
1203 | } | 1203 | } |
1204 | 1204 | ||
1205 | int init_port_console(struct port *port) | 1205 | static int init_port_console(struct port *port) |
1206 | { | 1206 | { |
1207 | int ret; | 1207 | int ret; |
1208 | 1208 | ||
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index 89875ea19ade..ee035ec4526b 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig | |||
@@ -5,10 +5,9 @@ config LGUEST | |||
5 | ---help--- | 5 | ---help--- |
6 | This is a very simple module which allows you to run | 6 | This is a very simple module which allows you to run |
7 | multiple instances of the same Linux kernel, using the | 7 | multiple instances of the same Linux kernel, using the |
8 | "lguest" command found in the Documentation/virtual/lguest | 8 | "lguest" command found in the tools/lguest directory. |
9 | directory. | ||
10 | 9 | ||
11 | Note that "lguest" is pronounced to rhyme with "fell quest", | 10 | Note that "lguest" is pronounced to rhyme with "fell quest", |
12 | not "rustyvisor". See Documentation/virtual/lguest/lguest.txt. | 11 | not "rustyvisor". See tools/lguest/lguest.txt. |
13 | 12 | ||
14 | If unsure, say N. If curious, say M. If masochistic, say Y. | 13 | If unsure, say N. If curious, say M. If masochistic, say Y. |
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index a5ebc0083d87..0bf1e4edf04d 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c | |||
@@ -20,9 +20,9 @@ | |||
20 | #include <asm/asm-offsets.h> | 20 | #include <asm/asm-offsets.h> |
21 | #include "lg.h" | 21 | #include "lg.h" |
22 | 22 | ||
23 | 23 | unsigned long switcher_addr; | |
24 | struct page **lg_switcher_pages; | ||
24 | static struct vm_struct *switcher_vma; | 25 | static struct vm_struct *switcher_vma; |
25 | static struct page **switcher_page; | ||
26 | 26 | ||
27 | /* This One Big lock protects all inter-guest data structures. */ | 27 | /* This One Big lock protects all inter-guest data structures. */ |
28 | DEFINE_MUTEX(lguest_lock); | 28 | DEFINE_MUTEX(lguest_lock); |
@@ -52,13 +52,21 @@ static __init int map_switcher(void) | |||
52 | * easy. | 52 | * easy. |
53 | */ | 53 | */ |
54 | 54 | ||
55 | /* We assume Switcher text fits into a single page. */ | ||
56 | if (end_switcher_text - start_switcher_text > PAGE_SIZE) { | ||
57 | printk(KERN_ERR "lguest: switcher text too large (%zu)\n", | ||
58 | end_switcher_text - start_switcher_text); | ||
59 | return -EINVAL; | ||
60 | } | ||
61 | |||
55 | /* | 62 | /* |
56 | * We allocate an array of struct page pointers. map_vm_area() wants | 63 | * We allocate an array of struct page pointers. map_vm_area() wants |
57 | * this, rather than just an array of pages. | 64 | * this, rather than just an array of pages. |
58 | */ | 65 | */ |
59 | switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, | 66 | lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0]) |
60 | GFP_KERNEL); | 67 | * TOTAL_SWITCHER_PAGES, |
61 | if (!switcher_page) { | 68 | GFP_KERNEL); |
69 | if (!lg_switcher_pages) { | ||
62 | err = -ENOMEM; | 70 | err = -ENOMEM; |
63 | goto out; | 71 | goto out; |
64 | } | 72 | } |
@@ -68,32 +76,29 @@ static __init int map_switcher(void) | |||
68 | * so we make sure they're zeroed. | 76 | * so we make sure they're zeroed. |
69 | */ | 77 | */ |
70 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { | 78 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { |
71 | switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); | 79 | lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); |
72 | if (!switcher_page[i]) { | 80 | if (!lg_switcher_pages[i]) { |
73 | err = -ENOMEM; | 81 | err = -ENOMEM; |
74 | goto free_some_pages; | 82 | goto free_some_pages; |
75 | } | 83 | } |
76 | } | 84 | } |
77 | 85 | ||
78 | /* | 86 | /* |
79 | * First we check that the Switcher won't overlap the fixmap area at | 87 | * We place the Switcher underneath the fixmap area, which is the |
80 | * the top of memory. It's currently nowhere near, but it could have | 88 | * highest virtual address we can get. This is important, since we |
81 | * very strange effects if it ever happened. | 89 | * tell the Guest it can't access this memory, so we want its ceiling |
90 | * as high as possible. | ||
82 | */ | 91 | */ |
83 | if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ | 92 | switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE; |
84 | err = -ENOMEM; | ||
85 | printk("lguest: mapping switcher would thwack fixmap\n"); | ||
86 | goto free_pages; | ||
87 | } | ||
88 | 93 | ||
89 | /* | 94 | /* |
90 | * Now we reserve the "virtual memory area" we want: 0xFFC00000 | 95 | * Now we reserve the "virtual memory area" we want. We might |
91 | * (SWITCHER_ADDR). We might not get it in theory, but in practice | 96 | * not get it in theory, but in practice it's worked so far. |
92 | * it's worked so far. The end address needs +1 because __get_vm_area | 97 | * The end address needs +1 because __get_vm_area allocates an |
93 | * allocates an extra guard page, so we need space for that. | 98 | * extra guard page, so we need space for that. |
94 | */ | 99 | */ |
95 | switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, | 100 | switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, |
96 | VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR | 101 | VM_ALLOC, switcher_addr, switcher_addr |
97 | + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); | 102 | + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); |
98 | if (!switcher_vma) { | 103 | if (!switcher_vma) { |
99 | err = -ENOMEM; | 104 | err = -ENOMEM; |
@@ -103,12 +108,12 @@ static __init int map_switcher(void) | |||
103 | 108 | ||
104 | /* | 109 | /* |
105 | * This code actually sets up the pages we've allocated to appear at | 110 | * This code actually sets up the pages we've allocated to appear at |
106 | * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the | 111 | * switcher_addr. map_vm_area() takes the vma we allocated above, the |
107 | * kind of pages we're mapping (kernel pages), and a pointer to our | 112 | * kind of pages we're mapping (kernel pages), and a pointer to our |
108 | * array of struct pages. It increments that pointer, but we don't | 113 | * array of struct pages. It increments that pointer, but we don't |
109 | * care. | 114 | * care. |
110 | */ | 115 | */ |
111 | pagep = switcher_page; | 116 | pagep = lg_switcher_pages; |
112 | err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); | 117 | err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); |
113 | if (err) { | 118 | if (err) { |
114 | printk("lguest: map_vm_area failed: %i\n", err); | 119 | printk("lguest: map_vm_area failed: %i\n", err); |
@@ -133,8 +138,8 @@ free_pages: | |||
133 | i = TOTAL_SWITCHER_PAGES; | 138 | i = TOTAL_SWITCHER_PAGES; |
134 | free_some_pages: | 139 | free_some_pages: |
135 | for (--i; i >= 0; i--) | 140 | for (--i; i >= 0; i--) |
136 | __free_pages(switcher_page[i], 0); | 141 | __free_pages(lg_switcher_pages[i], 0); |
137 | kfree(switcher_page); | 142 | kfree(lg_switcher_pages); |
138 | out: | 143 | out: |
139 | return err; | 144 | return err; |
140 | } | 145 | } |
@@ -149,8 +154,8 @@ static void unmap_switcher(void) | |||
149 | vunmap(switcher_vma->addr); | 154 | vunmap(switcher_vma->addr); |
150 | /* Now we just need to free the pages we copied the switcher into */ | 155 | /* Now we just need to free the pages we copied the switcher into */ |
151 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) | 156 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) |
152 | __free_pages(switcher_page[i], 0); | 157 | __free_pages(lg_switcher_pages[i], 0); |
153 | kfree(switcher_page); | 158 | kfree(lg_switcher_pages); |
154 | } | 159 | } |
155 | 160 | ||
156 | /*H:032 | 161 | /*H:032 |
@@ -323,15 +328,10 @@ static int __init init(void) | |||
323 | if (err) | 328 | if (err) |
324 | goto out; | 329 | goto out; |
325 | 330 | ||
326 | /* Now we set up the pagetable implementation for the Guests. */ | ||
327 | err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); | ||
328 | if (err) | ||
329 | goto unmap; | ||
330 | |||
331 | /* We might need to reserve an interrupt vector. */ | 331 | /* We might need to reserve an interrupt vector. */ |
332 | err = init_interrupts(); | 332 | err = init_interrupts(); |
333 | if (err) | 333 | if (err) |
334 | goto free_pgtables; | 334 | goto unmap; |
335 | 335 | ||
336 | /* /dev/lguest needs to be registered. */ | 336 | /* /dev/lguest needs to be registered. */ |
337 | err = lguest_device_init(); | 337 | err = lguest_device_init(); |
@@ -346,8 +346,6 @@ static int __init init(void) | |||
346 | 346 | ||
347 | free_interrupts: | 347 | free_interrupts: |
348 | free_interrupts(); | 348 | free_interrupts(); |
349 | free_pgtables: | ||
350 | free_pagetables(); | ||
351 | unmap: | 349 | unmap: |
352 | unmap_switcher(); | 350 | unmap_switcher(); |
353 | out: | 351 | out: |
@@ -359,7 +357,6 @@ static void __exit fini(void) | |||
359 | { | 357 | { |
360 | lguest_device_remove(); | 358 | lguest_device_remove(); |
361 | free_interrupts(); | 359 | free_interrupts(); |
362 | free_pagetables(); | ||
363 | unmap_switcher(); | 360 | unmap_switcher(); |
364 | 361 | ||
365 | lguest_arch_host_fini(); | 362 | lguest_arch_host_fini(); |
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 295df06e6590..2eef40be4c04 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h | |||
@@ -14,11 +14,10 @@ | |||
14 | 14 | ||
15 | #include <asm/lguest.h> | 15 | #include <asm/lguest.h> |
16 | 16 | ||
17 | void free_pagetables(void); | ||
18 | int init_pagetables(struct page **switcher_page, unsigned int pages); | ||
19 | |||
20 | struct pgdir { | 17 | struct pgdir { |
21 | unsigned long gpgdir; | 18 | unsigned long gpgdir; |
19 | bool switcher_mapped; | ||
20 | int last_host_cpu; | ||
22 | pgd_t *pgdir; | 21 | pgd_t *pgdir; |
23 | }; | 22 | }; |
24 | 23 | ||
@@ -124,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg, | |||
124 | unsigned long addr, unsigned long len); | 123 | unsigned long addr, unsigned long len); |
125 | void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); | 124 | void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); |
126 | void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); | 125 | void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); |
126 | extern struct page **lg_switcher_pages; | ||
127 | 127 | ||
128 | /*H:035 | 128 | /*H:035 |
129 | * Using memory-copy operations like that is usually inconvient, so we | 129 | * Using memory-copy operations like that is usually inconvient, so we |
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index ff4a0bc9904d..4263f4cc8c55 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c | |||
@@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) | |||
250 | */ | 250 | */ |
251 | static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) | 251 | static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) |
252 | { | 252 | { |
253 | /* We have a limited number the number of CPUs in the lguest struct. */ | 253 | /* We have a limited number of CPUs in the lguest struct. */ |
254 | if (id >= ARRAY_SIZE(cpu->lg->cpus)) | 254 | if (id >= ARRAY_SIZE(cpu->lg->cpus)) |
255 | return -EINVAL; | 255 | return -EINVAL; |
256 | 256 | ||
257 | /* Set up this CPU's id, and pointer back to the lguest struct. */ | 257 | /* Set up this CPU's id, and pointer back to the lguest struct. */ |
258 | cpu->id = id; | 258 | cpu->id = id; |
259 | cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); | 259 | cpu->lg = container_of(cpu, struct lguest, cpus[id]); |
260 | cpu->lg->nr_cpus++; | 260 | cpu->lg->nr_cpus++; |
261 | 261 | ||
262 | /* Each CPU has a timer it can set. */ | 262 | /* Each CPU has a timer it can set. */ |
@@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) | |||
270 | if (!cpu->regs_page) | 270 | if (!cpu->regs_page) |
271 | return -ENOMEM; | 271 | return -ENOMEM; |
272 | 272 | ||
273 | /* We actually put the registers at the bottom of the page. */ | 273 | /* We actually put the registers at the end of the page. */ |
274 | cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); | 274 | cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); |
275 | 275 | ||
276 | /* | 276 | /* |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 864baabaee25..699187ab3800 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * converted Guest pages when running the Guest. | 7 | * converted Guest pages when running the Guest. |
8 | :*/ | 8 | :*/ |
9 | 9 | ||
10 | /* Copyright (C) Rusty Russell IBM Corporation 2006. | 10 | /* Copyright (C) Rusty Russell IBM Corporation 2013. |
11 | * GPL v2 and any later version */ | 11 | * GPL v2 and any later version */ |
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/gfp.h> | 13 | #include <linux/gfp.h> |
@@ -62,22 +62,11 @@ | |||
62 | * will need the last pmd entry of the last pmd page. | 62 | * will need the last pmd entry of the last pmd page. |
63 | */ | 63 | */ |
64 | #ifdef CONFIG_X86_PAE | 64 | #ifdef CONFIG_X86_PAE |
65 | #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) | ||
66 | #define RESERVE_MEM 2U | ||
67 | #define CHECK_GPGD_MASK _PAGE_PRESENT | 65 | #define CHECK_GPGD_MASK _PAGE_PRESENT |
68 | #else | 66 | #else |
69 | #define RESERVE_MEM 4U | ||
70 | #define CHECK_GPGD_MASK _PAGE_TABLE | 67 | #define CHECK_GPGD_MASK _PAGE_TABLE |
71 | #endif | 68 | #endif |
72 | 69 | ||
73 | /* | ||
74 | * We actually need a separate PTE page for each CPU. Remember that after the | ||
75 | * Switcher code itself comes two pages for each CPU, and we don't want this | ||
76 | * CPU's guest to see the pages of any other CPU. | ||
77 | */ | ||
78 | static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); | ||
79 | #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) | ||
80 | |||
81 | /*H:320 | 70 | /*H:320 |
82 | * The page table code is curly enough to need helper functions to keep it | 71 | * The page table code is curly enough to need helper functions to keep it |
83 | * clear and clean. The kernel itself provides many of them; one advantage | 72 | * clear and clean. The kernel itself provides many of them; one advantage |
@@ -95,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) | |||
95 | { | 84 | { |
96 | unsigned int index = pgd_index(vaddr); | 85 | unsigned int index = pgd_index(vaddr); |
97 | 86 | ||
98 | #ifndef CONFIG_X86_PAE | ||
99 | /* We kill any Guest trying to touch the Switcher addresses. */ | ||
100 | if (index >= SWITCHER_PGD_INDEX) { | ||
101 | kill_guest(cpu, "attempt to access switcher pages"); | ||
102 | index = 0; | ||
103 | } | ||
104 | #endif | ||
105 | /* Return a pointer index'th pgd entry for the i'th page table. */ | 87 | /* Return a pointer index'th pgd entry for the i'th page table. */ |
106 | return &cpu->lg->pgdirs[i].pgdir[index]; | 88 | return &cpu->lg->pgdirs[i].pgdir[index]; |
107 | } | 89 | } |
@@ -117,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) | |||
117 | unsigned int index = pmd_index(vaddr); | 99 | unsigned int index = pmd_index(vaddr); |
118 | pmd_t *page; | 100 | pmd_t *page; |
119 | 101 | ||
120 | /* We kill any Guest trying to touch the Switcher addresses. */ | ||
121 | if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && | ||
122 | index >= SWITCHER_PMD_INDEX) { | ||
123 | kill_guest(cpu, "attempt to access switcher pages"); | ||
124 | index = 0; | ||
125 | } | ||
126 | |||
127 | /* You should never call this if the PGD entry wasn't valid */ | 102 | /* You should never call this if the PGD entry wasn't valid */ |
128 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); | 103 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); |
129 | page = __va(pgd_pfn(spgd) << PAGE_SHIFT); | 104 | page = __va(pgd_pfn(spgd) << PAGE_SHIFT); |
@@ -275,122 +250,177 @@ static void release_pte(pte_t pte) | |||
275 | } | 250 | } |
276 | /*:*/ | 251 | /*:*/ |
277 | 252 | ||
278 | static void check_gpte(struct lg_cpu *cpu, pte_t gpte) | 253 | static bool check_gpte(struct lg_cpu *cpu, pte_t gpte) |
279 | { | 254 | { |
280 | if ((pte_flags(gpte) & _PAGE_PSE) || | 255 | if ((pte_flags(gpte) & _PAGE_PSE) || |
281 | pte_pfn(gpte) >= cpu->lg->pfn_limit) | 256 | pte_pfn(gpte) >= cpu->lg->pfn_limit) { |
282 | kill_guest(cpu, "bad page table entry"); | 257 | kill_guest(cpu, "bad page table entry"); |
258 | return false; | ||
259 | } | ||
260 | return true; | ||
283 | } | 261 | } |
284 | 262 | ||
285 | static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) | 263 | static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) |
286 | { | 264 | { |
287 | if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || | 265 | if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || |
288 | (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) | 266 | (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) { |
289 | kill_guest(cpu, "bad page directory entry"); | 267 | kill_guest(cpu, "bad page directory entry"); |
268 | return false; | ||
269 | } | ||
270 | return true; | ||
290 | } | 271 | } |
291 | 272 | ||
292 | #ifdef CONFIG_X86_PAE | 273 | #ifdef CONFIG_X86_PAE |
293 | static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) | 274 | static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) |
294 | { | 275 | { |
295 | if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || | 276 | if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || |
296 | (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) | 277 | (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) { |
297 | kill_guest(cpu, "bad page middle directory entry"); | 278 | kill_guest(cpu, "bad page middle directory entry"); |
279 | return false; | ||
280 | } | ||
281 | return true; | ||
298 | } | 282 | } |
299 | #endif | 283 | #endif |
300 | 284 | ||
301 | /*H:330 | 285 | /*H:331 |
302 | * (i) Looking up a page table entry when the Guest faults. | 286 | * This is the core routine to walk the shadow page tables and find the page |
303 | * | 287 | * table entry for a specific address. |
304 | * We saw this call in run_guest(): when we see a page fault in the Guest, we | ||
305 | * come here. That's because we only set up the shadow page tables lazily as | ||
306 | * they're needed, so we get page faults all the time and quietly fix them up | ||
307 | * and return to the Guest without it knowing. | ||
308 | * | 288 | * |
309 | * If we fixed up the fault (ie. we mapped the address), this routine returns | 289 | * If allocate is set, then we allocate any missing levels, setting the flags |
310 | * true. Otherwise, it was a real fault and we need to tell the Guest. | 290 | * on the new page directory and mid-level directories using the arguments |
291 | * (which are copied from the Guest's page table entries). | ||
311 | */ | 292 | */ |
312 | bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | 293 | static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate, |
294 | int pgd_flags, int pmd_flags) | ||
313 | { | 295 | { |
314 | pgd_t gpgd; | ||
315 | pgd_t *spgd; | 296 | pgd_t *spgd; |
316 | unsigned long gpte_ptr; | ||
317 | pte_t gpte; | ||
318 | pte_t *spte; | ||
319 | |||
320 | /* Mid level for PAE. */ | 297 | /* Mid level for PAE. */ |
321 | #ifdef CONFIG_X86_PAE | 298 | #ifdef CONFIG_X86_PAE |
322 | pmd_t *spmd; | 299 | pmd_t *spmd; |
323 | pmd_t gpmd; | ||
324 | #endif | 300 | #endif |
325 | 301 | ||
326 | /* First step: get the top-level Guest page table entry. */ | 302 | /* Get top level entry. */ |
327 | if (unlikely(cpu->linear_pages)) { | ||
328 | /* Faking up a linear mapping. */ | ||
329 | gpgd = __pgd(CHECK_GPGD_MASK); | ||
330 | } else { | ||
331 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | ||
332 | /* Toplevel not present? We can't map it in. */ | ||
333 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) | ||
334 | return false; | ||
335 | } | ||
336 | |||
337 | /* Now look at the matching shadow entry. */ | ||
338 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); | 303 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); |
339 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { | 304 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { |
340 | /* No shadow entry: allocate a new shadow PTE page. */ | 305 | /* No shadow entry: allocate a new shadow PTE page. */ |
341 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); | 306 | unsigned long ptepage; |
307 | |||
308 | /* If they didn't want us to allocate anything, stop. */ | ||
309 | if (!allocate) | ||
310 | return NULL; | ||
311 | |||
312 | ptepage = get_zeroed_page(GFP_KERNEL); | ||
342 | /* | 313 | /* |
343 | * This is not really the Guest's fault, but killing it is | 314 | * This is not really the Guest's fault, but killing it is |
344 | * simple for this corner case. | 315 | * simple for this corner case. |
345 | */ | 316 | */ |
346 | if (!ptepage) { | 317 | if (!ptepage) { |
347 | kill_guest(cpu, "out of memory allocating pte page"); | 318 | kill_guest(cpu, "out of memory allocating pte page"); |
348 | return false; | 319 | return NULL; |
349 | } | 320 | } |
350 | /* We check that the Guest pgd is OK. */ | ||
351 | check_gpgd(cpu, gpgd); | ||
352 | /* | 321 | /* |
353 | * And we copy the flags to the shadow PGD entry. The page | 322 | * And we copy the flags to the shadow PGD entry. The page |
354 | * number in the shadow PGD is the page we just allocated. | 323 | * number in the shadow PGD is the page we just allocated. |
355 | */ | 324 | */ |
356 | set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); | 325 | set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags)); |
357 | } | 326 | } |
358 | 327 | ||
328 | /* | ||
329 | * Intel's Physical Address Extension actually uses three levels of | ||
330 | * page tables, so we need to look in the mid-level. | ||
331 | */ | ||
359 | #ifdef CONFIG_X86_PAE | 332 | #ifdef CONFIG_X86_PAE |
360 | if (unlikely(cpu->linear_pages)) { | 333 | /* Now look at the mid-level shadow entry. */ |
361 | /* Faking up a linear mapping. */ | ||
362 | gpmd = __pmd(_PAGE_TABLE); | ||
363 | } else { | ||
364 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
365 | /* Middle level not present? We can't map it in. */ | ||
366 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
367 | return false; | ||
368 | } | ||
369 | |||
370 | /* Now look at the matching shadow entry. */ | ||
371 | spmd = spmd_addr(cpu, *spgd, vaddr); | 334 | spmd = spmd_addr(cpu, *spgd, vaddr); |
372 | 335 | ||
373 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { | 336 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { |
374 | /* No shadow entry: allocate a new shadow PTE page. */ | 337 | /* No shadow entry: allocate a new shadow PTE page. */ |
375 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); | 338 | unsigned long ptepage; |
339 | |||
340 | /* If they didn't want us to allocate anything, stop. */ | ||
341 | if (!allocate) | ||
342 | return NULL; | ||
343 | |||
344 | ptepage = get_zeroed_page(GFP_KERNEL); | ||
376 | 345 | ||
377 | /* | 346 | /* |
378 | * This is not really the Guest's fault, but killing it is | 347 | * This is not really the Guest's fault, but killing it is |
379 | * simple for this corner case. | 348 | * simple for this corner case. |
380 | */ | 349 | */ |
381 | if (!ptepage) { | 350 | if (!ptepage) { |
382 | kill_guest(cpu, "out of memory allocating pte page"); | 351 | kill_guest(cpu, "out of memory allocating pmd page"); |
383 | return false; | 352 | return NULL; |
384 | } | 353 | } |
385 | 354 | ||
386 | /* We check that the Guest pmd is OK. */ | ||
387 | check_gpmd(cpu, gpmd); | ||
388 | |||
389 | /* | 355 | /* |
390 | * And we copy the flags to the shadow PMD entry. The page | 356 | * And we copy the flags to the shadow PMD entry. The page |
391 | * number in the shadow PMD is the page we just allocated. | 357 | * number in the shadow PMD is the page we just allocated. |
392 | */ | 358 | */ |
393 | set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); | 359 | set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags)); |
360 | } | ||
361 | #endif | ||
362 | |||
363 | /* Get the pointer to the shadow PTE entry we're going to set. */ | ||
364 | return spte_addr(cpu, *spgd, vaddr); | ||
365 | } | ||
366 | |||
367 | /*H:330 | ||
368 | * (i) Looking up a page table entry when the Guest faults. | ||
369 | * | ||
370 | * We saw this call in run_guest(): when we see a page fault in the Guest, we | ||
371 | * come here. That's because we only set up the shadow page tables lazily as | ||
372 | * they're needed, so we get page faults all the time and quietly fix them up | ||
373 | * and return to the Guest without it knowing. | ||
374 | * | ||
375 | * If we fixed up the fault (ie. we mapped the address), this routine returns | ||
376 | * true. Otherwise, it was a real fault and we need to tell the Guest. | ||
377 | */ | ||
378 | bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | ||
379 | { | ||
380 | unsigned long gpte_ptr; | ||
381 | pte_t gpte; | ||
382 | pte_t *spte; | ||
383 | pmd_t gpmd; | ||
384 | pgd_t gpgd; | ||
385 | |||
386 | /* We never demand page the Switcher, so trying is a mistake. */ | ||
387 | if (vaddr >= switcher_addr) | ||
388 | return false; | ||
389 | |||
390 | /* First step: get the top-level Guest page table entry. */ | ||
391 | if (unlikely(cpu->linear_pages)) { | ||
392 | /* Faking up a linear mapping. */ | ||
393 | gpgd = __pgd(CHECK_GPGD_MASK); | ||
394 | } else { | ||
395 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | ||
396 | /* Toplevel not present? We can't map it in. */ | ||
397 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) | ||
398 | return false; | ||
399 | |||
400 | /* | ||
401 | * This kills the Guest if it has weird flags or tries to | ||
402 | * refer to a "physical" address outside the bounds. | ||
403 | */ | ||
404 | if (!check_gpgd(cpu, gpgd)) | ||
405 | return false; | ||
406 | } | ||
407 | |||
408 | /* This "mid-level" entry is only used for non-linear, PAE mode. */ | ||
409 | gpmd = __pmd(_PAGE_TABLE); | ||
410 | |||
411 | #ifdef CONFIG_X86_PAE | ||
412 | if (likely(!cpu->linear_pages)) { | ||
413 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
414 | /* Middle level not present? We can't map it in. */ | ||
415 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
416 | return false; | ||
417 | |||
418 | /* | ||
419 | * This kills the Guest if it has weird flags or tries to | ||
420 | * refer to a "physical" address outside the bounds. | ||
421 | */ | ||
422 | if (!check_gpmd(cpu, gpmd)) | ||
423 | return false; | ||
394 | } | 424 | } |
395 | 425 | ||
396 | /* | 426 | /* |
@@ -433,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
433 | * Check that the Guest PTE flags are OK, and the page number is below | 463 | * Check that the Guest PTE flags are OK, and the page number is below |
434 | * the pfn_limit (ie. not mapping the Launcher binary). | 464 | * the pfn_limit (ie. not mapping the Launcher binary). |
435 | */ | 465 | */ |
436 | check_gpte(cpu, gpte); | 466 | if (!check_gpte(cpu, gpte)) |
467 | return false; | ||
437 | 468 | ||
438 | /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ | 469 | /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ |
439 | gpte = pte_mkyoung(gpte); | 470 | gpte = pte_mkyoung(gpte); |
@@ -441,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
441 | gpte = pte_mkdirty(gpte); | 472 | gpte = pte_mkdirty(gpte); |
442 | 473 | ||
443 | /* Get the pointer to the shadow PTE entry we're going to set. */ | 474 | /* Get the pointer to the shadow PTE entry we're going to set. */ |
444 | spte = spte_addr(cpu, *spgd, vaddr); | 475 | spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd)); |
476 | if (!spte) | ||
477 | return false; | ||
445 | 478 | ||
446 | /* | 479 | /* |
447 | * If there was a valid shadow PTE entry here before, we release it. | 480 | * If there was a valid shadow PTE entry here before, we release it. |
@@ -493,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
493 | */ | 526 | */ |
494 | static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) | 527 | static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) |
495 | { | 528 | { |
496 | pgd_t *spgd; | 529 | pte_t *spte; |
497 | unsigned long flags; | 530 | unsigned long flags; |
498 | 531 | ||
499 | #ifdef CONFIG_X86_PAE | 532 | /* You can't put your stack in the Switcher! */ |
500 | pmd_t *spmd; | 533 | if (vaddr >= switcher_addr) |
501 | #endif | ||
502 | /* Look at the current top level entry: is it present? */ | ||
503 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); | ||
504 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) | ||
505 | return false; | 534 | return false; |
506 | 535 | ||
507 | #ifdef CONFIG_X86_PAE | 536 | /* If there's no shadow PTE, it's not writable. */ |
508 | spmd = spmd_addr(cpu, *spgd, vaddr); | 537 | spte = find_spte(cpu, vaddr, false, 0, 0); |
509 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) | 538 | if (!spte) |
510 | return false; | 539 | return false; |
511 | #endif | ||
512 | 540 | ||
513 | /* | 541 | /* |
514 | * Check the flags on the pte entry itself: it must be present and | 542 | * Check the flags on the pte entry itself: it must be present and |
515 | * writable. | 543 | * writable. |
516 | */ | 544 | */ |
517 | flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); | 545 | flags = pte_flags(*spte); |
518 | |||
519 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); | 546 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); |
520 | } | 547 | } |
521 | 548 | ||
@@ -678,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
678 | int *blank_pgdir) | 705 | int *blank_pgdir) |
679 | { | 706 | { |
680 | unsigned int next; | 707 | unsigned int next; |
681 | #ifdef CONFIG_X86_PAE | ||
682 | pmd_t *pmd_table; | ||
683 | #endif | ||
684 | 708 | ||
685 | /* | 709 | /* |
686 | * We pick one entry at random to throw out. Choosing the Least | 710 | * We pick one entry at random to throw out. Choosing the Least |
@@ -695,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
695 | if (!cpu->lg->pgdirs[next].pgdir) | 719 | if (!cpu->lg->pgdirs[next].pgdir) |
696 | next = cpu->cpu_pgd; | 720 | next = cpu->cpu_pgd; |
697 | else { | 721 | else { |
698 | #ifdef CONFIG_X86_PAE | ||
699 | /* | 722 | /* |
700 | * In PAE mode, allocate a pmd page and populate the | 723 | * This is a blank page, so there are no kernel |
701 | * last pgd entry. | 724 | * mappings: caller must map the stack! |
702 | */ | 725 | */ |
703 | pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); | ||
704 | if (!pmd_table) { | ||
705 | free_page((long)cpu->lg->pgdirs[next].pgdir); | ||
706 | set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); | ||
707 | next = cpu->cpu_pgd; | ||
708 | } else { | ||
709 | set_pgd(cpu->lg->pgdirs[next].pgdir + | ||
710 | SWITCHER_PGD_INDEX, | ||
711 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
712 | /* | ||
713 | * This is a blank page, so there are no kernel | ||
714 | * mappings: caller must map the stack! | ||
715 | */ | ||
716 | *blank_pgdir = 1; | ||
717 | } | ||
718 | #else | ||
719 | *blank_pgdir = 1; | 726 | *blank_pgdir = 1; |
720 | #endif | ||
721 | } | 727 | } |
722 | } | 728 | } |
723 | /* Record which Guest toplevel this shadows. */ | 729 | /* Record which Guest toplevel this shadows. */ |
@@ -725,9 +731,50 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
725 | /* Release all the non-kernel mappings. */ | 731 | /* Release all the non-kernel mappings. */ |
726 | flush_user_mappings(cpu->lg, next); | 732 | flush_user_mappings(cpu->lg, next); |
727 | 733 | ||
734 | /* This hasn't run on any CPU at all. */ | ||
735 | cpu->lg->pgdirs[next].last_host_cpu = -1; | ||
736 | |||
728 | return next; | 737 | return next; |
729 | } | 738 | } |
730 | 739 | ||
740 | /*H:501 | ||
741 | * We do need the Switcher code mapped at all times, so we allocate that | ||
742 | * part of the Guest page table here. We map the Switcher code immediately, | ||
743 | * but defer mapping of the guest register page and IDT/LDT etc page until | ||
744 | * just before we run the guest in map_switcher_in_guest(). | ||
745 | * | ||
746 | * We *could* do this setup in map_switcher_in_guest(), but at that point | ||
747 | * we've interrupts disabled, and allocating pages like that is fraught: we | ||
748 | * can't sleep if we need to free up some memory. | ||
749 | */ | ||
750 | static bool allocate_switcher_mapping(struct lg_cpu *cpu) | ||
751 | { | ||
752 | int i; | ||
753 | |||
754 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { | ||
755 | pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true, | ||
756 | CHECK_GPGD_MASK, _PAGE_TABLE); | ||
757 | if (!pte) | ||
758 | return false; | ||
759 | |||
760 | /* | ||
761 | * Map the switcher page if not already there. It might | ||
762 | * already be there because we call allocate_switcher_mapping() | ||
763 | * in guest_set_pgd() just in case it did discard our Switcher | ||
764 | * mapping, but it probably didn't. | ||
765 | */ | ||
766 | if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) { | ||
767 | /* Get a reference to the Switcher page. */ | ||
768 | get_page(lg_switcher_pages[0]); | ||
769 | /* Create a read-only, exectuable, kernel-style PTE */ | ||
770 | set_pte(pte, | ||
771 | mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX)); | ||
772 | } | ||
773 | } | ||
774 | cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true; | ||
775 | return true; | ||
776 | } | ||
777 | |||
731 | /*H:470 | 778 | /*H:470 |
732 | * Finally, a routine which throws away everything: all PGD entries in all | 779 | * Finally, a routine which throws away everything: all PGD entries in all |
733 | * the shadow page tables, including the Guest's kernel mappings. This is used | 780 | * the shadow page tables, including the Guest's kernel mappings. This is used |
@@ -738,28 +785,16 @@ static void release_all_pagetables(struct lguest *lg) | |||
738 | unsigned int i, j; | 785 | unsigned int i, j; |
739 | 786 | ||
740 | /* Every shadow pagetable this Guest has */ | 787 | /* Every shadow pagetable this Guest has */ |
741 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 788 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) { |
742 | if (lg->pgdirs[i].pgdir) { | 789 | if (!lg->pgdirs[i].pgdir) |
743 | #ifdef CONFIG_X86_PAE | 790 | continue; |
744 | pgd_t *spgd; | 791 | |
745 | pmd_t *pmdpage; | 792 | /* Every PGD entry. */ |
746 | unsigned int k; | 793 | for (j = 0; j < PTRS_PER_PGD; j++) |
747 | 794 | release_pgd(lg->pgdirs[i].pgdir + j); | |
748 | /* Get the last pmd page. */ | 795 | lg->pgdirs[i].switcher_mapped = false; |
749 | spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; | 796 | lg->pgdirs[i].last_host_cpu = -1; |
750 | pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); | 797 | } |
751 | |||
752 | /* | ||
753 | * And release the pmd entries of that pmd page, | ||
754 | * except for the switcher pmd. | ||
755 | */ | ||
756 | for (k = 0; k < SWITCHER_PMD_INDEX; k++) | ||
757 | release_pmd(&pmdpage[k]); | ||
758 | #endif | ||
759 | /* Every PGD entry except the Switcher at the top */ | ||
760 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) | ||
761 | release_pgd(lg->pgdirs[i].pgdir + j); | ||
762 | } | ||
763 | } | 798 | } |
764 | 799 | ||
765 | /* | 800 | /* |
@@ -773,6 +808,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) | |||
773 | release_all_pagetables(cpu->lg); | 808 | release_all_pagetables(cpu->lg); |
774 | /* We need the Guest kernel stack mapped again. */ | 809 | /* We need the Guest kernel stack mapped again. */ |
775 | pin_stack_pages(cpu); | 810 | pin_stack_pages(cpu); |
811 | /* And we need Switcher allocated. */ | ||
812 | if (!allocate_switcher_mapping(cpu)) | ||
813 | kill_guest(cpu, "Cannot populate switcher mapping"); | ||
776 | } | 814 | } |
777 | 815 | ||
778 | /*H:430 | 816 | /*H:430 |
@@ -808,9 +846,17 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) | |||
808 | newpgdir = new_pgdir(cpu, pgtable, &repin); | 846 | newpgdir = new_pgdir(cpu, pgtable, &repin); |
809 | /* Change the current pgd index to the new one. */ | 847 | /* Change the current pgd index to the new one. */ |
810 | cpu->cpu_pgd = newpgdir; | 848 | cpu->cpu_pgd = newpgdir; |
811 | /* If it was completely blank, we map in the Guest kernel stack */ | 849 | /* |
850 | * If it was completely blank, we map in the Guest kernel stack and | ||
851 | * the Switcher. | ||
852 | */ | ||
812 | if (repin) | 853 | if (repin) |
813 | pin_stack_pages(cpu); | 854 | pin_stack_pages(cpu); |
855 | |||
856 | if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) { | ||
857 | if (!allocate_switcher_mapping(cpu)) | ||
858 | kill_guest(cpu, "Cannot populate switcher mapping"); | ||
859 | } | ||
814 | } | 860 | } |
815 | /*:*/ | 861 | /*:*/ |
816 | 862 | ||
@@ -865,7 +911,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, | |||
865 | * micro-benchmark. | 911 | * micro-benchmark. |
866 | */ | 912 | */ |
867 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { | 913 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { |
868 | check_gpte(cpu, gpte); | 914 | if (!check_gpte(cpu, gpte)) |
915 | return; | ||
869 | set_pte(spte, | 916 | set_pte(spte, |
870 | gpte_to_spte(cpu, gpte, | 917 | gpte_to_spte(cpu, gpte, |
871 | pte_flags(gpte) & _PAGE_DIRTY)); | 918 | pte_flags(gpte) & _PAGE_DIRTY)); |
@@ -897,6 +944,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, | |||
897 | void guest_set_pte(struct lg_cpu *cpu, | 944 | void guest_set_pte(struct lg_cpu *cpu, |
898 | unsigned long gpgdir, unsigned long vaddr, pte_t gpte) | 945 | unsigned long gpgdir, unsigned long vaddr, pte_t gpte) |
899 | { | 946 | { |
947 | /* We don't let you remap the Switcher; we need it to get back! */ | ||
948 | if (vaddr >= switcher_addr) { | ||
949 | kill_guest(cpu, "attempt to set pte into Switcher pages"); | ||
950 | return; | ||
951 | } | ||
952 | |||
900 | /* | 953 | /* |
901 | * Kernel mappings must be changed on all top levels. Slow, but doesn't | 954 | * Kernel mappings must be changed on all top levels. Slow, but doesn't |
902 | * happen often. | 955 | * happen often. |
@@ -933,14 +986,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) | |||
933 | { | 986 | { |
934 | int pgdir; | 987 | int pgdir; |
935 | 988 | ||
936 | if (idx >= SWITCHER_PGD_INDEX) | 989 | if (idx > PTRS_PER_PGD) { |
990 | kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u", | ||
991 | idx, PTRS_PER_PGD); | ||
937 | return; | 992 | return; |
993 | } | ||
938 | 994 | ||
939 | /* If they're talking about a page table we have a shadow for... */ | 995 | /* If they're talking about a page table we have a shadow for... */ |
940 | pgdir = find_pgdir(lg, gpgdir); | 996 | pgdir = find_pgdir(lg, gpgdir); |
941 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) | 997 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) { |
942 | /* ... throw it away. */ | 998 | /* ... throw it away. */ |
943 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); | 999 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); |
1000 | /* That might have been the Switcher mapping, remap it. */ | ||
1001 | if (!allocate_switcher_mapping(&lg->cpus[0])) { | ||
1002 | kill_guest(&lg->cpus[0], | ||
1003 | "Cannot populate switcher mapping"); | ||
1004 | } | ||
1005 | } | ||
944 | } | 1006 | } |
945 | 1007 | ||
946 | #ifdef CONFIG_X86_PAE | 1008 | #ifdef CONFIG_X86_PAE |
@@ -958,6 +1020,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) | |||
958 | * we will populate on future faults. The Guest doesn't have any actual | 1020 | * we will populate on future faults. The Guest doesn't have any actual |
959 | * pagetables yet, so we set linear_pages to tell demand_page() to fake it | 1021 | * pagetables yet, so we set linear_pages to tell demand_page() to fake it |
960 | * for the moment. | 1022 | * for the moment. |
1023 | * | ||
1024 | * We do need the Switcher to be mapped at all times, so we allocate that | ||
1025 | * part of the Guest page table here. | ||
961 | */ | 1026 | */ |
962 | int init_guest_pagetable(struct lguest *lg) | 1027 | int init_guest_pagetable(struct lguest *lg) |
963 | { | 1028 | { |
@@ -971,21 +1036,34 @@ int init_guest_pagetable(struct lguest *lg) | |||
971 | 1036 | ||
972 | /* We start with a linear mapping until the initialize. */ | 1037 | /* We start with a linear mapping until the initialize. */ |
973 | cpu->linear_pages = true; | 1038 | cpu->linear_pages = true; |
1039 | |||
1040 | /* Allocate the page tables for the Switcher. */ | ||
1041 | if (!allocate_switcher_mapping(cpu)) { | ||
1042 | release_all_pagetables(lg); | ||
1043 | return -ENOMEM; | ||
1044 | } | ||
1045 | |||
974 | return 0; | 1046 | return 0; |
975 | } | 1047 | } |
976 | 1048 | ||
977 | /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ | 1049 | /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ |
978 | void page_table_guest_data_init(struct lg_cpu *cpu) | 1050 | void page_table_guest_data_init(struct lg_cpu *cpu) |
979 | { | 1051 | { |
1052 | /* | ||
1053 | * We tell the Guest that it can't use the virtual addresses | ||
1054 | * used by the Switcher. This trick is equivalent to 4GB - | ||
1055 | * switcher_addr. | ||
1056 | */ | ||
1057 | u32 top = ~switcher_addr + 1; | ||
1058 | |||
980 | /* We get the kernel address: above this is all kernel memory. */ | 1059 | /* We get the kernel address: above this is all kernel memory. */ |
981 | if (get_user(cpu->lg->kernel_address, | 1060 | if (get_user(cpu->lg->kernel_address, |
982 | &cpu->lg->lguest_data->kernel_address) | 1061 | &cpu->lg->lguest_data->kernel_address) |
983 | /* | 1062 | /* |
984 | * We tell the Guest that it can't use the top 2 or 4 MB | 1063 | * We tell the Guest that it can't use the top virtual |
985 | * of virtual addresses used by the Switcher. | 1064 | * addresses (used by the Switcher). |
986 | */ | 1065 | */ |
987 | || put_user(RESERVE_MEM * 1024 * 1024, | 1066 | || put_user(top, &cpu->lg->lguest_data->reserve_mem)) { |
988 | &cpu->lg->lguest_data->reserve_mem)) { | ||
989 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); | 1067 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); |
990 | return; | 1068 | return; |
991 | } | 1069 | } |
@@ -995,12 +1073,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu) | |||
995 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the | 1073 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the |
996 | * Switcher mappings, so check that now. | 1074 | * Switcher mappings, so check that now. |
997 | */ | 1075 | */ |
998 | #ifdef CONFIG_X86_PAE | 1076 | if (cpu->lg->kernel_address >= switcher_addr) |
999 | if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && | ||
1000 | pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) | ||
1001 | #else | ||
1002 | if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) | ||
1003 | #endif | ||
1004 | kill_guest(cpu, "bad kernel address %#lx", | 1077 | kill_guest(cpu, "bad kernel address %#lx", |
1005 | cpu->lg->kernel_address); | 1078 | cpu->lg->kernel_address); |
1006 | } | 1079 | } |
@@ -1017,102 +1090,96 @@ void free_guest_pagetable(struct lguest *lg) | |||
1017 | free_page((long)lg->pgdirs[i].pgdir); | 1090 | free_page((long)lg->pgdirs[i].pgdir); |
1018 | } | 1091 | } |
1019 | 1092 | ||
1020 | /*H:480 | 1093 | /*H:481 |
1021 | * (vi) Mapping the Switcher when the Guest is about to run. | 1094 | * This clears the Switcher mappings for cpu #i. |
1022 | * | ||
1023 | * The Switcher and the two pages for this CPU need to be visible in the | ||
1024 | * Guest (and not the pages for other CPUs). We have the appropriate PTE pages | ||
1025 | * for each CPU already set up, we just need to hook them in now we know which | ||
1026 | * Guest is about to run on this CPU. | ||
1027 | */ | 1095 | */ |
1028 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) | 1096 | static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i) |
1029 | { | 1097 | { |
1030 | pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages); | 1098 | unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2; |
1031 | pte_t regs_pte; | 1099 | pte_t *pte; |
1032 | 1100 | ||
1033 | #ifdef CONFIG_X86_PAE | 1101 | /* Clear the mappings for both pages. */ |
1034 | pmd_t switcher_pmd; | 1102 | pte = find_spte(cpu, base, false, 0, 0); |
1035 | pmd_t *pmd_table; | 1103 | release_pte(*pte); |
1036 | 1104 | set_pte(pte, __pte(0)); | |
1037 | switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, | ||
1038 | PAGE_KERNEL_EXEC); | ||
1039 | |||
1040 | /* Figure out where the pmd page is, by reading the PGD, and converting | ||
1041 | * it to a virtual address. */ | ||
1042 | pmd_table = __va(pgd_pfn(cpu->lg-> | ||
1043 | pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) | ||
1044 | << PAGE_SHIFT); | ||
1045 | /* Now write it into the shadow page table. */ | ||
1046 | set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); | ||
1047 | #else | ||
1048 | pgd_t switcher_pgd; | ||
1049 | 1105 | ||
1050 | /* | 1106 | pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); |
1051 | * Make the last PGD entry for this Guest point to the Switcher's PTE | 1107 | release_pte(*pte); |
1052 | * page for this CPU (with appropriate flags). | 1108 | set_pte(pte, __pte(0)); |
1053 | */ | ||
1054 | switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); | ||
1055 | |||
1056 | cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; | ||
1057 | |||
1058 | #endif | ||
1059 | /* | ||
1060 | * We also change the Switcher PTE page. When we're running the Guest, | ||
1061 | * we want the Guest's "regs" page to appear where the first Switcher | ||
1062 | * page for this CPU is. This is an optimization: when the Switcher | ||
1063 | * saves the Guest registers, it saves them into the first page of this | ||
1064 | * CPU's "struct lguest_pages": if we make sure the Guest's register | ||
1065 | * page is already mapped there, we don't have to copy them out | ||
1066 | * again. | ||
1067 | */ | ||
1068 | regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL); | ||
1069 | set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte); | ||
1070 | } | 1109 | } |
1071 | /*:*/ | ||
1072 | 1110 | ||
1073 | static void free_switcher_pte_pages(void) | 1111 | /*H:480 |
1074 | { | 1112 | * (vi) Mapping the Switcher when the Guest is about to run. |
1075 | unsigned int i; | 1113 | * |
1076 | 1114 | * The Switcher and the two pages for this CPU need to be visible in the Guest | |
1077 | for_each_possible_cpu(i) | 1115 | * (and not the pages for other CPUs). |
1078 | free_page((long)switcher_pte_page(i)); | ||
1079 | } | ||
1080 | |||
1081 | /*H:520 | ||
1082 | * Setting up the Switcher PTE page for given CPU is fairly easy, given | ||
1083 | * the CPU number and the "struct page"s for the Switcher code itself. | ||
1084 | * | 1116 | * |
1085 | * Currently the Switcher is less than a page long, so "pages" is always 1. | 1117 | * The pages for the pagetables have all been allocated before: we just need |
1118 | * to make sure the actual PTEs are up-to-date for the CPU we're about to run | ||
1119 | * on. | ||
1086 | */ | 1120 | */ |
1087 | static __init void populate_switcher_pte_page(unsigned int cpu, | 1121 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) |
1088 | struct page *switcher_page[], | ||
1089 | unsigned int pages) | ||
1090 | { | 1122 | { |
1091 | unsigned int i; | 1123 | unsigned long base; |
1092 | pte_t *pte = switcher_pte_page(cpu); | 1124 | struct page *percpu_switcher_page, *regs_page; |
1125 | pte_t *pte; | ||
1126 | struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd]; | ||
1127 | |||
1128 | /* Switcher page should always be mapped by now! */ | ||
1129 | BUG_ON(!pgdir->switcher_mapped); | ||
1130 | |||
1131 | /* | ||
1132 | * Remember that we have two pages for each Host CPU, so we can run a | ||
1133 | * Guest on each CPU without them interfering. We need to make sure | ||
1134 | * those pages are mapped correctly in the Guest, but since we usually | ||
1135 | * run on the same CPU, we cache that, and only update the mappings | ||
1136 | * when we move. | ||
1137 | */ | ||
1138 | if (pgdir->last_host_cpu == raw_smp_processor_id()) | ||
1139 | return; | ||
1093 | 1140 | ||
1094 | /* The first entries are easy: they map the Switcher code. */ | 1141 | /* -1 means unknown so we remove everything. */ |
1095 | for (i = 0; i < pages; i++) { | 1142 | if (pgdir->last_host_cpu == -1) { |
1096 | set_pte(&pte[i], mk_pte(switcher_page[i], | 1143 | unsigned int i; |
1097 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); | 1144 | for_each_possible_cpu(i) |
1145 | remove_switcher_percpu_map(cpu, i); | ||
1146 | } else { | ||
1147 | /* We know exactly what CPU mapping to remove. */ | ||
1148 | remove_switcher_percpu_map(cpu, pgdir->last_host_cpu); | ||
1098 | } | 1149 | } |
1099 | 1150 | ||
1100 | /* The only other thing we map is this CPU's pair of pages. */ | 1151 | /* |
1101 | i = pages + cpu*2; | 1152 | * When we're running the Guest, we want the Guest's "regs" page to |
1102 | 1153 | * appear where the first Switcher page for this CPU is. This is an | |
1103 | /* First page (Guest registers) is writable from the Guest */ | 1154 | * optimization: when the Switcher saves the Guest registers, it saves |
1104 | set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), | 1155 | * them into the first page of this CPU's "struct lguest_pages": if we |
1105 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); | 1156 | * make sure the Guest's register page is already mapped there, we |
1157 | * don't have to copy them out again. | ||
1158 | */ | ||
1159 | /* Find the shadow PTE for this regs page. */ | ||
1160 | base = switcher_addr + PAGE_SIZE | ||
1161 | + raw_smp_processor_id() * sizeof(struct lguest_pages); | ||
1162 | pte = find_spte(cpu, base, false, 0, 0); | ||
1163 | regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT); | ||
1164 | get_page(regs_page); | ||
1165 | set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL))); | ||
1106 | 1166 | ||
1107 | /* | 1167 | /* |
1108 | * The second page contains the "struct lguest_ro_state", and is | 1168 | * We map the second page of the struct lguest_pages read-only in |
1109 | * read-only. | 1169 | * the Guest: the IDT, GDT and other things it's not supposed to |
1170 | * change. | ||
1110 | */ | 1171 | */ |
1111 | set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), | 1172 | pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); |
1112 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); | 1173 | percpu_switcher_page |
1174 | = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1]; | ||
1175 | get_page(percpu_switcher_page); | ||
1176 | set_pte(pte, mk_pte(percpu_switcher_page, | ||
1177 | __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL))); | ||
1178 | |||
1179 | pgdir->last_host_cpu = raw_smp_processor_id(); | ||
1113 | } | 1180 | } |
1114 | 1181 | ||
1115 | /* | 1182 | /*H:490 |
1116 | * We've made it through the page table code. Perhaps our tired brains are | 1183 | * We've made it through the page table code. Perhaps our tired brains are |
1117 | * still processing the details, or perhaps we're simply glad it's over. | 1184 | * still processing the details, or perhaps we're simply glad it's over. |
1118 | * | 1185 | * |
@@ -1124,29 +1191,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu, | |||
1124 | * | 1191 | * |
1125 | * There is just one file remaining in the Host. | 1192 | * There is just one file remaining in the Host. |
1126 | */ | 1193 | */ |
1127 | |||
1128 | /*H:510 | ||
1129 | * At boot or module load time, init_pagetables() allocates and populates | ||
1130 | * the Switcher PTE page for each CPU. | ||
1131 | */ | ||
1132 | __init int init_pagetables(struct page **switcher_page, unsigned int pages) | ||
1133 | { | ||
1134 | unsigned int i; | ||
1135 | |||
1136 | for_each_possible_cpu(i) { | ||
1137 | switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); | ||
1138 | if (!switcher_pte_page(i)) { | ||
1139 | free_switcher_pte_pages(); | ||
1140 | return -ENOMEM; | ||
1141 | } | ||
1142 | populate_switcher_pte_page(i, switcher_page, pages); | ||
1143 | } | ||
1144 | return 0; | ||
1145 | } | ||
1146 | /*:*/ | ||
1147 | |||
1148 | /* Cleaning up simply involves freeing the PTE page for each CPU. */ | ||
1149 | void free_pagetables(void) | ||
1150 | { | ||
1151 | free_switcher_pte_pages(); | ||
1152 | } | ||
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 4af12e1844d5..f0a3347b6441 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c | |||
@@ -59,14 +59,13 @@ static struct { | |||
59 | /* Offset from where switcher.S was compiled to where we've copied it */ | 59 | /* Offset from where switcher.S was compiled to where we've copied it */ |
60 | static unsigned long switcher_offset(void) | 60 | static unsigned long switcher_offset(void) |
61 | { | 61 | { |
62 | return SWITCHER_ADDR - (unsigned long)start_switcher_text; | 62 | return switcher_addr - (unsigned long)start_switcher_text; |
63 | } | 63 | } |
64 | 64 | ||
65 | /* This cpu's struct lguest_pages. */ | 65 | /* This cpu's struct lguest_pages (after the Switcher text page) */ |
66 | static struct lguest_pages *lguest_pages(unsigned int cpu) | 66 | static struct lguest_pages *lguest_pages(unsigned int cpu) |
67 | { | 67 | { |
68 | return &(((struct lguest_pages *) | 68 | return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]); |
69 | (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); | ||
70 | } | 69 | } |
71 | 70 | ||
72 | static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); | 71 | static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); |
diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig index a966128c2a7a..7ffc756131a2 100644 --- a/drivers/net/caif/Kconfig +++ b/drivers/net/caif/Kconfig | |||
@@ -40,3 +40,17 @@ config CAIF_HSI | |||
40 | The caif low level driver for CAIF over HSI. | 40 | The caif low level driver for CAIF over HSI. |
41 | Be aware that if you enable this then you also need to | 41 | Be aware that if you enable this then you also need to |
42 | enable a low-level HSI driver. | 42 | enable a low-level HSI driver. |
43 | |||
44 | config CAIF_VIRTIO | ||
45 | tristate "CAIF virtio transport driver" | ||
46 | depends on CAIF | ||
47 | select VHOST_RING | ||
48 | select VIRTIO | ||
49 | select GENERIC_ALLOCATOR | ||
50 | default n | ||
51 | ---help--- | ||
52 | The caif driver for CAIF over Virtio. | ||
53 | |||
54 | if CAIF_VIRTIO | ||
55 | source "drivers/vhost/Kconfig" | ||
56 | endif | ||
diff --git a/drivers/net/caif/Makefile b/drivers/net/caif/Makefile index 15a9d2fc753d..9bbd45391f6c 100644 --- a/drivers/net/caif/Makefile +++ b/drivers/net/caif/Makefile | |||
@@ -9,3 +9,6 @@ obj-$(CONFIG_CAIF_SPI_SLAVE) += cfspi_slave.o | |||
9 | 9 | ||
10 | # HSI interface | 10 | # HSI interface |
11 | obj-$(CONFIG_CAIF_HSI) += caif_hsi.o | 11 | obj-$(CONFIG_CAIF_HSI) += caif_hsi.o |
12 | |||
13 | # Virtio interface | ||
14 | obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o | ||
diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c new file mode 100644 index 000000000000..b9ed1288ce2d --- /dev/null +++ b/drivers/net/caif/caif_virtio.c | |||
@@ -0,0 +1,790 @@ | |||
1 | /* | ||
2 | * Copyright (C) ST-Ericsson AB 2013 | ||
3 | * Authors: Vicram Arv | ||
4 | * Dmitry Tarnyagin <dmitry.tarnyagin@lockless.no> | ||
5 | * Sjur Brendeland | ||
6 | * License terms: GNU General Public License (GPL) version 2 | ||
7 | */ | ||
8 | #include <linux/module.h> | ||
9 | #include <linux/if_arp.h> | ||
10 | #include <linux/virtio.h> | ||
11 | #include <linux/vringh.h> | ||
12 | #include <linux/debugfs.h> | ||
13 | #include <linux/spinlock.h> | ||
14 | #include <linux/genalloc.h> | ||
15 | #include <linux/interrupt.h> | ||
16 | #include <linux/netdevice.h> | ||
17 | #include <linux/rtnetlink.h> | ||
18 | #include <linux/virtio_ids.h> | ||
19 | #include <linux/virtio_caif.h> | ||
20 | #include <linux/virtio_ring.h> | ||
21 | #include <linux/dma-mapping.h> | ||
22 | #include <net/caif/caif_dev.h> | ||
23 | #include <linux/virtio_config.h> | ||
24 | |||
25 | MODULE_LICENSE("GPL v2"); | ||
26 | MODULE_AUTHOR("Vicram Arv"); | ||
27 | MODULE_AUTHOR("Sjur Brendeland"); | ||
28 | MODULE_DESCRIPTION("Virtio CAIF Driver"); | ||
29 | |||
30 | /* NAPI schedule quota */ | ||
31 | #define CFV_DEFAULT_QUOTA 32 | ||
32 | |||
33 | /* Defaults used if virtio config space is unavailable */ | ||
34 | #define CFV_DEF_MTU_SIZE 4096 | ||
35 | #define CFV_DEF_HEADROOM 32 | ||
36 | #define CFV_DEF_TAILROOM 32 | ||
37 | |||
38 | /* Required IP header alignment */ | ||
39 | #define IP_HDR_ALIGN 4 | ||
40 | |||
41 | /* struct cfv_napi_contxt - NAPI context info | ||
42 | * @riov: IOV holding data read from the ring. Note that riov may | ||
43 | * still hold data when cfv_rx_poll() returns. | ||
44 | * @head: Last descriptor ID we received from vringh_getdesc_kern. | ||
45 | * We use this to put descriptor back on the used ring. USHRT_MAX is | ||
46 | * used to indicate invalid head-id. | ||
47 | */ | ||
48 | struct cfv_napi_context { | ||
49 | struct vringh_kiov riov; | ||
50 | unsigned short head; | ||
51 | }; | ||
52 | |||
53 | /* struct cfv_stats - statistics for debugfs | ||
54 | * @rx_napi_complete: Number of NAPI completions (RX) | ||
55 | * @rx_napi_resched: Number of calls where the full quota was used (RX) | ||
56 | * @rx_nomem: Number of SKB alloc failures (RX) | ||
57 | * @rx_kicks: Number of RX kicks | ||
58 | * @tx_full_ring: Number times TX ring was full | ||
59 | * @tx_no_mem: Number of times TX went out of memory | ||
60 | * @tx_flow_on: Number of flow on (TX) | ||
61 | * @tx_kicks: Number of TX kicks | ||
62 | */ | ||
63 | struct cfv_stats { | ||
64 | u32 rx_napi_complete; | ||
65 | u32 rx_napi_resched; | ||
66 | u32 rx_nomem; | ||
67 | u32 rx_kicks; | ||
68 | u32 tx_full_ring; | ||
69 | u32 tx_no_mem; | ||
70 | u32 tx_flow_on; | ||
71 | u32 tx_kicks; | ||
72 | }; | ||
73 | |||
74 | /* struct cfv_info - Caif Virtio control structure | ||
75 | * @cfdev: caif common header | ||
76 | * @vdev: Associated virtio device | ||
77 | * @vr_rx: rx/downlink host vring | ||
78 | * @vq_tx: tx/uplink virtqueue | ||
79 | * @ndev: CAIF link layer device | ||
80 | * @watermark_tx: indicates number of free descriptors we need | ||
81 | * to reopen the tx-queues after overload. | ||
82 | * @tx_lock: protects vq_tx from concurrent use | ||
83 | * @tx_release_tasklet: Tasklet for freeing consumed TX buffers | ||
84 | * @napi: Napi context used in cfv_rx_poll() | ||
85 | * @ctx: Context data used in cfv_rx_poll() | ||
86 | * @tx_hr: transmit headroom | ||
87 | * @rx_hr: receive headroom | ||
88 | * @tx_tr: transmit tail room | ||
89 | * @rx_tr: receive tail room | ||
90 | * @mtu: transmit max size | ||
91 | * @mru: receive max size | ||
92 | * @allocsz: size of dma memory reserved for TX buffers | ||
93 | * @alloc_addr: virtual address to dma memory for TX buffers | ||
94 | * @alloc_dma: dma address to dma memory for TX buffers | ||
95 | * @genpool: Gen Pool used for allocating TX buffers | ||
96 | * @reserved_mem: Pointer to memory reserve allocated from genpool | ||
97 | * @reserved_size: Size of memory reserve allocated from genpool | ||
98 | * @stats: Statistics exposed in sysfs | ||
99 | * @debugfs: Debugfs dentry for statistic counters | ||
100 | */ | ||
101 | struct cfv_info { | ||
102 | struct caif_dev_common cfdev; | ||
103 | struct virtio_device *vdev; | ||
104 | struct vringh *vr_rx; | ||
105 | struct virtqueue *vq_tx; | ||
106 | struct net_device *ndev; | ||
107 | unsigned int watermark_tx; | ||
108 | /* Protect access to vq_tx */ | ||
109 | spinlock_t tx_lock; | ||
110 | struct tasklet_struct tx_release_tasklet; | ||
111 | struct napi_struct napi; | ||
112 | struct cfv_napi_context ctx; | ||
113 | u16 tx_hr; | ||
114 | u16 rx_hr; | ||
115 | u16 tx_tr; | ||
116 | u16 rx_tr; | ||
117 | u32 mtu; | ||
118 | u32 mru; | ||
119 | size_t allocsz; | ||
120 | void *alloc_addr; | ||
121 | dma_addr_t alloc_dma; | ||
122 | struct gen_pool *genpool; | ||
123 | unsigned long reserved_mem; | ||
124 | size_t reserved_size; | ||
125 | struct cfv_stats stats; | ||
126 | struct dentry *debugfs; | ||
127 | }; | ||
128 | |||
129 | /* struct buf_info - maintains transmit buffer data handle | ||
130 | * @size: size of transmit buffer | ||
131 | * @dma_handle: handle to allocated dma device memory area | ||
132 | * @vaddr: virtual address mapping to allocated memory area | ||
133 | */ | ||
134 | struct buf_info { | ||
135 | size_t size; | ||
136 | u8 *vaddr; | ||
137 | }; | ||
138 | |||
139 | /* Called from virtio device, in IRQ context */ | ||
140 | static void cfv_release_cb(struct virtqueue *vq_tx) | ||
141 | { | ||
142 | struct cfv_info *cfv = vq_tx->vdev->priv; | ||
143 | |||
144 | ++cfv->stats.tx_kicks; | ||
145 | tasklet_schedule(&cfv->tx_release_tasklet); | ||
146 | } | ||
147 | |||
148 | static void free_buf_info(struct cfv_info *cfv, struct buf_info *buf_info) | ||
149 | { | ||
150 | if (!buf_info) | ||
151 | return; | ||
152 | gen_pool_free(cfv->genpool, (unsigned long) buf_info->vaddr, | ||
153 | buf_info->size); | ||
154 | kfree(buf_info); | ||
155 | } | ||
156 | |||
157 | /* This is invoked whenever the remote processor completed processing | ||
158 | * a TX msg we just sent, and the buffer is put back to the used ring. | ||
159 | */ | ||
160 | static void cfv_release_used_buf(struct virtqueue *vq_tx) | ||
161 | { | ||
162 | struct cfv_info *cfv = vq_tx->vdev->priv; | ||
163 | unsigned long flags; | ||
164 | |||
165 | BUG_ON(vq_tx != cfv->vq_tx); | ||
166 | |||
167 | for (;;) { | ||
168 | unsigned int len; | ||
169 | struct buf_info *buf_info; | ||
170 | |||
171 | /* Get used buffer from used ring to recycle used descriptors */ | ||
172 | spin_lock_irqsave(&cfv->tx_lock, flags); | ||
173 | buf_info = virtqueue_get_buf(vq_tx, &len); | ||
174 | spin_unlock_irqrestore(&cfv->tx_lock, flags); | ||
175 | |||
176 | /* Stop looping if there are no more buffers to free */ | ||
177 | if (!buf_info) | ||
178 | break; | ||
179 | |||
180 | free_buf_info(cfv, buf_info); | ||
181 | |||
182 | /* watermark_tx indicates if we previously stopped the tx | ||
183 | * queues. If we have enough free stots in the virtio ring, | ||
184 | * re-establish memory reserved and open up tx queues. | ||
185 | */ | ||
186 | if (cfv->vq_tx->num_free <= cfv->watermark_tx) | ||
187 | continue; | ||
188 | |||
189 | /* Re-establish memory reserve */ | ||
190 | if (cfv->reserved_mem == 0 && cfv->genpool) | ||
191 | cfv->reserved_mem = | ||
192 | gen_pool_alloc(cfv->genpool, | ||
193 | cfv->reserved_size); | ||
194 | |||
195 | /* Open up the tx queues */ | ||
196 | if (cfv->reserved_mem) { | ||
197 | cfv->watermark_tx = | ||
198 | virtqueue_get_vring_size(cfv->vq_tx); | ||
199 | netif_tx_wake_all_queues(cfv->ndev); | ||
200 | /* Buffers are recycled in cfv_netdev_tx, so | ||
201 | * disable notifications when queues are opened. | ||
202 | */ | ||
203 | virtqueue_disable_cb(cfv->vq_tx); | ||
204 | ++cfv->stats.tx_flow_on; | ||
205 | } else { | ||
206 | /* if no memory reserve, wait for more free slots */ | ||
207 | WARN_ON(cfv->watermark_tx > | ||
208 | virtqueue_get_vring_size(cfv->vq_tx)); | ||
209 | cfv->watermark_tx += | ||
210 | virtqueue_get_vring_size(cfv->vq_tx) / 4; | ||
211 | } | ||
212 | } | ||
213 | } | ||
214 | |||
215 | /* Allocate a SKB and copy packet data to it */ | ||
216 | static struct sk_buff *cfv_alloc_and_copy_skb(int *err, | ||
217 | struct cfv_info *cfv, | ||
218 | u8 *frm, u32 frm_len) | ||
219 | { | ||
220 | struct sk_buff *skb; | ||
221 | u32 cfpkt_len, pad_len; | ||
222 | |||
223 | *err = 0; | ||
224 | /* Verify that packet size with down-link header and mtu size */ | ||
225 | if (frm_len > cfv->mru || frm_len <= cfv->rx_hr + cfv->rx_tr) { | ||
226 | netdev_err(cfv->ndev, | ||
227 | "Invalid frmlen:%u mtu:%u hr:%d tr:%d\n", | ||
228 | frm_len, cfv->mru, cfv->rx_hr, | ||
229 | cfv->rx_tr); | ||
230 | *err = -EPROTO; | ||
231 | return NULL; | ||
232 | } | ||
233 | |||
234 | cfpkt_len = frm_len - (cfv->rx_hr + cfv->rx_tr); | ||
235 | pad_len = (unsigned long)(frm + cfv->rx_hr) & (IP_HDR_ALIGN - 1); | ||
236 | |||
237 | skb = netdev_alloc_skb(cfv->ndev, frm_len + pad_len); | ||
238 | if (!skb) { | ||
239 | *err = -ENOMEM; | ||
240 | return NULL; | ||
241 | } | ||
242 | |||
243 | skb_reserve(skb, cfv->rx_hr + pad_len); | ||
244 | |||
245 | memcpy(skb_put(skb, cfpkt_len), frm + cfv->rx_hr, cfpkt_len); | ||
246 | return skb; | ||
247 | } | ||
248 | |||
249 | /* Get packets from the host vring */ | ||
250 | static int cfv_rx_poll(struct napi_struct *napi, int quota) | ||
251 | { | ||
252 | struct cfv_info *cfv = container_of(napi, struct cfv_info, napi); | ||
253 | int rxcnt = 0; | ||
254 | int err = 0; | ||
255 | void *buf; | ||
256 | struct sk_buff *skb; | ||
257 | struct vringh_kiov *riov = &cfv->ctx.riov; | ||
258 | unsigned int skb_len; | ||
259 | |||
260 | again: | ||
261 | do { | ||
262 | skb = NULL; | ||
263 | |||
264 | /* Put the previous iovec back on the used ring and | ||
265 | * fetch a new iovec if we have processed all elements. | ||
266 | */ | ||
267 | if (riov->i == riov->used) { | ||
268 | if (cfv->ctx.head != USHRT_MAX) { | ||
269 | vringh_complete_kern(cfv->vr_rx, | ||
270 | cfv->ctx.head, | ||
271 | 0); | ||
272 | cfv->ctx.head = USHRT_MAX; | ||
273 | } | ||
274 | |||
275 | err = vringh_getdesc_kern( | ||
276 | cfv->vr_rx, | ||
277 | riov, | ||
278 | NULL, | ||
279 | &cfv->ctx.head, | ||
280 | GFP_ATOMIC); | ||
281 | |||
282 | if (err <= 0) | ||
283 | goto exit; | ||
284 | } | ||
285 | |||
286 | buf = phys_to_virt((unsigned long) riov->iov[riov->i].iov_base); | ||
287 | /* TODO: Add check on valid buffer address */ | ||
288 | |||
289 | skb = cfv_alloc_and_copy_skb(&err, cfv, buf, | ||
290 | riov->iov[riov->i].iov_len); | ||
291 | if (unlikely(err)) | ||
292 | goto exit; | ||
293 | |||
294 | /* Push received packet up the stack. */ | ||
295 | skb_len = skb->len; | ||
296 | skb->protocol = htons(ETH_P_CAIF); | ||
297 | skb_reset_mac_header(skb); | ||
298 | skb->dev = cfv->ndev; | ||
299 | err = netif_receive_skb(skb); | ||
300 | if (unlikely(err)) { | ||
301 | ++cfv->ndev->stats.rx_dropped; | ||
302 | } else { | ||
303 | ++cfv->ndev->stats.rx_packets; | ||
304 | cfv->ndev->stats.rx_bytes += skb_len; | ||
305 | } | ||
306 | |||
307 | ++riov->i; | ||
308 | ++rxcnt; | ||
309 | } while (rxcnt < quota); | ||
310 | |||
311 | ++cfv->stats.rx_napi_resched; | ||
312 | goto out; | ||
313 | |||
314 | exit: | ||
315 | switch (err) { | ||
316 | case 0: | ||
317 | ++cfv->stats.rx_napi_complete; | ||
318 | |||
319 | /* Really out of patckets? (stolen from virtio_net)*/ | ||
320 | napi_complete(napi); | ||
321 | if (unlikely(!vringh_notify_enable_kern(cfv->vr_rx)) && | ||
322 | napi_schedule_prep(napi)) { | ||
323 | vringh_notify_disable_kern(cfv->vr_rx); | ||
324 | __napi_schedule(napi); | ||
325 | goto again; | ||
326 | } | ||
327 | break; | ||
328 | |||
329 | case -ENOMEM: | ||
330 | ++cfv->stats.rx_nomem; | ||
331 | dev_kfree_skb(skb); | ||
332 | /* Stop NAPI poll on OOM, we hope to be polled later */ | ||
333 | napi_complete(napi); | ||
334 | vringh_notify_enable_kern(cfv->vr_rx); | ||
335 | break; | ||
336 | |||
337 | default: | ||
338 | /* We're doomed, any modem fault is fatal */ | ||
339 | netdev_warn(cfv->ndev, "Bad ring, disable device\n"); | ||
340 | cfv->ndev->stats.rx_dropped = riov->used - riov->i; | ||
341 | napi_complete(napi); | ||
342 | vringh_notify_disable_kern(cfv->vr_rx); | ||
343 | netif_carrier_off(cfv->ndev); | ||
344 | break; | ||
345 | } | ||
346 | out: | ||
347 | if (rxcnt && vringh_need_notify_kern(cfv->vr_rx) > 0) | ||
348 | vringh_notify(cfv->vr_rx); | ||
349 | return rxcnt; | ||
350 | } | ||
351 | |||
352 | static void cfv_recv(struct virtio_device *vdev, struct vringh *vr_rx) | ||
353 | { | ||
354 | struct cfv_info *cfv = vdev->priv; | ||
355 | |||
356 | ++cfv->stats.rx_kicks; | ||
357 | vringh_notify_disable_kern(cfv->vr_rx); | ||
358 | napi_schedule(&cfv->napi); | ||
359 | } | ||
360 | |||
361 | static void cfv_destroy_genpool(struct cfv_info *cfv) | ||
362 | { | ||
363 | if (cfv->alloc_addr) | ||
364 | dma_free_coherent(cfv->vdev->dev.parent->parent, | ||
365 | cfv->allocsz, cfv->alloc_addr, | ||
366 | cfv->alloc_dma); | ||
367 | |||
368 | if (!cfv->genpool) | ||
369 | return; | ||
370 | gen_pool_free(cfv->genpool, cfv->reserved_mem, | ||
371 | cfv->reserved_size); | ||
372 | gen_pool_destroy(cfv->genpool); | ||
373 | cfv->genpool = NULL; | ||
374 | } | ||
375 | |||
376 | static int cfv_create_genpool(struct cfv_info *cfv) | ||
377 | { | ||
378 | int err; | ||
379 | |||
380 | /* dma_alloc can only allocate whole pages, and we need a more | ||
381 | * fine graned allocation so we use genpool. We ask for space needed | ||
382 | * by IP and a full ring. If the dma allcoation fails we retry with a | ||
383 | * smaller allocation size. | ||
384 | */ | ||
385 | err = -ENOMEM; | ||
386 | cfv->allocsz = (virtqueue_get_vring_size(cfv->vq_tx) * | ||
387 | (ETH_DATA_LEN + cfv->tx_hr + cfv->tx_tr) * 11)/10; | ||
388 | if (cfv->allocsz <= (num_possible_cpus() + 1) * cfv->ndev->mtu) | ||
389 | return -EINVAL; | ||
390 | |||
391 | for (;;) { | ||
392 | if (cfv->allocsz <= num_possible_cpus() * cfv->ndev->mtu) { | ||
393 | netdev_info(cfv->ndev, "Not enough device memory\n"); | ||
394 | return -ENOMEM; | ||
395 | } | ||
396 | |||
397 | cfv->alloc_addr = dma_alloc_coherent( | ||
398 | cfv->vdev->dev.parent->parent, | ||
399 | cfv->allocsz, &cfv->alloc_dma, | ||
400 | GFP_ATOMIC); | ||
401 | if (cfv->alloc_addr) | ||
402 | break; | ||
403 | |||
404 | cfv->allocsz = (cfv->allocsz * 3) >> 2; | ||
405 | } | ||
406 | |||
407 | netdev_dbg(cfv->ndev, "Allocated %zd bytes from dma-memory\n", | ||
408 | cfv->allocsz); | ||
409 | |||
410 | /* Allocate on 128 bytes boundaries (1 << 7)*/ | ||
411 | cfv->genpool = gen_pool_create(7, -1); | ||
412 | if (!cfv->genpool) | ||
413 | goto err; | ||
414 | |||
415 | err = gen_pool_add_virt(cfv->genpool, (unsigned long)cfv->alloc_addr, | ||
416 | (phys_addr_t)virt_to_phys(cfv->alloc_addr), | ||
417 | cfv->allocsz, -1); | ||
418 | if (err) | ||
419 | goto err; | ||
420 | |||
421 | /* Reserve some memory for low memory situations. If we hit the roof | ||
422 | * in the memory pool, we stop TX flow and release the reserve. | ||
423 | */ | ||
424 | cfv->reserved_size = num_possible_cpus() * cfv->ndev->mtu; | ||
425 | cfv->reserved_mem = gen_pool_alloc(cfv->genpool, | ||
426 | cfv->reserved_size); | ||
427 | if (!cfv->reserved_mem) { | ||
428 | err = -ENOMEM; | ||
429 | goto err; | ||
430 | } | ||
431 | |||
432 | cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx); | ||
433 | return 0; | ||
434 | err: | ||
435 | cfv_destroy_genpool(cfv); | ||
436 | return err; | ||
437 | } | ||
438 | |||
439 | /* Enable the CAIF interface and allocate the memory-pool */ | ||
440 | static int cfv_netdev_open(struct net_device *netdev) | ||
441 | { | ||
442 | struct cfv_info *cfv = netdev_priv(netdev); | ||
443 | |||
444 | if (cfv_create_genpool(cfv)) | ||
445 | return -ENOMEM; | ||
446 | |||
447 | netif_carrier_on(netdev); | ||
448 | napi_enable(&cfv->napi); | ||
449 | |||
450 | /* Schedule NAPI to read any pending packets */ | ||
451 | napi_schedule(&cfv->napi); | ||
452 | return 0; | ||
453 | } | ||
454 | |||
455 | /* Disable the CAIF interface and free the memory-pool */ | ||
456 | static int cfv_netdev_close(struct net_device *netdev) | ||
457 | { | ||
458 | struct cfv_info *cfv = netdev_priv(netdev); | ||
459 | unsigned long flags; | ||
460 | struct buf_info *buf_info; | ||
461 | |||
462 | /* Disable interrupts, queues and NAPI polling */ | ||
463 | netif_carrier_off(netdev); | ||
464 | virtqueue_disable_cb(cfv->vq_tx); | ||
465 | vringh_notify_disable_kern(cfv->vr_rx); | ||
466 | napi_disable(&cfv->napi); | ||
467 | |||
468 | /* Release any TX buffers on both used and avilable rings */ | ||
469 | cfv_release_used_buf(cfv->vq_tx); | ||
470 | spin_lock_irqsave(&cfv->tx_lock, flags); | ||
471 | while ((buf_info = virtqueue_detach_unused_buf(cfv->vq_tx))) | ||
472 | free_buf_info(cfv, buf_info); | ||
473 | spin_unlock_irqrestore(&cfv->tx_lock, flags); | ||
474 | |||
475 | /* Release all dma allocated memory and destroy the pool */ | ||
476 | cfv_destroy_genpool(cfv); | ||
477 | return 0; | ||
478 | } | ||
479 | |||
480 | /* Allocate a buffer in dma-memory and copy skb to it */ | ||
481 | static struct buf_info *cfv_alloc_and_copy_to_shm(struct cfv_info *cfv, | ||
482 | struct sk_buff *skb, | ||
483 | struct scatterlist *sg) | ||
484 | { | ||
485 | struct caif_payload_info *info = (void *)&skb->cb; | ||
486 | struct buf_info *buf_info = NULL; | ||
487 | u8 pad_len, hdr_ofs; | ||
488 | |||
489 | if (!cfv->genpool) | ||
490 | goto err; | ||
491 | |||
492 | if (unlikely(cfv->tx_hr + skb->len + cfv->tx_tr > cfv->mtu)) { | ||
493 | netdev_warn(cfv->ndev, "Invalid packet len (%d > %d)\n", | ||
494 | cfv->tx_hr + skb->len + cfv->tx_tr, cfv->mtu); | ||
495 | goto err; | ||
496 | } | ||
497 | |||
498 | buf_info = kmalloc(sizeof(struct buf_info), GFP_ATOMIC); | ||
499 | if (unlikely(!buf_info)) | ||
500 | goto err; | ||
501 | |||
502 | /* Make the IP header aligned in tbe buffer */ | ||
503 | hdr_ofs = cfv->tx_hr + info->hdr_len; | ||
504 | pad_len = hdr_ofs & (IP_HDR_ALIGN - 1); | ||
505 | buf_info->size = cfv->tx_hr + skb->len + cfv->tx_tr + pad_len; | ||
506 | |||
507 | /* allocate dma memory buffer */ | ||
508 | buf_info->vaddr = (void *)gen_pool_alloc(cfv->genpool, buf_info->size); | ||
509 | if (unlikely(!buf_info->vaddr)) | ||
510 | goto err; | ||
511 | |||
512 | /* copy skbuf contents to send buffer */ | ||
513 | skb_copy_bits(skb, 0, buf_info->vaddr + cfv->tx_hr + pad_len, skb->len); | ||
514 | sg_init_one(sg, buf_info->vaddr + pad_len, | ||
515 | skb->len + cfv->tx_hr + cfv->rx_hr); | ||
516 | |||
517 | return buf_info; | ||
518 | err: | ||
519 | kfree(buf_info); | ||
520 | return NULL; | ||
521 | } | ||
522 | |||
523 | /* Put the CAIF packet on the virtio ring and kick the receiver */ | ||
524 | static int cfv_netdev_tx(struct sk_buff *skb, struct net_device *netdev) | ||
525 | { | ||
526 | struct cfv_info *cfv = netdev_priv(netdev); | ||
527 | struct buf_info *buf_info; | ||
528 | struct scatterlist sg; | ||
529 | unsigned long flags; | ||
530 | bool flow_off = false; | ||
531 | int ret; | ||
532 | |||
533 | /* garbage collect released buffers */ | ||
534 | cfv_release_used_buf(cfv->vq_tx); | ||
535 | spin_lock_irqsave(&cfv->tx_lock, flags); | ||
536 | |||
537 | /* Flow-off check takes into account number of cpus to make sure | ||
538 | * virtqueue will not be overfilled in any possible smp conditions. | ||
539 | * | ||
540 | * Flow-on is triggered when sufficient buffers are freed | ||
541 | */ | ||
542 | if (unlikely(cfv->vq_tx->num_free <= num_present_cpus())) { | ||
543 | flow_off = true; | ||
544 | cfv->stats.tx_full_ring++; | ||
545 | } | ||
546 | |||
547 | /* If we run out of memory, we release the memory reserve and retry | ||
548 | * allocation. | ||
549 | */ | ||
550 | buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg); | ||
551 | if (unlikely(!buf_info)) { | ||
552 | cfv->stats.tx_no_mem++; | ||
553 | flow_off = true; | ||
554 | |||
555 | if (cfv->reserved_mem && cfv->genpool) { | ||
556 | gen_pool_free(cfv->genpool, cfv->reserved_mem, | ||
557 | cfv->reserved_size); | ||
558 | cfv->reserved_mem = 0; | ||
559 | buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg); | ||
560 | } | ||
561 | } | ||
562 | |||
563 | if (unlikely(flow_off)) { | ||
564 | /* Turn flow on when a 1/4 of the descriptors are released */ | ||
565 | cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx) / 4; | ||
566 | /* Enable notifications of recycled TX buffers */ | ||
567 | virtqueue_enable_cb(cfv->vq_tx); | ||
568 | netif_tx_stop_all_queues(netdev); | ||
569 | } | ||
570 | |||
571 | if (unlikely(!buf_info)) { | ||
572 | /* If the memory reserve does it's job, this shouldn't happen */ | ||
573 | netdev_warn(cfv->ndev, "Out of gen_pool memory\n"); | ||
574 | goto err; | ||
575 | } | ||
576 | |||
577 | ret = virtqueue_add_outbuf(cfv->vq_tx, &sg, 1, buf_info, GFP_ATOMIC); | ||
578 | if (unlikely((ret < 0))) { | ||
579 | /* If flow control works, this shouldn't happen */ | ||
580 | netdev_warn(cfv->ndev, "Failed adding buffer to TX vring:%d\n", | ||
581 | ret); | ||
582 | goto err; | ||
583 | } | ||
584 | |||
585 | /* update netdev statistics */ | ||
586 | cfv->ndev->stats.tx_packets++; | ||
587 | cfv->ndev->stats.tx_bytes += skb->len; | ||
588 | spin_unlock_irqrestore(&cfv->tx_lock, flags); | ||
589 | |||
590 | /* tell the remote processor it has a pending message to read */ | ||
591 | virtqueue_kick(cfv->vq_tx); | ||
592 | |||
593 | dev_kfree_skb(skb); | ||
594 | return NETDEV_TX_OK; | ||
595 | err: | ||
596 | spin_unlock_irqrestore(&cfv->tx_lock, flags); | ||
597 | cfv->ndev->stats.tx_dropped++; | ||
598 | free_buf_info(cfv, buf_info); | ||
599 | dev_kfree_skb(skb); | ||
600 | return NETDEV_TX_OK; | ||
601 | } | ||
602 | |||
603 | static void cfv_tx_release_tasklet(unsigned long drv) | ||
604 | { | ||
605 | struct cfv_info *cfv = (struct cfv_info *)drv; | ||
606 | cfv_release_used_buf(cfv->vq_tx); | ||
607 | } | ||
608 | |||
609 | static const struct net_device_ops cfv_netdev_ops = { | ||
610 | .ndo_open = cfv_netdev_open, | ||
611 | .ndo_stop = cfv_netdev_close, | ||
612 | .ndo_start_xmit = cfv_netdev_tx, | ||
613 | }; | ||
614 | |||
615 | static void cfv_netdev_setup(struct net_device *netdev) | ||
616 | { | ||
617 | netdev->netdev_ops = &cfv_netdev_ops; | ||
618 | netdev->type = ARPHRD_CAIF; | ||
619 | netdev->tx_queue_len = 100; | ||
620 | netdev->flags = IFF_POINTOPOINT | IFF_NOARP; | ||
621 | netdev->mtu = CFV_DEF_MTU_SIZE; | ||
622 | netdev->destructor = free_netdev; | ||
623 | } | ||
624 | |||
625 | /* Create debugfs counters for the device */ | ||
626 | static inline void debugfs_init(struct cfv_info *cfv) | ||
627 | { | ||
628 | cfv->debugfs = | ||
629 | debugfs_create_dir(netdev_name(cfv->ndev), NULL); | ||
630 | |||
631 | if (IS_ERR(cfv->debugfs)) | ||
632 | return; | ||
633 | |||
634 | debugfs_create_u32("rx-napi-complete", S_IRUSR, cfv->debugfs, | ||
635 | &cfv->stats.rx_napi_complete); | ||
636 | debugfs_create_u32("rx-napi-resched", S_IRUSR, cfv->debugfs, | ||
637 | &cfv->stats.rx_napi_resched); | ||
638 | debugfs_create_u32("rx-nomem", S_IRUSR, cfv->debugfs, | ||
639 | &cfv->stats.rx_nomem); | ||
640 | debugfs_create_u32("rx-kicks", S_IRUSR, cfv->debugfs, | ||
641 | &cfv->stats.rx_kicks); | ||
642 | debugfs_create_u32("tx-full-ring", S_IRUSR, cfv->debugfs, | ||
643 | &cfv->stats.tx_full_ring); | ||
644 | debugfs_create_u32("tx-no-mem", S_IRUSR, cfv->debugfs, | ||
645 | &cfv->stats.tx_no_mem); | ||
646 | debugfs_create_u32("tx-kicks", S_IRUSR, cfv->debugfs, | ||
647 | &cfv->stats.tx_kicks); | ||
648 | debugfs_create_u32("tx-flow-on", S_IRUSR, cfv->debugfs, | ||
649 | &cfv->stats.tx_flow_on); | ||
650 | } | ||
651 | |||
652 | /* Setup CAIF for the a virtio device */ | ||
653 | static int cfv_probe(struct virtio_device *vdev) | ||
654 | { | ||
655 | vq_callback_t *vq_cbs = cfv_release_cb; | ||
656 | vrh_callback_t *vrh_cbs = cfv_recv; | ||
657 | const char *names = "output"; | ||
658 | const char *cfv_netdev_name = "cfvrt"; | ||
659 | struct net_device *netdev; | ||
660 | struct cfv_info *cfv; | ||
661 | int err = -EINVAL; | ||
662 | |||
663 | netdev = alloc_netdev(sizeof(struct cfv_info), cfv_netdev_name, | ||
664 | cfv_netdev_setup); | ||
665 | if (!netdev) | ||
666 | return -ENOMEM; | ||
667 | |||
668 | cfv = netdev_priv(netdev); | ||
669 | cfv->vdev = vdev; | ||
670 | cfv->ndev = netdev; | ||
671 | |||
672 | spin_lock_init(&cfv->tx_lock); | ||
673 | |||
674 | /* Get the RX virtio ring. This is a "host side vring". */ | ||
675 | err = -ENODEV; | ||
676 | if (!vdev->vringh_config || !vdev->vringh_config->find_vrhs) | ||
677 | goto err; | ||
678 | |||
679 | err = vdev->vringh_config->find_vrhs(vdev, 1, &cfv->vr_rx, &vrh_cbs); | ||
680 | if (err) | ||
681 | goto err; | ||
682 | |||
683 | /* Get the TX virtio ring. This is a "guest side vring". */ | ||
684 | err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names); | ||
685 | if (err) | ||
686 | goto err; | ||
687 | |||
688 | /* Get the CAIF configuration from virtio config space, if available */ | ||
689 | #define GET_VIRTIO_CONFIG_OPS(_v, _var, _f) \ | ||
690 | ((_v)->config->get(_v, offsetof(struct virtio_caif_transf_config, _f), \ | ||
691 | &_var, \ | ||
692 | FIELD_SIZEOF(struct virtio_caif_transf_config, _f))) | ||
693 | |||
694 | if (vdev->config->get) { | ||
695 | GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_hr, headroom); | ||
696 | GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_hr, headroom); | ||
697 | GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_tr, tailroom); | ||
698 | GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_tr, tailroom); | ||
699 | GET_VIRTIO_CONFIG_OPS(vdev, cfv->mtu, mtu); | ||
700 | GET_VIRTIO_CONFIG_OPS(vdev, cfv->mru, mtu); | ||
701 | } else { | ||
702 | cfv->tx_hr = CFV_DEF_HEADROOM; | ||
703 | cfv->rx_hr = CFV_DEF_HEADROOM; | ||
704 | cfv->tx_tr = CFV_DEF_TAILROOM; | ||
705 | cfv->rx_tr = CFV_DEF_TAILROOM; | ||
706 | cfv->mtu = CFV_DEF_MTU_SIZE; | ||
707 | cfv->mru = CFV_DEF_MTU_SIZE; | ||
708 | } | ||
709 | |||
710 | netdev->needed_headroom = cfv->tx_hr; | ||
711 | netdev->needed_tailroom = cfv->tx_tr; | ||
712 | |||
713 | /* Disable buffer release interrupts unless we have stopped TX queues */ | ||
714 | virtqueue_disable_cb(cfv->vq_tx); | ||
715 | |||
716 | netdev->mtu = cfv->mtu - cfv->tx_tr; | ||
717 | vdev->priv = cfv; | ||
718 | |||
719 | /* Initialize NAPI poll context data */ | ||
720 | vringh_kiov_init(&cfv->ctx.riov, NULL, 0); | ||
721 | cfv->ctx.head = USHRT_MAX; | ||
722 | netif_napi_add(netdev, &cfv->napi, cfv_rx_poll, CFV_DEFAULT_QUOTA); | ||
723 | |||
724 | tasklet_init(&cfv->tx_release_tasklet, | ||
725 | cfv_tx_release_tasklet, | ||
726 | (unsigned long)cfv); | ||
727 | |||
728 | /* Carrier is off until netdevice is opened */ | ||
729 | netif_carrier_off(netdev); | ||
730 | |||
731 | /* register Netdev */ | ||
732 | err = register_netdev(netdev); | ||
733 | if (err) { | ||
734 | dev_err(&vdev->dev, "Unable to register netdev (%d)\n", err); | ||
735 | goto err; | ||
736 | } | ||
737 | |||
738 | debugfs_init(cfv); | ||
739 | |||
740 | return 0; | ||
741 | err: | ||
742 | netdev_warn(cfv->ndev, "CAIF Virtio probe failed:%d\n", err); | ||
743 | |||
744 | if (cfv->vr_rx) | ||
745 | vdev->vringh_config->del_vrhs(cfv->vdev); | ||
746 | if (cfv->vdev) | ||
747 | vdev->config->del_vqs(cfv->vdev); | ||
748 | free_netdev(netdev); | ||
749 | return err; | ||
750 | } | ||
751 | |||
752 | static void cfv_remove(struct virtio_device *vdev) | ||
753 | { | ||
754 | struct cfv_info *cfv = vdev->priv; | ||
755 | |||
756 | rtnl_lock(); | ||
757 | dev_close(cfv->ndev); | ||
758 | rtnl_unlock(); | ||
759 | |||
760 | tasklet_kill(&cfv->tx_release_tasklet); | ||
761 | debugfs_remove_recursive(cfv->debugfs); | ||
762 | |||
763 | vringh_kiov_cleanup(&cfv->ctx.riov); | ||
764 | vdev->config->reset(vdev); | ||
765 | vdev->vringh_config->del_vrhs(cfv->vdev); | ||
766 | cfv->vr_rx = NULL; | ||
767 | vdev->config->del_vqs(cfv->vdev); | ||
768 | unregister_netdev(cfv->ndev); | ||
769 | } | ||
770 | |||
771 | static struct virtio_device_id id_table[] = { | ||
772 | { VIRTIO_ID_CAIF, VIRTIO_DEV_ANY_ID }, | ||
773 | { 0 }, | ||
774 | }; | ||
775 | |||
776 | static unsigned int features[] = { | ||
777 | }; | ||
778 | |||
779 | static struct virtio_driver caif_virtio_driver = { | ||
780 | .feature_table = features, | ||
781 | .feature_table_size = ARRAY_SIZE(features), | ||
782 | .driver.name = KBUILD_MODNAME, | ||
783 | .driver.owner = THIS_MODULE, | ||
784 | .id_table = id_table, | ||
785 | .probe = cfv_probe, | ||
786 | .remove = cfv_remove, | ||
787 | }; | ||
788 | |||
789 | module_virtio_driver(caif_virtio_driver); | ||
790 | MODULE_DEVICE_TABLE(virtio, id_table); | ||
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 50077753a0e5..3c23fdc27bf0 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c | |||
@@ -39,7 +39,6 @@ module_param(gso, bool, 0444); | |||
39 | #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) | 39 | #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) |
40 | #define GOOD_COPY_LEN 128 | 40 | #define GOOD_COPY_LEN 128 |
41 | 41 | ||
42 | #define VIRTNET_SEND_COMMAND_SG_MAX 2 | ||
43 | #define VIRTNET_DRIVER_VERSION "1.0.0" | 42 | #define VIRTNET_DRIVER_VERSION "1.0.0" |
44 | 43 | ||
45 | struct virtnet_stats { | 44 | struct virtnet_stats { |
@@ -444,7 +443,7 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp) | |||
444 | 443 | ||
445 | skb_to_sgvec(skb, rq->sg + 1, 0, skb->len); | 444 | skb_to_sgvec(skb, rq->sg + 1, 0, skb->len); |
446 | 445 | ||
447 | err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp); | 446 | err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp); |
448 | if (err < 0) | 447 | if (err < 0) |
449 | dev_kfree_skb(skb); | 448 | dev_kfree_skb(skb); |
450 | 449 | ||
@@ -489,8 +488,8 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp) | |||
489 | 488 | ||
490 | /* chain first in list head */ | 489 | /* chain first in list head */ |
491 | first->private = (unsigned long)list; | 490 | first->private = (unsigned long)list; |
492 | err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2, | 491 | err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2, |
493 | first, gfp); | 492 | first, gfp); |
494 | if (err < 0) | 493 | if (err < 0) |
495 | give_pages(rq, first); | 494 | give_pages(rq, first); |
496 | 495 | ||
@@ -508,7 +507,7 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp) | |||
508 | 507 | ||
509 | sg_init_one(rq->sg, page_address(page), PAGE_SIZE); | 508 | sg_init_one(rq->sg, page_address(page), PAGE_SIZE); |
510 | 509 | ||
511 | err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp); | 510 | err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, page, gfp); |
512 | if (err < 0) | 511 | if (err < 0) |
513 | give_pages(rq, page); | 512 | give_pages(rq, page); |
514 | 513 | ||
@@ -582,7 +581,7 @@ static void refill_work(struct work_struct *work) | |||
582 | bool still_empty; | 581 | bool still_empty; |
583 | int i; | 582 | int i; |
584 | 583 | ||
585 | for (i = 0; i < vi->max_queue_pairs; i++) { | 584 | for (i = 0; i < vi->curr_queue_pairs; i++) { |
586 | struct receive_queue *rq = &vi->rq[i]; | 585 | struct receive_queue *rq = &vi->rq[i]; |
587 | 586 | ||
588 | napi_disable(&rq->napi); | 587 | napi_disable(&rq->napi); |
@@ -637,7 +636,7 @@ static int virtnet_open(struct net_device *dev) | |||
637 | struct virtnet_info *vi = netdev_priv(dev); | 636 | struct virtnet_info *vi = netdev_priv(dev); |
638 | int i; | 637 | int i; |
639 | 638 | ||
640 | for (i = 0; i < vi->max_queue_pairs; i++) { | 639 | for (i = 0; i < vi->curr_queue_pairs; i++) { |
641 | /* Make sure we have some buffers: if oom use wq. */ | 640 | /* Make sure we have some buffers: if oom use wq. */ |
642 | if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) | 641 | if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) |
643 | schedule_delayed_work(&vi->refill, 0); | 642 | schedule_delayed_work(&vi->refill, 0); |
@@ -711,8 +710,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) | |||
711 | sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr); | 710 | sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr); |
712 | 711 | ||
713 | num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1; | 712 | num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1; |
714 | return virtqueue_add_buf(sq->vq, sq->sg, num_sg, | 713 | return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC); |
715 | 0, skb, GFP_ATOMIC); | ||
716 | } | 714 | } |
717 | 715 | ||
718 | static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) | 716 | static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) |
@@ -767,32 +765,35 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) | |||
767 | * never fail unless improperly formated. | 765 | * never fail unless improperly formated. |
768 | */ | 766 | */ |
769 | static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, | 767 | static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, |
770 | struct scatterlist *data, int out, int in) | 768 | struct scatterlist *out, |
769 | struct scatterlist *in) | ||
771 | { | 770 | { |
772 | struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2]; | 771 | struct scatterlist *sgs[4], hdr, stat; |
773 | struct virtio_net_ctrl_hdr ctrl; | 772 | struct virtio_net_ctrl_hdr ctrl; |
774 | virtio_net_ctrl_ack status = ~0; | 773 | virtio_net_ctrl_ack status = ~0; |
775 | unsigned int tmp; | 774 | unsigned out_num = 0, in_num = 0, tmp; |
776 | int i; | ||
777 | 775 | ||
778 | /* Caller should know better */ | 776 | /* Caller should know better */ |
779 | BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) || | 777 | BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)); |
780 | (out + in > VIRTNET_SEND_COMMAND_SG_MAX)); | ||
781 | |||
782 | out++; /* Add header */ | ||
783 | in++; /* Add return status */ | ||
784 | 778 | ||
785 | ctrl.class = class; | 779 | ctrl.class = class; |
786 | ctrl.cmd = cmd; | 780 | ctrl.cmd = cmd; |
781 | /* Add header */ | ||
782 | sg_init_one(&hdr, &ctrl, sizeof(ctrl)); | ||
783 | sgs[out_num++] = &hdr; | ||
787 | 784 | ||
788 | sg_init_table(sg, out + in); | 785 | if (out) |
786 | sgs[out_num++] = out; | ||
787 | if (in) | ||
788 | sgs[out_num + in_num++] = in; | ||
789 | 789 | ||
790 | sg_set_buf(&sg[0], &ctrl, sizeof(ctrl)); | 790 | /* Add return status. */ |
791 | for_each_sg(data, s, out + in - 2, i) | 791 | sg_init_one(&stat, &status, sizeof(status)); |
792 | sg_set_buf(&sg[i + 1], sg_virt(s), s->length); | 792 | sgs[out_num + in_num++] = &stat; |
793 | sg_set_buf(&sg[out + in - 1], &status, sizeof(status)); | ||
794 | 793 | ||
795 | BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi, GFP_ATOMIC) < 0); | 794 | BUG_ON(out_num + in_num > ARRAY_SIZE(sgs)); |
795 | BUG_ON(virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC) | ||
796 | < 0); | ||
796 | 797 | ||
797 | virtqueue_kick(vi->cvq); | 798 | virtqueue_kick(vi->cvq); |
798 | 799 | ||
@@ -821,7 +822,7 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) | |||
821 | sg_init_one(&sg, addr->sa_data, dev->addr_len); | 822 | sg_init_one(&sg, addr->sa_data, dev->addr_len); |
822 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, | 823 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, |
823 | VIRTIO_NET_CTRL_MAC_ADDR_SET, | 824 | VIRTIO_NET_CTRL_MAC_ADDR_SET, |
824 | &sg, 1, 0)) { | 825 | &sg, NULL)) { |
825 | dev_warn(&vdev->dev, | 826 | dev_warn(&vdev->dev, |
826 | "Failed to set mac address by vq command.\n"); | 827 | "Failed to set mac address by vq command.\n"); |
827 | return -EINVAL; | 828 | return -EINVAL; |
@@ -889,8 +890,7 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi) | |||
889 | { | 890 | { |
890 | rtnl_lock(); | 891 | rtnl_lock(); |
891 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE, | 892 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE, |
892 | VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, | 893 | VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, NULL)) |
893 | 0, 0)) | ||
894 | dev_warn(&vi->dev->dev, "Failed to ack link announce.\n"); | 894 | dev_warn(&vi->dev->dev, "Failed to ack link announce.\n"); |
895 | rtnl_unlock(); | 895 | rtnl_unlock(); |
896 | } | 896 | } |
@@ -900,6 +900,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) | |||
900 | struct scatterlist sg; | 900 | struct scatterlist sg; |
901 | struct virtio_net_ctrl_mq s; | 901 | struct virtio_net_ctrl_mq s; |
902 | struct net_device *dev = vi->dev; | 902 | struct net_device *dev = vi->dev; |
903 | int i; | ||
903 | 904 | ||
904 | if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) | 905 | if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) |
905 | return 0; | 906 | return 0; |
@@ -908,12 +909,16 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) | |||
908 | sg_init_one(&sg, &s, sizeof(s)); | 909 | sg_init_one(&sg, &s, sizeof(s)); |
909 | 910 | ||
910 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, | 911 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, |
911 | VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, 1, 0)){ | 912 | VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, NULL)) { |
912 | dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n", | 913 | dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n", |
913 | queue_pairs); | 914 | queue_pairs); |
914 | return -EINVAL; | 915 | return -EINVAL; |
915 | } else | 916 | } else { |
917 | for (i = vi->curr_queue_pairs; i < queue_pairs; i++) | ||
918 | if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) | ||
919 | schedule_delayed_work(&vi->refill, 0); | ||
916 | vi->curr_queue_pairs = queue_pairs; | 920 | vi->curr_queue_pairs = queue_pairs; |
921 | } | ||
917 | 922 | ||
918 | return 0; | 923 | return 0; |
919 | } | 924 | } |
@@ -955,7 +960,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) | |||
955 | 960 | ||
956 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, | 961 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, |
957 | VIRTIO_NET_CTRL_RX_PROMISC, | 962 | VIRTIO_NET_CTRL_RX_PROMISC, |
958 | sg, 1, 0)) | 963 | sg, NULL)) |
959 | dev_warn(&dev->dev, "Failed to %sable promisc mode.\n", | 964 | dev_warn(&dev->dev, "Failed to %sable promisc mode.\n", |
960 | promisc ? "en" : "dis"); | 965 | promisc ? "en" : "dis"); |
961 | 966 | ||
@@ -963,7 +968,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) | |||
963 | 968 | ||
964 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, | 969 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, |
965 | VIRTIO_NET_CTRL_RX_ALLMULTI, | 970 | VIRTIO_NET_CTRL_RX_ALLMULTI, |
966 | sg, 1, 0)) | 971 | sg, NULL)) |
967 | dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", | 972 | dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", |
968 | allmulti ? "en" : "dis"); | 973 | allmulti ? "en" : "dis"); |
969 | 974 | ||
@@ -1000,7 +1005,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) | |||
1000 | 1005 | ||
1001 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, | 1006 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, |
1002 | VIRTIO_NET_CTRL_MAC_TABLE_SET, | 1007 | VIRTIO_NET_CTRL_MAC_TABLE_SET, |
1003 | sg, 2, 0)) | 1008 | sg, NULL)) |
1004 | dev_warn(&dev->dev, "Failed to set MAC fitler table.\n"); | 1009 | dev_warn(&dev->dev, "Failed to set MAC fitler table.\n"); |
1005 | 1010 | ||
1006 | kfree(buf); | 1011 | kfree(buf); |
@@ -1015,7 +1020,7 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev, | |||
1015 | sg_init_one(&sg, &vid, sizeof(vid)); | 1020 | sg_init_one(&sg, &vid, sizeof(vid)); |
1016 | 1021 | ||
1017 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, | 1022 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, |
1018 | VIRTIO_NET_CTRL_VLAN_ADD, &sg, 1, 0)) | 1023 | VIRTIO_NET_CTRL_VLAN_ADD, &sg, NULL)) |
1019 | dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid); | 1024 | dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid); |
1020 | return 0; | 1025 | return 0; |
1021 | } | 1026 | } |
@@ -1029,7 +1034,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev, | |||
1029 | sg_init_one(&sg, &vid, sizeof(vid)); | 1034 | sg_init_one(&sg, &vid, sizeof(vid)); |
1030 | 1035 | ||
1031 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, | 1036 | if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, |
1032 | VIRTIO_NET_CTRL_VLAN_DEL, &sg, 1, 0)) | 1037 | VIRTIO_NET_CTRL_VLAN_DEL, &sg, NULL)) |
1033 | dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid); | 1038 | dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid); |
1034 | return 0; | 1039 | return 0; |
1035 | } | 1040 | } |
@@ -1570,7 +1575,7 @@ static int virtnet_probe(struct virtio_device *vdev) | |||
1570 | } | 1575 | } |
1571 | 1576 | ||
1572 | /* Last of all, set up some receive buffers. */ | 1577 | /* Last of all, set up some receive buffers. */ |
1573 | for (i = 0; i < vi->max_queue_pairs; i++) { | 1578 | for (i = 0; i < vi->curr_queue_pairs; i++) { |
1574 | try_fill_recv(&vi->rq[i], GFP_KERNEL); | 1579 | try_fill_recv(&vi->rq[i], GFP_KERNEL); |
1575 | 1580 | ||
1576 | /* If we didn't even get one input buffer, we're useless. */ | 1581 | /* If we didn't even get one input buffer, we're useless. */ |
@@ -1694,7 +1699,7 @@ static int virtnet_restore(struct virtio_device *vdev) | |||
1694 | 1699 | ||
1695 | netif_device_attach(vi->dev); | 1700 | netif_device_attach(vi->dev); |
1696 | 1701 | ||
1697 | for (i = 0; i < vi->max_queue_pairs; i++) | 1702 | for (i = 0; i < vi->curr_queue_pairs; i++) |
1698 | if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) | 1703 | if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) |
1699 | schedule_delayed_work(&vi->refill, 0); | 1704 | schedule_delayed_work(&vi->refill, 0); |
1700 | 1705 | ||
diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c index 7861f1119b7d..56fceafec9ec 100644 --- a/drivers/rpmsg/virtio_rpmsg_bus.c +++ b/drivers/rpmsg/virtio_rpmsg_bus.c | |||
@@ -757,14 +757,14 @@ int rpmsg_send_offchannel_raw(struct rpmsg_channel *rpdev, u32 src, u32 dst, | |||
757 | mutex_lock(&vrp->tx_lock); | 757 | mutex_lock(&vrp->tx_lock); |
758 | 758 | ||
759 | /* add message to the remote processor's virtqueue */ | 759 | /* add message to the remote processor's virtqueue */ |
760 | err = virtqueue_add_buf(vrp->svq, &sg, 1, 0, msg, GFP_KERNEL); | 760 | err = virtqueue_add_outbuf(vrp->svq, &sg, 1, msg, GFP_KERNEL); |
761 | if (err) { | 761 | if (err) { |
762 | /* | 762 | /* |
763 | * need to reclaim the buffer here, otherwise it's lost | 763 | * need to reclaim the buffer here, otherwise it's lost |
764 | * (memory won't leak, but rpmsg won't use it again for TX). | 764 | * (memory won't leak, but rpmsg won't use it again for TX). |
765 | * this will wait for a buffer management overhaul. | 765 | * this will wait for a buffer management overhaul. |
766 | */ | 766 | */ |
767 | dev_err(dev, "virtqueue_add_buf failed: %d\n", err); | 767 | dev_err(dev, "virtqueue_add_outbuf failed: %d\n", err); |
768 | goto out; | 768 | goto out; |
769 | } | 769 | } |
770 | 770 | ||
@@ -839,7 +839,7 @@ static void rpmsg_recv_done(struct virtqueue *rvq) | |||
839 | sg_init_one(&sg, msg, RPMSG_BUF_SIZE); | 839 | sg_init_one(&sg, msg, RPMSG_BUF_SIZE); |
840 | 840 | ||
841 | /* add the buffer back to the remote processor's virtqueue */ | 841 | /* add the buffer back to the remote processor's virtqueue */ |
842 | err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, msg, GFP_KERNEL); | 842 | err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, msg, GFP_KERNEL); |
843 | if (err < 0) { | 843 | if (err < 0) { |
844 | dev_err(dev, "failed to add a virtqueue buffer: %d\n", err); | 844 | dev_err(dev, "failed to add a virtqueue buffer: %d\n", err); |
845 | return; | 845 | return; |
@@ -972,7 +972,7 @@ static int rpmsg_probe(struct virtio_device *vdev) | |||
972 | 972 | ||
973 | sg_init_one(&sg, cpu_addr, RPMSG_BUF_SIZE); | 973 | sg_init_one(&sg, cpu_addr, RPMSG_BUF_SIZE); |
974 | 974 | ||
975 | err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, cpu_addr, | 975 | err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, cpu_addr, |
976 | GFP_KERNEL); | 976 | GFP_KERNEL); |
977 | WARN_ON(err); /* sanity check; this can't really happen */ | 977 | WARN_ON(err); /* sanity check; this can't really happen */ |
978 | } | 978 | } |
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 3449a1f8c656..2168258fb2c3 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c | |||
@@ -13,6 +13,8 @@ | |||
13 | * | 13 | * |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
17 | |||
16 | #include <linux/module.h> | 18 | #include <linux/module.h> |
17 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
18 | #include <linux/mempool.h> | 20 | #include <linux/mempool.h> |
@@ -20,12 +22,14 @@ | |||
20 | #include <linux/virtio_ids.h> | 22 | #include <linux/virtio_ids.h> |
21 | #include <linux/virtio_config.h> | 23 | #include <linux/virtio_config.h> |
22 | #include <linux/virtio_scsi.h> | 24 | #include <linux/virtio_scsi.h> |
25 | #include <linux/cpu.h> | ||
23 | #include <scsi/scsi_host.h> | 26 | #include <scsi/scsi_host.h> |
24 | #include <scsi/scsi_device.h> | 27 | #include <scsi/scsi_device.h> |
25 | #include <scsi/scsi_cmnd.h> | 28 | #include <scsi/scsi_cmnd.h> |
26 | 29 | ||
27 | #define VIRTIO_SCSI_MEMPOOL_SZ 64 | 30 | #define VIRTIO_SCSI_MEMPOOL_SZ 64 |
28 | #define VIRTIO_SCSI_EVENT_LEN 8 | 31 | #define VIRTIO_SCSI_EVENT_LEN 8 |
32 | #define VIRTIO_SCSI_VQ_BASE 2 | ||
29 | 33 | ||
30 | /* Command queue element */ | 34 | /* Command queue element */ |
31 | struct virtio_scsi_cmd { | 35 | struct virtio_scsi_cmd { |
@@ -57,27 +61,61 @@ struct virtio_scsi_vq { | |||
57 | struct virtqueue *vq; | 61 | struct virtqueue *vq; |
58 | }; | 62 | }; |
59 | 63 | ||
60 | /* Per-target queue state */ | 64 | /* |
65 | * Per-target queue state. | ||
66 | * | ||
67 | * This struct holds the data needed by the queue steering policy. When a | ||
68 | * target is sent multiple requests, we need to drive them to the same queue so | ||
69 | * that FIFO processing order is kept. However, if a target was idle, we can | ||
70 | * choose a queue arbitrarily. In this case the queue is chosen according to | ||
71 | * the current VCPU, so the driver expects the number of request queues to be | ||
72 | * equal to the number of VCPUs. This makes it easy and fast to select the | ||
73 | * queue, and also lets the driver optimize the IRQ affinity for the virtqueues | ||
74 | * (each virtqueue's affinity is set to the CPU that "owns" the queue). | ||
75 | * | ||
76 | * An interesting effect of this policy is that only writes to req_vq need to | ||
77 | * take the tgt_lock. Read can be done outside the lock because: | ||
78 | * | ||
79 | * - writes of req_vq only occur when atomic_inc_return(&tgt->reqs) returns 1. | ||
80 | * In that case, no other CPU is reading req_vq: even if they were in | ||
81 | * virtscsi_queuecommand_multi, they would be spinning on tgt_lock. | ||
82 | * | ||
83 | * - reads of req_vq only occur when the target is not idle (reqs != 0). | ||
84 | * A CPU that enters virtscsi_queuecommand_multi will not modify req_vq. | ||
85 | * | ||
86 | * Similarly, decrements of reqs are never concurrent with writes of req_vq. | ||
87 | * Thus they can happen outside the tgt_lock, provided of course we make reqs | ||
88 | * an atomic_t. | ||
89 | */ | ||
61 | struct virtio_scsi_target_state { | 90 | struct virtio_scsi_target_state { |
62 | /* Protects sg. Lock hierarchy is tgt_lock -> vq_lock. */ | 91 | /* This spinlock never held at the same time as vq_lock. */ |
63 | spinlock_t tgt_lock; | 92 | spinlock_t tgt_lock; |
64 | 93 | ||
65 | /* For sglist construction when adding commands to the virtqueue. */ | 94 | /* Count of outstanding requests. */ |
66 | struct scatterlist sg[]; | 95 | atomic_t reqs; |
96 | |||
97 | /* Currently active virtqueue for requests sent to this target. */ | ||
98 | struct virtio_scsi_vq *req_vq; | ||
67 | }; | 99 | }; |
68 | 100 | ||
69 | /* Driver instance state */ | 101 | /* Driver instance state */ |
70 | struct virtio_scsi { | 102 | struct virtio_scsi { |
71 | struct virtio_device *vdev; | 103 | struct virtio_device *vdev; |
72 | 104 | ||
73 | struct virtio_scsi_vq ctrl_vq; | ||
74 | struct virtio_scsi_vq event_vq; | ||
75 | struct virtio_scsi_vq req_vq; | ||
76 | |||
77 | /* Get some buffers ready for event vq */ | 105 | /* Get some buffers ready for event vq */ |
78 | struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN]; | 106 | struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN]; |
79 | 107 | ||
80 | struct virtio_scsi_target_state *tgt[]; | 108 | u32 num_queues; |
109 | |||
110 | /* If the affinity hint is set for virtqueues */ | ||
111 | bool affinity_hint_set; | ||
112 | |||
113 | /* CPU hotplug notifier */ | ||
114 | struct notifier_block nb; | ||
115 | |||
116 | struct virtio_scsi_vq ctrl_vq; | ||
117 | struct virtio_scsi_vq event_vq; | ||
118 | struct virtio_scsi_vq req_vqs[]; | ||
81 | }; | 119 | }; |
82 | 120 | ||
83 | static struct kmem_cache *virtscsi_cmd_cache; | 121 | static struct kmem_cache *virtscsi_cmd_cache; |
@@ -107,11 +145,13 @@ static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid) | |||
107 | * | 145 | * |
108 | * Called with vq_lock held. | 146 | * Called with vq_lock held. |
109 | */ | 147 | */ |
110 | static void virtscsi_complete_cmd(void *buf) | 148 | static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf) |
111 | { | 149 | { |
112 | struct virtio_scsi_cmd *cmd = buf; | 150 | struct virtio_scsi_cmd *cmd = buf; |
113 | struct scsi_cmnd *sc = cmd->sc; | 151 | struct scsi_cmnd *sc = cmd->sc; |
114 | struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd; | 152 | struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd; |
153 | struct virtio_scsi_target_state *tgt = | ||
154 | scsi_target(sc->device)->hostdata; | ||
115 | 155 | ||
116 | dev_dbg(&sc->device->sdev_gendev, | 156 | dev_dbg(&sc->device->sdev_gendev, |
117 | "cmd %p response %u status %#02x sense_len %u\n", | 157 | "cmd %p response %u status %#02x sense_len %u\n", |
@@ -166,32 +206,71 @@ static void virtscsi_complete_cmd(void *buf) | |||
166 | 206 | ||
167 | mempool_free(cmd, virtscsi_cmd_pool); | 207 | mempool_free(cmd, virtscsi_cmd_pool); |
168 | sc->scsi_done(sc); | 208 | sc->scsi_done(sc); |
209 | |||
210 | atomic_dec(&tgt->reqs); | ||
169 | } | 211 | } |
170 | 212 | ||
171 | static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf)) | 213 | static void virtscsi_vq_done(struct virtio_scsi *vscsi, |
214 | struct virtio_scsi_vq *virtscsi_vq, | ||
215 | void (*fn)(struct virtio_scsi *vscsi, void *buf)) | ||
172 | { | 216 | { |
173 | void *buf; | 217 | void *buf; |
174 | unsigned int len; | 218 | unsigned int len; |
219 | unsigned long flags; | ||
220 | struct virtqueue *vq = virtscsi_vq->vq; | ||
175 | 221 | ||
222 | spin_lock_irqsave(&virtscsi_vq->vq_lock, flags); | ||
176 | do { | 223 | do { |
177 | virtqueue_disable_cb(vq); | 224 | virtqueue_disable_cb(vq); |
178 | while ((buf = virtqueue_get_buf(vq, &len)) != NULL) | 225 | while ((buf = virtqueue_get_buf(vq, &len)) != NULL) |
179 | fn(buf); | 226 | fn(vscsi, buf); |
180 | } while (!virtqueue_enable_cb(vq)); | 227 | } while (!virtqueue_enable_cb(vq)); |
228 | spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags); | ||
181 | } | 229 | } |
182 | 230 | ||
183 | static void virtscsi_req_done(struct virtqueue *vq) | 231 | static void virtscsi_req_done(struct virtqueue *vq) |
184 | { | 232 | { |
185 | struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); | 233 | struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); |
186 | struct virtio_scsi *vscsi = shost_priv(sh); | 234 | struct virtio_scsi *vscsi = shost_priv(sh); |
187 | unsigned long flags; | 235 | int index = vq->index - VIRTIO_SCSI_VQ_BASE; |
236 | struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index]; | ||
188 | 237 | ||
189 | spin_lock_irqsave(&vscsi->req_vq.vq_lock, flags); | 238 | /* |
190 | virtscsi_vq_done(vq, virtscsi_complete_cmd); | 239 | * Read req_vq before decrementing the reqs field in |
191 | spin_unlock_irqrestore(&vscsi->req_vq.vq_lock, flags); | 240 | * virtscsi_complete_cmd. |
241 | * | ||
242 | * With barriers: | ||
243 | * | ||
244 | * CPU #0 virtscsi_queuecommand_multi (CPU #1) | ||
245 | * ------------------------------------------------------------ | ||
246 | * lock vq_lock | ||
247 | * read req_vq | ||
248 | * read reqs (reqs = 1) | ||
249 | * write reqs (reqs = 0) | ||
250 | * increment reqs (reqs = 1) | ||
251 | * write req_vq | ||
252 | * | ||
253 | * Possible reordering without barriers: | ||
254 | * | ||
255 | * CPU #0 virtscsi_queuecommand_multi (CPU #1) | ||
256 | * ------------------------------------------------------------ | ||
257 | * lock vq_lock | ||
258 | * read reqs (reqs = 1) | ||
259 | * write reqs (reqs = 0) | ||
260 | * increment reqs (reqs = 1) | ||
261 | * write req_vq | ||
262 | * read (wrong) req_vq | ||
263 | * | ||
264 | * We do not need a full smp_rmb, because req_vq is required to get | ||
265 | * to tgt->reqs: tgt is &vscsi->tgt[sc->device->id], where sc is stored | ||
266 | * in the virtqueue as the user token. | ||
267 | */ | ||
268 | smp_read_barrier_depends(); | ||
269 | |||
270 | virtscsi_vq_done(vscsi, req_vq, virtscsi_complete_cmd); | ||
192 | }; | 271 | }; |
193 | 272 | ||
194 | static void virtscsi_complete_free(void *buf) | 273 | static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf) |
195 | { | 274 | { |
196 | struct virtio_scsi_cmd *cmd = buf; | 275 | struct virtio_scsi_cmd *cmd = buf; |
197 | 276 | ||
@@ -205,11 +284,8 @@ static void virtscsi_ctrl_done(struct virtqueue *vq) | |||
205 | { | 284 | { |
206 | struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); | 285 | struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); |
207 | struct virtio_scsi *vscsi = shost_priv(sh); | 286 | struct virtio_scsi *vscsi = shost_priv(sh); |
208 | unsigned long flags; | ||
209 | 287 | ||
210 | spin_lock_irqsave(&vscsi->ctrl_vq.vq_lock, flags); | 288 | virtscsi_vq_done(vscsi, &vscsi->ctrl_vq, virtscsi_complete_free); |
211 | virtscsi_vq_done(vq, virtscsi_complete_free); | ||
212 | spin_unlock_irqrestore(&vscsi->ctrl_vq.vq_lock, flags); | ||
213 | }; | 289 | }; |
214 | 290 | ||
215 | static int virtscsi_kick_event(struct virtio_scsi *vscsi, | 291 | static int virtscsi_kick_event(struct virtio_scsi *vscsi, |
@@ -223,8 +299,8 @@ static int virtscsi_kick_event(struct virtio_scsi *vscsi, | |||
223 | 299 | ||
224 | spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); | 300 | spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); |
225 | 301 | ||
226 | err = virtqueue_add_buf(vscsi->event_vq.vq, &sg, 0, 1, event_node, | 302 | err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node, |
227 | GFP_ATOMIC); | 303 | GFP_ATOMIC); |
228 | if (!err) | 304 | if (!err) |
229 | virtqueue_kick(vscsi->event_vq.vq); | 305 | virtqueue_kick(vscsi->event_vq.vq); |
230 | 306 | ||
@@ -254,7 +330,7 @@ static void virtscsi_cancel_event_work(struct virtio_scsi *vscsi) | |||
254 | } | 330 | } |
255 | 331 | ||
256 | static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi, | 332 | static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi, |
257 | struct virtio_scsi_event *event) | 333 | struct virtio_scsi_event *event) |
258 | { | 334 | { |
259 | struct scsi_device *sdev; | 335 | struct scsi_device *sdev; |
260 | struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev); | 336 | struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev); |
@@ -332,7 +408,7 @@ static void virtscsi_handle_event(struct work_struct *work) | |||
332 | virtscsi_kick_event(vscsi, event_node); | 408 | virtscsi_kick_event(vscsi, event_node); |
333 | } | 409 | } |
334 | 410 | ||
335 | static void virtscsi_complete_event(void *buf) | 411 | static void virtscsi_complete_event(struct virtio_scsi *vscsi, void *buf) |
336 | { | 412 | { |
337 | struct virtio_scsi_event_node *event_node = buf; | 413 | struct virtio_scsi_event_node *event_node = buf; |
338 | 414 | ||
@@ -344,82 +420,65 @@ static void virtscsi_event_done(struct virtqueue *vq) | |||
344 | { | 420 | { |
345 | struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); | 421 | struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); |
346 | struct virtio_scsi *vscsi = shost_priv(sh); | 422 | struct virtio_scsi *vscsi = shost_priv(sh); |
347 | unsigned long flags; | ||
348 | 423 | ||
349 | spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); | 424 | virtscsi_vq_done(vscsi, &vscsi->event_vq, virtscsi_complete_event); |
350 | virtscsi_vq_done(vq, virtscsi_complete_event); | ||
351 | spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags); | ||
352 | }; | 425 | }; |
353 | 426 | ||
354 | static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx, | ||
355 | struct scsi_data_buffer *sdb) | ||
356 | { | ||
357 | struct sg_table *table = &sdb->table; | ||
358 | struct scatterlist *sg_elem; | ||
359 | unsigned int idx = *p_idx; | ||
360 | int i; | ||
361 | |||
362 | for_each_sg(table->sgl, sg_elem, table->nents, i) | ||
363 | sg[idx++] = *sg_elem; | ||
364 | |||
365 | *p_idx = idx; | ||
366 | } | ||
367 | |||
368 | /** | 427 | /** |
369 | * virtscsi_map_cmd - map a scsi_cmd to a virtqueue scatterlist | 428 | * virtscsi_add_cmd - add a virtio_scsi_cmd to a virtqueue |
370 | * @vscsi : virtio_scsi state | 429 | * @vq : the struct virtqueue we're talking about |
371 | * @cmd : command structure | 430 | * @cmd : command structure |
372 | * @out_num : number of read-only elements | ||
373 | * @in_num : number of write-only elements | ||
374 | * @req_size : size of the request buffer | 431 | * @req_size : size of the request buffer |
375 | * @resp_size : size of the response buffer | 432 | * @resp_size : size of the response buffer |
376 | * | 433 | * @gfp : flags to use for memory allocations |
377 | * Called with tgt_lock held. | ||
378 | */ | 434 | */ |
379 | static void virtscsi_map_cmd(struct virtio_scsi_target_state *tgt, | 435 | static int virtscsi_add_cmd(struct virtqueue *vq, |
380 | struct virtio_scsi_cmd *cmd, | 436 | struct virtio_scsi_cmd *cmd, |
381 | unsigned *out_num, unsigned *in_num, | 437 | size_t req_size, size_t resp_size, gfp_t gfp) |
382 | size_t req_size, size_t resp_size) | ||
383 | { | 438 | { |
384 | struct scsi_cmnd *sc = cmd->sc; | 439 | struct scsi_cmnd *sc = cmd->sc; |
385 | struct scatterlist *sg = tgt->sg; | 440 | struct scatterlist *sgs[4], req, resp; |
386 | unsigned int idx = 0; | 441 | struct sg_table *out, *in; |
442 | unsigned out_num = 0, in_num = 0; | ||
443 | |||
444 | out = in = NULL; | ||
445 | |||
446 | if (sc && sc->sc_data_direction != DMA_NONE) { | ||
447 | if (sc->sc_data_direction != DMA_FROM_DEVICE) | ||
448 | out = &scsi_out(sc)->table; | ||
449 | if (sc->sc_data_direction != DMA_TO_DEVICE) | ||
450 | in = &scsi_in(sc)->table; | ||
451 | } | ||
387 | 452 | ||
388 | /* Request header. */ | 453 | /* Request header. */ |
389 | sg_set_buf(&sg[idx++], &cmd->req, req_size); | 454 | sg_init_one(&req, &cmd->req, req_size); |
455 | sgs[out_num++] = &req; | ||
390 | 456 | ||
391 | /* Data-out buffer. */ | 457 | /* Data-out buffer. */ |
392 | if (sc && sc->sc_data_direction != DMA_FROM_DEVICE) | 458 | if (out) |
393 | virtscsi_map_sgl(sg, &idx, scsi_out(sc)); | 459 | sgs[out_num++] = out->sgl; |
394 | |||
395 | *out_num = idx; | ||
396 | 460 | ||
397 | /* Response header. */ | 461 | /* Response header. */ |
398 | sg_set_buf(&sg[idx++], &cmd->resp, resp_size); | 462 | sg_init_one(&resp, &cmd->resp, resp_size); |
463 | sgs[out_num + in_num++] = &resp; | ||
399 | 464 | ||
400 | /* Data-in buffer */ | 465 | /* Data-in buffer */ |
401 | if (sc && sc->sc_data_direction != DMA_TO_DEVICE) | 466 | if (in) |
402 | virtscsi_map_sgl(sg, &idx, scsi_in(sc)); | 467 | sgs[out_num + in_num++] = in->sgl; |
403 | 468 | ||
404 | *in_num = idx - *out_num; | 469 | return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, gfp); |
405 | } | 470 | } |
406 | 471 | ||
407 | static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt, | 472 | static int virtscsi_kick_cmd(struct virtio_scsi_vq *vq, |
408 | struct virtio_scsi_vq *vq, | ||
409 | struct virtio_scsi_cmd *cmd, | 473 | struct virtio_scsi_cmd *cmd, |
410 | size_t req_size, size_t resp_size, gfp_t gfp) | 474 | size_t req_size, size_t resp_size, gfp_t gfp) |
411 | { | 475 | { |
412 | unsigned int out_num, in_num; | ||
413 | unsigned long flags; | 476 | unsigned long flags; |
414 | int err; | 477 | int err; |
415 | bool needs_kick = false; | 478 | bool needs_kick = false; |
416 | 479 | ||
417 | spin_lock_irqsave(&tgt->tgt_lock, flags); | 480 | spin_lock_irqsave(&vq->vq_lock, flags); |
418 | virtscsi_map_cmd(tgt, cmd, &out_num, &in_num, req_size, resp_size); | 481 | err = virtscsi_add_cmd(vq->vq, cmd, req_size, resp_size, gfp); |
419 | |||
420 | spin_lock(&vq->vq_lock); | ||
421 | err = virtqueue_add_buf(vq->vq, tgt->sg, out_num, in_num, cmd, gfp); | ||
422 | spin_unlock(&tgt->tgt_lock); | ||
423 | if (!err) | 482 | if (!err) |
424 | needs_kick = virtqueue_kick_prepare(vq->vq); | 483 | needs_kick = virtqueue_kick_prepare(vq->vq); |
425 | 484 | ||
@@ -430,10 +489,10 @@ static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt, | |||
430 | return err; | 489 | return err; |
431 | } | 490 | } |
432 | 491 | ||
433 | static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) | 492 | static int virtscsi_queuecommand(struct virtio_scsi *vscsi, |
493 | struct virtio_scsi_vq *req_vq, | ||
494 | struct scsi_cmnd *sc) | ||
434 | { | 495 | { |
435 | struct virtio_scsi *vscsi = shost_priv(sh); | ||
436 | struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id]; | ||
437 | struct virtio_scsi_cmd *cmd; | 496 | struct virtio_scsi_cmd *cmd; |
438 | int ret; | 497 | int ret; |
439 | 498 | ||
@@ -467,7 +526,7 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) | |||
467 | BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE); | 526 | BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE); |
468 | memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len); | 527 | memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len); |
469 | 528 | ||
470 | if (virtscsi_kick_cmd(tgt, &vscsi->req_vq, cmd, | 529 | if (virtscsi_kick_cmd(req_vq, cmd, |
471 | sizeof cmd->req.cmd, sizeof cmd->resp.cmd, | 530 | sizeof cmd->req.cmd, sizeof cmd->resp.cmd, |
472 | GFP_ATOMIC) == 0) | 531 | GFP_ATOMIC) == 0) |
473 | ret = 0; | 532 | ret = 0; |
@@ -478,14 +537,62 @@ out: | |||
478 | return ret; | 537 | return ret; |
479 | } | 538 | } |
480 | 539 | ||
540 | static int virtscsi_queuecommand_single(struct Scsi_Host *sh, | ||
541 | struct scsi_cmnd *sc) | ||
542 | { | ||
543 | struct virtio_scsi *vscsi = shost_priv(sh); | ||
544 | struct virtio_scsi_target_state *tgt = | ||
545 | scsi_target(sc->device)->hostdata; | ||
546 | |||
547 | atomic_inc(&tgt->reqs); | ||
548 | return virtscsi_queuecommand(vscsi, &vscsi->req_vqs[0], sc); | ||
549 | } | ||
550 | |||
551 | static struct virtio_scsi_vq *virtscsi_pick_vq(struct virtio_scsi *vscsi, | ||
552 | struct virtio_scsi_target_state *tgt) | ||
553 | { | ||
554 | struct virtio_scsi_vq *vq; | ||
555 | unsigned long flags; | ||
556 | u32 queue_num; | ||
557 | |||
558 | spin_lock_irqsave(&tgt->tgt_lock, flags); | ||
559 | |||
560 | /* | ||
561 | * The memory barrier after atomic_inc_return matches | ||
562 | * the smp_read_barrier_depends() in virtscsi_req_done. | ||
563 | */ | ||
564 | if (atomic_inc_return(&tgt->reqs) > 1) | ||
565 | vq = ACCESS_ONCE(tgt->req_vq); | ||
566 | else { | ||
567 | queue_num = smp_processor_id(); | ||
568 | while (unlikely(queue_num >= vscsi->num_queues)) | ||
569 | queue_num -= vscsi->num_queues; | ||
570 | |||
571 | tgt->req_vq = vq = &vscsi->req_vqs[queue_num]; | ||
572 | } | ||
573 | |||
574 | spin_unlock_irqrestore(&tgt->tgt_lock, flags); | ||
575 | return vq; | ||
576 | } | ||
577 | |||
578 | static int virtscsi_queuecommand_multi(struct Scsi_Host *sh, | ||
579 | struct scsi_cmnd *sc) | ||
580 | { | ||
581 | struct virtio_scsi *vscsi = shost_priv(sh); | ||
582 | struct virtio_scsi_target_state *tgt = | ||
583 | scsi_target(sc->device)->hostdata; | ||
584 | struct virtio_scsi_vq *req_vq = virtscsi_pick_vq(vscsi, tgt); | ||
585 | |||
586 | return virtscsi_queuecommand(vscsi, req_vq, sc); | ||
587 | } | ||
588 | |||
481 | static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd) | 589 | static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd) |
482 | { | 590 | { |
483 | DECLARE_COMPLETION_ONSTACK(comp); | 591 | DECLARE_COMPLETION_ONSTACK(comp); |
484 | struct virtio_scsi_target_state *tgt = vscsi->tgt[cmd->sc->device->id]; | ||
485 | int ret = FAILED; | 592 | int ret = FAILED; |
486 | 593 | ||
487 | cmd->comp = ∁ | 594 | cmd->comp = ∁ |
488 | if (virtscsi_kick_cmd(tgt, &vscsi->ctrl_vq, cmd, | 595 | if (virtscsi_kick_cmd(&vscsi->ctrl_vq, cmd, |
489 | sizeof cmd->req.tmf, sizeof cmd->resp.tmf, | 596 | sizeof cmd->req.tmf, sizeof cmd->resp.tmf, |
490 | GFP_NOIO) < 0) | 597 | GFP_NOIO) < 0) |
491 | goto out; | 598 | goto out; |
@@ -547,18 +654,57 @@ static int virtscsi_abort(struct scsi_cmnd *sc) | |||
547 | return virtscsi_tmf(vscsi, cmd); | 654 | return virtscsi_tmf(vscsi, cmd); |
548 | } | 655 | } |
549 | 656 | ||
550 | static struct scsi_host_template virtscsi_host_template = { | 657 | static int virtscsi_target_alloc(struct scsi_target *starget) |
658 | { | ||
659 | struct virtio_scsi_target_state *tgt = | ||
660 | kmalloc(sizeof(*tgt), GFP_KERNEL); | ||
661 | if (!tgt) | ||
662 | return -ENOMEM; | ||
663 | |||
664 | spin_lock_init(&tgt->tgt_lock); | ||
665 | atomic_set(&tgt->reqs, 0); | ||
666 | tgt->req_vq = NULL; | ||
667 | |||
668 | starget->hostdata = tgt; | ||
669 | return 0; | ||
670 | } | ||
671 | |||
672 | static void virtscsi_target_destroy(struct scsi_target *starget) | ||
673 | { | ||
674 | struct virtio_scsi_target_state *tgt = starget->hostdata; | ||
675 | kfree(tgt); | ||
676 | } | ||
677 | |||
678 | static struct scsi_host_template virtscsi_host_template_single = { | ||
679 | .module = THIS_MODULE, | ||
680 | .name = "Virtio SCSI HBA", | ||
681 | .proc_name = "virtio_scsi", | ||
682 | .this_id = -1, | ||
683 | .queuecommand = virtscsi_queuecommand_single, | ||
684 | .eh_abort_handler = virtscsi_abort, | ||
685 | .eh_device_reset_handler = virtscsi_device_reset, | ||
686 | |||
687 | .can_queue = 1024, | ||
688 | .dma_boundary = UINT_MAX, | ||
689 | .use_clustering = ENABLE_CLUSTERING, | ||
690 | .target_alloc = virtscsi_target_alloc, | ||
691 | .target_destroy = virtscsi_target_destroy, | ||
692 | }; | ||
693 | |||
694 | static struct scsi_host_template virtscsi_host_template_multi = { | ||
551 | .module = THIS_MODULE, | 695 | .module = THIS_MODULE, |
552 | .name = "Virtio SCSI HBA", | 696 | .name = "Virtio SCSI HBA", |
553 | .proc_name = "virtio_scsi", | 697 | .proc_name = "virtio_scsi", |
554 | .queuecommand = virtscsi_queuecommand, | ||
555 | .this_id = -1, | 698 | .this_id = -1, |
699 | .queuecommand = virtscsi_queuecommand_multi, | ||
556 | .eh_abort_handler = virtscsi_abort, | 700 | .eh_abort_handler = virtscsi_abort, |
557 | .eh_device_reset_handler = virtscsi_device_reset, | 701 | .eh_device_reset_handler = virtscsi_device_reset, |
558 | 702 | ||
559 | .can_queue = 1024, | 703 | .can_queue = 1024, |
560 | .dma_boundary = UINT_MAX, | 704 | .dma_boundary = UINT_MAX, |
561 | .use_clustering = ENABLE_CLUSTERING, | 705 | .use_clustering = ENABLE_CLUSTERING, |
706 | .target_alloc = virtscsi_target_alloc, | ||
707 | .target_destroy = virtscsi_target_destroy, | ||
562 | }; | 708 | }; |
563 | 709 | ||
564 | #define virtscsi_config_get(vdev, fld) \ | 710 | #define virtscsi_config_get(vdev, fld) \ |
@@ -578,29 +724,69 @@ static struct scsi_host_template virtscsi_host_template = { | |||
578 | &__val, sizeof(__val)); \ | 724 | &__val, sizeof(__val)); \ |
579 | }) | 725 | }) |
580 | 726 | ||
581 | static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, | 727 | static void __virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity) |
582 | struct virtqueue *vq) | ||
583 | { | 728 | { |
584 | spin_lock_init(&virtscsi_vq->vq_lock); | 729 | int i; |
585 | virtscsi_vq->vq = vq; | 730 | int cpu; |
731 | |||
732 | /* In multiqueue mode, when the number of cpu is equal | ||
733 | * to the number of request queues, we let the qeueues | ||
734 | * to be private to one cpu by setting the affinity hint | ||
735 | * to eliminate the contention. | ||
736 | */ | ||
737 | if ((vscsi->num_queues == 1 || | ||
738 | vscsi->num_queues != num_online_cpus()) && affinity) { | ||
739 | if (vscsi->affinity_hint_set) | ||
740 | affinity = false; | ||
741 | else | ||
742 | return; | ||
743 | } | ||
744 | |||
745 | if (affinity) { | ||
746 | i = 0; | ||
747 | for_each_online_cpu(cpu) { | ||
748 | virtqueue_set_affinity(vscsi->req_vqs[i].vq, cpu); | ||
749 | i++; | ||
750 | } | ||
751 | |||
752 | vscsi->affinity_hint_set = true; | ||
753 | } else { | ||
754 | for (i = 0; i < vscsi->num_queues - VIRTIO_SCSI_VQ_BASE; i++) | ||
755 | virtqueue_set_affinity(vscsi->req_vqs[i].vq, -1); | ||
756 | |||
757 | vscsi->affinity_hint_set = false; | ||
758 | } | ||
586 | } | 759 | } |
587 | 760 | ||
588 | static struct virtio_scsi_target_state *virtscsi_alloc_tgt( | 761 | static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity) |
589 | struct virtio_device *vdev, int sg_elems) | ||
590 | { | 762 | { |
591 | struct virtio_scsi_target_state *tgt; | 763 | get_online_cpus(); |
592 | gfp_t gfp_mask = GFP_KERNEL; | 764 | __virtscsi_set_affinity(vscsi, affinity); |
593 | 765 | put_online_cpus(); | |
594 | /* We need extra sg elements at head and tail. */ | 766 | } |
595 | tgt = kmalloc(sizeof(*tgt) + sizeof(tgt->sg[0]) * (sg_elems + 2), | ||
596 | gfp_mask); | ||
597 | 767 | ||
598 | if (!tgt) | 768 | static int virtscsi_cpu_callback(struct notifier_block *nfb, |
599 | return NULL; | 769 | unsigned long action, void *hcpu) |
770 | { | ||
771 | struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb); | ||
772 | switch(action) { | ||
773 | case CPU_ONLINE: | ||
774 | case CPU_ONLINE_FROZEN: | ||
775 | case CPU_DEAD: | ||
776 | case CPU_DEAD_FROZEN: | ||
777 | __virtscsi_set_affinity(vscsi, true); | ||
778 | break; | ||
779 | default: | ||
780 | break; | ||
781 | } | ||
782 | return NOTIFY_OK; | ||
783 | } | ||
600 | 784 | ||
601 | spin_lock_init(&tgt->tgt_lock); | 785 | static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, |
602 | sg_init_table(tgt->sg, sg_elems + 2); | 786 | struct virtqueue *vq) |
603 | return tgt; | 787 | { |
788 | spin_lock_init(&virtscsi_vq->vq_lock); | ||
789 | virtscsi_vq->vq = vq; | ||
604 | } | 790 | } |
605 | 791 | ||
606 | static void virtscsi_scan(struct virtio_device *vdev) | 792 | static void virtscsi_scan(struct virtio_device *vdev) |
@@ -614,46 +800,56 @@ static void virtscsi_remove_vqs(struct virtio_device *vdev) | |||
614 | { | 800 | { |
615 | struct Scsi_Host *sh = virtio_scsi_host(vdev); | 801 | struct Scsi_Host *sh = virtio_scsi_host(vdev); |
616 | struct virtio_scsi *vscsi = shost_priv(sh); | 802 | struct virtio_scsi *vscsi = shost_priv(sh); |
617 | u32 i, num_targets; | 803 | |
804 | virtscsi_set_affinity(vscsi, false); | ||
618 | 805 | ||
619 | /* Stop all the virtqueues. */ | 806 | /* Stop all the virtqueues. */ |
620 | vdev->config->reset(vdev); | 807 | vdev->config->reset(vdev); |
621 | 808 | ||
622 | num_targets = sh->max_id; | ||
623 | for (i = 0; i < num_targets; i++) { | ||
624 | kfree(vscsi->tgt[i]); | ||
625 | vscsi->tgt[i] = NULL; | ||
626 | } | ||
627 | |||
628 | vdev->config->del_vqs(vdev); | 809 | vdev->config->del_vqs(vdev); |
629 | } | 810 | } |
630 | 811 | ||
631 | static int virtscsi_init(struct virtio_device *vdev, | 812 | static int virtscsi_init(struct virtio_device *vdev, |
632 | struct virtio_scsi *vscsi, int num_targets) | 813 | struct virtio_scsi *vscsi) |
633 | { | 814 | { |
634 | int err; | 815 | int err; |
635 | struct virtqueue *vqs[3]; | 816 | u32 i; |
636 | u32 i, sg_elems; | 817 | u32 num_vqs; |
818 | vq_callback_t **callbacks; | ||
819 | const char **names; | ||
820 | struct virtqueue **vqs; | ||
821 | |||
822 | num_vqs = vscsi->num_queues + VIRTIO_SCSI_VQ_BASE; | ||
823 | vqs = kmalloc(num_vqs * sizeof(struct virtqueue *), GFP_KERNEL); | ||
824 | callbacks = kmalloc(num_vqs * sizeof(vq_callback_t *), GFP_KERNEL); | ||
825 | names = kmalloc(num_vqs * sizeof(char *), GFP_KERNEL); | ||
826 | |||
827 | if (!callbacks || !vqs || !names) { | ||
828 | err = -ENOMEM; | ||
829 | goto out; | ||
830 | } | ||
637 | 831 | ||
638 | vq_callback_t *callbacks[] = { | 832 | callbacks[0] = virtscsi_ctrl_done; |
639 | virtscsi_ctrl_done, | 833 | callbacks[1] = virtscsi_event_done; |
640 | virtscsi_event_done, | 834 | names[0] = "control"; |
641 | virtscsi_req_done | 835 | names[1] = "event"; |
642 | }; | 836 | for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) { |
643 | const char *names[] = { | 837 | callbacks[i] = virtscsi_req_done; |
644 | "control", | 838 | names[i] = "request"; |
645 | "event", | 839 | } |
646 | "request" | ||
647 | }; | ||
648 | 840 | ||
649 | /* Discover virtqueues and write information to configuration. */ | 841 | /* Discover virtqueues and write information to configuration. */ |
650 | err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names); | 842 | err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names); |
651 | if (err) | 843 | if (err) |
652 | return err; | 844 | goto out; |
653 | 845 | ||
654 | virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]); | 846 | virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]); |
655 | virtscsi_init_vq(&vscsi->event_vq, vqs[1]); | 847 | virtscsi_init_vq(&vscsi->event_vq, vqs[1]); |
656 | virtscsi_init_vq(&vscsi->req_vq, vqs[2]); | 848 | for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) |
849 | virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE], | ||
850 | vqs[i]); | ||
851 | |||
852 | virtscsi_set_affinity(vscsi, true); | ||
657 | 853 | ||
658 | virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE); | 854 | virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE); |
659 | virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE); | 855 | virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE); |
@@ -661,19 +857,12 @@ static int virtscsi_init(struct virtio_device *vdev, | |||
661 | if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) | 857 | if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) |
662 | virtscsi_kick_event_all(vscsi); | 858 | virtscsi_kick_event_all(vscsi); |
663 | 859 | ||
664 | /* We need to know how many segments before we allocate. */ | ||
665 | sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1; | ||
666 | |||
667 | for (i = 0; i < num_targets; i++) { | ||
668 | vscsi->tgt[i] = virtscsi_alloc_tgt(vdev, sg_elems); | ||
669 | if (!vscsi->tgt[i]) { | ||
670 | err = -ENOMEM; | ||
671 | goto out; | ||
672 | } | ||
673 | } | ||
674 | err = 0; | 860 | err = 0; |
675 | 861 | ||
676 | out: | 862 | out: |
863 | kfree(names); | ||
864 | kfree(callbacks); | ||
865 | kfree(vqs); | ||
677 | if (err) | 866 | if (err) |
678 | virtscsi_remove_vqs(vdev); | 867 | virtscsi_remove_vqs(vdev); |
679 | return err; | 868 | return err; |
@@ -686,13 +875,21 @@ static int virtscsi_probe(struct virtio_device *vdev) | |||
686 | int err; | 875 | int err; |
687 | u32 sg_elems, num_targets; | 876 | u32 sg_elems, num_targets; |
688 | u32 cmd_per_lun; | 877 | u32 cmd_per_lun; |
878 | u32 num_queues; | ||
879 | struct scsi_host_template *hostt; | ||
880 | |||
881 | /* We need to know how many queues before we allocate. */ | ||
882 | num_queues = virtscsi_config_get(vdev, num_queues) ? : 1; | ||
689 | 883 | ||
690 | /* Allocate memory and link the structs together. */ | ||
691 | num_targets = virtscsi_config_get(vdev, max_target) + 1; | 884 | num_targets = virtscsi_config_get(vdev, max_target) + 1; |
692 | shost = scsi_host_alloc(&virtscsi_host_template, | ||
693 | sizeof(*vscsi) | ||
694 | + num_targets * sizeof(struct virtio_scsi_target_state)); | ||
695 | 885 | ||
886 | if (num_queues == 1) | ||
887 | hostt = &virtscsi_host_template_single; | ||
888 | else | ||
889 | hostt = &virtscsi_host_template_multi; | ||
890 | |||
891 | shost = scsi_host_alloc(hostt, | ||
892 | sizeof(*vscsi) + sizeof(vscsi->req_vqs[0]) * num_queues); | ||
696 | if (!shost) | 893 | if (!shost) |
697 | return -ENOMEM; | 894 | return -ENOMEM; |
698 | 895 | ||
@@ -700,12 +897,20 @@ static int virtscsi_probe(struct virtio_device *vdev) | |||
700 | shost->sg_tablesize = sg_elems; | 897 | shost->sg_tablesize = sg_elems; |
701 | vscsi = shost_priv(shost); | 898 | vscsi = shost_priv(shost); |
702 | vscsi->vdev = vdev; | 899 | vscsi->vdev = vdev; |
900 | vscsi->num_queues = num_queues; | ||
703 | vdev->priv = shost; | 901 | vdev->priv = shost; |
704 | 902 | ||
705 | err = virtscsi_init(vdev, vscsi, num_targets); | 903 | err = virtscsi_init(vdev, vscsi); |
706 | if (err) | 904 | if (err) |
707 | goto virtscsi_init_failed; | 905 | goto virtscsi_init_failed; |
708 | 906 | ||
907 | vscsi->nb.notifier_call = &virtscsi_cpu_callback; | ||
908 | err = register_hotcpu_notifier(&vscsi->nb); | ||
909 | if (err) { | ||
910 | pr_err("registering cpu notifier failed\n"); | ||
911 | goto scsi_add_host_failed; | ||
912 | } | ||
913 | |||
709 | cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1; | 914 | cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1; |
710 | shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue); | 915 | shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue); |
711 | shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF; | 916 | shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF; |
@@ -743,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev) | |||
743 | 948 | ||
744 | scsi_remove_host(shost); | 949 | scsi_remove_host(shost); |
745 | 950 | ||
951 | unregister_hotcpu_notifier(&vscsi->nb); | ||
952 | |||
746 | virtscsi_remove_vqs(vdev); | 953 | virtscsi_remove_vqs(vdev); |
747 | scsi_host_put(shost); | 954 | scsi_host_put(shost); |
748 | } | 955 | } |
@@ -759,7 +966,7 @@ static int virtscsi_restore(struct virtio_device *vdev) | |||
759 | struct Scsi_Host *sh = virtio_scsi_host(vdev); | 966 | struct Scsi_Host *sh = virtio_scsi_host(vdev); |
760 | struct virtio_scsi *vscsi = shost_priv(sh); | 967 | struct virtio_scsi *vscsi = shost_priv(sh); |
761 | 968 | ||
762 | return virtscsi_init(vdev, vscsi, sh->max_id); | 969 | return virtscsi_init(vdev, vscsi); |
763 | } | 970 | } |
764 | #endif | 971 | #endif |
765 | 972 | ||
@@ -794,8 +1001,7 @@ static int __init init(void) | |||
794 | 1001 | ||
795 | virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0); | 1002 | virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0); |
796 | if (!virtscsi_cmd_cache) { | 1003 | if (!virtscsi_cmd_cache) { |
797 | printk(KERN_ERR "kmem_cache_create() for " | 1004 | pr_err("kmem_cache_create() for virtscsi_cmd_cache failed\n"); |
798 | "virtscsi_cmd_cache failed\n"); | ||
799 | goto error; | 1005 | goto error; |
800 | } | 1006 | } |
801 | 1007 | ||
@@ -804,8 +1010,7 @@ static int __init init(void) | |||
804 | mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ, | 1010 | mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ, |
805 | virtscsi_cmd_cache); | 1011 | virtscsi_cmd_cache); |
806 | if (!virtscsi_cmd_pool) { | 1012 | if (!virtscsi_cmd_pool) { |
807 | printk(KERN_ERR "mempool_create() for" | 1013 | pr_err("mempool_create() for virtscsi_cmd_pool failed\n"); |
808 | "virtscsi_cmd_pool failed\n"); | ||
809 | goto error; | 1014 | goto error; |
810 | } | 1015 | } |
811 | ret = register_virtio_driver(&virtio_scsi_driver); | 1016 | ret = register_virtio_driver(&virtio_scsi_driver); |
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 26a64e5b8a58..8b9226da3f54 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig | |||
@@ -1,6 +1,7 @@ | |||
1 | config VHOST_NET | 1 | config VHOST_NET |
2 | tristate "Host kernel accelerator for virtio net" | 2 | tristate "Host kernel accelerator for virtio net" |
3 | depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP) | 3 | depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP) |
4 | select VHOST_RING | ||
4 | ---help--- | 5 | ---help--- |
5 | This kernel module can be loaded in host kernel to accelerate | 6 | This kernel module can be loaded in host kernel to accelerate |
6 | guest networking with virtio_net. Not to be confused with virtio_net | 7 | guest networking with virtio_net. Not to be confused with virtio_net |
@@ -12,7 +13,14 @@ config VHOST_NET | |||
12 | config VHOST_SCSI | 13 | config VHOST_SCSI |
13 | tristate "VHOST_SCSI TCM fabric driver" | 14 | tristate "VHOST_SCSI TCM fabric driver" |
14 | depends on TARGET_CORE && EVENTFD && m | 15 | depends on TARGET_CORE && EVENTFD && m |
16 | select VHOST_RING | ||
15 | default n | 17 | default n |
16 | ---help--- | 18 | ---help--- |
17 | Say M here to enable the vhost_scsi TCM fabric module | 19 | Say M here to enable the vhost_scsi TCM fabric module |
18 | for use with virtio-scsi guests | 20 | for use with virtio-scsi guests |
21 | |||
22 | config VHOST_RING | ||
23 | tristate | ||
24 | ---help--- | ||
25 | This option is selected by any driver which needs to access | ||
26 | the host side of a virtio ring. | ||
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index ef21d5fdfa7d..654e9afb11f5 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile | |||
@@ -3,3 +3,5 @@ vhost_net-y := vhost.o net.o | |||
3 | 3 | ||
4 | obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o | 4 | obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o |
5 | vhost_scsi-y := scsi.o | 5 | vhost_scsi-y := scsi.o |
6 | |||
7 | obj-$(CONFIG_VHOST_RING) += vringh.o | ||
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c index be65414d5bb1..1ee45bc85f67 100644 --- a/drivers/vhost/test.c +++ b/drivers/vhost/test.c | |||
@@ -282,7 +282,9 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl, | |||
282 | return vhost_test_reset_owner(n); | 282 | return vhost_test_reset_owner(n); |
283 | default: | 283 | default: |
284 | mutex_lock(&n->dev.mutex); | 284 | mutex_lock(&n->dev.mutex); |
285 | r = vhost_dev_ioctl(&n->dev, ioctl, arg); | 285 | r = vhost_dev_ioctl(&n->dev, ioctl, argp); |
286 | if (r == -ENOIOCTLCMD) | ||
287 | r = vhost_vring_ioctl(&n->dev, ioctl, argp); | ||
286 | vhost_test_flush(n); | 288 | vhost_test_flush(n); |
287 | mutex_unlock(&n->dev.mutex); | 289 | mutex_unlock(&n->dev.mutex); |
288 | return r; | 290 | return r; |
diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c new file mode 100644 index 000000000000..bff0775e258c --- /dev/null +++ b/drivers/vhost/vringh.c | |||
@@ -0,0 +1,1007 @@ | |||
1 | /* | ||
2 | * Helpers for the host side of a virtio ring. | ||
3 | * | ||
4 | * Since these may be in userspace, we use (inline) accessors. | ||
5 | */ | ||
6 | #include <linux/vringh.h> | ||
7 | #include <linux/virtio_ring.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/ratelimit.h> | ||
10 | #include <linux/uaccess.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <linux/export.h> | ||
13 | |||
14 | static __printf(1,2) __cold void vringh_bad(const char *fmt, ...) | ||
15 | { | ||
16 | static DEFINE_RATELIMIT_STATE(vringh_rs, | ||
17 | DEFAULT_RATELIMIT_INTERVAL, | ||
18 | DEFAULT_RATELIMIT_BURST); | ||
19 | if (__ratelimit(&vringh_rs)) { | ||
20 | va_list ap; | ||
21 | va_start(ap, fmt); | ||
22 | printk(KERN_NOTICE "vringh:"); | ||
23 | vprintk(fmt, ap); | ||
24 | va_end(ap); | ||
25 | } | ||
26 | } | ||
27 | |||
28 | /* Returns vring->num if empty, -ve on error. */ | ||
29 | static inline int __vringh_get_head(const struct vringh *vrh, | ||
30 | int (*getu16)(u16 *val, const u16 *p), | ||
31 | u16 *last_avail_idx) | ||
32 | { | ||
33 | u16 avail_idx, i, head; | ||
34 | int err; | ||
35 | |||
36 | err = getu16(&avail_idx, &vrh->vring.avail->idx); | ||
37 | if (err) { | ||
38 | vringh_bad("Failed to access avail idx at %p", | ||
39 | &vrh->vring.avail->idx); | ||
40 | return err; | ||
41 | } | ||
42 | |||
43 | if (*last_avail_idx == avail_idx) | ||
44 | return vrh->vring.num; | ||
45 | |||
46 | /* Only get avail ring entries after they have been exposed by guest. */ | ||
47 | virtio_rmb(vrh->weak_barriers); | ||
48 | |||
49 | i = *last_avail_idx & (vrh->vring.num - 1); | ||
50 | |||
51 | err = getu16(&head, &vrh->vring.avail->ring[i]); | ||
52 | if (err) { | ||
53 | vringh_bad("Failed to read head: idx %d address %p", | ||
54 | *last_avail_idx, &vrh->vring.avail->ring[i]); | ||
55 | return err; | ||
56 | } | ||
57 | |||
58 | if (head >= vrh->vring.num) { | ||
59 | vringh_bad("Guest says index %u > %u is available", | ||
60 | head, vrh->vring.num); | ||
61 | return -EINVAL; | ||
62 | } | ||
63 | |||
64 | (*last_avail_idx)++; | ||
65 | return head; | ||
66 | } | ||
67 | |||
68 | /* Copy some bytes to/from the iovec. Returns num copied. */ | ||
69 | static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov, | ||
70 | void *ptr, size_t len, | ||
71 | int (*xfer)(void *addr, void *ptr, | ||
72 | size_t len)) | ||
73 | { | ||
74 | int err, done = 0; | ||
75 | |||
76 | while (len && iov->i < iov->used) { | ||
77 | size_t partlen; | ||
78 | |||
79 | partlen = min(iov->iov[iov->i].iov_len, len); | ||
80 | err = xfer(iov->iov[iov->i].iov_base, ptr, partlen); | ||
81 | if (err) | ||
82 | return err; | ||
83 | done += partlen; | ||
84 | len -= partlen; | ||
85 | ptr += partlen; | ||
86 | iov->consumed += partlen; | ||
87 | iov->iov[iov->i].iov_len -= partlen; | ||
88 | iov->iov[iov->i].iov_base += partlen; | ||
89 | |||
90 | if (!iov->iov[iov->i].iov_len) { | ||
91 | /* Fix up old iov element then increment. */ | ||
92 | iov->iov[iov->i].iov_len = iov->consumed; | ||
93 | iov->iov[iov->i].iov_base -= iov->consumed; | ||
94 | |||
95 | iov->consumed = 0; | ||
96 | iov->i++; | ||
97 | } | ||
98 | } | ||
99 | return done; | ||
100 | } | ||
101 | |||
102 | /* May reduce *len if range is shorter. */ | ||
103 | static inline bool range_check(struct vringh *vrh, u64 addr, size_t *len, | ||
104 | struct vringh_range *range, | ||
105 | bool (*getrange)(struct vringh *, | ||
106 | u64, struct vringh_range *)) | ||
107 | { | ||
108 | if (addr < range->start || addr > range->end_incl) { | ||
109 | if (!getrange(vrh, addr, range)) | ||
110 | return false; | ||
111 | } | ||
112 | BUG_ON(addr < range->start || addr > range->end_incl); | ||
113 | |||
114 | /* To end of memory? */ | ||
115 | if (unlikely(addr + *len == 0)) { | ||
116 | if (range->end_incl == -1ULL) | ||
117 | return true; | ||
118 | goto truncate; | ||
119 | } | ||
120 | |||
121 | /* Otherwise, don't wrap. */ | ||
122 | if (addr + *len < addr) { | ||
123 | vringh_bad("Wrapping descriptor %zu@0x%llx", | ||
124 | *len, (unsigned long long)addr); | ||
125 | return false; | ||
126 | } | ||
127 | |||
128 | if (unlikely(addr + *len - 1 > range->end_incl)) | ||
129 | goto truncate; | ||
130 | return true; | ||
131 | |||
132 | truncate: | ||
133 | *len = range->end_incl + 1 - addr; | ||
134 | return true; | ||
135 | } | ||
136 | |||
137 | static inline bool no_range_check(struct vringh *vrh, u64 addr, size_t *len, | ||
138 | struct vringh_range *range, | ||
139 | bool (*getrange)(struct vringh *, | ||
140 | u64, struct vringh_range *)) | ||
141 | { | ||
142 | return true; | ||
143 | } | ||
144 | |||
145 | /* No reason for this code to be inline. */ | ||
146 | static int move_to_indirect(int *up_next, u16 *i, void *addr, | ||
147 | const struct vring_desc *desc, | ||
148 | struct vring_desc **descs, int *desc_max) | ||
149 | { | ||
150 | /* Indirect tables can't have indirect. */ | ||
151 | if (*up_next != -1) { | ||
152 | vringh_bad("Multilevel indirect %u->%u", *up_next, *i); | ||
153 | return -EINVAL; | ||
154 | } | ||
155 | |||
156 | if (unlikely(desc->len % sizeof(struct vring_desc))) { | ||
157 | vringh_bad("Strange indirect len %u", desc->len); | ||
158 | return -EINVAL; | ||
159 | } | ||
160 | |||
161 | /* We will check this when we follow it! */ | ||
162 | if (desc->flags & VRING_DESC_F_NEXT) | ||
163 | *up_next = desc->next; | ||
164 | else | ||
165 | *up_next = -2; | ||
166 | *descs = addr; | ||
167 | *desc_max = desc->len / sizeof(struct vring_desc); | ||
168 | |||
169 | /* Now, start at the first indirect. */ | ||
170 | *i = 0; | ||
171 | return 0; | ||
172 | } | ||
173 | |||
174 | static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp) | ||
175 | { | ||
176 | struct kvec *new; | ||
177 | unsigned int flag, new_num = (iov->max_num & ~VRINGH_IOV_ALLOCATED) * 2; | ||
178 | |||
179 | if (new_num < 8) | ||
180 | new_num = 8; | ||
181 | |||
182 | flag = (iov->max_num & VRINGH_IOV_ALLOCATED); | ||
183 | if (flag) | ||
184 | new = krealloc(iov->iov, new_num * sizeof(struct iovec), gfp); | ||
185 | else { | ||
186 | new = kmalloc(new_num * sizeof(struct iovec), gfp); | ||
187 | if (new) { | ||
188 | memcpy(new, iov->iov, | ||
189 | iov->max_num * sizeof(struct iovec)); | ||
190 | flag = VRINGH_IOV_ALLOCATED; | ||
191 | } | ||
192 | } | ||
193 | if (!new) | ||
194 | return -ENOMEM; | ||
195 | iov->iov = new; | ||
196 | iov->max_num = (new_num | flag); | ||
197 | return 0; | ||
198 | } | ||
199 | |||
200 | static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next, | ||
201 | struct vring_desc **descs, int *desc_max) | ||
202 | { | ||
203 | u16 i = *up_next; | ||
204 | |||
205 | *up_next = -1; | ||
206 | *descs = vrh->vring.desc; | ||
207 | *desc_max = vrh->vring.num; | ||
208 | return i; | ||
209 | } | ||
210 | |||
211 | static int slow_copy(struct vringh *vrh, void *dst, const void *src, | ||
212 | bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len, | ||
213 | struct vringh_range *range, | ||
214 | bool (*getrange)(struct vringh *vrh, | ||
215 | u64, | ||
216 | struct vringh_range *)), | ||
217 | bool (*getrange)(struct vringh *vrh, | ||
218 | u64 addr, | ||
219 | struct vringh_range *r), | ||
220 | struct vringh_range *range, | ||
221 | int (*copy)(void *dst, const void *src, size_t len)) | ||
222 | { | ||
223 | size_t part, len = sizeof(struct vring_desc); | ||
224 | |||
225 | do { | ||
226 | u64 addr; | ||
227 | int err; | ||
228 | |||
229 | part = len; | ||
230 | addr = (u64)(unsigned long)src - range->offset; | ||
231 | |||
232 | if (!rcheck(vrh, addr, &part, range, getrange)) | ||
233 | return -EINVAL; | ||
234 | |||
235 | err = copy(dst, src, part); | ||
236 | if (err) | ||
237 | return err; | ||
238 | |||
239 | dst += part; | ||
240 | src += part; | ||
241 | len -= part; | ||
242 | } while (len); | ||
243 | return 0; | ||
244 | } | ||
245 | |||
246 | static inline int | ||
247 | __vringh_iov(struct vringh *vrh, u16 i, | ||
248 | struct vringh_kiov *riov, | ||
249 | struct vringh_kiov *wiov, | ||
250 | bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len, | ||
251 | struct vringh_range *range, | ||
252 | bool (*getrange)(struct vringh *, u64, | ||
253 | struct vringh_range *)), | ||
254 | bool (*getrange)(struct vringh *, u64, struct vringh_range *), | ||
255 | gfp_t gfp, | ||
256 | int (*copy)(void *dst, const void *src, size_t len)) | ||
257 | { | ||
258 | int err, count = 0, up_next, desc_max; | ||
259 | struct vring_desc desc, *descs; | ||
260 | struct vringh_range range = { -1ULL, 0 }, slowrange; | ||
261 | bool slow = false; | ||
262 | |||
263 | /* We start traversing vring's descriptor table. */ | ||
264 | descs = vrh->vring.desc; | ||
265 | desc_max = vrh->vring.num; | ||
266 | up_next = -1; | ||
267 | |||
268 | if (riov) | ||
269 | riov->i = riov->used = 0; | ||
270 | else if (wiov) | ||
271 | wiov->i = wiov->used = 0; | ||
272 | else | ||
273 | /* You must want something! */ | ||
274 | BUG(); | ||
275 | |||
276 | for (;;) { | ||
277 | void *addr; | ||
278 | struct vringh_kiov *iov; | ||
279 | size_t len; | ||
280 | |||
281 | if (unlikely(slow)) | ||
282 | err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange, | ||
283 | &slowrange, copy); | ||
284 | else | ||
285 | err = copy(&desc, &descs[i], sizeof(desc)); | ||
286 | if (unlikely(err)) | ||
287 | goto fail; | ||
288 | |||
289 | if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) { | ||
290 | /* Make sure it's OK, and get offset. */ | ||
291 | len = desc.len; | ||
292 | if (!rcheck(vrh, desc.addr, &len, &range, getrange)) { | ||
293 | err = -EINVAL; | ||
294 | goto fail; | ||
295 | } | ||
296 | |||
297 | if (unlikely(len != desc.len)) { | ||
298 | slow = true; | ||
299 | /* We need to save this range to use offset */ | ||
300 | slowrange = range; | ||
301 | } | ||
302 | |||
303 | addr = (void *)(long)(desc.addr + range.offset); | ||
304 | err = move_to_indirect(&up_next, &i, addr, &desc, | ||
305 | &descs, &desc_max); | ||
306 | if (err) | ||
307 | goto fail; | ||
308 | continue; | ||
309 | } | ||
310 | |||
311 | if (count++ == vrh->vring.num) { | ||
312 | vringh_bad("Descriptor loop in %p", descs); | ||
313 | err = -ELOOP; | ||
314 | goto fail; | ||
315 | } | ||
316 | |||
317 | if (desc.flags & VRING_DESC_F_WRITE) | ||
318 | iov = wiov; | ||
319 | else { | ||
320 | iov = riov; | ||
321 | if (unlikely(wiov && wiov->i)) { | ||
322 | vringh_bad("Readable desc %p after writable", | ||
323 | &descs[i]); | ||
324 | err = -EINVAL; | ||
325 | goto fail; | ||
326 | } | ||
327 | } | ||
328 | |||
329 | if (!iov) { | ||
330 | vringh_bad("Unexpected %s desc", | ||
331 | !wiov ? "writable" : "readable"); | ||
332 | err = -EPROTO; | ||
333 | goto fail; | ||
334 | } | ||
335 | |||
336 | again: | ||
337 | /* Make sure it's OK, and get offset. */ | ||
338 | len = desc.len; | ||
339 | if (!rcheck(vrh, desc.addr, &len, &range, getrange)) { | ||
340 | err = -EINVAL; | ||
341 | goto fail; | ||
342 | } | ||
343 | addr = (void *)(unsigned long)(desc.addr + range.offset); | ||
344 | |||
345 | if (unlikely(iov->used == (iov->max_num & ~VRINGH_IOV_ALLOCATED))) { | ||
346 | err = resize_iovec(iov, gfp); | ||
347 | if (err) | ||
348 | goto fail; | ||
349 | } | ||
350 | |||
351 | iov->iov[iov->used].iov_base = addr; | ||
352 | iov->iov[iov->used].iov_len = len; | ||
353 | iov->used++; | ||
354 | |||
355 | if (unlikely(len != desc.len)) { | ||
356 | desc.len -= len; | ||
357 | desc.addr += len; | ||
358 | goto again; | ||
359 | } | ||
360 | |||
361 | if (desc.flags & VRING_DESC_F_NEXT) { | ||
362 | i = desc.next; | ||
363 | } else { | ||
364 | /* Just in case we need to finish traversing above. */ | ||
365 | if (unlikely(up_next > 0)) { | ||
366 | i = return_from_indirect(vrh, &up_next, | ||
367 | &descs, &desc_max); | ||
368 | slow = false; | ||
369 | } else | ||
370 | break; | ||
371 | } | ||
372 | |||
373 | if (i >= desc_max) { | ||
374 | vringh_bad("Chained index %u > %u", i, desc_max); | ||
375 | err = -EINVAL; | ||
376 | goto fail; | ||
377 | } | ||
378 | } | ||
379 | |||
380 | return 0; | ||
381 | |||
382 | fail: | ||
383 | return err; | ||
384 | } | ||
385 | |||
386 | static inline int __vringh_complete(struct vringh *vrh, | ||
387 | const struct vring_used_elem *used, | ||
388 | unsigned int num_used, | ||
389 | int (*putu16)(u16 *p, u16 val), | ||
390 | int (*putused)(struct vring_used_elem *dst, | ||
391 | const struct vring_used_elem | ||
392 | *src, unsigned num)) | ||
393 | { | ||
394 | struct vring_used *used_ring; | ||
395 | int err; | ||
396 | u16 used_idx, off; | ||
397 | |||
398 | used_ring = vrh->vring.used; | ||
399 | used_idx = vrh->last_used_idx + vrh->completed; | ||
400 | |||
401 | off = used_idx % vrh->vring.num; | ||
402 | |||
403 | /* Compiler knows num_used == 1 sometimes, hence extra check */ | ||
404 | if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) { | ||
405 | u16 part = vrh->vring.num - off; | ||
406 | err = putused(&used_ring->ring[off], used, part); | ||
407 | if (!err) | ||
408 | err = putused(&used_ring->ring[0], used + part, | ||
409 | num_used - part); | ||
410 | } else | ||
411 | err = putused(&used_ring->ring[off], used, num_used); | ||
412 | |||
413 | if (err) { | ||
414 | vringh_bad("Failed to write %u used entries %u at %p", | ||
415 | num_used, off, &used_ring->ring[off]); | ||
416 | return err; | ||
417 | } | ||
418 | |||
419 | /* Make sure buffer is written before we update index. */ | ||
420 | virtio_wmb(vrh->weak_barriers); | ||
421 | |||
422 | err = putu16(&vrh->vring.used->idx, used_idx + num_used); | ||
423 | if (err) { | ||
424 | vringh_bad("Failed to update used index at %p", | ||
425 | &vrh->vring.used->idx); | ||
426 | return err; | ||
427 | } | ||
428 | |||
429 | vrh->completed += num_used; | ||
430 | return 0; | ||
431 | } | ||
432 | |||
433 | |||
434 | static inline int __vringh_need_notify(struct vringh *vrh, | ||
435 | int (*getu16)(u16 *val, const u16 *p)) | ||
436 | { | ||
437 | bool notify; | ||
438 | u16 used_event; | ||
439 | int err; | ||
440 | |||
441 | /* Flush out used index update. This is paired with the | ||
442 | * barrier that the Guest executes when enabling | ||
443 | * interrupts. */ | ||
444 | virtio_mb(vrh->weak_barriers); | ||
445 | |||
446 | /* Old-style, without event indices. */ | ||
447 | if (!vrh->event_indices) { | ||
448 | u16 flags; | ||
449 | err = getu16(&flags, &vrh->vring.avail->flags); | ||
450 | if (err) { | ||
451 | vringh_bad("Failed to get flags at %p", | ||
452 | &vrh->vring.avail->flags); | ||
453 | return err; | ||
454 | } | ||
455 | return (!(flags & VRING_AVAIL_F_NO_INTERRUPT)); | ||
456 | } | ||
457 | |||
458 | /* Modern: we know when other side wants to know. */ | ||
459 | err = getu16(&used_event, &vring_used_event(&vrh->vring)); | ||
460 | if (err) { | ||
461 | vringh_bad("Failed to get used event idx at %p", | ||
462 | &vring_used_event(&vrh->vring)); | ||
463 | return err; | ||
464 | } | ||
465 | |||
466 | /* Just in case we added so many that we wrap. */ | ||
467 | if (unlikely(vrh->completed > 0xffff)) | ||
468 | notify = true; | ||
469 | else | ||
470 | notify = vring_need_event(used_event, | ||
471 | vrh->last_used_idx + vrh->completed, | ||
472 | vrh->last_used_idx); | ||
473 | |||
474 | vrh->last_used_idx += vrh->completed; | ||
475 | vrh->completed = 0; | ||
476 | return notify; | ||
477 | } | ||
478 | |||
479 | static inline bool __vringh_notify_enable(struct vringh *vrh, | ||
480 | int (*getu16)(u16 *val, const u16 *p), | ||
481 | int (*putu16)(u16 *p, u16 val)) | ||
482 | { | ||
483 | u16 avail; | ||
484 | |||
485 | if (!vrh->event_indices) { | ||
486 | /* Old-school; update flags. */ | ||
487 | if (putu16(&vrh->vring.used->flags, 0) != 0) { | ||
488 | vringh_bad("Clearing used flags %p", | ||
489 | &vrh->vring.used->flags); | ||
490 | return true; | ||
491 | } | ||
492 | } else { | ||
493 | if (putu16(&vring_avail_event(&vrh->vring), | ||
494 | vrh->last_avail_idx) != 0) { | ||
495 | vringh_bad("Updating avail event index %p", | ||
496 | &vring_avail_event(&vrh->vring)); | ||
497 | return true; | ||
498 | } | ||
499 | } | ||
500 | |||
501 | /* They could have slipped one in as we were doing that: make | ||
502 | * sure it's written, then check again. */ | ||
503 | virtio_mb(vrh->weak_barriers); | ||
504 | |||
505 | if (getu16(&avail, &vrh->vring.avail->idx) != 0) { | ||
506 | vringh_bad("Failed to check avail idx at %p", | ||
507 | &vrh->vring.avail->idx); | ||
508 | return true; | ||
509 | } | ||
510 | |||
511 | /* This is unlikely, so we just leave notifications enabled | ||
512 | * (if we're using event_indices, we'll only get one | ||
513 | * notification anyway). */ | ||
514 | return avail == vrh->last_avail_idx; | ||
515 | } | ||
516 | |||
517 | static inline void __vringh_notify_disable(struct vringh *vrh, | ||
518 | int (*putu16)(u16 *p, u16 val)) | ||
519 | { | ||
520 | if (!vrh->event_indices) { | ||
521 | /* Old-school; update flags. */ | ||
522 | if (putu16(&vrh->vring.used->flags, VRING_USED_F_NO_NOTIFY)) { | ||
523 | vringh_bad("Setting used flags %p", | ||
524 | &vrh->vring.used->flags); | ||
525 | } | ||
526 | } | ||
527 | } | ||
528 | |||
529 | /* Userspace access helpers: in this case, addresses are really userspace. */ | ||
530 | static inline int getu16_user(u16 *val, const u16 *p) | ||
531 | { | ||
532 | return get_user(*val, (__force u16 __user *)p); | ||
533 | } | ||
534 | |||
535 | static inline int putu16_user(u16 *p, u16 val) | ||
536 | { | ||
537 | return put_user(val, (__force u16 __user *)p); | ||
538 | } | ||
539 | |||
540 | static inline int copydesc_user(void *dst, const void *src, size_t len) | ||
541 | { | ||
542 | return copy_from_user(dst, (__force void __user *)src, len) ? | ||
543 | -EFAULT : 0; | ||
544 | } | ||
545 | |||
546 | static inline int putused_user(struct vring_used_elem *dst, | ||
547 | const struct vring_used_elem *src, | ||
548 | unsigned int num) | ||
549 | { | ||
550 | return copy_to_user((__force void __user *)dst, src, | ||
551 | sizeof(*dst) * num) ? -EFAULT : 0; | ||
552 | } | ||
553 | |||
554 | static inline int xfer_from_user(void *src, void *dst, size_t len) | ||
555 | { | ||
556 | return copy_from_user(dst, (__force void __user *)src, len) ? | ||
557 | -EFAULT : 0; | ||
558 | } | ||
559 | |||
560 | static inline int xfer_to_user(void *dst, void *src, size_t len) | ||
561 | { | ||
562 | return copy_to_user((__force void __user *)dst, src, len) ? | ||
563 | -EFAULT : 0; | ||
564 | } | ||
565 | |||
566 | /** | ||
567 | * vringh_init_user - initialize a vringh for a userspace vring. | ||
568 | * @vrh: the vringh to initialize. | ||
569 | * @features: the feature bits for this ring. | ||
570 | * @num: the number of elements. | ||
571 | * @weak_barriers: true if we only need memory barriers, not I/O. | ||
572 | * @desc: the userpace descriptor pointer. | ||
573 | * @avail: the userpace avail pointer. | ||
574 | * @used: the userpace used pointer. | ||
575 | * | ||
576 | * Returns an error if num is invalid: you should check pointers | ||
577 | * yourself! | ||
578 | */ | ||
579 | int vringh_init_user(struct vringh *vrh, u32 features, | ||
580 | unsigned int num, bool weak_barriers, | ||
581 | struct vring_desc __user *desc, | ||
582 | struct vring_avail __user *avail, | ||
583 | struct vring_used __user *used) | ||
584 | { | ||
585 | /* Sane power of 2 please! */ | ||
586 | if (!num || num > 0xffff || (num & (num - 1))) { | ||
587 | vringh_bad("Bad ring size %u", num); | ||
588 | return -EINVAL; | ||
589 | } | ||
590 | |||
591 | vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX)); | ||
592 | vrh->weak_barriers = weak_barriers; | ||
593 | vrh->completed = 0; | ||
594 | vrh->last_avail_idx = 0; | ||
595 | vrh->last_used_idx = 0; | ||
596 | vrh->vring.num = num; | ||
597 | /* vring expects kernel addresses, but only used via accessors. */ | ||
598 | vrh->vring.desc = (__force struct vring_desc *)desc; | ||
599 | vrh->vring.avail = (__force struct vring_avail *)avail; | ||
600 | vrh->vring.used = (__force struct vring_used *)used; | ||
601 | return 0; | ||
602 | } | ||
603 | EXPORT_SYMBOL(vringh_init_user); | ||
604 | |||
605 | /** | ||
606 | * vringh_getdesc_user - get next available descriptor from userspace ring. | ||
607 | * @vrh: the userspace vring. | ||
608 | * @riov: where to put the readable descriptors (or NULL) | ||
609 | * @wiov: where to put the writable descriptors (or NULL) | ||
610 | * @getrange: function to call to check ranges. | ||
611 | * @head: head index we received, for passing to vringh_complete_user(). | ||
612 | * | ||
613 | * Returns 0 if there was no descriptor, 1 if there was, or -errno. | ||
614 | * | ||
615 | * Note that on error return, you can tell the difference between an | ||
616 | * invalid ring and a single invalid descriptor: in the former case, | ||
617 | * *head will be vrh->vring.num. You may be able to ignore an invalid | ||
618 | * descriptor, but there's not much you can do with an invalid ring. | ||
619 | * | ||
620 | * Note that you may need to clean up riov and wiov, even on error! | ||
621 | */ | ||
622 | int vringh_getdesc_user(struct vringh *vrh, | ||
623 | struct vringh_iov *riov, | ||
624 | struct vringh_iov *wiov, | ||
625 | bool (*getrange)(struct vringh *vrh, | ||
626 | u64 addr, struct vringh_range *r), | ||
627 | u16 *head) | ||
628 | { | ||
629 | int err; | ||
630 | |||
631 | *head = vrh->vring.num; | ||
632 | err = __vringh_get_head(vrh, getu16_user, &vrh->last_avail_idx); | ||
633 | if (err < 0) | ||
634 | return err; | ||
635 | |||
636 | /* Empty... */ | ||
637 | if (err == vrh->vring.num) | ||
638 | return 0; | ||
639 | |||
640 | /* We need the layouts to be the identical for this to work */ | ||
641 | BUILD_BUG_ON(sizeof(struct vringh_kiov) != sizeof(struct vringh_iov)); | ||
642 | BUILD_BUG_ON(offsetof(struct vringh_kiov, iov) != | ||
643 | offsetof(struct vringh_iov, iov)); | ||
644 | BUILD_BUG_ON(offsetof(struct vringh_kiov, i) != | ||
645 | offsetof(struct vringh_iov, i)); | ||
646 | BUILD_BUG_ON(offsetof(struct vringh_kiov, used) != | ||
647 | offsetof(struct vringh_iov, used)); | ||
648 | BUILD_BUG_ON(offsetof(struct vringh_kiov, max_num) != | ||
649 | offsetof(struct vringh_iov, max_num)); | ||
650 | BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); | ||
651 | BUILD_BUG_ON(offsetof(struct iovec, iov_base) != | ||
652 | offsetof(struct kvec, iov_base)); | ||
653 | BUILD_BUG_ON(offsetof(struct iovec, iov_len) != | ||
654 | offsetof(struct kvec, iov_len)); | ||
655 | BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_base) | ||
656 | != sizeof(((struct kvec *)NULL)->iov_base)); | ||
657 | BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_len) | ||
658 | != sizeof(((struct kvec *)NULL)->iov_len)); | ||
659 | |||
660 | *head = err; | ||
661 | err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov, | ||
662 | (struct vringh_kiov *)wiov, | ||
663 | range_check, getrange, GFP_KERNEL, copydesc_user); | ||
664 | if (err) | ||
665 | return err; | ||
666 | |||
667 | return 1; | ||
668 | } | ||
669 | EXPORT_SYMBOL(vringh_getdesc_user); | ||
670 | |||
671 | /** | ||
672 | * vringh_iov_pull_user - copy bytes from vring_iov. | ||
673 | * @riov: the riov as passed to vringh_getdesc_user() (updated as we consume) | ||
674 | * @dst: the place to copy. | ||
675 | * @len: the maximum length to copy. | ||
676 | * | ||
677 | * Returns the bytes copied <= len or a negative errno. | ||
678 | */ | ||
679 | ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len) | ||
680 | { | ||
681 | return vringh_iov_xfer((struct vringh_kiov *)riov, | ||
682 | dst, len, xfer_from_user); | ||
683 | } | ||
684 | EXPORT_SYMBOL(vringh_iov_pull_user); | ||
685 | |||
686 | /** | ||
687 | * vringh_iov_push_user - copy bytes into vring_iov. | ||
688 | * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume) | ||
689 | * @dst: the place to copy. | ||
690 | * @len: the maximum length to copy. | ||
691 | * | ||
692 | * Returns the bytes copied <= len or a negative errno. | ||
693 | */ | ||
694 | ssize_t vringh_iov_push_user(struct vringh_iov *wiov, | ||
695 | const void *src, size_t len) | ||
696 | { | ||
697 | return vringh_iov_xfer((struct vringh_kiov *)wiov, | ||
698 | (void *)src, len, xfer_to_user); | ||
699 | } | ||
700 | EXPORT_SYMBOL(vringh_iov_push_user); | ||
701 | |||
702 | /** | ||
703 | * vringh_abandon_user - we've decided not to handle the descriptor(s). | ||
704 | * @vrh: the vring. | ||
705 | * @num: the number of descriptors to put back (ie. num | ||
706 | * vringh_get_user() to undo). | ||
707 | * | ||
708 | * The next vringh_get_user() will return the old descriptor(s) again. | ||
709 | */ | ||
710 | void vringh_abandon_user(struct vringh *vrh, unsigned int num) | ||
711 | { | ||
712 | /* We only update vring_avail_event(vr) when we want to be notified, | ||
713 | * so we haven't changed that yet. */ | ||
714 | vrh->last_avail_idx -= num; | ||
715 | } | ||
716 | EXPORT_SYMBOL(vringh_abandon_user); | ||
717 | |||
718 | /** | ||
719 | * vringh_complete_user - we've finished with descriptor, publish it. | ||
720 | * @vrh: the vring. | ||
721 | * @head: the head as filled in by vringh_getdesc_user. | ||
722 | * @len: the length of data we have written. | ||
723 | * | ||
724 | * You should check vringh_need_notify_user() after one or more calls | ||
725 | * to this function. | ||
726 | */ | ||
727 | int vringh_complete_user(struct vringh *vrh, u16 head, u32 len) | ||
728 | { | ||
729 | struct vring_used_elem used; | ||
730 | |||
731 | used.id = head; | ||
732 | used.len = len; | ||
733 | return __vringh_complete(vrh, &used, 1, putu16_user, putused_user); | ||
734 | } | ||
735 | EXPORT_SYMBOL(vringh_complete_user); | ||
736 | |||
737 | /** | ||
738 | * vringh_complete_multi_user - we've finished with many descriptors. | ||
739 | * @vrh: the vring. | ||
740 | * @used: the head, length pairs. | ||
741 | * @num_used: the number of used elements. | ||
742 | * | ||
743 | * You should check vringh_need_notify_user() after one or more calls | ||
744 | * to this function. | ||
745 | */ | ||
746 | int vringh_complete_multi_user(struct vringh *vrh, | ||
747 | const struct vring_used_elem used[], | ||
748 | unsigned num_used) | ||
749 | { | ||
750 | return __vringh_complete(vrh, used, num_used, | ||
751 | putu16_user, putused_user); | ||
752 | } | ||
753 | EXPORT_SYMBOL(vringh_complete_multi_user); | ||
754 | |||
755 | /** | ||
756 | * vringh_notify_enable_user - we want to know if something changes. | ||
757 | * @vrh: the vring. | ||
758 | * | ||
759 | * This always enables notifications, but returns false if there are | ||
760 | * now more buffers available in the vring. | ||
761 | */ | ||
762 | bool vringh_notify_enable_user(struct vringh *vrh) | ||
763 | { | ||
764 | return __vringh_notify_enable(vrh, getu16_user, putu16_user); | ||
765 | } | ||
766 | EXPORT_SYMBOL(vringh_notify_enable_user); | ||
767 | |||
768 | /** | ||
769 | * vringh_notify_disable_user - don't tell us if something changes. | ||
770 | * @vrh: the vring. | ||
771 | * | ||
772 | * This is our normal running state: we disable and then only enable when | ||
773 | * we're going to sleep. | ||
774 | */ | ||
775 | void vringh_notify_disable_user(struct vringh *vrh) | ||
776 | { | ||
777 | __vringh_notify_disable(vrh, putu16_user); | ||
778 | } | ||
779 | EXPORT_SYMBOL(vringh_notify_disable_user); | ||
780 | |||
781 | /** | ||
782 | * vringh_need_notify_user - must we tell the other side about used buffers? | ||
783 | * @vrh: the vring we've called vringh_complete_user() on. | ||
784 | * | ||
785 | * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. | ||
786 | */ | ||
787 | int vringh_need_notify_user(struct vringh *vrh) | ||
788 | { | ||
789 | return __vringh_need_notify(vrh, getu16_user); | ||
790 | } | ||
791 | EXPORT_SYMBOL(vringh_need_notify_user); | ||
792 | |||
793 | /* Kernelspace access helpers. */ | ||
794 | static inline int getu16_kern(u16 *val, const u16 *p) | ||
795 | { | ||
796 | *val = ACCESS_ONCE(*p); | ||
797 | return 0; | ||
798 | } | ||
799 | |||
800 | static inline int putu16_kern(u16 *p, u16 val) | ||
801 | { | ||
802 | ACCESS_ONCE(*p) = val; | ||
803 | return 0; | ||
804 | } | ||
805 | |||
806 | static inline int copydesc_kern(void *dst, const void *src, size_t len) | ||
807 | { | ||
808 | memcpy(dst, src, len); | ||
809 | return 0; | ||
810 | } | ||
811 | |||
812 | static inline int putused_kern(struct vring_used_elem *dst, | ||
813 | const struct vring_used_elem *src, | ||
814 | unsigned int num) | ||
815 | { | ||
816 | memcpy(dst, src, num * sizeof(*dst)); | ||
817 | return 0; | ||
818 | } | ||
819 | |||
820 | static inline int xfer_kern(void *src, void *dst, size_t len) | ||
821 | { | ||
822 | memcpy(dst, src, len); | ||
823 | return 0; | ||
824 | } | ||
825 | |||
826 | /** | ||
827 | * vringh_init_kern - initialize a vringh for a kernelspace vring. | ||
828 | * @vrh: the vringh to initialize. | ||
829 | * @features: the feature bits for this ring. | ||
830 | * @num: the number of elements. | ||
831 | * @weak_barriers: true if we only need memory barriers, not I/O. | ||
832 | * @desc: the userpace descriptor pointer. | ||
833 | * @avail: the userpace avail pointer. | ||
834 | * @used: the userpace used pointer. | ||
835 | * | ||
836 | * Returns an error if num is invalid. | ||
837 | */ | ||
838 | int vringh_init_kern(struct vringh *vrh, u32 features, | ||
839 | unsigned int num, bool weak_barriers, | ||
840 | struct vring_desc *desc, | ||
841 | struct vring_avail *avail, | ||
842 | struct vring_used *used) | ||
843 | { | ||
844 | /* Sane power of 2 please! */ | ||
845 | if (!num || num > 0xffff || (num & (num - 1))) { | ||
846 | vringh_bad("Bad ring size %u", num); | ||
847 | return -EINVAL; | ||
848 | } | ||
849 | |||
850 | vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX)); | ||
851 | vrh->weak_barriers = weak_barriers; | ||
852 | vrh->completed = 0; | ||
853 | vrh->last_avail_idx = 0; | ||
854 | vrh->last_used_idx = 0; | ||
855 | vrh->vring.num = num; | ||
856 | vrh->vring.desc = desc; | ||
857 | vrh->vring.avail = avail; | ||
858 | vrh->vring.used = used; | ||
859 | return 0; | ||
860 | } | ||
861 | EXPORT_SYMBOL(vringh_init_kern); | ||
862 | |||
863 | /** | ||
864 | * vringh_getdesc_kern - get next available descriptor from kernelspace ring. | ||
865 | * @vrh: the kernelspace vring. | ||
866 | * @riov: where to put the readable descriptors (or NULL) | ||
867 | * @wiov: where to put the writable descriptors (or NULL) | ||
868 | * @head: head index we received, for passing to vringh_complete_kern(). | ||
869 | * @gfp: flags for allocating larger riov/wiov. | ||
870 | * | ||
871 | * Returns 0 if there was no descriptor, 1 if there was, or -errno. | ||
872 | * | ||
873 | * Note that on error return, you can tell the difference between an | ||
874 | * invalid ring and a single invalid descriptor: in the former case, | ||
875 | * *head will be vrh->vring.num. You may be able to ignore an invalid | ||
876 | * descriptor, but there's not much you can do with an invalid ring. | ||
877 | * | ||
878 | * Note that you may need to clean up riov and wiov, even on error! | ||
879 | */ | ||
880 | int vringh_getdesc_kern(struct vringh *vrh, | ||
881 | struct vringh_kiov *riov, | ||
882 | struct vringh_kiov *wiov, | ||
883 | u16 *head, | ||
884 | gfp_t gfp) | ||
885 | { | ||
886 | int err; | ||
887 | |||
888 | err = __vringh_get_head(vrh, getu16_kern, &vrh->last_avail_idx); | ||
889 | if (err < 0) | ||
890 | return err; | ||
891 | |||
892 | /* Empty... */ | ||
893 | if (err == vrh->vring.num) | ||
894 | return 0; | ||
895 | |||
896 | *head = err; | ||
897 | err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL, | ||
898 | gfp, copydesc_kern); | ||
899 | if (err) | ||
900 | return err; | ||
901 | |||
902 | return 1; | ||
903 | } | ||
904 | EXPORT_SYMBOL(vringh_getdesc_kern); | ||
905 | |||
906 | /** | ||
907 | * vringh_iov_pull_kern - copy bytes from vring_iov. | ||
908 | * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume) | ||
909 | * @dst: the place to copy. | ||
910 | * @len: the maximum length to copy. | ||
911 | * | ||
912 | * Returns the bytes copied <= len or a negative errno. | ||
913 | */ | ||
914 | ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len) | ||
915 | { | ||
916 | return vringh_iov_xfer(riov, dst, len, xfer_kern); | ||
917 | } | ||
918 | EXPORT_SYMBOL(vringh_iov_pull_kern); | ||
919 | |||
920 | /** | ||
921 | * vringh_iov_push_kern - copy bytes into vring_iov. | ||
922 | * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume) | ||
923 | * @dst: the place to copy. | ||
924 | * @len: the maximum length to copy. | ||
925 | * | ||
926 | * Returns the bytes copied <= len or a negative errno. | ||
927 | */ | ||
928 | ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, | ||
929 | const void *src, size_t len) | ||
930 | { | ||
931 | return vringh_iov_xfer(wiov, (void *)src, len, xfer_kern); | ||
932 | } | ||
933 | EXPORT_SYMBOL(vringh_iov_push_kern); | ||
934 | |||
935 | /** | ||
936 | * vringh_abandon_kern - we've decided not to handle the descriptor(s). | ||
937 | * @vrh: the vring. | ||
938 | * @num: the number of descriptors to put back (ie. num | ||
939 | * vringh_get_kern() to undo). | ||
940 | * | ||
941 | * The next vringh_get_kern() will return the old descriptor(s) again. | ||
942 | */ | ||
943 | void vringh_abandon_kern(struct vringh *vrh, unsigned int num) | ||
944 | { | ||
945 | /* We only update vring_avail_event(vr) when we want to be notified, | ||
946 | * so we haven't changed that yet. */ | ||
947 | vrh->last_avail_idx -= num; | ||
948 | } | ||
949 | EXPORT_SYMBOL(vringh_abandon_kern); | ||
950 | |||
951 | /** | ||
952 | * vringh_complete_kern - we've finished with descriptor, publish it. | ||
953 | * @vrh: the vring. | ||
954 | * @head: the head as filled in by vringh_getdesc_kern. | ||
955 | * @len: the length of data we have written. | ||
956 | * | ||
957 | * You should check vringh_need_notify_kern() after one or more calls | ||
958 | * to this function. | ||
959 | */ | ||
960 | int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len) | ||
961 | { | ||
962 | struct vring_used_elem used; | ||
963 | |||
964 | used.id = head; | ||
965 | used.len = len; | ||
966 | |||
967 | return __vringh_complete(vrh, &used, 1, putu16_kern, putused_kern); | ||
968 | } | ||
969 | EXPORT_SYMBOL(vringh_complete_kern); | ||
970 | |||
971 | /** | ||
972 | * vringh_notify_enable_kern - we want to know if something changes. | ||
973 | * @vrh: the vring. | ||
974 | * | ||
975 | * This always enables notifications, but returns false if there are | ||
976 | * now more buffers available in the vring. | ||
977 | */ | ||
978 | bool vringh_notify_enable_kern(struct vringh *vrh) | ||
979 | { | ||
980 | return __vringh_notify_enable(vrh, getu16_kern, putu16_kern); | ||
981 | } | ||
982 | EXPORT_SYMBOL(vringh_notify_enable_kern); | ||
983 | |||
984 | /** | ||
985 | * vringh_notify_disable_kern - don't tell us if something changes. | ||
986 | * @vrh: the vring. | ||
987 | * | ||
988 | * This is our normal running state: we disable and then only enable when | ||
989 | * we're going to sleep. | ||
990 | */ | ||
991 | void vringh_notify_disable_kern(struct vringh *vrh) | ||
992 | { | ||
993 | __vringh_notify_disable(vrh, putu16_kern); | ||
994 | } | ||
995 | EXPORT_SYMBOL(vringh_notify_disable_kern); | ||
996 | |||
997 | /** | ||
998 | * vringh_need_notify_kern - must we tell the other side about used buffers? | ||
999 | * @vrh: the vring we've called vringh_complete_kern() on. | ||
1000 | * | ||
1001 | * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. | ||
1002 | */ | ||
1003 | int vringh_need_notify_kern(struct vringh *vrh) | ||
1004 | { | ||
1005 | return __vringh_need_notify(vrh, getu16_kern); | ||
1006 | } | ||
1007 | EXPORT_SYMBOL(vringh_need_notify_kern); | ||
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 8dab163c5ef0..bd3ae324a1a2 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c | |||
@@ -108,7 +108,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) | |||
108 | sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); | 108 | sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); |
109 | 109 | ||
110 | /* We should always be able to add one buffer to an empty queue. */ | 110 | /* We should always be able to add one buffer to an empty queue. */ |
111 | if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) | 111 | if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0) |
112 | BUG(); | 112 | BUG(); |
113 | virtqueue_kick(vq); | 113 | virtqueue_kick(vq); |
114 | 114 | ||
@@ -256,7 +256,7 @@ static void stats_handle_request(struct virtio_balloon *vb) | |||
256 | if (!virtqueue_get_buf(vq, &len)) | 256 | if (!virtqueue_get_buf(vq, &len)) |
257 | return; | 257 | return; |
258 | sg_init_one(&sg, vb->stats, sizeof(vb->stats)); | 258 | sg_init_one(&sg, vb->stats, sizeof(vb->stats)); |
259 | if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) | 259 | if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0) |
260 | BUG(); | 260 | BUG(); |
261 | virtqueue_kick(vq); | 261 | virtqueue_kick(vq); |
262 | } | 262 | } |
@@ -341,7 +341,7 @@ static int init_vqs(struct virtio_balloon *vb) | |||
341 | * use it to signal us later. | 341 | * use it to signal us later. |
342 | */ | 342 | */ |
343 | sg_init_one(&sg, vb->stats, sizeof vb->stats); | 343 | sg_init_one(&sg, vb->stats, sizeof vb->stats); |
344 | if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb, GFP_KERNEL) | 344 | if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL) |
345 | < 0) | 345 | < 0) |
346 | BUG(); | 346 | BUG(); |
347 | virtqueue_kick(vb->stats_vq); | 347 | virtqueue_kick(vb->stats_vq); |
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index ffd7e7da5d3b..5217baf5528c 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c | |||
@@ -24,27 +24,6 @@ | |||
24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
25 | #include <linux/hrtimer.h> | 25 | #include <linux/hrtimer.h> |
26 | 26 | ||
27 | /* virtio guest is communicating with a virtual "device" that actually runs on | ||
28 | * a host processor. Memory barriers are used to control SMP effects. */ | ||
29 | #ifdef CONFIG_SMP | ||
30 | /* Where possible, use SMP barriers which are more lightweight than mandatory | ||
31 | * barriers, because mandatory barriers control MMIO effects on accesses | ||
32 | * through relaxed memory I/O windows (which virtio-pci does not use). */ | ||
33 | #define virtio_mb(vq) \ | ||
34 | do { if ((vq)->weak_barriers) smp_mb(); else mb(); } while(0) | ||
35 | #define virtio_rmb(vq) \ | ||
36 | do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0) | ||
37 | #define virtio_wmb(vq) \ | ||
38 | do { if ((vq)->weak_barriers) smp_wmb(); else wmb(); } while(0) | ||
39 | #else | ||
40 | /* We must force memory ordering even if guest is UP since host could be | ||
41 | * running on another CPU, but SMP barriers are defined to barrier() in that | ||
42 | * configuration. So fall back to mandatory barriers instead. */ | ||
43 | #define virtio_mb(vq) mb() | ||
44 | #define virtio_rmb(vq) rmb() | ||
45 | #define virtio_wmb(vq) wmb() | ||
46 | #endif | ||
47 | |||
48 | #ifdef DEBUG | 27 | #ifdef DEBUG |
49 | /* For development, we want to crash whenever the ring is screwed. */ | 28 | /* For development, we want to crash whenever the ring is screwed. */ |
50 | #define BAD_RING(_vq, fmt, args...) \ | 29 | #define BAD_RING(_vq, fmt, args...) \ |
@@ -119,16 +98,36 @@ struct vring_virtqueue | |||
119 | 98 | ||
120 | #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) | 99 | #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) |
121 | 100 | ||
101 | static inline struct scatterlist *sg_next_chained(struct scatterlist *sg, | ||
102 | unsigned int *count) | ||
103 | { | ||
104 | return sg_next(sg); | ||
105 | } | ||
106 | |||
107 | static inline struct scatterlist *sg_next_arr(struct scatterlist *sg, | ||
108 | unsigned int *count) | ||
109 | { | ||
110 | if (--(*count) == 0) | ||
111 | return NULL; | ||
112 | return sg + 1; | ||
113 | } | ||
114 | |||
122 | /* Set up an indirect table of descriptors and add it to the queue. */ | 115 | /* Set up an indirect table of descriptors and add it to the queue. */ |
123 | static int vring_add_indirect(struct vring_virtqueue *vq, | 116 | static inline int vring_add_indirect(struct vring_virtqueue *vq, |
124 | struct scatterlist sg[], | 117 | struct scatterlist *sgs[], |
125 | unsigned int out, | 118 | struct scatterlist *(*next) |
126 | unsigned int in, | 119 | (struct scatterlist *, unsigned int *), |
127 | gfp_t gfp) | 120 | unsigned int total_sg, |
121 | unsigned int total_out, | ||
122 | unsigned int total_in, | ||
123 | unsigned int out_sgs, | ||
124 | unsigned int in_sgs, | ||
125 | gfp_t gfp) | ||
128 | { | 126 | { |
129 | struct vring_desc *desc; | 127 | struct vring_desc *desc; |
130 | unsigned head; | 128 | unsigned head; |
131 | int i; | 129 | struct scatterlist *sg; |
130 | int i, n; | ||
132 | 131 | ||
133 | /* | 132 | /* |
134 | * We require lowmem mappings for the descriptors because | 133 | * We require lowmem mappings for the descriptors because |
@@ -137,25 +136,31 @@ static int vring_add_indirect(struct vring_virtqueue *vq, | |||
137 | */ | 136 | */ |
138 | gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH); | 137 | gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH); |
139 | 138 | ||
140 | desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp); | 139 | desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp); |
141 | if (!desc) | 140 | if (!desc) |
142 | return -ENOMEM; | 141 | return -ENOMEM; |
143 | 142 | ||
144 | /* Transfer entries from the sg list into the indirect page */ | 143 | /* Transfer entries from the sg lists into the indirect page */ |
145 | for (i = 0; i < out; i++) { | 144 | i = 0; |
146 | desc[i].flags = VRING_DESC_F_NEXT; | 145 | for (n = 0; n < out_sgs; n++) { |
147 | desc[i].addr = sg_phys(sg); | 146 | for (sg = sgs[n]; sg; sg = next(sg, &total_out)) { |
148 | desc[i].len = sg->length; | 147 | desc[i].flags = VRING_DESC_F_NEXT; |
149 | desc[i].next = i+1; | 148 | desc[i].addr = sg_phys(sg); |
150 | sg++; | 149 | desc[i].len = sg->length; |
150 | desc[i].next = i+1; | ||
151 | i++; | ||
152 | } | ||
151 | } | 153 | } |
152 | for (; i < (out + in); i++) { | 154 | for (; n < (out_sgs + in_sgs); n++) { |
153 | desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; | 155 | for (sg = sgs[n]; sg; sg = next(sg, &total_in)) { |
154 | desc[i].addr = sg_phys(sg); | 156 | desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; |
155 | desc[i].len = sg->length; | 157 | desc[i].addr = sg_phys(sg); |
156 | desc[i].next = i+1; | 158 | desc[i].len = sg->length; |
157 | sg++; | 159 | desc[i].next = i+1; |
160 | i++; | ||
161 | } | ||
158 | } | 162 | } |
163 | BUG_ON(i != total_sg); | ||
159 | 164 | ||
160 | /* Last one doesn't continue. */ | 165 | /* Last one doesn't continue. */ |
161 | desc[i-1].flags &= ~VRING_DESC_F_NEXT; | 166 | desc[i-1].flags &= ~VRING_DESC_F_NEXT; |
@@ -176,29 +181,20 @@ static int vring_add_indirect(struct vring_virtqueue *vq, | |||
176 | return head; | 181 | return head; |
177 | } | 182 | } |
178 | 183 | ||
179 | /** | 184 | static inline int virtqueue_add(struct virtqueue *_vq, |
180 | * virtqueue_add_buf - expose buffer to other end | 185 | struct scatterlist *sgs[], |
181 | * @vq: the struct virtqueue we're talking about. | 186 | struct scatterlist *(*next) |
182 | * @sg: the description of the buffer(s). | 187 | (struct scatterlist *, unsigned int *), |
183 | * @out_num: the number of sg readable by other side | 188 | unsigned int total_out, |
184 | * @in_num: the number of sg which are writable (after readable ones) | 189 | unsigned int total_in, |
185 | * @data: the token identifying the buffer. | 190 | unsigned int out_sgs, |
186 | * @gfp: how to do memory allocations (if necessary). | 191 | unsigned int in_sgs, |
187 | * | 192 | void *data, |
188 | * Caller must ensure we don't call this with other virtqueue operations | 193 | gfp_t gfp) |
189 | * at the same time (except where noted). | ||
190 | * | ||
191 | * Returns zero or a negative error (ie. ENOSPC, ENOMEM). | ||
192 | */ | ||
193 | int virtqueue_add_buf(struct virtqueue *_vq, | ||
194 | struct scatterlist sg[], | ||
195 | unsigned int out, | ||
196 | unsigned int in, | ||
197 | void *data, | ||
198 | gfp_t gfp) | ||
199 | { | 194 | { |
200 | struct vring_virtqueue *vq = to_vvq(_vq); | 195 | struct vring_virtqueue *vq = to_vvq(_vq); |
201 | unsigned int i, avail, uninitialized_var(prev); | 196 | struct scatterlist *sg; |
197 | unsigned int i, n, avail, uninitialized_var(prev), total_sg; | ||
202 | int head; | 198 | int head; |
203 | 199 | ||
204 | START_USE(vq); | 200 | START_USE(vq); |
@@ -218,46 +214,54 @@ int virtqueue_add_buf(struct virtqueue *_vq, | |||
218 | } | 214 | } |
219 | #endif | 215 | #endif |
220 | 216 | ||
217 | total_sg = total_in + total_out; | ||
218 | |||
221 | /* If the host supports indirect descriptor tables, and we have multiple | 219 | /* If the host supports indirect descriptor tables, and we have multiple |
222 | * buffers, then go indirect. FIXME: tune this threshold */ | 220 | * buffers, then go indirect. FIXME: tune this threshold */ |
223 | if (vq->indirect && (out + in) > 1 && vq->vq.num_free) { | 221 | if (vq->indirect && total_sg > 1 && vq->vq.num_free) { |
224 | head = vring_add_indirect(vq, sg, out, in, gfp); | 222 | head = vring_add_indirect(vq, sgs, next, total_sg, total_out, |
223 | total_in, | ||
224 | out_sgs, in_sgs, gfp); | ||
225 | if (likely(head >= 0)) | 225 | if (likely(head >= 0)) |
226 | goto add_head; | 226 | goto add_head; |
227 | } | 227 | } |
228 | 228 | ||
229 | BUG_ON(out + in > vq->vring.num); | 229 | BUG_ON(total_sg > vq->vring.num); |
230 | BUG_ON(out + in == 0); | 230 | BUG_ON(total_sg == 0); |
231 | 231 | ||
232 | if (vq->vq.num_free < out + in) { | 232 | if (vq->vq.num_free < total_sg) { |
233 | pr_debug("Can't add buf len %i - avail = %i\n", | 233 | pr_debug("Can't add buf len %i - avail = %i\n", |
234 | out + in, vq->vq.num_free); | 234 | total_sg, vq->vq.num_free); |
235 | /* FIXME: for historical reasons, we force a notify here if | 235 | /* FIXME: for historical reasons, we force a notify here if |
236 | * there are outgoing parts to the buffer. Presumably the | 236 | * there are outgoing parts to the buffer. Presumably the |
237 | * host should service the ring ASAP. */ | 237 | * host should service the ring ASAP. */ |
238 | if (out) | 238 | if (out_sgs) |
239 | vq->notify(&vq->vq); | 239 | vq->notify(&vq->vq); |
240 | END_USE(vq); | 240 | END_USE(vq); |
241 | return -ENOSPC; | 241 | return -ENOSPC; |
242 | } | 242 | } |
243 | 243 | ||
244 | /* We're about to use some buffers from the free list. */ | 244 | /* We're about to use some buffers from the free list. */ |
245 | vq->vq.num_free -= out + in; | 245 | vq->vq.num_free -= total_sg; |
246 | 246 | ||
247 | head = vq->free_head; | 247 | head = i = vq->free_head; |
248 | for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) { | 248 | for (n = 0; n < out_sgs; n++) { |
249 | vq->vring.desc[i].flags = VRING_DESC_F_NEXT; | 249 | for (sg = sgs[n]; sg; sg = next(sg, &total_out)) { |
250 | vq->vring.desc[i].addr = sg_phys(sg); | 250 | vq->vring.desc[i].flags = VRING_DESC_F_NEXT; |
251 | vq->vring.desc[i].len = sg->length; | 251 | vq->vring.desc[i].addr = sg_phys(sg); |
252 | prev = i; | 252 | vq->vring.desc[i].len = sg->length; |
253 | sg++; | 253 | prev = i; |
254 | i = vq->vring.desc[i].next; | ||
255 | } | ||
254 | } | 256 | } |
255 | for (; in; i = vq->vring.desc[i].next, in--) { | 257 | for (; n < (out_sgs + in_sgs); n++) { |
256 | vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; | 258 | for (sg = sgs[n]; sg; sg = next(sg, &total_in)) { |
257 | vq->vring.desc[i].addr = sg_phys(sg); | 259 | vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; |
258 | vq->vring.desc[i].len = sg->length; | 260 | vq->vring.desc[i].addr = sg_phys(sg); |
259 | prev = i; | 261 | vq->vring.desc[i].len = sg->length; |
260 | sg++; | 262 | prev = i; |
263 | i = vq->vring.desc[i].next; | ||
264 | } | ||
261 | } | 265 | } |
262 | /* Last one doesn't continue. */ | 266 | /* Last one doesn't continue. */ |
263 | vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT; | 267 | vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT; |
@@ -276,7 +280,7 @@ add_head: | |||
276 | 280 | ||
277 | /* Descriptors and available array need to be set before we expose the | 281 | /* Descriptors and available array need to be set before we expose the |
278 | * new available array entries. */ | 282 | * new available array entries. */ |
279 | virtio_wmb(vq); | 283 | virtio_wmb(vq->weak_barriers); |
280 | vq->vring.avail->idx++; | 284 | vq->vring.avail->idx++; |
281 | vq->num_added++; | 285 | vq->num_added++; |
282 | 286 | ||
@@ -290,9 +294,122 @@ add_head: | |||
290 | 294 | ||
291 | return 0; | 295 | return 0; |
292 | } | 296 | } |
297 | |||
298 | /** | ||
299 | * virtqueue_add_buf - expose buffer to other end | ||
300 | * @vq: the struct virtqueue we're talking about. | ||
301 | * @sg: the description of the buffer(s). | ||
302 | * @out_num: the number of sg readable by other side | ||
303 | * @in_num: the number of sg which are writable (after readable ones) | ||
304 | * @data: the token identifying the buffer. | ||
305 | * @gfp: how to do memory allocations (if necessary). | ||
306 | * | ||
307 | * Caller must ensure we don't call this with other virtqueue operations | ||
308 | * at the same time (except where noted). | ||
309 | * | ||
310 | * Returns zero or a negative error (ie. ENOSPC, ENOMEM). | ||
311 | */ | ||
312 | int virtqueue_add_buf(struct virtqueue *_vq, | ||
313 | struct scatterlist sg[], | ||
314 | unsigned int out, | ||
315 | unsigned int in, | ||
316 | void *data, | ||
317 | gfp_t gfp) | ||
318 | { | ||
319 | struct scatterlist *sgs[2]; | ||
320 | |||
321 | sgs[0] = sg; | ||
322 | sgs[1] = sg + out; | ||
323 | |||
324 | return virtqueue_add(_vq, sgs, sg_next_arr, | ||
325 | out, in, out ? 1 : 0, in ? 1 : 0, data, gfp); | ||
326 | } | ||
293 | EXPORT_SYMBOL_GPL(virtqueue_add_buf); | 327 | EXPORT_SYMBOL_GPL(virtqueue_add_buf); |
294 | 328 | ||
295 | /** | 329 | /** |
330 | * virtqueue_add_sgs - expose buffers to other end | ||
331 | * @vq: the struct virtqueue we're talking about. | ||
332 | * @sgs: array of terminated scatterlists. | ||
333 | * @out_num: the number of scatterlists readable by other side | ||
334 | * @in_num: the number of scatterlists which are writable (after readable ones) | ||
335 | * @data: the token identifying the buffer. | ||
336 | * @gfp: how to do memory allocations (if necessary). | ||
337 | * | ||
338 | * Caller must ensure we don't call this with other virtqueue operations | ||
339 | * at the same time (except where noted). | ||
340 | * | ||
341 | * Returns zero or a negative error (ie. ENOSPC, ENOMEM). | ||
342 | */ | ||
343 | int virtqueue_add_sgs(struct virtqueue *_vq, | ||
344 | struct scatterlist *sgs[], | ||
345 | unsigned int out_sgs, | ||
346 | unsigned int in_sgs, | ||
347 | void *data, | ||
348 | gfp_t gfp) | ||
349 | { | ||
350 | unsigned int i, total_out, total_in; | ||
351 | |||
352 | /* Count them first. */ | ||
353 | for (i = total_out = total_in = 0; i < out_sgs; i++) { | ||
354 | struct scatterlist *sg; | ||
355 | for (sg = sgs[i]; sg; sg = sg_next(sg)) | ||
356 | total_out++; | ||
357 | } | ||
358 | for (; i < out_sgs + in_sgs; i++) { | ||
359 | struct scatterlist *sg; | ||
360 | for (sg = sgs[i]; sg; sg = sg_next(sg)) | ||
361 | total_in++; | ||
362 | } | ||
363 | return virtqueue_add(_vq, sgs, sg_next_chained, | ||
364 | total_out, total_in, out_sgs, in_sgs, data, gfp); | ||
365 | } | ||
366 | EXPORT_SYMBOL_GPL(virtqueue_add_sgs); | ||
367 | |||
368 | /** | ||
369 | * virtqueue_add_outbuf - expose output buffers to other end | ||
370 | * @vq: the struct virtqueue we're talking about. | ||
371 | * @sgs: array of scatterlists (need not be terminated!) | ||
372 | * @num: the number of scatterlists readable by other side | ||
373 | * @data: the token identifying the buffer. | ||
374 | * @gfp: how to do memory allocations (if necessary). | ||
375 | * | ||
376 | * Caller must ensure we don't call this with other virtqueue operations | ||
377 | * at the same time (except where noted). | ||
378 | * | ||
379 | * Returns zero or a negative error (ie. ENOSPC, ENOMEM). | ||
380 | */ | ||
381 | int virtqueue_add_outbuf(struct virtqueue *vq, | ||
382 | struct scatterlist sg[], unsigned int num, | ||
383 | void *data, | ||
384 | gfp_t gfp) | ||
385 | { | ||
386 | return virtqueue_add(vq, &sg, sg_next_arr, num, 0, 1, 0, data, gfp); | ||
387 | } | ||
388 | EXPORT_SYMBOL_GPL(virtqueue_add_outbuf); | ||
389 | |||
390 | /** | ||
391 | * virtqueue_add_inbuf - expose input buffers to other end | ||
392 | * @vq: the struct virtqueue we're talking about. | ||
393 | * @sgs: array of scatterlists (need not be terminated!) | ||
394 | * @num: the number of scatterlists writable by other side | ||
395 | * @data: the token identifying the buffer. | ||
396 | * @gfp: how to do memory allocations (if necessary). | ||
397 | * | ||
398 | * Caller must ensure we don't call this with other virtqueue operations | ||
399 | * at the same time (except where noted). | ||
400 | * | ||
401 | * Returns zero or a negative error (ie. ENOSPC, ENOMEM). | ||
402 | */ | ||
403 | int virtqueue_add_inbuf(struct virtqueue *vq, | ||
404 | struct scatterlist sg[], unsigned int num, | ||
405 | void *data, | ||
406 | gfp_t gfp) | ||
407 | { | ||
408 | return virtqueue_add(vq, &sg, sg_next_arr, 0, num, 0, 1, data, gfp); | ||
409 | } | ||
410 | EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); | ||
411 | |||
412 | /** | ||
296 | * virtqueue_kick_prepare - first half of split virtqueue_kick call. | 413 | * virtqueue_kick_prepare - first half of split virtqueue_kick call. |
297 | * @vq: the struct virtqueue | 414 | * @vq: the struct virtqueue |
298 | * | 415 | * |
@@ -312,7 +429,7 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq) | |||
312 | START_USE(vq); | 429 | START_USE(vq); |
313 | /* We need to expose available array entries before checking avail | 430 | /* We need to expose available array entries before checking avail |
314 | * event. */ | 431 | * event. */ |
315 | virtio_mb(vq); | 432 | virtio_mb(vq->weak_barriers); |
316 | 433 | ||
317 | old = vq->vring.avail->idx - vq->num_added; | 434 | old = vq->vring.avail->idx - vq->num_added; |
318 | new = vq->vring.avail->idx; | 435 | new = vq->vring.avail->idx; |
@@ -436,7 +553,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) | |||
436 | } | 553 | } |
437 | 554 | ||
438 | /* Only get used array entries after they have been exposed by host. */ | 555 | /* Only get used array entries after they have been exposed by host. */ |
439 | virtio_rmb(vq); | 556 | virtio_rmb(vq->weak_barriers); |
440 | 557 | ||
441 | last_used = (vq->last_used_idx & (vq->vring.num - 1)); | 558 | last_used = (vq->last_used_idx & (vq->vring.num - 1)); |
442 | i = vq->vring.used->ring[last_used].id; | 559 | i = vq->vring.used->ring[last_used].id; |
@@ -460,7 +577,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) | |||
460 | * the read in the next get_buf call. */ | 577 | * the read in the next get_buf call. */ |
461 | if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { | 578 | if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { |
462 | vring_used_event(&vq->vring) = vq->last_used_idx; | 579 | vring_used_event(&vq->vring) = vq->last_used_idx; |
463 | virtio_mb(vq); | 580 | virtio_mb(vq->weak_barriers); |
464 | } | 581 | } |
465 | 582 | ||
466 | #ifdef DEBUG | 583 | #ifdef DEBUG |
@@ -513,7 +630,7 @@ bool virtqueue_enable_cb(struct virtqueue *_vq) | |||
513 | * entry. Always do both to keep code simple. */ | 630 | * entry. Always do both to keep code simple. */ |
514 | vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; | 631 | vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; |
515 | vring_used_event(&vq->vring) = vq->last_used_idx; | 632 | vring_used_event(&vq->vring) = vq->last_used_idx; |
516 | virtio_mb(vq); | 633 | virtio_mb(vq->weak_barriers); |
517 | if (unlikely(more_used(vq))) { | 634 | if (unlikely(more_used(vq))) { |
518 | END_USE(vq); | 635 | END_USE(vq); |
519 | return false; | 636 | return false; |
@@ -553,7 +670,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) | |||
553 | /* TODO: tune this threshold */ | 670 | /* TODO: tune this threshold */ |
554 | bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4; | 671 | bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4; |
555 | vring_used_event(&vq->vring) = vq->last_used_idx + bufs; | 672 | vring_used_event(&vq->vring) = vq->last_used_idx + bufs; |
556 | virtio_mb(vq); | 673 | virtio_mb(vq->weak_barriers); |
557 | if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) { | 674 | if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) { |
558 | END_USE(vq); | 675 | END_USE(vq); |
559 | return false; | 676 | return false; |
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 2d8bdaef9611..bfc47e0de81c 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h | |||
@@ -172,6 +172,22 @@ static inline void sg_mark_end(struct scatterlist *sg) | |||
172 | } | 172 | } |
173 | 173 | ||
174 | /** | 174 | /** |
175 | * sg_unmark_end - Undo setting the end of the scatterlist | ||
176 | * @sg: SG entryScatterlist | ||
177 | * | ||
178 | * Description: | ||
179 | * Removes the termination marker from the given entry of the scatterlist. | ||
180 | * | ||
181 | **/ | ||
182 | static inline void sg_unmark_end(struct scatterlist *sg) | ||
183 | { | ||
184 | #ifdef CONFIG_DEBUG_SG | ||
185 | BUG_ON(sg->sg_magic != SG_MAGIC); | ||
186 | #endif | ||
187 | sg->page_link &= ~0x02; | ||
188 | } | ||
189 | |||
190 | /** | ||
175 | * sg_phys - Return physical address of an sg entry | 191 | * sg_phys - Return physical address of an sg entry |
176 | * @sg: SG entry | 192 | * @sg: SG entry |
177 | * | 193 | * |
diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 2d7a5e045908..9ff8645b7e0b 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/device.h> | 8 | #include <linux/device.h> |
9 | #include <linux/mod_devicetable.h> | 9 | #include <linux/mod_devicetable.h> |
10 | #include <linux/gfp.h> | 10 | #include <linux/gfp.h> |
11 | #include <linux/vringh.h> | ||
11 | 12 | ||
12 | /** | 13 | /** |
13 | * virtqueue - a queue to register buffers for sending or receiving. | 14 | * virtqueue - a queue to register buffers for sending or receiving. |
@@ -40,6 +41,23 @@ int virtqueue_add_buf(struct virtqueue *vq, | |||
40 | void *data, | 41 | void *data, |
41 | gfp_t gfp); | 42 | gfp_t gfp); |
42 | 43 | ||
44 | int virtqueue_add_outbuf(struct virtqueue *vq, | ||
45 | struct scatterlist sg[], unsigned int num, | ||
46 | void *data, | ||
47 | gfp_t gfp); | ||
48 | |||
49 | int virtqueue_add_inbuf(struct virtqueue *vq, | ||
50 | struct scatterlist sg[], unsigned int num, | ||
51 | void *data, | ||
52 | gfp_t gfp); | ||
53 | |||
54 | int virtqueue_add_sgs(struct virtqueue *vq, | ||
55 | struct scatterlist *sgs[], | ||
56 | unsigned int out_sgs, | ||
57 | unsigned int in_sgs, | ||
58 | void *data, | ||
59 | gfp_t gfp); | ||
60 | |||
43 | void virtqueue_kick(struct virtqueue *vq); | 61 | void virtqueue_kick(struct virtqueue *vq); |
44 | 62 | ||
45 | bool virtqueue_kick_prepare(struct virtqueue *vq); | 63 | bool virtqueue_kick_prepare(struct virtqueue *vq); |
@@ -64,6 +82,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *vq); | |||
64 | * @dev: underlying device. | 82 | * @dev: underlying device. |
65 | * @id: the device type identification (used to match it with a driver). | 83 | * @id: the device type identification (used to match it with a driver). |
66 | * @config: the configuration ops for this device. | 84 | * @config: the configuration ops for this device. |
85 | * @vringh_config: configuration ops for host vrings. | ||
67 | * @vqs: the list of virtqueues for this device. | 86 | * @vqs: the list of virtqueues for this device. |
68 | * @features: the features supported by both driver and device. | 87 | * @features: the features supported by both driver and device. |
69 | * @priv: private pointer for the driver's use. | 88 | * @priv: private pointer for the driver's use. |
@@ -73,6 +92,7 @@ struct virtio_device { | |||
73 | struct device dev; | 92 | struct device dev; |
74 | struct virtio_device_id id; | 93 | struct virtio_device_id id; |
75 | const struct virtio_config_ops *config; | 94 | const struct virtio_config_ops *config; |
95 | const struct vringh_config_ops *vringh_config; | ||
76 | struct list_head vqs; | 96 | struct list_head vqs; |
77 | /* Note that this is a Linux set_bit-style bitmap. */ | 97 | /* Note that this is a Linux set_bit-style bitmap. */ |
78 | unsigned long features[1]; | 98 | unsigned long features[1]; |
diff --git a/include/linux/virtio_caif.h b/include/linux/virtio_caif.h new file mode 100644 index 000000000000..5d2d3124ca3d --- /dev/null +++ b/include/linux/virtio_caif.h | |||
@@ -0,0 +1,24 @@ | |||
1 | /* | ||
2 | * Copyright (C) ST-Ericsson AB 2012 | ||
3 | * Author: Sjur Brændeland <sjur.brandeland@stericsson.com> | ||
4 | * | ||
5 | * This header is BSD licensed so | ||
6 | * anyone can use the definitions to implement compatible remote processors | ||
7 | */ | ||
8 | |||
9 | #ifndef VIRTIO_CAIF_H | ||
10 | #define VIRTIO_CAIF_H | ||
11 | |||
12 | #include <linux/types.h> | ||
13 | struct virtio_caif_transf_config { | ||
14 | u16 headroom; | ||
15 | u16 tailroom; | ||
16 | u32 mtu; | ||
17 | u8 reserved[4]; | ||
18 | }; | ||
19 | |||
20 | struct virtio_caif_config { | ||
21 | struct virtio_caif_transf_config uplink, downlink; | ||
22 | u8 reserved[8]; | ||
23 | }; | ||
24 | #endif | ||
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 63c6ea199519..ca3ad41c2c82 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h | |||
@@ -4,6 +4,63 @@ | |||
4 | #include <linux/irqreturn.h> | 4 | #include <linux/irqreturn.h> |
5 | #include <uapi/linux/virtio_ring.h> | 5 | #include <uapi/linux/virtio_ring.h> |
6 | 6 | ||
7 | /* | ||
8 | * Barriers in virtio are tricky. Non-SMP virtio guests can't assume | ||
9 | * they're not on an SMP host system, so they need to assume real | ||
10 | * barriers. Non-SMP virtio hosts could skip the barriers, but does | ||
11 | * anyone care? | ||
12 | * | ||
13 | * For virtio_pci on SMP, we don't need to order with respect to MMIO | ||
14 | * accesses through relaxed memory I/O windows, so smp_mb() et al are | ||
15 | * sufficient. | ||
16 | * | ||
17 | * For using virtio to talk to real devices (eg. other heterogeneous | ||
18 | * CPUs) we do need real barriers. In theory, we could be using both | ||
19 | * kinds of virtio, so it's a runtime decision, and the branch is | ||
20 | * actually quite cheap. | ||
21 | */ | ||
22 | |||
23 | #ifdef CONFIG_SMP | ||
24 | static inline void virtio_mb(bool weak_barriers) | ||
25 | { | ||
26 | if (weak_barriers) | ||
27 | smp_mb(); | ||
28 | else | ||
29 | mb(); | ||
30 | } | ||
31 | |||
32 | static inline void virtio_rmb(bool weak_barriers) | ||
33 | { | ||
34 | if (weak_barriers) | ||
35 | smp_rmb(); | ||
36 | else | ||
37 | rmb(); | ||
38 | } | ||
39 | |||
40 | static inline void virtio_wmb(bool weak_barriers) | ||
41 | { | ||
42 | if (weak_barriers) | ||
43 | smp_wmb(); | ||
44 | else | ||
45 | wmb(); | ||
46 | } | ||
47 | #else | ||
48 | static inline void virtio_mb(bool weak_barriers) | ||
49 | { | ||
50 | mb(); | ||
51 | } | ||
52 | |||
53 | static inline void virtio_rmb(bool weak_barriers) | ||
54 | { | ||
55 | rmb(); | ||
56 | } | ||
57 | |||
58 | static inline void virtio_wmb(bool weak_barriers) | ||
59 | { | ||
60 | wmb(); | ||
61 | } | ||
62 | #endif | ||
63 | |||
7 | struct virtio_device; | 64 | struct virtio_device; |
8 | struct virtqueue; | 65 | struct virtqueue; |
9 | 66 | ||
diff --git a/include/linux/vringh.h b/include/linux/vringh.h new file mode 100644 index 000000000000..749cde28728b --- /dev/null +++ b/include/linux/vringh.h | |||
@@ -0,0 +1,225 @@ | |||
1 | /* | ||
2 | * Linux host-side vring helpers; for when the kernel needs to access | ||
3 | * someone else's vring. | ||
4 | * | ||
5 | * Copyright IBM Corporation, 2013. | ||
6 | * Parts taken from drivers/vhost/vhost.c Copyright 2009 Red Hat, Inc. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License as published by | ||
10 | * the Free Software Foundation; either version 2 of the License, or | ||
11 | * (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, write to the Free Software | ||
20 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
21 | * | ||
22 | * Written by: Rusty Russell <rusty@rustcorp.com.au> | ||
23 | */ | ||
24 | #ifndef _LINUX_VRINGH_H | ||
25 | #define _LINUX_VRINGH_H | ||
26 | #include <uapi/linux/virtio_ring.h> | ||
27 | #include <linux/uio.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <asm/barrier.h> | ||
30 | |||
31 | /* virtio_ring with information needed for host access. */ | ||
32 | struct vringh { | ||
33 | /* Guest publishes used event idx (note: we always do). */ | ||
34 | bool event_indices; | ||
35 | |||
36 | /* Can we get away with weak barriers? */ | ||
37 | bool weak_barriers; | ||
38 | |||
39 | /* Last available index we saw (ie. where we're up to). */ | ||
40 | u16 last_avail_idx; | ||
41 | |||
42 | /* Last index we used. */ | ||
43 | u16 last_used_idx; | ||
44 | |||
45 | /* How many descriptors we've completed since last need_notify(). */ | ||
46 | u32 completed; | ||
47 | |||
48 | /* The vring (note: it may contain user pointers!) */ | ||
49 | struct vring vring; | ||
50 | |||
51 | /* The function to call to notify the guest about added buffers */ | ||
52 | void (*notify)(struct vringh *); | ||
53 | }; | ||
54 | |||
55 | /** | ||
56 | * struct vringh_config_ops - ops for creating a host vring from a virtio driver | ||
57 | * @find_vrhs: find the host vrings and instantiate them | ||
58 | * vdev: the virtio_device | ||
59 | * nhvrs: the number of host vrings to find | ||
60 | * hvrs: on success, includes new host vrings | ||
61 | * callbacks: array of driver callbacks, for each host vring | ||
62 | * include a NULL entry for vqs that do not need a callback | ||
63 | * Returns 0 on success or error status | ||
64 | * @del_vrhs: free the host vrings found by find_vrhs(). | ||
65 | */ | ||
66 | struct virtio_device; | ||
67 | typedef void vrh_callback_t(struct virtio_device *, struct vringh *); | ||
68 | struct vringh_config_ops { | ||
69 | int (*find_vrhs)(struct virtio_device *vdev, unsigned nhvrs, | ||
70 | struct vringh *vrhs[], vrh_callback_t *callbacks[]); | ||
71 | void (*del_vrhs)(struct virtio_device *vdev); | ||
72 | }; | ||
73 | |||
74 | /* The memory the vring can access, and what offset to apply. */ | ||
75 | struct vringh_range { | ||
76 | u64 start, end_incl; | ||
77 | u64 offset; | ||
78 | }; | ||
79 | |||
80 | /** | ||
81 | * struct vringh_iov - iovec mangler. | ||
82 | * | ||
83 | * Mangles iovec in place, and restores it. | ||
84 | * Remaining data is iov + i, of used - i elements. | ||
85 | */ | ||
86 | struct vringh_iov { | ||
87 | struct iovec *iov; | ||
88 | size_t consumed; /* Within iov[i] */ | ||
89 | unsigned i, used, max_num; | ||
90 | }; | ||
91 | |||
92 | /** | ||
93 | * struct vringh_iov - kvec mangler. | ||
94 | * | ||
95 | * Mangles kvec in place, and restores it. | ||
96 | * Remaining data is iov + i, of used - i elements. | ||
97 | */ | ||
98 | struct vringh_kiov { | ||
99 | struct kvec *iov; | ||
100 | size_t consumed; /* Within iov[i] */ | ||
101 | unsigned i, used, max_num; | ||
102 | }; | ||
103 | |||
104 | /* Flag on max_num to indicate we're kmalloced. */ | ||
105 | #define VRINGH_IOV_ALLOCATED 0x8000000 | ||
106 | |||
107 | /* Helpers for userspace vrings. */ | ||
108 | int vringh_init_user(struct vringh *vrh, u32 features, | ||
109 | unsigned int num, bool weak_barriers, | ||
110 | struct vring_desc __user *desc, | ||
111 | struct vring_avail __user *avail, | ||
112 | struct vring_used __user *used); | ||
113 | |||
114 | static inline void vringh_iov_init(struct vringh_iov *iov, | ||
115 | struct iovec *iovec, unsigned num) | ||
116 | { | ||
117 | iov->used = iov->i = 0; | ||
118 | iov->consumed = 0; | ||
119 | iov->max_num = num; | ||
120 | iov->iov = iovec; | ||
121 | } | ||
122 | |||
123 | static inline void vringh_iov_reset(struct vringh_iov *iov) | ||
124 | { | ||
125 | iov->iov[iov->i].iov_len += iov->consumed; | ||
126 | iov->iov[iov->i].iov_base -= iov->consumed; | ||
127 | iov->consumed = 0; | ||
128 | iov->i = 0; | ||
129 | } | ||
130 | |||
131 | static inline void vringh_iov_cleanup(struct vringh_iov *iov) | ||
132 | { | ||
133 | if (iov->max_num & VRINGH_IOV_ALLOCATED) | ||
134 | kfree(iov->iov); | ||
135 | iov->max_num = iov->used = iov->i = iov->consumed = 0; | ||
136 | iov->iov = NULL; | ||
137 | } | ||
138 | |||
139 | /* Convert a descriptor into iovecs. */ | ||
140 | int vringh_getdesc_user(struct vringh *vrh, | ||
141 | struct vringh_iov *riov, | ||
142 | struct vringh_iov *wiov, | ||
143 | bool (*getrange)(struct vringh *vrh, | ||
144 | u64 addr, struct vringh_range *r), | ||
145 | u16 *head); | ||
146 | |||
147 | /* Copy bytes from readable vsg, consuming it (and incrementing wiov->i). */ | ||
148 | ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len); | ||
149 | |||
150 | /* Copy bytes into writable vsg, consuming it (and incrementing wiov->i). */ | ||
151 | ssize_t vringh_iov_push_user(struct vringh_iov *wiov, | ||
152 | const void *src, size_t len); | ||
153 | |||
154 | /* Mark a descriptor as used. */ | ||
155 | int vringh_complete_user(struct vringh *vrh, u16 head, u32 len); | ||
156 | int vringh_complete_multi_user(struct vringh *vrh, | ||
157 | const struct vring_used_elem used[], | ||
158 | unsigned num_used); | ||
159 | |||
160 | /* Pretend we've never seen descriptor (for easy error handling). */ | ||
161 | void vringh_abandon_user(struct vringh *vrh, unsigned int num); | ||
162 | |||
163 | /* Do we need to fire the eventfd to notify the other side? */ | ||
164 | int vringh_need_notify_user(struct vringh *vrh); | ||
165 | |||
166 | bool vringh_notify_enable_user(struct vringh *vrh); | ||
167 | void vringh_notify_disable_user(struct vringh *vrh); | ||
168 | |||
169 | /* Helpers for kernelspace vrings. */ | ||
170 | int vringh_init_kern(struct vringh *vrh, u32 features, | ||
171 | unsigned int num, bool weak_barriers, | ||
172 | struct vring_desc *desc, | ||
173 | struct vring_avail *avail, | ||
174 | struct vring_used *used); | ||
175 | |||
176 | static inline void vringh_kiov_init(struct vringh_kiov *kiov, | ||
177 | struct kvec *kvec, unsigned num) | ||
178 | { | ||
179 | kiov->used = kiov->i = 0; | ||
180 | kiov->consumed = 0; | ||
181 | kiov->max_num = num; | ||
182 | kiov->iov = kvec; | ||
183 | } | ||
184 | |||
185 | static inline void vringh_kiov_reset(struct vringh_kiov *kiov) | ||
186 | { | ||
187 | kiov->iov[kiov->i].iov_len += kiov->consumed; | ||
188 | kiov->iov[kiov->i].iov_base -= kiov->consumed; | ||
189 | kiov->consumed = 0; | ||
190 | kiov->i = 0; | ||
191 | } | ||
192 | |||
193 | static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov) | ||
194 | { | ||
195 | if (kiov->max_num & VRINGH_IOV_ALLOCATED) | ||
196 | kfree(kiov->iov); | ||
197 | kiov->max_num = kiov->used = kiov->i = kiov->consumed = 0; | ||
198 | kiov->iov = NULL; | ||
199 | } | ||
200 | |||
201 | int vringh_getdesc_kern(struct vringh *vrh, | ||
202 | struct vringh_kiov *riov, | ||
203 | struct vringh_kiov *wiov, | ||
204 | u16 *head, | ||
205 | gfp_t gfp); | ||
206 | |||
207 | ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len); | ||
208 | ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, | ||
209 | const void *src, size_t len); | ||
210 | void vringh_abandon_kern(struct vringh *vrh, unsigned int num); | ||
211 | int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len); | ||
212 | |||
213 | bool vringh_notify_enable_kern(struct vringh *vrh); | ||
214 | void vringh_notify_disable_kern(struct vringh *vrh); | ||
215 | |||
216 | int vringh_need_notify_kern(struct vringh *vrh); | ||
217 | |||
218 | /* Notify the guest about buffers added to the used ring */ | ||
219 | static inline void vringh_notify(struct vringh *vrh) | ||
220 | { | ||
221 | if (vrh->notify) | ||
222 | vrh->notify(vrh); | ||
223 | } | ||
224 | |||
225 | #endif /* _LINUX_VRINGH_H */ | ||
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index 652dc8bea921..5e26f61b5df5 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h | |||
@@ -52,8 +52,8 @@ struct virtio_balloon_config | |||
52 | #define VIRTIO_BALLOON_S_NR 6 | 52 | #define VIRTIO_BALLOON_S_NR 6 |
53 | 53 | ||
54 | struct virtio_balloon_stat { | 54 | struct virtio_balloon_stat { |
55 | u16 tag; | 55 | __u16 tag; |
56 | u64 val; | 56 | __u64 val; |
57 | } __attribute__((packed)); | 57 | } __attribute__((packed)); |
58 | 58 | ||
59 | #endif /* _LINUX_VIRTIO_BALLOON_H */ | 59 | #endif /* _LINUX_VIRTIO_BALLOON_H */ |
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index a7630d04029f..284fc3a05f7b 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h | |||
@@ -38,5 +38,6 @@ | |||
38 | #define VIRTIO_ID_SCSI 8 /* virtio scsi */ | 38 | #define VIRTIO_ID_SCSI 8 /* virtio scsi */ |
39 | #define VIRTIO_ID_9P 9 /* 9p virtio console */ | 39 | #define VIRTIO_ID_9P 9 /* 9p virtio console */ |
40 | #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ | 40 | #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ |
41 | #define VIRTIO_ID_CAIF 12 /* Virtio caif */ | ||
41 | 42 | ||
42 | #endif /* _LINUX_VIRTIO_IDS_H */ | 43 | #endif /* _LINUX_VIRTIO_IDS_H */ |
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index de2e950a0a7a..e1c26b101830 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c | |||
@@ -194,11 +194,14 @@ static int pack_sg_list(struct scatterlist *sg, int start, | |||
194 | if (s > count) | 194 | if (s > count) |
195 | s = count; | 195 | s = count; |
196 | BUG_ON(index > limit); | 196 | BUG_ON(index > limit); |
197 | /* Make sure we don't terminate early. */ | ||
198 | sg_unmark_end(&sg[index]); | ||
197 | sg_set_buf(&sg[index++], data, s); | 199 | sg_set_buf(&sg[index++], data, s); |
198 | count -= s; | 200 | count -= s; |
199 | data += s; | 201 | data += s; |
200 | } | 202 | } |
201 | 203 | if (index-start) | |
204 | sg_mark_end(&sg[index - 1]); | ||
202 | return index-start; | 205 | return index-start; |
203 | } | 206 | } |
204 | 207 | ||
@@ -236,12 +239,17 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit, | |||
236 | s = rest_of_page(data); | 239 | s = rest_of_page(data); |
237 | if (s > count) | 240 | if (s > count) |
238 | s = count; | 241 | s = count; |
242 | /* Make sure we don't terminate early. */ | ||
243 | sg_unmark_end(&sg[index]); | ||
239 | sg_set_page(&sg[index++], pdata[i++], s, data_off); | 244 | sg_set_page(&sg[index++], pdata[i++], s, data_off); |
240 | data_off = 0; | 245 | data_off = 0; |
241 | data += s; | 246 | data += s; |
242 | count -= s; | 247 | count -= s; |
243 | nr_pages--; | 248 | nr_pages--; |
244 | } | 249 | } |
250 | |||
251 | if (index-start) | ||
252 | sg_mark_end(&sg[index - 1]); | ||
245 | return index - start; | 253 | return index - start; |
246 | } | 254 | } |
247 | 255 | ||
@@ -256,9 +264,10 @@ static int | |||
256 | p9_virtio_request(struct p9_client *client, struct p9_req_t *req) | 264 | p9_virtio_request(struct p9_client *client, struct p9_req_t *req) |
257 | { | 265 | { |
258 | int err; | 266 | int err; |
259 | int in, out; | 267 | int in, out, out_sgs, in_sgs; |
260 | unsigned long flags; | 268 | unsigned long flags; |
261 | struct virtio_chan *chan = client->trans; | 269 | struct virtio_chan *chan = client->trans; |
270 | struct scatterlist *sgs[2]; | ||
262 | 271 | ||
263 | p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n"); | 272 | p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n"); |
264 | 273 | ||
@@ -266,14 +275,19 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) | |||
266 | req_retry: | 275 | req_retry: |
267 | spin_lock_irqsave(&chan->lock, flags); | 276 | spin_lock_irqsave(&chan->lock, flags); |
268 | 277 | ||
278 | out_sgs = in_sgs = 0; | ||
269 | /* Handle out VirtIO ring buffers */ | 279 | /* Handle out VirtIO ring buffers */ |
270 | out = pack_sg_list(chan->sg, 0, | 280 | out = pack_sg_list(chan->sg, 0, |
271 | VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); | 281 | VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); |
282 | if (out) | ||
283 | sgs[out_sgs++] = chan->sg; | ||
272 | 284 | ||
273 | in = pack_sg_list(chan->sg, out, | 285 | in = pack_sg_list(chan->sg, out, |
274 | VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity); | 286 | VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity); |
287 | if (in) | ||
288 | sgs[out_sgs + in_sgs++] = chan->sg + out; | ||
275 | 289 | ||
276 | err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, | 290 | err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc, |
277 | GFP_ATOMIC); | 291 | GFP_ATOMIC); |
278 | if (err < 0) { | 292 | if (err < 0) { |
279 | if (err == -ENOSPC) { | 293 | if (err == -ENOSPC) { |
@@ -289,7 +303,7 @@ req_retry: | |||
289 | } else { | 303 | } else { |
290 | spin_unlock_irqrestore(&chan->lock, flags); | 304 | spin_unlock_irqrestore(&chan->lock, flags); |
291 | p9_debug(P9_DEBUG_TRANS, | 305 | p9_debug(P9_DEBUG_TRANS, |
292 | "virtio rpc add_buf returned failure\n"); | 306 | "virtio rpc add_sgs returned failure\n"); |
293 | return -EIO; | 307 | return -EIO; |
294 | } | 308 | } |
295 | } | 309 | } |
@@ -351,11 +365,12 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, | |||
351 | char *uidata, char *uodata, int inlen, | 365 | char *uidata, char *uodata, int inlen, |
352 | int outlen, int in_hdr_len, int kern_buf) | 366 | int outlen, int in_hdr_len, int kern_buf) |
353 | { | 367 | { |
354 | int in, out, err; | 368 | int in, out, err, out_sgs, in_sgs; |
355 | unsigned long flags; | 369 | unsigned long flags; |
356 | int in_nr_pages = 0, out_nr_pages = 0; | 370 | int in_nr_pages = 0, out_nr_pages = 0; |
357 | struct page **in_pages = NULL, **out_pages = NULL; | 371 | struct page **in_pages = NULL, **out_pages = NULL; |
358 | struct virtio_chan *chan = client->trans; | 372 | struct virtio_chan *chan = client->trans; |
373 | struct scatterlist *sgs[4]; | ||
359 | 374 | ||
360 | p9_debug(P9_DEBUG_TRANS, "virtio request\n"); | 375 | p9_debug(P9_DEBUG_TRANS, "virtio request\n"); |
361 | 376 | ||
@@ -396,13 +411,22 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, | |||
396 | req->status = REQ_STATUS_SENT; | 411 | req->status = REQ_STATUS_SENT; |
397 | req_retry_pinned: | 412 | req_retry_pinned: |
398 | spin_lock_irqsave(&chan->lock, flags); | 413 | spin_lock_irqsave(&chan->lock, flags); |
414 | |||
415 | out_sgs = in_sgs = 0; | ||
416 | |||
399 | /* out data */ | 417 | /* out data */ |
400 | out = pack_sg_list(chan->sg, 0, | 418 | out = pack_sg_list(chan->sg, 0, |
401 | VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); | 419 | VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); |
402 | 420 | ||
403 | if (out_pages) | 421 | if (out) |
422 | sgs[out_sgs++] = chan->sg; | ||
423 | |||
424 | if (out_pages) { | ||
425 | sgs[out_sgs++] = chan->sg + out; | ||
404 | out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, | 426 | out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, |
405 | out_pages, out_nr_pages, uodata, outlen); | 427 | out_pages, out_nr_pages, uodata, outlen); |
428 | } | ||
429 | |||
406 | /* | 430 | /* |
407 | * Take care of in data | 431 | * Take care of in data |
408 | * For example TREAD have 11. | 432 | * For example TREAD have 11. |
@@ -412,11 +436,17 @@ req_retry_pinned: | |||
412 | */ | 436 | */ |
413 | in = pack_sg_list(chan->sg, out, | 437 | in = pack_sg_list(chan->sg, out, |
414 | VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len); | 438 | VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len); |
415 | if (in_pages) | 439 | if (in) |
440 | sgs[out_sgs + in_sgs++] = chan->sg + out; | ||
441 | |||
442 | if (in_pages) { | ||
443 | sgs[out_sgs + in_sgs++] = chan->sg + out + in; | ||
416 | in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM, | 444 | in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM, |
417 | in_pages, in_nr_pages, uidata, inlen); | 445 | in_pages, in_nr_pages, uidata, inlen); |
446 | } | ||
418 | 447 | ||
419 | err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, | 448 | BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs)); |
449 | err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc, | ||
420 | GFP_ATOMIC); | 450 | GFP_ATOMIC); |
421 | if (err < 0) { | 451 | if (err < 0) { |
422 | if (err == -ENOSPC) { | 452 | if (err == -ENOSPC) { |
@@ -432,7 +462,7 @@ req_retry_pinned: | |||
432 | } else { | 462 | } else { |
433 | spin_unlock_irqrestore(&chan->lock, flags); | 463 | spin_unlock_irqrestore(&chan->lock, flags); |
434 | p9_debug(P9_DEBUG_TRANS, | 464 | p9_debug(P9_DEBUG_TRANS, |
435 | "virtio rpc add_buf returned failure\n"); | 465 | "virtio rpc add_sgs returned failure\n"); |
436 | err = -EIO; | 466 | err = -EIO; |
437 | goto err_out; | 467 | goto err_out; |
438 | } | 468 | } |
diff --git a/tools/lguest/lguest.txt b/tools/lguest/lguest.txt index 7203ace65e83..06e1f4649511 100644 --- a/tools/lguest/lguest.txt +++ b/tools/lguest/lguest.txt | |||
@@ -70,7 +70,7 @@ Running Lguest: | |||
70 | 70 | ||
71 | - Run an lguest as root: | 71 | - Run an lguest as root: |
72 | 72 | ||
73 | Documentation/virtual/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ | 73 | tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ |
74 | --block=rootfile root=/dev/vda | 74 | --block=rootfile root=/dev/vda |
75 | 75 | ||
76 | Explanation: | 76 | Explanation: |
diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile index d1d442ed106a..3187c62d9814 100644 --- a/tools/virtio/Makefile +++ b/tools/virtio/Makefile | |||
@@ -1,12 +1,14 @@ | |||
1 | all: test mod | 1 | all: test mod |
2 | test: virtio_test | 2 | test: virtio_test vringh_test |
3 | virtio_test: virtio_ring.o virtio_test.o | 3 | virtio_test: virtio_ring.o virtio_test.o |
4 | CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -MMD | 4 | vringh_test: vringh_test.o vringh.o virtio_ring.o |
5 | vpath %.c ../../drivers/virtio | 5 | |
6 | CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE | ||
7 | vpath %.c ../../drivers/virtio ../../drivers/vhost | ||
6 | mod: | 8 | mod: |
7 | ${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test | 9 | ${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test |
8 | .PHONY: all test mod clean | 10 | .PHONY: all test mod clean |
9 | clean: | 11 | clean: |
10 | ${RM} *.o vhost_test/*.o vhost_test/.*.cmd \ | 12 | ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \ |
11 | vhost_test/Module.symvers vhost_test/modules.order *.d | 13 | vhost_test/Module.symvers vhost_test/modules.order *.d |
12 | -include *.d | 14 | -include *.d |
diff --git a/tools/virtio/asm/barrier.h b/tools/virtio/asm/barrier.h new file mode 100644 index 000000000000..aff61e13306c --- /dev/null +++ b/tools/virtio/asm/barrier.h | |||
@@ -0,0 +1,14 @@ | |||
1 | #if defined(__i386__) || defined(__x86_64__) | ||
2 | #define barrier() asm volatile("" ::: "memory") | ||
3 | #define mb() __sync_synchronize() | ||
4 | |||
5 | #define smp_mb() mb() | ||
6 | # define smp_rmb() barrier() | ||
7 | # define smp_wmb() barrier() | ||
8 | /* Weak barriers should be used. If not - it's a bug */ | ||
9 | # define rmb() abort() | ||
10 | # define wmb() abort() | ||
11 | #else | ||
12 | #error Please fill in barrier macros | ||
13 | #endif | ||
14 | |||
diff --git a/tools/virtio/linux/bug.h b/tools/virtio/linux/bug.h new file mode 100644 index 000000000000..fb94f0787c47 --- /dev/null +++ b/tools/virtio/linux/bug.h | |||
@@ -0,0 +1,10 @@ | |||
1 | #ifndef BUG_H | ||
2 | #define BUG_H | ||
3 | |||
4 | #define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond)) | ||
5 | |||
6 | #define BUILD_BUG_ON(x) | ||
7 | |||
8 | #define BUG() abort() | ||
9 | |||
10 | #endif /* BUG_H */ | ||
diff --git a/tools/virtio/linux/err.h b/tools/virtio/linux/err.h new file mode 100644 index 000000000000..e32eff8b2a14 --- /dev/null +++ b/tools/virtio/linux/err.h | |||
@@ -0,0 +1,26 @@ | |||
1 | #ifndef ERR_H | ||
2 | #define ERR_H | ||
3 | #define MAX_ERRNO 4095 | ||
4 | |||
5 | #define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO) | ||
6 | |||
7 | static inline void * __must_check ERR_PTR(long error) | ||
8 | { | ||
9 | return (void *) error; | ||
10 | } | ||
11 | |||
12 | static inline long __must_check PTR_ERR(const void *ptr) | ||
13 | { | ||
14 | return (long) ptr; | ||
15 | } | ||
16 | |||
17 | static inline long __must_check IS_ERR(const void *ptr) | ||
18 | { | ||
19 | return IS_ERR_VALUE((unsigned long)ptr); | ||
20 | } | ||
21 | |||
22 | static inline long __must_check IS_ERR_OR_NULL(const void *ptr) | ||
23 | { | ||
24 | return !ptr || IS_ERR_VALUE((unsigned long)ptr); | ||
25 | } | ||
26 | #endif /* ERR_H */ | ||
diff --git a/tools/virtio/linux/export.h b/tools/virtio/linux/export.h new file mode 100644 index 000000000000..7311d326894a --- /dev/null +++ b/tools/virtio/linux/export.h | |||
@@ -0,0 +1,5 @@ | |||
1 | #define EXPORT_SYMBOL(sym) | ||
2 | #define EXPORT_SYMBOL_GPL(sym) | ||
3 | #define EXPORT_SYMBOL_GPL_FUTURE(sym) | ||
4 | #define EXPORT_UNUSED_SYMBOL(sym) | ||
5 | #define EXPORT_UNUSED_SYMBOL_GPL(sym) | ||
diff --git a/tools/virtio/linux/irqreturn.h b/tools/virtio/linux/irqreturn.h new file mode 100644 index 000000000000..a3c4e7be7089 --- /dev/null +++ b/tools/virtio/linux/irqreturn.h | |||
@@ -0,0 +1 @@ | |||
#include "../../../include/linux/irqreturn.h" | |||
diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h new file mode 100644 index 000000000000..fba705963968 --- /dev/null +++ b/tools/virtio/linux/kernel.h | |||
@@ -0,0 +1,112 @@ | |||
1 | #ifndef KERNEL_H | ||
2 | #define KERNEL_H | ||
3 | #include <stdbool.h> | ||
4 | #include <stdlib.h> | ||
5 | #include <stddef.h> | ||
6 | #include <stdio.h> | ||
7 | #include <string.h> | ||
8 | #include <assert.h> | ||
9 | #include <stdarg.h> | ||
10 | |||
11 | #include <linux/types.h> | ||
12 | #include <linux/printk.h> | ||
13 | #include <linux/bug.h> | ||
14 | #include <errno.h> | ||
15 | #include <unistd.h> | ||
16 | #include <asm/barrier.h> | ||
17 | |||
18 | #define CONFIG_SMP | ||
19 | |||
20 | #define PAGE_SIZE getpagesize() | ||
21 | #define PAGE_MASK (~(PAGE_SIZE-1)) | ||
22 | |||
23 | typedef unsigned long long dma_addr_t; | ||
24 | typedef size_t __kernel_size_t; | ||
25 | |||
26 | struct page { | ||
27 | unsigned long long dummy; | ||
28 | }; | ||
29 | |||
30 | /* Physical == Virtual */ | ||
31 | #define virt_to_phys(p) ((unsigned long)p) | ||
32 | #define phys_to_virt(a) ((void *)(unsigned long)(a)) | ||
33 | /* Page address: Virtual / 4K */ | ||
34 | #define page_to_phys(p) ((dma_addr_t)(unsigned long)(p)) | ||
35 | #define virt_to_page(p) ((struct page *)((unsigned long)p & PAGE_MASK)) | ||
36 | |||
37 | #define offset_in_page(p) (((unsigned long)p) % PAGE_SIZE) | ||
38 | |||
39 | #define __printf(a,b) __attribute__((format(printf,a,b))) | ||
40 | |||
41 | typedef enum { | ||
42 | GFP_KERNEL, | ||
43 | GFP_ATOMIC, | ||
44 | __GFP_HIGHMEM, | ||
45 | __GFP_HIGH | ||
46 | } gfp_t; | ||
47 | |||
48 | #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) | ||
49 | |||
50 | extern void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; | ||
51 | static inline void *kmalloc(size_t s, gfp_t gfp) | ||
52 | { | ||
53 | if (__kmalloc_fake) | ||
54 | return __kmalloc_fake; | ||
55 | return malloc(s); | ||
56 | } | ||
57 | |||
58 | static inline void kfree(void *p) | ||
59 | { | ||
60 | if (p >= __kfree_ignore_start && p < __kfree_ignore_end) | ||
61 | return; | ||
62 | free(p); | ||
63 | } | ||
64 | |||
65 | static inline void *krealloc(void *p, size_t s, gfp_t gfp) | ||
66 | { | ||
67 | return realloc(p, s); | ||
68 | } | ||
69 | |||
70 | |||
71 | static inline unsigned long __get_free_page(gfp_t gfp) | ||
72 | { | ||
73 | void *p; | ||
74 | |||
75 | posix_memalign(&p, PAGE_SIZE, PAGE_SIZE); | ||
76 | return (unsigned long)p; | ||
77 | } | ||
78 | |||
79 | static inline void free_page(unsigned long addr) | ||
80 | { | ||
81 | free((void *)addr); | ||
82 | } | ||
83 | |||
84 | #define container_of(ptr, type, member) ({ \ | ||
85 | const typeof( ((type *)0)->member ) *__mptr = (ptr); \ | ||
86 | (type *)( (char *)__mptr - offsetof(type,member) );}) | ||
87 | |||
88 | #define uninitialized_var(x) x = x | ||
89 | |||
90 | # ifndef likely | ||
91 | # define likely(x) (__builtin_expect(!!(x), 1)) | ||
92 | # endif | ||
93 | # ifndef unlikely | ||
94 | # define unlikely(x) (__builtin_expect(!!(x), 0)) | ||
95 | # endif | ||
96 | |||
97 | #define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__) | ||
98 | #ifdef DEBUG | ||
99 | #define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__) | ||
100 | #else | ||
101 | #define pr_debug(format, ...) do {} while (0) | ||
102 | #endif | ||
103 | #define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) | ||
104 | #define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) | ||
105 | |||
106 | #define min(x, y) ({ \ | ||
107 | typeof(x) _min1 = (x); \ | ||
108 | typeof(y) _min2 = (y); \ | ||
109 | (void) (&_min1 == &_min2); \ | ||
110 | _min1 < _min2 ? _min1 : _min2; }) | ||
111 | |||
112 | #endif /* KERNEL_H */ | ||
diff --git a/tools/virtio/linux/module.h b/tools/virtio/linux/module.h index e69de29bb2d1..3039a7e972b6 100644 --- a/tools/virtio/linux/module.h +++ b/tools/virtio/linux/module.h | |||
@@ -0,0 +1 @@ | |||
#include <linux/export.h> | |||
diff --git a/tools/virtio/linux/printk.h b/tools/virtio/linux/printk.h new file mode 100644 index 000000000000..9f2423bd89c2 --- /dev/null +++ b/tools/virtio/linux/printk.h | |||
@@ -0,0 +1,4 @@ | |||
1 | #include "../../../include/linux/kern_levels.h" | ||
2 | |||
3 | #define printk printf | ||
4 | #define vprintk vprintf | ||
diff --git a/tools/virtio/linux/ratelimit.h b/tools/virtio/linux/ratelimit.h new file mode 100644 index 000000000000..dcce1725f90d --- /dev/null +++ b/tools/virtio/linux/ratelimit.h | |||
@@ -0,0 +1,4 @@ | |||
1 | #define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) int name = 0 | ||
2 | |||
3 | #define __ratelimit(x) (*(x)) | ||
4 | |||
diff --git a/tools/virtio/linux/scatterlist.h b/tools/virtio/linux/scatterlist.h new file mode 100644 index 000000000000..68c9e2adc996 --- /dev/null +++ b/tools/virtio/linux/scatterlist.h | |||
@@ -0,0 +1,189 @@ | |||
1 | #ifndef SCATTERLIST_H | ||
2 | #define SCATTERLIST_H | ||
3 | #include <linux/kernel.h> | ||
4 | |||
5 | struct scatterlist { | ||
6 | unsigned long page_link; | ||
7 | unsigned int offset; | ||
8 | unsigned int length; | ||
9 | dma_addr_t dma_address; | ||
10 | }; | ||
11 | |||
12 | /* Scatterlist helpers, stolen from linux/scatterlist.h */ | ||
13 | #define sg_is_chain(sg) ((sg)->page_link & 0x01) | ||
14 | #define sg_is_last(sg) ((sg)->page_link & 0x02) | ||
15 | #define sg_chain_ptr(sg) \ | ||
16 | ((struct scatterlist *) ((sg)->page_link & ~0x03)) | ||
17 | |||
18 | /** | ||
19 | * sg_assign_page - Assign a given page to an SG entry | ||
20 | * @sg: SG entry | ||
21 | * @page: The page | ||
22 | * | ||
23 | * Description: | ||
24 | * Assign page to sg entry. Also see sg_set_page(), the most commonly used | ||
25 | * variant. | ||
26 | * | ||
27 | **/ | ||
28 | static inline void sg_assign_page(struct scatterlist *sg, struct page *page) | ||
29 | { | ||
30 | unsigned long page_link = sg->page_link & 0x3; | ||
31 | |||
32 | /* | ||
33 | * In order for the low bit stealing approach to work, pages | ||
34 | * must be aligned at a 32-bit boundary as a minimum. | ||
35 | */ | ||
36 | BUG_ON((unsigned long) page & 0x03); | ||
37 | #ifdef CONFIG_DEBUG_SG | ||
38 | BUG_ON(sg->sg_magic != SG_MAGIC); | ||
39 | BUG_ON(sg_is_chain(sg)); | ||
40 | #endif | ||
41 | sg->page_link = page_link | (unsigned long) page; | ||
42 | } | ||
43 | |||
44 | /** | ||
45 | * sg_set_page - Set sg entry to point at given page | ||
46 | * @sg: SG entry | ||
47 | * @page: The page | ||
48 | * @len: Length of data | ||
49 | * @offset: Offset into page | ||
50 | * | ||
51 | * Description: | ||
52 | * Use this function to set an sg entry pointing at a page, never assign | ||
53 | * the page directly. We encode sg table information in the lower bits | ||
54 | * of the page pointer. See sg_page() for looking up the page belonging | ||
55 | * to an sg entry. | ||
56 | * | ||
57 | **/ | ||
58 | static inline void sg_set_page(struct scatterlist *sg, struct page *page, | ||
59 | unsigned int len, unsigned int offset) | ||
60 | { | ||
61 | sg_assign_page(sg, page); | ||
62 | sg->offset = offset; | ||
63 | sg->length = len; | ||
64 | } | ||
65 | |||
66 | static inline struct page *sg_page(struct scatterlist *sg) | ||
67 | { | ||
68 | #ifdef CONFIG_DEBUG_SG | ||
69 | BUG_ON(sg->sg_magic != SG_MAGIC); | ||
70 | BUG_ON(sg_is_chain(sg)); | ||
71 | #endif | ||
72 | return (struct page *)((sg)->page_link & ~0x3); | ||
73 | } | ||
74 | |||
75 | /* | ||
76 | * Loop over each sg element, following the pointer to a new list if necessary | ||
77 | */ | ||
78 | #define for_each_sg(sglist, sg, nr, __i) \ | ||
79 | for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg)) | ||
80 | |||
81 | /** | ||
82 | * sg_chain - Chain two sglists together | ||
83 | * @prv: First scatterlist | ||
84 | * @prv_nents: Number of entries in prv | ||
85 | * @sgl: Second scatterlist | ||
86 | * | ||
87 | * Description: | ||
88 | * Links @prv@ and @sgl@ together, to form a longer scatterlist. | ||
89 | * | ||
90 | **/ | ||
91 | static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, | ||
92 | struct scatterlist *sgl) | ||
93 | { | ||
94 | /* | ||
95 | * offset and length are unused for chain entry. Clear them. | ||
96 | */ | ||
97 | prv[prv_nents - 1].offset = 0; | ||
98 | prv[prv_nents - 1].length = 0; | ||
99 | |||
100 | /* | ||
101 | * Set lowest bit to indicate a link pointer, and make sure to clear | ||
102 | * the termination bit if it happens to be set. | ||
103 | */ | ||
104 | prv[prv_nents - 1].page_link = ((unsigned long) sgl | 0x01) & ~0x02; | ||
105 | } | ||
106 | |||
107 | /** | ||
108 | * sg_mark_end - Mark the end of the scatterlist | ||
109 | * @sg: SG entryScatterlist | ||
110 | * | ||
111 | * Description: | ||
112 | * Marks the passed in sg entry as the termination point for the sg | ||
113 | * table. A call to sg_next() on this entry will return NULL. | ||
114 | * | ||
115 | **/ | ||
116 | static inline void sg_mark_end(struct scatterlist *sg) | ||
117 | { | ||
118 | #ifdef CONFIG_DEBUG_SG | ||
119 | BUG_ON(sg->sg_magic != SG_MAGIC); | ||
120 | #endif | ||
121 | /* | ||
122 | * Set termination bit, clear potential chain bit | ||
123 | */ | ||
124 | sg->page_link |= 0x02; | ||
125 | sg->page_link &= ~0x01; | ||
126 | } | ||
127 | |||
128 | /** | ||
129 | * sg_unmark_end - Undo setting the end of the scatterlist | ||
130 | * @sg: SG entryScatterlist | ||
131 | * | ||
132 | * Description: | ||
133 | * Removes the termination marker from the given entry of the scatterlist. | ||
134 | * | ||
135 | **/ | ||
136 | static inline void sg_unmark_end(struct scatterlist *sg) | ||
137 | { | ||
138 | #ifdef CONFIG_DEBUG_SG | ||
139 | BUG_ON(sg->sg_magic != SG_MAGIC); | ||
140 | #endif | ||
141 | sg->page_link &= ~0x02; | ||
142 | } | ||
143 | |||
144 | static inline struct scatterlist *sg_next(struct scatterlist *sg) | ||
145 | { | ||
146 | #ifdef CONFIG_DEBUG_SG | ||
147 | BUG_ON(sg->sg_magic != SG_MAGIC); | ||
148 | #endif | ||
149 | if (sg_is_last(sg)) | ||
150 | return NULL; | ||
151 | |||
152 | sg++; | ||
153 | if (unlikely(sg_is_chain(sg))) | ||
154 | sg = sg_chain_ptr(sg); | ||
155 | |||
156 | return sg; | ||
157 | } | ||
158 | |||
159 | static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents) | ||
160 | { | ||
161 | memset(sgl, 0, sizeof(*sgl) * nents); | ||
162 | #ifdef CONFIG_DEBUG_SG | ||
163 | { | ||
164 | unsigned int i; | ||
165 | for (i = 0; i < nents; i++) | ||
166 | sgl[i].sg_magic = SG_MAGIC; | ||
167 | } | ||
168 | #endif | ||
169 | sg_mark_end(&sgl[nents - 1]); | ||
170 | } | ||
171 | |||
172 | static inline dma_addr_t sg_phys(struct scatterlist *sg) | ||
173 | { | ||
174 | return page_to_phys(sg_page(sg)) + sg->offset; | ||
175 | } | ||
176 | |||
177 | static inline void sg_set_buf(struct scatterlist *sg, const void *buf, | ||
178 | unsigned int buflen) | ||
179 | { | ||
180 | sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); | ||
181 | } | ||
182 | |||
183 | static inline void sg_init_one(struct scatterlist *sg, | ||
184 | const void *buf, unsigned int buflen) | ||
185 | { | ||
186 | sg_init_table(sg, 1); | ||
187 | sg_set_buf(sg, buf, buflen); | ||
188 | } | ||
189 | #endif /* SCATTERLIST_H */ | ||
diff --git a/tools/virtio/linux/types.h b/tools/virtio/linux/types.h new file mode 100644 index 000000000000..f8ebb9a2b3d6 --- /dev/null +++ b/tools/virtio/linux/types.h | |||
@@ -0,0 +1,28 @@ | |||
1 | #ifndef TYPES_H | ||
2 | #define TYPES_H | ||
3 | #include <stdint.h> | ||
4 | |||
5 | #define __force | ||
6 | #define __user | ||
7 | #define __must_check | ||
8 | #define __cold | ||
9 | |||
10 | typedef uint64_t u64; | ||
11 | typedef int64_t s64; | ||
12 | typedef uint32_t u32; | ||
13 | typedef int32_t s32; | ||
14 | typedef uint16_t u16; | ||
15 | typedef int16_t s16; | ||
16 | typedef uint8_t u8; | ||
17 | typedef int8_t s8; | ||
18 | |||
19 | typedef uint64_t __u64; | ||
20 | typedef int64_t __s64; | ||
21 | typedef uint32_t __u32; | ||
22 | typedef int32_t __s32; | ||
23 | typedef uint16_t __u16; | ||
24 | typedef int16_t __s16; | ||
25 | typedef uint8_t __u8; | ||
26 | typedef int8_t __s8; | ||
27 | |||
28 | #endif /* TYPES_H */ | ||
diff --git a/tools/virtio/linux/uaccess.h b/tools/virtio/linux/uaccess.h new file mode 100644 index 000000000000..0a578fe18653 --- /dev/null +++ b/tools/virtio/linux/uaccess.h | |||
@@ -0,0 +1,50 @@ | |||
1 | #ifndef UACCESS_H | ||
2 | #define UACCESS_H | ||
3 | extern void *__user_addr_min, *__user_addr_max; | ||
4 | |||
5 | #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) | ||
6 | |||
7 | static inline void __chk_user_ptr(const volatile void *p, size_t size) | ||
8 | { | ||
9 | assert(p >= __user_addr_min && p + size <= __user_addr_max); | ||
10 | } | ||
11 | |||
12 | #define put_user(x, ptr) \ | ||
13 | ({ \ | ||
14 | typeof(ptr) __pu_ptr = (ptr); \ | ||
15 | __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \ | ||
16 | ACCESS_ONCE(*(__pu_ptr)) = x; \ | ||
17 | 0; \ | ||
18 | }) | ||
19 | |||
20 | #define get_user(x, ptr) \ | ||
21 | ({ \ | ||
22 | typeof(ptr) __pu_ptr = (ptr); \ | ||
23 | __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \ | ||
24 | x = ACCESS_ONCE(*(__pu_ptr)); \ | ||
25 | 0; \ | ||
26 | }) | ||
27 | |||
28 | static void volatile_memcpy(volatile char *to, const volatile char *from, | ||
29 | unsigned long n) | ||
30 | { | ||
31 | while (n--) | ||
32 | *(to++) = *(from++); | ||
33 | } | ||
34 | |||
35 | static inline int copy_from_user(void *to, const void __user volatile *from, | ||
36 | unsigned long n) | ||
37 | { | ||
38 | __chk_user_ptr(from, n); | ||
39 | volatile_memcpy(to, from, n); | ||
40 | return 0; | ||
41 | } | ||
42 | |||
43 | static inline int copy_to_user(void __user volatile *to, const void *from, | ||
44 | unsigned long n) | ||
45 | { | ||
46 | __chk_user_ptr(to, n); | ||
47 | volatile_memcpy(to, from, n); | ||
48 | return 0; | ||
49 | } | ||
50 | #endif /* UACCESS_H */ | ||
diff --git a/tools/virtio/linux/uio.h b/tools/virtio/linux/uio.h new file mode 100644 index 000000000000..cd20f0ba3081 --- /dev/null +++ b/tools/virtio/linux/uio.h | |||
@@ -0,0 +1,3 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | |||
3 | #include "../../../include/linux/uio.h" | ||
diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index 81847dd08bd0..cd801838156f 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h | |||
@@ -1,127 +1,7 @@ | |||
1 | #ifndef LINUX_VIRTIO_H | 1 | #ifndef LINUX_VIRTIO_H |
2 | #define LINUX_VIRTIO_H | 2 | #define LINUX_VIRTIO_H |
3 | 3 | #include <linux/scatterlist.h> | |
4 | #include <stdbool.h> | 4 | #include <linux/kernel.h> |
5 | #include <stdlib.h> | ||
6 | #include <stddef.h> | ||
7 | #include <stdio.h> | ||
8 | #include <string.h> | ||
9 | #include <assert.h> | ||
10 | |||
11 | #include <linux/types.h> | ||
12 | #include <errno.h> | ||
13 | |||
14 | typedef unsigned long long dma_addr_t; | ||
15 | |||
16 | struct scatterlist { | ||
17 | unsigned long page_link; | ||
18 | unsigned int offset; | ||
19 | unsigned int length; | ||
20 | dma_addr_t dma_address; | ||
21 | }; | ||
22 | |||
23 | struct page { | ||
24 | unsigned long long dummy; | ||
25 | }; | ||
26 | |||
27 | #define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond)) | ||
28 | |||
29 | /* Physical == Virtual */ | ||
30 | #define virt_to_phys(p) ((unsigned long)p) | ||
31 | #define phys_to_virt(a) ((void *)(unsigned long)(a)) | ||
32 | /* Page address: Virtual / 4K */ | ||
33 | #define virt_to_page(p) ((struct page*)((virt_to_phys(p) / 4096) * \ | ||
34 | sizeof(struct page))) | ||
35 | #define offset_in_page(p) (((unsigned long)p) % 4096) | ||
36 | #define sg_phys(sg) ((sg->page_link & ~0x3) / sizeof(struct page) * 4096 + \ | ||
37 | sg->offset) | ||
38 | static inline void sg_mark_end(struct scatterlist *sg) | ||
39 | { | ||
40 | /* | ||
41 | * Set termination bit, clear potential chain bit | ||
42 | */ | ||
43 | sg->page_link |= 0x02; | ||
44 | sg->page_link &= ~0x01; | ||
45 | } | ||
46 | static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents) | ||
47 | { | ||
48 | memset(sgl, 0, sizeof(*sgl) * nents); | ||
49 | sg_mark_end(&sgl[nents - 1]); | ||
50 | } | ||
51 | static inline void sg_assign_page(struct scatterlist *sg, struct page *page) | ||
52 | { | ||
53 | unsigned long page_link = sg->page_link & 0x3; | ||
54 | |||
55 | /* | ||
56 | * In order for the low bit stealing approach to work, pages | ||
57 | * must be aligned at a 32-bit boundary as a minimum. | ||
58 | */ | ||
59 | BUG_ON((unsigned long) page & 0x03); | ||
60 | sg->page_link = page_link | (unsigned long) page; | ||
61 | } | ||
62 | |||
63 | static inline void sg_set_page(struct scatterlist *sg, struct page *page, | ||
64 | unsigned int len, unsigned int offset) | ||
65 | { | ||
66 | sg_assign_page(sg, page); | ||
67 | sg->offset = offset; | ||
68 | sg->length = len; | ||
69 | } | ||
70 | |||
71 | static inline void sg_set_buf(struct scatterlist *sg, const void *buf, | ||
72 | unsigned int buflen) | ||
73 | { | ||
74 | sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); | ||
75 | } | ||
76 | |||
77 | static inline void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen) | ||
78 | { | ||
79 | sg_init_table(sg, 1); | ||
80 | sg_set_buf(sg, buf, buflen); | ||
81 | } | ||
82 | |||
83 | typedef __u16 u16; | ||
84 | |||
85 | typedef enum { | ||
86 | GFP_KERNEL, | ||
87 | GFP_ATOMIC, | ||
88 | } gfp_t; | ||
89 | typedef enum { | ||
90 | IRQ_NONE, | ||
91 | IRQ_HANDLED | ||
92 | } irqreturn_t; | ||
93 | |||
94 | static inline void *kmalloc(size_t s, gfp_t gfp) | ||
95 | { | ||
96 | return malloc(s); | ||
97 | } | ||
98 | |||
99 | static inline void kfree(void *p) | ||
100 | { | ||
101 | free(p); | ||
102 | } | ||
103 | |||
104 | #define container_of(ptr, type, member) ({ \ | ||
105 | const typeof( ((type *)0)->member ) *__mptr = (ptr); \ | ||
106 | (type *)( (char *)__mptr - offsetof(type,member) );}) | ||
107 | |||
108 | #define uninitialized_var(x) x = x | ||
109 | |||
110 | # ifndef likely | ||
111 | # define likely(x) (__builtin_expect(!!(x), 1)) | ||
112 | # endif | ||
113 | # ifndef unlikely | ||
114 | # define unlikely(x) (__builtin_expect(!!(x), 0)) | ||
115 | # endif | ||
116 | |||
117 | #define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__) | ||
118 | #ifdef DEBUG | ||
119 | #define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__) | ||
120 | #else | ||
121 | #define pr_debug(format, ...) do {} while (0) | ||
122 | #endif | ||
123 | #define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) | ||
124 | #define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) | ||
125 | 5 | ||
126 | /* TODO: empty stubs for now. Broken but enough for virtio_ring.c */ | 6 | /* TODO: empty stubs for now. Broken but enough for virtio_ring.c */ |
127 | #define list_add_tail(a, b) do {} while (0) | 7 | #define list_add_tail(a, b) do {} while (0) |
@@ -131,6 +11,7 @@ static inline void kfree(void *p) | |||
131 | #define BITS_PER_BYTE 8 | 11 | #define BITS_PER_BYTE 8 |
132 | #define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE) | 12 | #define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE) |
133 | #define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) | 13 | #define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) |
14 | |||
134 | /* TODO: Not atomic as it should be: | 15 | /* TODO: Not atomic as it should be: |
135 | * we don't use this for anything important. */ | 16 | * we don't use this for anything important. */ |
136 | static inline void clear_bit(int nr, volatile unsigned long *addr) | 17 | static inline void clear_bit(int nr, volatile unsigned long *addr) |
@@ -145,10 +26,6 @@ static inline int test_bit(int nr, const volatile unsigned long *addr) | |||
145 | { | 26 | { |
146 | return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); | 27 | return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); |
147 | } | 28 | } |
148 | |||
149 | /* The only feature we care to support */ | ||
150 | #define virtio_has_feature(dev, feature) \ | ||
151 | test_bit((feature), (dev)->features) | ||
152 | /* end of stubs */ | 29 | /* end of stubs */ |
153 | 30 | ||
154 | struct virtio_device { | 31 | struct virtio_device { |
@@ -163,39 +40,32 @@ struct virtqueue { | |||
163 | void (*callback)(struct virtqueue *vq); | 40 | void (*callback)(struct virtqueue *vq); |
164 | const char *name; | 41 | const char *name; |
165 | struct virtio_device *vdev; | 42 | struct virtio_device *vdev; |
43 | unsigned int index; | ||
44 | unsigned int num_free; | ||
166 | void *priv; | 45 | void *priv; |
167 | }; | 46 | }; |
168 | 47 | ||
169 | #define EXPORT_SYMBOL_GPL(__EXPORT_SYMBOL_GPL_name) \ | ||
170 | void __EXPORT_SYMBOL_GPL##__EXPORT_SYMBOL_GPL_name() { \ | ||
171 | } | ||
172 | #define MODULE_LICENSE(__MODULE_LICENSE_value) \ | 48 | #define MODULE_LICENSE(__MODULE_LICENSE_value) \ |
173 | const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value | 49 | const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value |
174 | 50 | ||
175 | #define CONFIG_SMP | ||
176 | |||
177 | #if defined(__i386__) || defined(__x86_64__) | ||
178 | #define barrier() asm volatile("" ::: "memory") | ||
179 | #define mb() __sync_synchronize() | ||
180 | |||
181 | #define smp_mb() mb() | ||
182 | # define smp_rmb() barrier() | ||
183 | # define smp_wmb() barrier() | ||
184 | /* Weak barriers should be used. If not - it's a bug */ | ||
185 | # define rmb() abort() | ||
186 | # define wmb() abort() | ||
187 | #else | ||
188 | #error Please fill in barrier macros | ||
189 | #endif | ||
190 | |||
191 | /* Interfaces exported by virtio_ring. */ | 51 | /* Interfaces exported by virtio_ring. */ |
192 | int virtqueue_add_buf(struct virtqueue *vq, | 52 | int virtqueue_add_sgs(struct virtqueue *vq, |
193 | struct scatterlist sg[], | 53 | struct scatterlist *sgs[], |
194 | unsigned int out_num, | 54 | unsigned int out_sgs, |
195 | unsigned int in_num, | 55 | unsigned int in_sgs, |
196 | void *data, | 56 | void *data, |
197 | gfp_t gfp); | 57 | gfp_t gfp); |
198 | 58 | ||
59 | int virtqueue_add_outbuf(struct virtqueue *vq, | ||
60 | struct scatterlist sg[], unsigned int num, | ||
61 | void *data, | ||
62 | gfp_t gfp); | ||
63 | |||
64 | int virtqueue_add_inbuf(struct virtqueue *vq, | ||
65 | struct scatterlist sg[], unsigned int num, | ||
66 | void *data, | ||
67 | gfp_t gfp); | ||
68 | |||
199 | void virtqueue_kick(struct virtqueue *vq); | 69 | void virtqueue_kick(struct virtqueue *vq); |
200 | 70 | ||
201 | void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); | 71 | void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); |
@@ -206,7 +76,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq); | |||
206 | bool virtqueue_enable_cb_delayed(struct virtqueue *vq); | 76 | bool virtqueue_enable_cb_delayed(struct virtqueue *vq); |
207 | 77 | ||
208 | void *virtqueue_detach_unused_buf(struct virtqueue *vq); | 78 | void *virtqueue_detach_unused_buf(struct virtqueue *vq); |
209 | struct virtqueue *vring_new_virtqueue(unsigned int num, | 79 | struct virtqueue *vring_new_virtqueue(unsigned int index, |
80 | unsigned int num, | ||
210 | unsigned int vring_align, | 81 | unsigned int vring_align, |
211 | struct virtio_device *vdev, | 82 | struct virtio_device *vdev, |
212 | bool weak_barriers, | 83 | bool weak_barriers, |
diff --git a/tools/virtio/linux/virtio_config.h b/tools/virtio/linux/virtio_config.h new file mode 100644 index 000000000000..5049967f99f7 --- /dev/null +++ b/tools/virtio/linux/virtio_config.h | |||
@@ -0,0 +1,6 @@ | |||
1 | #define VIRTIO_TRANSPORT_F_START 28 | ||
2 | #define VIRTIO_TRANSPORT_F_END 32 | ||
3 | |||
4 | #define virtio_has_feature(dev, feature) \ | ||
5 | test_bit((feature), (dev)->features) | ||
6 | |||
diff --git a/tools/virtio/linux/virtio_ring.h b/tools/virtio/linux/virtio_ring.h new file mode 100644 index 000000000000..8949c4e2772c --- /dev/null +++ b/tools/virtio/linux/virtio_ring.h | |||
@@ -0,0 +1 @@ | |||
#include "../../../include/linux/virtio_ring.h" | |||
diff --git a/tools/virtio/linux/vringh.h b/tools/virtio/linux/vringh.h new file mode 100644 index 000000000000..9348957be56e --- /dev/null +++ b/tools/virtio/linux/vringh.h | |||
@@ -0,0 +1 @@ | |||
#include "../../../include/linux/vringh.h" | |||
diff --git a/tools/virtio/uapi/linux/uio.h b/tools/virtio/uapi/linux/uio.h new file mode 100644 index 000000000000..7230e9002207 --- /dev/null +++ b/tools/virtio/uapi/linux/uio.h | |||
@@ -0,0 +1 @@ | |||
#include <sys/uio.h> | |||
diff --git a/tools/virtio/uapi/linux/virtio_config.h b/tools/virtio/uapi/linux/virtio_config.h new file mode 100644 index 000000000000..4c86675f0159 --- /dev/null +++ b/tools/virtio/uapi/linux/virtio_config.h | |||
@@ -0,0 +1 @@ | |||
#include "../../../../include/uapi/linux/virtio_config.h" | |||
diff --git a/tools/virtio/uapi/linux/virtio_ring.h b/tools/virtio/uapi/linux/virtio_ring.h new file mode 100644 index 000000000000..4d99c78234d3 --- /dev/null +++ b/tools/virtio/uapi/linux/virtio_ring.h | |||
@@ -0,0 +1,4 @@ | |||
1 | #ifndef VIRTIO_RING_H | ||
2 | #define VIRTIO_RING_H | ||
3 | #include "../../../../include/uapi/linux/virtio_ring.h" | ||
4 | #endif /* VIRTIO_RING_H */ | ||
diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index fcc9aa25fd08..da7a19558281 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c | |||
@@ -10,11 +10,15 @@ | |||
10 | #include <sys/stat.h> | 10 | #include <sys/stat.h> |
11 | #include <sys/types.h> | 11 | #include <sys/types.h> |
12 | #include <fcntl.h> | 12 | #include <fcntl.h> |
13 | #include <stdbool.h> | ||
13 | #include <linux/vhost.h> | 14 | #include <linux/vhost.h> |
14 | #include <linux/virtio.h> | 15 | #include <linux/virtio.h> |
15 | #include <linux/virtio_ring.h> | 16 | #include <linux/virtio_ring.h> |
16 | #include "../../drivers/vhost/test.h" | 17 | #include "../../drivers/vhost/test.h" |
17 | 18 | ||
19 | /* Unused */ | ||
20 | void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; | ||
21 | |||
18 | struct vq_info { | 22 | struct vq_info { |
19 | int kick; | 23 | int kick; |
20 | int call; | 24 | int call; |
@@ -92,7 +96,8 @@ static void vq_info_add(struct vdev_info *dev, int num) | |||
92 | assert(r >= 0); | 96 | assert(r >= 0); |
93 | memset(info->ring, 0, vring_size(num, 4096)); | 97 | memset(info->ring, 0, vring_size(num, 4096)); |
94 | vring_init(&info->vring, num, info->ring, 4096); | 98 | vring_init(&info->vring, num, info->ring, 4096); |
95 | info->vq = vring_new_virtqueue(info->vring.num, 4096, &dev->vdev, | 99 | info->vq = vring_new_virtqueue(info->idx, |
100 | info->vring.num, 4096, &dev->vdev, | ||
96 | true, info->ring, | 101 | true, info->ring, |
97 | vq_notify, vq_callback, "test"); | 102 | vq_notify, vq_callback, "test"); |
98 | assert(info->vq); | 103 | assert(info->vq); |
@@ -161,9 +166,9 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq, | |||
161 | do { | 166 | do { |
162 | if (started < bufs) { | 167 | if (started < bufs) { |
163 | sg_init_one(&sl, dev->buf, dev->buf_size); | 168 | sg_init_one(&sl, dev->buf, dev->buf_size); |
164 | r = virtqueue_add_buf(vq->vq, &sl, 1, 0, | 169 | r = virtqueue_add_outbuf(vq->vq, &sl, 1, |
165 | dev->buf + started, | 170 | dev->buf + started, |
166 | GFP_ATOMIC); | 171 | GFP_ATOMIC); |
167 | if (likely(r == 0)) { | 172 | if (likely(r == 0)) { |
168 | ++started; | 173 | ++started; |
169 | virtqueue_kick(vq->vq); | 174 | virtqueue_kick(vq->vq); |
diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c new file mode 100644 index 000000000000..d053ea40c001 --- /dev/null +++ b/tools/virtio/vringh_test.c | |||
@@ -0,0 +1,741 @@ | |||
1 | /* Simple test of virtio code, entirely in userpsace. */ | ||
2 | #define _GNU_SOURCE | ||
3 | #include <sched.h> | ||
4 | #include <err.h> | ||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/err.h> | ||
7 | #include <linux/virtio.h> | ||
8 | #include <linux/vringh.h> | ||
9 | #include <linux/virtio_ring.h> | ||
10 | #include <linux/uaccess.h> | ||
11 | #include <sys/types.h> | ||
12 | #include <sys/stat.h> | ||
13 | #include <sys/mman.h> | ||
14 | #include <sys/wait.h> | ||
15 | #include <fcntl.h> | ||
16 | |||
17 | #define USER_MEM (1024*1024) | ||
18 | void *__user_addr_min, *__user_addr_max; | ||
19 | void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; | ||
20 | static u64 user_addr_offset; | ||
21 | |||
22 | #define RINGSIZE 256 | ||
23 | #define ALIGN 4096 | ||
24 | |||
25 | static void never_notify_host(struct virtqueue *vq) | ||
26 | { | ||
27 | abort(); | ||
28 | } | ||
29 | |||
30 | static void never_callback_guest(struct virtqueue *vq) | ||
31 | { | ||
32 | abort(); | ||
33 | } | ||
34 | |||
35 | static bool getrange_iov(struct vringh *vrh, u64 addr, struct vringh_range *r) | ||
36 | { | ||
37 | if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset) | ||
38 | return false; | ||
39 | if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset) | ||
40 | return false; | ||
41 | |||
42 | r->start = (u64)(unsigned long)__user_addr_min - user_addr_offset; | ||
43 | r->end_incl = (u64)(unsigned long)__user_addr_max - 1 - user_addr_offset; | ||
44 | r->offset = user_addr_offset; | ||
45 | return true; | ||
46 | } | ||
47 | |||
48 | /* We return single byte ranges. */ | ||
49 | static bool getrange_slow(struct vringh *vrh, u64 addr, struct vringh_range *r) | ||
50 | { | ||
51 | if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset) | ||
52 | return false; | ||
53 | if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset) | ||
54 | return false; | ||
55 | |||
56 | r->start = addr; | ||
57 | r->end_incl = r->start; | ||
58 | r->offset = user_addr_offset; | ||
59 | return true; | ||
60 | } | ||
61 | |||
62 | struct guest_virtio_device { | ||
63 | struct virtio_device vdev; | ||
64 | int to_host_fd; | ||
65 | unsigned long notifies; | ||
66 | }; | ||
67 | |||
68 | static void parallel_notify_host(struct virtqueue *vq) | ||
69 | { | ||
70 | struct guest_virtio_device *gvdev; | ||
71 | |||
72 | gvdev = container_of(vq->vdev, struct guest_virtio_device, vdev); | ||
73 | write(gvdev->to_host_fd, "", 1); | ||
74 | gvdev->notifies++; | ||
75 | } | ||
76 | |||
77 | static void no_notify_host(struct virtqueue *vq) | ||
78 | { | ||
79 | } | ||
80 | |||
81 | #define NUM_XFERS (10000000) | ||
82 | |||
83 | /* We aim for two "distant" cpus. */ | ||
84 | static void find_cpus(unsigned int *first, unsigned int *last) | ||
85 | { | ||
86 | unsigned int i; | ||
87 | |||
88 | *first = -1U; | ||
89 | *last = 0; | ||
90 | for (i = 0; i < 4096; i++) { | ||
91 | cpu_set_t set; | ||
92 | CPU_ZERO(&set); | ||
93 | CPU_SET(i, &set); | ||
94 | if (sched_setaffinity(getpid(), sizeof(set), &set) == 0) { | ||
95 | if (i < *first) | ||
96 | *first = i; | ||
97 | if (i > *last) | ||
98 | *last = i; | ||
99 | } | ||
100 | } | ||
101 | } | ||
102 | |||
103 | /* Opencoded version for fast mode */ | ||
104 | static inline int vringh_get_head(struct vringh *vrh, u16 *head) | ||
105 | { | ||
106 | u16 avail_idx, i; | ||
107 | int err; | ||
108 | |||
109 | err = get_user(avail_idx, &vrh->vring.avail->idx); | ||
110 | if (err) | ||
111 | return err; | ||
112 | |||
113 | if (vrh->last_avail_idx == avail_idx) | ||
114 | return 0; | ||
115 | |||
116 | /* Only get avail ring entries after they have been exposed by guest. */ | ||
117 | virtio_rmb(vrh->weak_barriers); | ||
118 | |||
119 | i = vrh->last_avail_idx & (vrh->vring.num - 1); | ||
120 | |||
121 | err = get_user(*head, &vrh->vring.avail->ring[i]); | ||
122 | if (err) | ||
123 | return err; | ||
124 | |||
125 | vrh->last_avail_idx++; | ||
126 | return 1; | ||
127 | } | ||
128 | |||
129 | static int parallel_test(unsigned long features, | ||
130 | bool (*getrange)(struct vringh *vrh, | ||
131 | u64 addr, struct vringh_range *r), | ||
132 | bool fast_vringh) | ||
133 | { | ||
134 | void *host_map, *guest_map; | ||
135 | int fd, mapsize, to_guest[2], to_host[2]; | ||
136 | unsigned long xfers = 0, notifies = 0, receives = 0; | ||
137 | unsigned int first_cpu, last_cpu; | ||
138 | cpu_set_t cpu_set; | ||
139 | char buf[128]; | ||
140 | |||
141 | /* Create real file to mmap. */ | ||
142 | fd = open("/tmp/vringh_test-file", O_RDWR|O_CREAT|O_TRUNC, 0600); | ||
143 | if (fd < 0) | ||
144 | err(1, "Opening /tmp/vringh_test-file"); | ||
145 | |||
146 | /* Extra room at the end for some data, and indirects */ | ||
147 | mapsize = vring_size(RINGSIZE, ALIGN) | ||
148 | + RINGSIZE * 2 * sizeof(int) | ||
149 | + RINGSIZE * 6 * sizeof(struct vring_desc); | ||
150 | mapsize = (mapsize + getpagesize() - 1) & ~(getpagesize() - 1); | ||
151 | ftruncate(fd, mapsize); | ||
152 | |||
153 | /* Parent and child use separate addresses, to check our mapping logic! */ | ||
154 | host_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); | ||
155 | guest_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); | ||
156 | |||
157 | pipe(to_guest); | ||
158 | pipe(to_host); | ||
159 | |||
160 | CPU_ZERO(&cpu_set); | ||
161 | find_cpus(&first_cpu, &last_cpu); | ||
162 | printf("Using CPUS %u and %u\n", first_cpu, last_cpu); | ||
163 | fflush(stdout); | ||
164 | |||
165 | if (fork() != 0) { | ||
166 | struct vringh vrh; | ||
167 | int status, err, rlen = 0; | ||
168 | char rbuf[5]; | ||
169 | |||
170 | /* We are the host: never access guest addresses! */ | ||
171 | munmap(guest_map, mapsize); | ||
172 | |||
173 | __user_addr_min = host_map; | ||
174 | __user_addr_max = __user_addr_min + mapsize; | ||
175 | user_addr_offset = host_map - guest_map; | ||
176 | assert(user_addr_offset); | ||
177 | |||
178 | close(to_guest[0]); | ||
179 | close(to_host[1]); | ||
180 | |||
181 | vring_init(&vrh.vring, RINGSIZE, host_map, ALIGN); | ||
182 | vringh_init_user(&vrh, features, RINGSIZE, true, | ||
183 | vrh.vring.desc, vrh.vring.avail, vrh.vring.used); | ||
184 | CPU_SET(first_cpu, &cpu_set); | ||
185 | if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set)) | ||
186 | errx(1, "Could not set affinity to cpu %u", first_cpu); | ||
187 | |||
188 | while (xfers < NUM_XFERS) { | ||
189 | struct iovec host_riov[2], host_wiov[2]; | ||
190 | struct vringh_iov riov, wiov; | ||
191 | u16 head, written; | ||
192 | |||
193 | if (fast_vringh) { | ||
194 | for (;;) { | ||
195 | err = vringh_get_head(&vrh, &head); | ||
196 | if (err != 0) | ||
197 | break; | ||
198 | err = vringh_need_notify_user(&vrh); | ||
199 | if (err < 0) | ||
200 | errx(1, "vringh_need_notify_user: %i", | ||
201 | err); | ||
202 | if (err) { | ||
203 | write(to_guest[1], "", 1); | ||
204 | notifies++; | ||
205 | } | ||
206 | } | ||
207 | if (err != 1) | ||
208 | errx(1, "vringh_get_head"); | ||
209 | written = 0; | ||
210 | goto complete; | ||
211 | } else { | ||
212 | vringh_iov_init(&riov, | ||
213 | host_riov, | ||
214 | ARRAY_SIZE(host_riov)); | ||
215 | vringh_iov_init(&wiov, | ||
216 | host_wiov, | ||
217 | ARRAY_SIZE(host_wiov)); | ||
218 | |||
219 | err = vringh_getdesc_user(&vrh, &riov, &wiov, | ||
220 | getrange, &head); | ||
221 | } | ||
222 | if (err == 0) { | ||
223 | err = vringh_need_notify_user(&vrh); | ||
224 | if (err < 0) | ||
225 | errx(1, "vringh_need_notify_user: %i", | ||
226 | err); | ||
227 | if (err) { | ||
228 | write(to_guest[1], "", 1); | ||
229 | notifies++; | ||
230 | } | ||
231 | |||
232 | if (!vringh_notify_enable_user(&vrh)) | ||
233 | continue; | ||
234 | |||
235 | /* Swallow all notifies at once. */ | ||
236 | if (read(to_host[0], buf, sizeof(buf)) < 1) | ||
237 | break; | ||
238 | |||
239 | vringh_notify_disable_user(&vrh); | ||
240 | receives++; | ||
241 | continue; | ||
242 | } | ||
243 | if (err != 1) | ||
244 | errx(1, "vringh_getdesc_user: %i", err); | ||
245 | |||
246 | /* We simply copy bytes. */ | ||
247 | if (riov.used) { | ||
248 | rlen = vringh_iov_pull_user(&riov, rbuf, | ||
249 | sizeof(rbuf)); | ||
250 | if (rlen != 4) | ||
251 | errx(1, "vringh_iov_pull_user: %i", | ||
252 | rlen); | ||
253 | assert(riov.i == riov.used); | ||
254 | written = 0; | ||
255 | } else { | ||
256 | err = vringh_iov_push_user(&wiov, rbuf, rlen); | ||
257 | if (err != rlen) | ||
258 | errx(1, "vringh_iov_push_user: %i", | ||
259 | err); | ||
260 | assert(wiov.i == wiov.used); | ||
261 | written = err; | ||
262 | } | ||
263 | complete: | ||
264 | xfers++; | ||
265 | |||
266 | err = vringh_complete_user(&vrh, head, written); | ||
267 | if (err != 0) | ||
268 | errx(1, "vringh_complete_user: %i", err); | ||
269 | } | ||
270 | |||
271 | err = vringh_need_notify_user(&vrh); | ||
272 | if (err < 0) | ||
273 | errx(1, "vringh_need_notify_user: %i", err); | ||
274 | if (err) { | ||
275 | write(to_guest[1], "", 1); | ||
276 | notifies++; | ||
277 | } | ||
278 | wait(&status); | ||
279 | if (!WIFEXITED(status)) | ||
280 | errx(1, "Child died with signal %i?", WTERMSIG(status)); | ||
281 | if (WEXITSTATUS(status) != 0) | ||
282 | errx(1, "Child exited %i?", WEXITSTATUS(status)); | ||
283 | printf("Host: notified %lu, pinged %lu\n", notifies, receives); | ||
284 | return 0; | ||
285 | } else { | ||
286 | struct guest_virtio_device gvdev; | ||
287 | struct virtqueue *vq; | ||
288 | unsigned int *data; | ||
289 | struct vring_desc *indirects; | ||
290 | unsigned int finished = 0; | ||
291 | |||
292 | /* We pass sg[]s pointing into here, but we need RINGSIZE+1 */ | ||
293 | data = guest_map + vring_size(RINGSIZE, ALIGN); | ||
294 | indirects = (void *)data + (RINGSIZE + 1) * 2 * sizeof(int); | ||
295 | |||
296 | /* We are the guest. */ | ||
297 | munmap(host_map, mapsize); | ||
298 | |||
299 | close(to_guest[1]); | ||
300 | close(to_host[0]); | ||
301 | |||
302 | gvdev.vdev.features[0] = features; | ||
303 | gvdev.to_host_fd = to_host[1]; | ||
304 | gvdev.notifies = 0; | ||
305 | |||
306 | CPU_SET(first_cpu, &cpu_set); | ||
307 | if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set)) | ||
308 | err(1, "Could not set affinity to cpu %u", first_cpu); | ||
309 | |||
310 | vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &gvdev.vdev, true, | ||
311 | guest_map, fast_vringh ? no_notify_host | ||
312 | : parallel_notify_host, | ||
313 | never_callback_guest, "guest vq"); | ||
314 | |||
315 | /* Don't kfree indirects. */ | ||
316 | __kfree_ignore_start = indirects; | ||
317 | __kfree_ignore_end = indirects + RINGSIZE * 6; | ||
318 | |||
319 | while (xfers < NUM_XFERS) { | ||
320 | struct scatterlist sg[4]; | ||
321 | unsigned int num_sg, len; | ||
322 | int *dbuf, err; | ||
323 | bool output = !(xfers % 2); | ||
324 | |||
325 | /* Consume bufs. */ | ||
326 | while ((dbuf = virtqueue_get_buf(vq, &len)) != NULL) { | ||
327 | if (len == 4) | ||
328 | assert(*dbuf == finished - 1); | ||
329 | else if (!fast_vringh) | ||
330 | assert(*dbuf == finished); | ||
331 | finished++; | ||
332 | } | ||
333 | |||
334 | /* Produce a buffer. */ | ||
335 | dbuf = data + (xfers % (RINGSIZE + 1)); | ||
336 | |||
337 | if (output) | ||
338 | *dbuf = xfers; | ||
339 | else | ||
340 | *dbuf = -1; | ||
341 | |||
342 | switch ((xfers / sizeof(*dbuf)) % 4) { | ||
343 | case 0: | ||
344 | /* Nasty three-element sg list. */ | ||
345 | sg_init_table(sg, num_sg = 3); | ||
346 | sg_set_buf(&sg[0], (void *)dbuf, 1); | ||
347 | sg_set_buf(&sg[1], (void *)dbuf + 1, 2); | ||
348 | sg_set_buf(&sg[2], (void *)dbuf + 3, 1); | ||
349 | break; | ||
350 | case 1: | ||
351 | sg_init_table(sg, num_sg = 2); | ||
352 | sg_set_buf(&sg[0], (void *)dbuf, 1); | ||
353 | sg_set_buf(&sg[1], (void *)dbuf + 1, 3); | ||
354 | break; | ||
355 | case 2: | ||
356 | sg_init_table(sg, num_sg = 1); | ||
357 | sg_set_buf(&sg[0], (void *)dbuf, 4); | ||
358 | break; | ||
359 | case 3: | ||
360 | sg_init_table(sg, num_sg = 4); | ||
361 | sg_set_buf(&sg[0], (void *)dbuf, 1); | ||
362 | sg_set_buf(&sg[1], (void *)dbuf + 1, 1); | ||
363 | sg_set_buf(&sg[2], (void *)dbuf + 2, 1); | ||
364 | sg_set_buf(&sg[3], (void *)dbuf + 3, 1); | ||
365 | break; | ||
366 | } | ||
367 | |||
368 | /* May allocate an indirect, so force it to allocate | ||
369 | * user addr */ | ||
370 | __kmalloc_fake = indirects + (xfers % RINGSIZE) * 4; | ||
371 | if (output) | ||
372 | err = virtqueue_add_outbuf(vq, sg, num_sg, dbuf, | ||
373 | GFP_KERNEL); | ||
374 | else | ||
375 | err = virtqueue_add_inbuf(vq, sg, num_sg, | ||
376 | dbuf, GFP_KERNEL); | ||
377 | |||
378 | if (err == -ENOSPC) { | ||
379 | if (!virtqueue_enable_cb_delayed(vq)) | ||
380 | continue; | ||
381 | /* Swallow all notifies at once. */ | ||
382 | if (read(to_guest[0], buf, sizeof(buf)) < 1) | ||
383 | break; | ||
384 | |||
385 | receives++; | ||
386 | virtqueue_disable_cb(vq); | ||
387 | continue; | ||
388 | } | ||
389 | |||
390 | if (err) | ||
391 | errx(1, "virtqueue_add_in/outbuf: %i", err); | ||
392 | |||
393 | xfers++; | ||
394 | virtqueue_kick(vq); | ||
395 | } | ||
396 | |||
397 | /* Any extra? */ | ||
398 | while (finished != xfers) { | ||
399 | int *dbuf; | ||
400 | unsigned int len; | ||
401 | |||
402 | /* Consume bufs. */ | ||
403 | dbuf = virtqueue_get_buf(vq, &len); | ||
404 | if (dbuf) { | ||
405 | if (len == 4) | ||
406 | assert(*dbuf == finished - 1); | ||
407 | else | ||
408 | assert(len == 0); | ||
409 | finished++; | ||
410 | continue; | ||
411 | } | ||
412 | |||
413 | if (!virtqueue_enable_cb_delayed(vq)) | ||
414 | continue; | ||
415 | if (read(to_guest[0], buf, sizeof(buf)) < 1) | ||
416 | break; | ||
417 | |||
418 | receives++; | ||
419 | virtqueue_disable_cb(vq); | ||
420 | } | ||
421 | |||
422 | printf("Guest: notified %lu, pinged %lu\n", | ||
423 | gvdev.notifies, receives); | ||
424 | vring_del_virtqueue(vq); | ||
425 | return 0; | ||
426 | } | ||
427 | } | ||
428 | |||
429 | int main(int argc, char *argv[]) | ||
430 | { | ||
431 | struct virtio_device vdev; | ||
432 | struct virtqueue *vq; | ||
433 | struct vringh vrh; | ||
434 | struct scatterlist guest_sg[RINGSIZE], *sgs[2]; | ||
435 | struct iovec host_riov[2], host_wiov[2]; | ||
436 | struct vringh_iov riov, wiov; | ||
437 | struct vring_used_elem used[RINGSIZE]; | ||
438 | char buf[28]; | ||
439 | u16 head; | ||
440 | int err; | ||
441 | unsigned i; | ||
442 | void *ret; | ||
443 | bool (*getrange)(struct vringh *vrh, u64 addr, struct vringh_range *r); | ||
444 | bool fast_vringh = false, parallel = false; | ||
445 | |||
446 | getrange = getrange_iov; | ||
447 | vdev.features[0] = 0; | ||
448 | |||
449 | while (argv[1]) { | ||
450 | if (strcmp(argv[1], "--indirect") == 0) | ||
451 | vdev.features[0] |= (1 << VIRTIO_RING_F_INDIRECT_DESC); | ||
452 | else if (strcmp(argv[1], "--eventidx") == 0) | ||
453 | vdev.features[0] |= (1 << VIRTIO_RING_F_EVENT_IDX); | ||
454 | else if (strcmp(argv[1], "--slow-range") == 0) | ||
455 | getrange = getrange_slow; | ||
456 | else if (strcmp(argv[1], "--fast-vringh") == 0) | ||
457 | fast_vringh = true; | ||
458 | else if (strcmp(argv[1], "--parallel") == 0) | ||
459 | parallel = true; | ||
460 | else | ||
461 | errx(1, "Unknown arg %s", argv[1]); | ||
462 | argv++; | ||
463 | } | ||
464 | |||
465 | if (parallel) | ||
466 | return parallel_test(vdev.features[0], getrange, fast_vringh); | ||
467 | |||
468 | if (posix_memalign(&__user_addr_min, PAGE_SIZE, USER_MEM) != 0) | ||
469 | abort(); | ||
470 | __user_addr_max = __user_addr_min + USER_MEM; | ||
471 | memset(__user_addr_min, 0, vring_size(RINGSIZE, ALIGN)); | ||
472 | |||
473 | /* Set up guest side. */ | ||
474 | vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true, | ||
475 | __user_addr_min, | ||
476 | never_notify_host, never_callback_guest, | ||
477 | "guest vq"); | ||
478 | |||
479 | /* Set up host side. */ | ||
480 | vring_init(&vrh.vring, RINGSIZE, __user_addr_min, ALIGN); | ||
481 | vringh_init_user(&vrh, vdev.features[0], RINGSIZE, true, | ||
482 | vrh.vring.desc, vrh.vring.avail, vrh.vring.used); | ||
483 | |||
484 | /* No descriptor to get yet... */ | ||
485 | err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); | ||
486 | if (err != 0) | ||
487 | errx(1, "vringh_getdesc_user: %i", err); | ||
488 | |||
489 | /* Guest puts in a descriptor. */ | ||
490 | memcpy(__user_addr_max - 1, "a", 1); | ||
491 | sg_init_table(guest_sg, 1); | ||
492 | sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1); | ||
493 | sg_init_table(guest_sg+1, 1); | ||
494 | sg_set_buf(&guest_sg[1], __user_addr_max - 3, 2); | ||
495 | sgs[0] = &guest_sg[0]; | ||
496 | sgs[1] = &guest_sg[1]; | ||
497 | |||
498 | /* May allocate an indirect, so force it to allocate user addr */ | ||
499 | __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN); | ||
500 | err = virtqueue_add_sgs(vq, sgs, 1, 1, &err, GFP_KERNEL); | ||
501 | if (err) | ||
502 | errx(1, "virtqueue_add_sgs: %i", err); | ||
503 | __kmalloc_fake = NULL; | ||
504 | |||
505 | /* Host retreives it. */ | ||
506 | vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); | ||
507 | vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); | ||
508 | |||
509 | err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); | ||
510 | if (err != 1) | ||
511 | errx(1, "vringh_getdesc_user: %i", err); | ||
512 | |||
513 | assert(riov.used == 1); | ||
514 | assert(riov.iov[0].iov_base == __user_addr_max - 1); | ||
515 | assert(riov.iov[0].iov_len == 1); | ||
516 | if (getrange != getrange_slow) { | ||
517 | assert(wiov.used == 1); | ||
518 | assert(wiov.iov[0].iov_base == __user_addr_max - 3); | ||
519 | assert(wiov.iov[0].iov_len == 2); | ||
520 | } else { | ||
521 | assert(wiov.used == 2); | ||
522 | assert(wiov.iov[0].iov_base == __user_addr_max - 3); | ||
523 | assert(wiov.iov[0].iov_len == 1); | ||
524 | assert(wiov.iov[1].iov_base == __user_addr_max - 2); | ||
525 | assert(wiov.iov[1].iov_len == 1); | ||
526 | } | ||
527 | |||
528 | err = vringh_iov_pull_user(&riov, buf, 5); | ||
529 | if (err != 1) | ||
530 | errx(1, "vringh_iov_pull_user: %i", err); | ||
531 | assert(buf[0] == 'a'); | ||
532 | assert(riov.i == 1); | ||
533 | assert(vringh_iov_pull_user(&riov, buf, 5) == 0); | ||
534 | |||
535 | memcpy(buf, "bcdef", 5); | ||
536 | err = vringh_iov_push_user(&wiov, buf, 5); | ||
537 | if (err != 2) | ||
538 | errx(1, "vringh_iov_push_user: %i", err); | ||
539 | assert(memcmp(__user_addr_max - 3, "bc", 2) == 0); | ||
540 | assert(wiov.i == wiov.used); | ||
541 | assert(vringh_iov_push_user(&wiov, buf, 5) == 0); | ||
542 | |||
543 | /* Host is done. */ | ||
544 | err = vringh_complete_user(&vrh, head, err); | ||
545 | if (err != 0) | ||
546 | errx(1, "vringh_complete_user: %i", err); | ||
547 | |||
548 | /* Guest should see used token now. */ | ||
549 | __kfree_ignore_start = __user_addr_min + vring_size(RINGSIZE, ALIGN); | ||
550 | __kfree_ignore_end = __kfree_ignore_start + 1; | ||
551 | ret = virtqueue_get_buf(vq, &i); | ||
552 | if (ret != &err) | ||
553 | errx(1, "virtqueue_get_buf: %p", ret); | ||
554 | assert(i == 2); | ||
555 | |||
556 | /* Guest puts in a huge descriptor. */ | ||
557 | sg_init_table(guest_sg, RINGSIZE); | ||
558 | for (i = 0; i < RINGSIZE; i++) { | ||
559 | sg_set_buf(&guest_sg[i], | ||
560 | __user_addr_max - USER_MEM/4, USER_MEM/4); | ||
561 | } | ||
562 | |||
563 | /* Fill contents with recognisable garbage. */ | ||
564 | for (i = 0; i < USER_MEM/4; i++) | ||
565 | ((char *)__user_addr_max - USER_MEM/4)[i] = i; | ||
566 | |||
567 | /* This will allocate an indirect, so force it to allocate user addr */ | ||
568 | __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN); | ||
569 | err = virtqueue_add_outbuf(vq, guest_sg, RINGSIZE, &err, GFP_KERNEL); | ||
570 | if (err) | ||
571 | errx(1, "virtqueue_add_outbuf (large): %i", err); | ||
572 | __kmalloc_fake = NULL; | ||
573 | |||
574 | /* Host picks it up (allocates new iov). */ | ||
575 | vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); | ||
576 | vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); | ||
577 | |||
578 | err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); | ||
579 | if (err != 1) | ||
580 | errx(1, "vringh_getdesc_user: %i", err); | ||
581 | |||
582 | assert(riov.max_num & VRINGH_IOV_ALLOCATED); | ||
583 | assert(riov.iov != host_riov); | ||
584 | if (getrange != getrange_slow) | ||
585 | assert(riov.used == RINGSIZE); | ||
586 | else | ||
587 | assert(riov.used == RINGSIZE * USER_MEM/4); | ||
588 | |||
589 | assert(!(wiov.max_num & VRINGH_IOV_ALLOCATED)); | ||
590 | assert(wiov.used == 0); | ||
591 | |||
592 | /* Pull data back out (in odd chunks), should be as expected. */ | ||
593 | for (i = 0; i < RINGSIZE * USER_MEM/4; i += 3) { | ||
594 | err = vringh_iov_pull_user(&riov, buf, 3); | ||
595 | if (err != 3 && i + err != RINGSIZE * USER_MEM/4) | ||
596 | errx(1, "vringh_iov_pull_user large: %i", err); | ||
597 | assert(buf[0] == (char)i); | ||
598 | assert(err < 2 || buf[1] == (char)(i + 1)); | ||
599 | assert(err < 3 || buf[2] == (char)(i + 2)); | ||
600 | } | ||
601 | assert(riov.i == riov.used); | ||
602 | vringh_iov_cleanup(&riov); | ||
603 | vringh_iov_cleanup(&wiov); | ||
604 | |||
605 | /* Complete using multi interface, just because we can. */ | ||
606 | used[0].id = head; | ||
607 | used[0].len = 0; | ||
608 | err = vringh_complete_multi_user(&vrh, used, 1); | ||
609 | if (err) | ||
610 | errx(1, "vringh_complete_multi_user(1): %i", err); | ||
611 | |||
612 | /* Free up those descriptors. */ | ||
613 | ret = virtqueue_get_buf(vq, &i); | ||
614 | if (ret != &err) | ||
615 | errx(1, "virtqueue_get_buf: %p", ret); | ||
616 | |||
617 | /* Add lots of descriptors. */ | ||
618 | sg_init_table(guest_sg, 1); | ||
619 | sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1); | ||
620 | for (i = 0; i < RINGSIZE; i++) { | ||
621 | err = virtqueue_add_outbuf(vq, guest_sg, 1, &err, GFP_KERNEL); | ||
622 | if (err) | ||
623 | errx(1, "virtqueue_add_outbuf (multiple): %i", err); | ||
624 | } | ||
625 | |||
626 | /* Now get many, and consume them all at once. */ | ||
627 | vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); | ||
628 | vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); | ||
629 | |||
630 | for (i = 0; i < RINGSIZE; i++) { | ||
631 | err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); | ||
632 | if (err != 1) | ||
633 | errx(1, "vringh_getdesc_user: %i", err); | ||
634 | used[i].id = head; | ||
635 | used[i].len = 0; | ||
636 | } | ||
637 | /* Make sure it wraps around ring, to test! */ | ||
638 | assert(vrh.vring.used->idx % RINGSIZE != 0); | ||
639 | err = vringh_complete_multi_user(&vrh, used, RINGSIZE); | ||
640 | if (err) | ||
641 | errx(1, "vringh_complete_multi_user: %i", err); | ||
642 | |||
643 | /* Free those buffers. */ | ||
644 | for (i = 0; i < RINGSIZE; i++) { | ||
645 | unsigned len; | ||
646 | assert(virtqueue_get_buf(vq, &len) != NULL); | ||
647 | } | ||
648 | |||
649 | /* Test weird (but legal!) indirect. */ | ||
650 | if (vdev.features[0] & (1 << VIRTIO_RING_F_INDIRECT_DESC)) { | ||
651 | char *data = __user_addr_max - USER_MEM/4; | ||
652 | struct vring_desc *d = __user_addr_max - USER_MEM/2; | ||
653 | struct vring vring; | ||
654 | |||
655 | /* Force creation of direct, which we modify. */ | ||
656 | vdev.features[0] &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC); | ||
657 | vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true, | ||
658 | __user_addr_min, | ||
659 | never_notify_host, | ||
660 | never_callback_guest, | ||
661 | "guest vq"); | ||
662 | |||
663 | sg_init_table(guest_sg, 4); | ||
664 | sg_set_buf(&guest_sg[0], d, sizeof(*d)*2); | ||
665 | sg_set_buf(&guest_sg[1], d + 2, sizeof(*d)*1); | ||
666 | sg_set_buf(&guest_sg[2], data + 6, 4); | ||
667 | sg_set_buf(&guest_sg[3], d + 3, sizeof(*d)*3); | ||
668 | |||
669 | err = virtqueue_add_outbuf(vq, guest_sg, 4, &err, GFP_KERNEL); | ||
670 | if (err) | ||
671 | errx(1, "virtqueue_add_outbuf (indirect): %i", err); | ||
672 | |||
673 | vring_init(&vring, RINGSIZE, __user_addr_min, ALIGN); | ||
674 | |||
675 | /* They're used in order, but double-check... */ | ||
676 | assert(vring.desc[0].addr == (unsigned long)d); | ||
677 | assert(vring.desc[1].addr == (unsigned long)(d+2)); | ||
678 | assert(vring.desc[2].addr == (unsigned long)data + 6); | ||
679 | assert(vring.desc[3].addr == (unsigned long)(d+3)); | ||
680 | vring.desc[0].flags |= VRING_DESC_F_INDIRECT; | ||
681 | vring.desc[1].flags |= VRING_DESC_F_INDIRECT; | ||
682 | vring.desc[3].flags |= VRING_DESC_F_INDIRECT; | ||
683 | |||
684 | /* First indirect */ | ||
685 | d[0].addr = (unsigned long)data; | ||
686 | d[0].len = 1; | ||
687 | d[0].flags = VRING_DESC_F_NEXT; | ||
688 | d[0].next = 1; | ||
689 | d[1].addr = (unsigned long)data + 1; | ||
690 | d[1].len = 2; | ||
691 | d[1].flags = 0; | ||
692 | |||
693 | /* Second indirect */ | ||
694 | d[2].addr = (unsigned long)data + 3; | ||
695 | d[2].len = 3; | ||
696 | d[2].flags = 0; | ||
697 | |||
698 | /* Third indirect */ | ||
699 | d[3].addr = (unsigned long)data + 10; | ||
700 | d[3].len = 5; | ||
701 | d[3].flags = VRING_DESC_F_NEXT; | ||
702 | d[3].next = 1; | ||
703 | d[4].addr = (unsigned long)data + 15; | ||
704 | d[4].len = 6; | ||
705 | d[4].flags = VRING_DESC_F_NEXT; | ||
706 | d[4].next = 2; | ||
707 | d[5].addr = (unsigned long)data + 21; | ||
708 | d[5].len = 7; | ||
709 | d[5].flags = 0; | ||
710 | |||
711 | /* Host picks it up (allocates new iov). */ | ||
712 | vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); | ||
713 | vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); | ||
714 | |||
715 | err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); | ||
716 | if (err != 1) | ||
717 | errx(1, "vringh_getdesc_user: %i", err); | ||
718 | |||
719 | if (head != 0) | ||
720 | errx(1, "vringh_getdesc_user: head %i not 0", head); | ||
721 | |||
722 | assert(riov.max_num & VRINGH_IOV_ALLOCATED); | ||
723 | if (getrange != getrange_slow) | ||
724 | assert(riov.used == 7); | ||
725 | else | ||
726 | assert(riov.used == 28); | ||
727 | err = vringh_iov_pull_user(&riov, buf, 29); | ||
728 | assert(err == 28); | ||
729 | |||
730 | /* Data should be linear. */ | ||
731 | for (i = 0; i < err; i++) | ||
732 | assert(buf[i] == i); | ||
733 | vringh_iov_cleanup(&riov); | ||
734 | } | ||
735 | |||
736 | /* Don't leak memory... */ | ||
737 | vring_del_virtqueue(vq); | ||
738 | free(__user_addr_min); | ||
739 | |||
740 | return 0; | ||
741 | } | ||