aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2013-03-19 23:20:14 -0400
committerRusty Russell <rusty@rustcorp.com.au>2013-03-19 23:35:33 -0400
commitf87d0fbb579818fed3eeb0923cc253163ab93039 (patch)
treef33e06a6cb4eb4656e710f8ad70100e2130a32a5
parent61d0b5a4b2777dcf5daef245e212b3c1fa8091ca (diff)
vringh: host-side implementation of virtio rings.
Getting use of virtio rings correct is tricky, and a recent patch saw an implementation of in-kernel rings (as separate from userspace). This abstracts the business of dealing with the virtio ring layout from the access (userspace or direct); to do this, we use function pointers, which gcc inlines correctly. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Acked-by: Michael S. Tsirkin <mst@redhat.com>
-rw-r--r--drivers/Makefile2
-rw-r--r--drivers/vhost/Kconfig8
-rw-r--r--drivers/vhost/Kconfig.tcm1
-rw-r--r--drivers/vhost/Makefile2
-rw-r--r--drivers/vhost/vringh.c1007
-rw-r--r--include/linux/vringh.h196
6 files changed, 1215 insertions, 1 deletions
diff --git a/drivers/Makefile b/drivers/Makefile
index dce39a95fa71..72d28d34ee24 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -123,7 +123,7 @@ obj-$(CONFIG_PPC_PS3) += ps3/
123obj-$(CONFIG_OF) += of/ 123obj-$(CONFIG_OF) += of/
124obj-$(CONFIG_SSB) += ssb/ 124obj-$(CONFIG_SSB) += ssb/
125obj-$(CONFIG_BCMA) += bcma/ 125obj-$(CONFIG_BCMA) += bcma/
126obj-$(CONFIG_VHOST_NET) += vhost/ 126obj-$(CONFIG_VHOST_RING) += vhost/
127obj-$(CONFIG_VLYNQ) += vlynq/ 127obj-$(CONFIG_VLYNQ) += vlynq/
128obj-$(CONFIG_STAGING) += staging/ 128obj-$(CONFIG_STAGING) += staging/
129obj-y += platform/ 129obj-y += platform/
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index bf243177ffe1..85b773a93a5d 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -1,6 +1,7 @@
1config VHOST_NET 1config VHOST_NET
2 tristate "Host kernel accelerator for virtio net" 2 tristate "Host kernel accelerator for virtio net"
3 depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP) 3 depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP)
4 select VHOST_RING
4 ---help--- 5 ---help---
5 This kernel module can be loaded in host kernel to accelerate 6 This kernel module can be loaded in host kernel to accelerate
6 guest networking with virtio_net. Not to be confused with virtio_net 7 guest networking with virtio_net. Not to be confused with virtio_net
@@ -12,3 +13,10 @@ config VHOST_NET
12if STAGING 13if STAGING
13source "drivers/vhost/Kconfig.tcm" 14source "drivers/vhost/Kconfig.tcm"
14endif 15endif
16
17config VHOST_RING
18 tristate
19 ---help---
20 This option is selected by any driver which needs to access
21 the host side of a virtio ring.
22
diff --git a/drivers/vhost/Kconfig.tcm b/drivers/vhost/Kconfig.tcm
index 7e3aa28d999e..c3a8cfa1de72 100644
--- a/drivers/vhost/Kconfig.tcm
+++ b/drivers/vhost/Kconfig.tcm
@@ -1,6 +1,7 @@
1config TCM_VHOST 1config TCM_VHOST
2 tristate "TCM_VHOST fabric module" 2 tristate "TCM_VHOST fabric module"
3 depends on TARGET_CORE && EVENTFD && m 3 depends on TARGET_CORE && EVENTFD && m
4 select VHOST_RING
4 default n 5 default n
5 ---help--- 6 ---help---
6 Say M here to enable the TCM_VHOST fabric module for use with virtio-scsi guests 7 Say M here to enable the TCM_VHOST fabric module for use with virtio-scsi guests
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index a27b053bc9ab..1d37f5e12be6 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o
2vhost_net-y := vhost.o net.o 2vhost_net-y := vhost.o net.o
3 3
4obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o 4obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o
5
6obj-$(CONFIG_VHOST_RING) += vringh.o
diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
new file mode 100644
index 000000000000..bff0775e258c
--- /dev/null
+++ b/drivers/vhost/vringh.c
@@ -0,0 +1,1007 @@
1/*
2 * Helpers for the host side of a virtio ring.
3 *
4 * Since these may be in userspace, we use (inline) accessors.
5 */
6#include <linux/vringh.h>
7#include <linux/virtio_ring.h>
8#include <linux/kernel.h>
9#include <linux/ratelimit.h>
10#include <linux/uaccess.h>
11#include <linux/slab.h>
12#include <linux/export.h>
13
14static __printf(1,2) __cold void vringh_bad(const char *fmt, ...)
15{
16 static DEFINE_RATELIMIT_STATE(vringh_rs,
17 DEFAULT_RATELIMIT_INTERVAL,
18 DEFAULT_RATELIMIT_BURST);
19 if (__ratelimit(&vringh_rs)) {
20 va_list ap;
21 va_start(ap, fmt);
22 printk(KERN_NOTICE "vringh:");
23 vprintk(fmt, ap);
24 va_end(ap);
25 }
26}
27
28/* Returns vring->num if empty, -ve on error. */
29static inline int __vringh_get_head(const struct vringh *vrh,
30 int (*getu16)(u16 *val, const u16 *p),
31 u16 *last_avail_idx)
32{
33 u16 avail_idx, i, head;
34 int err;
35
36 err = getu16(&avail_idx, &vrh->vring.avail->idx);
37 if (err) {
38 vringh_bad("Failed to access avail idx at %p",
39 &vrh->vring.avail->idx);
40 return err;
41 }
42
43 if (*last_avail_idx == avail_idx)
44 return vrh->vring.num;
45
46 /* Only get avail ring entries after they have been exposed by guest. */
47 virtio_rmb(vrh->weak_barriers);
48
49 i = *last_avail_idx & (vrh->vring.num - 1);
50
51 err = getu16(&head, &vrh->vring.avail->ring[i]);
52 if (err) {
53 vringh_bad("Failed to read head: idx %d address %p",
54 *last_avail_idx, &vrh->vring.avail->ring[i]);
55 return err;
56 }
57
58 if (head >= vrh->vring.num) {
59 vringh_bad("Guest says index %u > %u is available",
60 head, vrh->vring.num);
61 return -EINVAL;
62 }
63
64 (*last_avail_idx)++;
65 return head;
66}
67
68/* Copy some bytes to/from the iovec. Returns num copied. */
69static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
70 void *ptr, size_t len,
71 int (*xfer)(void *addr, void *ptr,
72 size_t len))
73{
74 int err, done = 0;
75
76 while (len && iov->i < iov->used) {
77 size_t partlen;
78
79 partlen = min(iov->iov[iov->i].iov_len, len);
80 err = xfer(iov->iov[iov->i].iov_base, ptr, partlen);
81 if (err)
82 return err;
83 done += partlen;
84 len -= partlen;
85 ptr += partlen;
86 iov->consumed += partlen;
87 iov->iov[iov->i].iov_len -= partlen;
88 iov->iov[iov->i].iov_base += partlen;
89
90 if (!iov->iov[iov->i].iov_len) {
91 /* Fix up old iov element then increment. */
92 iov->iov[iov->i].iov_len = iov->consumed;
93 iov->iov[iov->i].iov_base -= iov->consumed;
94
95 iov->consumed = 0;
96 iov->i++;
97 }
98 }
99 return done;
100}
101
102/* May reduce *len if range is shorter. */
103static inline bool range_check(struct vringh *vrh, u64 addr, size_t *len,
104 struct vringh_range *range,
105 bool (*getrange)(struct vringh *,
106 u64, struct vringh_range *))
107{
108 if (addr < range->start || addr > range->end_incl) {
109 if (!getrange(vrh, addr, range))
110 return false;
111 }
112 BUG_ON(addr < range->start || addr > range->end_incl);
113
114 /* To end of memory? */
115 if (unlikely(addr + *len == 0)) {
116 if (range->end_incl == -1ULL)
117 return true;
118 goto truncate;
119 }
120
121 /* Otherwise, don't wrap. */
122 if (addr + *len < addr) {
123 vringh_bad("Wrapping descriptor %zu@0x%llx",
124 *len, (unsigned long long)addr);
125 return false;
126 }
127
128 if (unlikely(addr + *len - 1 > range->end_incl))
129 goto truncate;
130 return true;
131
132truncate:
133 *len = range->end_incl + 1 - addr;
134 return true;
135}
136
137static inline bool no_range_check(struct vringh *vrh, u64 addr, size_t *len,
138 struct vringh_range *range,
139 bool (*getrange)(struct vringh *,
140 u64, struct vringh_range *))
141{
142 return true;
143}
144
145/* No reason for this code to be inline. */
146static int move_to_indirect(int *up_next, u16 *i, void *addr,
147 const struct vring_desc *desc,
148 struct vring_desc **descs, int *desc_max)
149{
150 /* Indirect tables can't have indirect. */
151 if (*up_next != -1) {
152 vringh_bad("Multilevel indirect %u->%u", *up_next, *i);
153 return -EINVAL;
154 }
155
156 if (unlikely(desc->len % sizeof(struct vring_desc))) {
157 vringh_bad("Strange indirect len %u", desc->len);
158 return -EINVAL;
159 }
160
161 /* We will check this when we follow it! */
162 if (desc->flags & VRING_DESC_F_NEXT)
163 *up_next = desc->next;
164 else
165 *up_next = -2;
166 *descs = addr;
167 *desc_max = desc->len / sizeof(struct vring_desc);
168
169 /* Now, start at the first indirect. */
170 *i = 0;
171 return 0;
172}
173
174static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp)
175{
176 struct kvec *new;
177 unsigned int flag, new_num = (iov->max_num & ~VRINGH_IOV_ALLOCATED) * 2;
178
179 if (new_num < 8)
180 new_num = 8;
181
182 flag = (iov->max_num & VRINGH_IOV_ALLOCATED);
183 if (flag)
184 new = krealloc(iov->iov, new_num * sizeof(struct iovec), gfp);
185 else {
186 new = kmalloc(new_num * sizeof(struct iovec), gfp);
187 if (new) {
188 memcpy(new, iov->iov,
189 iov->max_num * sizeof(struct iovec));
190 flag = VRINGH_IOV_ALLOCATED;
191 }
192 }
193 if (!new)
194 return -ENOMEM;
195 iov->iov = new;
196 iov->max_num = (new_num | flag);
197 return 0;
198}
199
200static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next,
201 struct vring_desc **descs, int *desc_max)
202{
203 u16 i = *up_next;
204
205 *up_next = -1;
206 *descs = vrh->vring.desc;
207 *desc_max = vrh->vring.num;
208 return i;
209}
210
211static int slow_copy(struct vringh *vrh, void *dst, const void *src,
212 bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
213 struct vringh_range *range,
214 bool (*getrange)(struct vringh *vrh,
215 u64,
216 struct vringh_range *)),
217 bool (*getrange)(struct vringh *vrh,
218 u64 addr,
219 struct vringh_range *r),
220 struct vringh_range *range,
221 int (*copy)(void *dst, const void *src, size_t len))
222{
223 size_t part, len = sizeof(struct vring_desc);
224
225 do {
226 u64 addr;
227 int err;
228
229 part = len;
230 addr = (u64)(unsigned long)src - range->offset;
231
232 if (!rcheck(vrh, addr, &part, range, getrange))
233 return -EINVAL;
234
235 err = copy(dst, src, part);
236 if (err)
237 return err;
238
239 dst += part;
240 src += part;
241 len -= part;
242 } while (len);
243 return 0;
244}
245
246static inline int
247__vringh_iov(struct vringh *vrh, u16 i,
248 struct vringh_kiov *riov,
249 struct vringh_kiov *wiov,
250 bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
251 struct vringh_range *range,
252 bool (*getrange)(struct vringh *, u64,
253 struct vringh_range *)),
254 bool (*getrange)(struct vringh *, u64, struct vringh_range *),
255 gfp_t gfp,
256 int (*copy)(void *dst, const void *src, size_t len))
257{
258 int err, count = 0, up_next, desc_max;
259 struct vring_desc desc, *descs;
260 struct vringh_range range = { -1ULL, 0 }, slowrange;
261 bool slow = false;
262
263 /* We start traversing vring's descriptor table. */
264 descs = vrh->vring.desc;
265 desc_max = vrh->vring.num;
266 up_next = -1;
267
268 if (riov)
269 riov->i = riov->used = 0;
270 else if (wiov)
271 wiov->i = wiov->used = 0;
272 else
273 /* You must want something! */
274 BUG();
275
276 for (;;) {
277 void *addr;
278 struct vringh_kiov *iov;
279 size_t len;
280
281 if (unlikely(slow))
282 err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange,
283 &slowrange, copy);
284 else
285 err = copy(&desc, &descs[i], sizeof(desc));
286 if (unlikely(err))
287 goto fail;
288
289 if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) {
290 /* Make sure it's OK, and get offset. */
291 len = desc.len;
292 if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
293 err = -EINVAL;
294 goto fail;
295 }
296
297 if (unlikely(len != desc.len)) {
298 slow = true;
299 /* We need to save this range to use offset */
300 slowrange = range;
301 }
302
303 addr = (void *)(long)(desc.addr + range.offset);
304 err = move_to_indirect(&up_next, &i, addr, &desc,
305 &descs, &desc_max);
306 if (err)
307 goto fail;
308 continue;
309 }
310
311 if (count++ == vrh->vring.num) {
312 vringh_bad("Descriptor loop in %p", descs);
313 err = -ELOOP;
314 goto fail;
315 }
316
317 if (desc.flags & VRING_DESC_F_WRITE)
318 iov = wiov;
319 else {
320 iov = riov;
321 if (unlikely(wiov && wiov->i)) {
322 vringh_bad("Readable desc %p after writable",
323 &descs[i]);
324 err = -EINVAL;
325 goto fail;
326 }
327 }
328
329 if (!iov) {
330 vringh_bad("Unexpected %s desc",
331 !wiov ? "writable" : "readable");
332 err = -EPROTO;
333 goto fail;
334 }
335
336 again:
337 /* Make sure it's OK, and get offset. */
338 len = desc.len;
339 if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
340 err = -EINVAL;
341 goto fail;
342 }
343 addr = (void *)(unsigned long)(desc.addr + range.offset);
344
345 if (unlikely(iov->used == (iov->max_num & ~VRINGH_IOV_ALLOCATED))) {
346 err = resize_iovec(iov, gfp);
347 if (err)
348 goto fail;
349 }
350
351 iov->iov[iov->used].iov_base = addr;
352 iov->iov[iov->used].iov_len = len;
353 iov->used++;
354
355 if (unlikely(len != desc.len)) {
356 desc.len -= len;
357 desc.addr += len;
358 goto again;
359 }
360
361 if (desc.flags & VRING_DESC_F_NEXT) {
362 i = desc.next;
363 } else {
364 /* Just in case we need to finish traversing above. */
365 if (unlikely(up_next > 0)) {
366 i = return_from_indirect(vrh, &up_next,
367 &descs, &desc_max);
368 slow = false;
369 } else
370 break;
371 }
372
373 if (i >= desc_max) {
374 vringh_bad("Chained index %u > %u", i, desc_max);
375 err = -EINVAL;
376 goto fail;
377 }
378 }
379
380 return 0;
381
382fail:
383 return err;
384}
385
386static inline int __vringh_complete(struct vringh *vrh,
387 const struct vring_used_elem *used,
388 unsigned int num_used,
389 int (*putu16)(u16 *p, u16 val),
390 int (*putused)(struct vring_used_elem *dst,
391 const struct vring_used_elem
392 *src, unsigned num))
393{
394 struct vring_used *used_ring;
395 int err;
396 u16 used_idx, off;
397
398 used_ring = vrh->vring.used;
399 used_idx = vrh->last_used_idx + vrh->completed;
400
401 off = used_idx % vrh->vring.num;
402
403 /* Compiler knows num_used == 1 sometimes, hence extra check */
404 if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) {
405 u16 part = vrh->vring.num - off;
406 err = putused(&used_ring->ring[off], used, part);
407 if (!err)
408 err = putused(&used_ring->ring[0], used + part,
409 num_used - part);
410 } else
411 err = putused(&used_ring->ring[off], used, num_used);
412
413 if (err) {
414 vringh_bad("Failed to write %u used entries %u at %p",
415 num_used, off, &used_ring->ring[off]);
416 return err;
417 }
418
419 /* Make sure buffer is written before we update index. */
420 virtio_wmb(vrh->weak_barriers);
421
422 err = putu16(&vrh->vring.used->idx, used_idx + num_used);
423 if (err) {
424 vringh_bad("Failed to update used index at %p",
425 &vrh->vring.used->idx);
426 return err;
427 }
428
429 vrh->completed += num_used;
430 return 0;
431}
432
433
434static inline int __vringh_need_notify(struct vringh *vrh,
435 int (*getu16)(u16 *val, const u16 *p))
436{
437 bool notify;
438 u16 used_event;
439 int err;
440
441 /* Flush out used index update. This is paired with the
442 * barrier that the Guest executes when enabling
443 * interrupts. */
444 virtio_mb(vrh->weak_barriers);
445
446 /* Old-style, without event indices. */
447 if (!vrh->event_indices) {
448 u16 flags;
449 err = getu16(&flags, &vrh->vring.avail->flags);
450 if (err) {
451 vringh_bad("Failed to get flags at %p",
452 &vrh->vring.avail->flags);
453 return err;
454 }
455 return (!(flags & VRING_AVAIL_F_NO_INTERRUPT));
456 }
457
458 /* Modern: we know when other side wants to know. */
459 err = getu16(&used_event, &vring_used_event(&vrh->vring));
460 if (err) {
461 vringh_bad("Failed to get used event idx at %p",
462 &vring_used_event(&vrh->vring));
463 return err;
464 }
465
466 /* Just in case we added so many that we wrap. */
467 if (unlikely(vrh->completed > 0xffff))
468 notify = true;
469 else
470 notify = vring_need_event(used_event,
471 vrh->last_used_idx + vrh->completed,
472 vrh->last_used_idx);
473
474 vrh->last_used_idx += vrh->completed;
475 vrh->completed = 0;
476 return notify;
477}
478
479static inline bool __vringh_notify_enable(struct vringh *vrh,
480 int (*getu16)(u16 *val, const u16 *p),
481 int (*putu16)(u16 *p, u16 val))
482{
483 u16 avail;
484
485 if (!vrh->event_indices) {
486 /* Old-school; update flags. */
487 if (putu16(&vrh->vring.used->flags, 0) != 0) {
488 vringh_bad("Clearing used flags %p",
489 &vrh->vring.used->flags);
490 return true;
491 }
492 } else {
493 if (putu16(&vring_avail_event(&vrh->vring),
494 vrh->last_avail_idx) != 0) {
495 vringh_bad("Updating avail event index %p",
496 &vring_avail_event(&vrh->vring));
497 return true;
498 }
499 }
500
501 /* They could have slipped one in as we were doing that: make
502 * sure it's written, then check again. */
503 virtio_mb(vrh->weak_barriers);
504
505 if (getu16(&avail, &vrh->vring.avail->idx) != 0) {
506 vringh_bad("Failed to check avail idx at %p",
507 &vrh->vring.avail->idx);
508 return true;
509 }
510
511 /* This is unlikely, so we just leave notifications enabled
512 * (if we're using event_indices, we'll only get one
513 * notification anyway). */
514 return avail == vrh->last_avail_idx;
515}
516
517static inline void __vringh_notify_disable(struct vringh *vrh,
518 int (*putu16)(u16 *p, u16 val))
519{
520 if (!vrh->event_indices) {
521 /* Old-school; update flags. */
522 if (putu16(&vrh->vring.used->flags, VRING_USED_F_NO_NOTIFY)) {
523 vringh_bad("Setting used flags %p",
524 &vrh->vring.used->flags);
525 }
526 }
527}
528
529/* Userspace access helpers: in this case, addresses are really userspace. */
530static inline int getu16_user(u16 *val, const u16 *p)
531{
532 return get_user(*val, (__force u16 __user *)p);
533}
534
535static inline int putu16_user(u16 *p, u16 val)
536{
537 return put_user(val, (__force u16 __user *)p);
538}
539
540static inline int copydesc_user(void *dst, const void *src, size_t len)
541{
542 return copy_from_user(dst, (__force void __user *)src, len) ?
543 -EFAULT : 0;
544}
545
546static inline int putused_user(struct vring_used_elem *dst,
547 const struct vring_used_elem *src,
548 unsigned int num)
549{
550 return copy_to_user((__force void __user *)dst, src,
551 sizeof(*dst) * num) ? -EFAULT : 0;
552}
553
554static inline int xfer_from_user(void *src, void *dst, size_t len)
555{
556 return copy_from_user(dst, (__force void __user *)src, len) ?
557 -EFAULT : 0;
558}
559
560static inline int xfer_to_user(void *dst, void *src, size_t len)
561{
562 return copy_to_user((__force void __user *)dst, src, len) ?
563 -EFAULT : 0;
564}
565
566/**
567 * vringh_init_user - initialize a vringh for a userspace vring.
568 * @vrh: the vringh to initialize.
569 * @features: the feature bits for this ring.
570 * @num: the number of elements.
571 * @weak_barriers: true if we only need memory barriers, not I/O.
572 * @desc: the userpace descriptor pointer.
573 * @avail: the userpace avail pointer.
574 * @used: the userpace used pointer.
575 *
576 * Returns an error if num is invalid: you should check pointers
577 * yourself!
578 */
579int vringh_init_user(struct vringh *vrh, u32 features,
580 unsigned int num, bool weak_barriers,
581 struct vring_desc __user *desc,
582 struct vring_avail __user *avail,
583 struct vring_used __user *used)
584{
585 /* Sane power of 2 please! */
586 if (!num || num > 0xffff || (num & (num - 1))) {
587 vringh_bad("Bad ring size %u", num);
588 return -EINVAL;
589 }
590
591 vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
592 vrh->weak_barriers = weak_barriers;
593 vrh->completed = 0;
594 vrh->last_avail_idx = 0;
595 vrh->last_used_idx = 0;
596 vrh->vring.num = num;
597 /* vring expects kernel addresses, but only used via accessors. */
598 vrh->vring.desc = (__force struct vring_desc *)desc;
599 vrh->vring.avail = (__force struct vring_avail *)avail;
600 vrh->vring.used = (__force struct vring_used *)used;
601 return 0;
602}
603EXPORT_SYMBOL(vringh_init_user);
604
605/**
606 * vringh_getdesc_user - get next available descriptor from userspace ring.
607 * @vrh: the userspace vring.
608 * @riov: where to put the readable descriptors (or NULL)
609 * @wiov: where to put the writable descriptors (or NULL)
610 * @getrange: function to call to check ranges.
611 * @head: head index we received, for passing to vringh_complete_user().
612 *
613 * Returns 0 if there was no descriptor, 1 if there was, or -errno.
614 *
615 * Note that on error return, you can tell the difference between an
616 * invalid ring and a single invalid descriptor: in the former case,
617 * *head will be vrh->vring.num. You may be able to ignore an invalid
618 * descriptor, but there's not much you can do with an invalid ring.
619 *
620 * Note that you may need to clean up riov and wiov, even on error!
621 */
622int vringh_getdesc_user(struct vringh *vrh,
623 struct vringh_iov *riov,
624 struct vringh_iov *wiov,
625 bool (*getrange)(struct vringh *vrh,
626 u64 addr, struct vringh_range *r),
627 u16 *head)
628{
629 int err;
630
631 *head = vrh->vring.num;
632 err = __vringh_get_head(vrh, getu16_user, &vrh->last_avail_idx);
633 if (err < 0)
634 return err;
635
636 /* Empty... */
637 if (err == vrh->vring.num)
638 return 0;
639
640 /* We need the layouts to be the identical for this to work */
641 BUILD_BUG_ON(sizeof(struct vringh_kiov) != sizeof(struct vringh_iov));
642 BUILD_BUG_ON(offsetof(struct vringh_kiov, iov) !=
643 offsetof(struct vringh_iov, iov));
644 BUILD_BUG_ON(offsetof(struct vringh_kiov, i) !=
645 offsetof(struct vringh_iov, i));
646 BUILD_BUG_ON(offsetof(struct vringh_kiov, used) !=
647 offsetof(struct vringh_iov, used));
648 BUILD_BUG_ON(offsetof(struct vringh_kiov, max_num) !=
649 offsetof(struct vringh_iov, max_num));
650 BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
651 BUILD_BUG_ON(offsetof(struct iovec, iov_base) !=
652 offsetof(struct kvec, iov_base));
653 BUILD_BUG_ON(offsetof(struct iovec, iov_len) !=
654 offsetof(struct kvec, iov_len));
655 BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_base)
656 != sizeof(((struct kvec *)NULL)->iov_base));
657 BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_len)
658 != sizeof(((struct kvec *)NULL)->iov_len));
659
660 *head = err;
661 err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov,
662 (struct vringh_kiov *)wiov,
663 range_check, getrange, GFP_KERNEL, copydesc_user);
664 if (err)
665 return err;
666
667 return 1;
668}
669EXPORT_SYMBOL(vringh_getdesc_user);
670
671/**
672 * vringh_iov_pull_user - copy bytes from vring_iov.
673 * @riov: the riov as passed to vringh_getdesc_user() (updated as we consume)
674 * @dst: the place to copy.
675 * @len: the maximum length to copy.
676 *
677 * Returns the bytes copied <= len or a negative errno.
678 */
679ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len)
680{
681 return vringh_iov_xfer((struct vringh_kiov *)riov,
682 dst, len, xfer_from_user);
683}
684EXPORT_SYMBOL(vringh_iov_pull_user);
685
686/**
687 * vringh_iov_push_user - copy bytes into vring_iov.
688 * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume)
689 * @dst: the place to copy.
690 * @len: the maximum length to copy.
691 *
692 * Returns the bytes copied <= len or a negative errno.
693 */
694ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
695 const void *src, size_t len)
696{
697 return vringh_iov_xfer((struct vringh_kiov *)wiov,
698 (void *)src, len, xfer_to_user);
699}
700EXPORT_SYMBOL(vringh_iov_push_user);
701
702/**
703 * vringh_abandon_user - we've decided not to handle the descriptor(s).
704 * @vrh: the vring.
705 * @num: the number of descriptors to put back (ie. num
706 * vringh_get_user() to undo).
707 *
708 * The next vringh_get_user() will return the old descriptor(s) again.
709 */
710void vringh_abandon_user(struct vringh *vrh, unsigned int num)
711{
712 /* We only update vring_avail_event(vr) when we want to be notified,
713 * so we haven't changed that yet. */
714 vrh->last_avail_idx -= num;
715}
716EXPORT_SYMBOL(vringh_abandon_user);
717
718/**
719 * vringh_complete_user - we've finished with descriptor, publish it.
720 * @vrh: the vring.
721 * @head: the head as filled in by vringh_getdesc_user.
722 * @len: the length of data we have written.
723 *
724 * You should check vringh_need_notify_user() after one or more calls
725 * to this function.
726 */
727int vringh_complete_user(struct vringh *vrh, u16 head, u32 len)
728{
729 struct vring_used_elem used;
730
731 used.id = head;
732 used.len = len;
733 return __vringh_complete(vrh, &used, 1, putu16_user, putused_user);
734}
735EXPORT_SYMBOL(vringh_complete_user);
736
737/**
738 * vringh_complete_multi_user - we've finished with many descriptors.
739 * @vrh: the vring.
740 * @used: the head, length pairs.
741 * @num_used: the number of used elements.
742 *
743 * You should check vringh_need_notify_user() after one or more calls
744 * to this function.
745 */
746int vringh_complete_multi_user(struct vringh *vrh,
747 const struct vring_used_elem used[],
748 unsigned num_used)
749{
750 return __vringh_complete(vrh, used, num_used,
751 putu16_user, putused_user);
752}
753EXPORT_SYMBOL(vringh_complete_multi_user);
754
755/**
756 * vringh_notify_enable_user - we want to know if something changes.
757 * @vrh: the vring.
758 *
759 * This always enables notifications, but returns false if there are
760 * now more buffers available in the vring.
761 */
762bool vringh_notify_enable_user(struct vringh *vrh)
763{
764 return __vringh_notify_enable(vrh, getu16_user, putu16_user);
765}
766EXPORT_SYMBOL(vringh_notify_enable_user);
767
768/**
769 * vringh_notify_disable_user - don't tell us if something changes.
770 * @vrh: the vring.
771 *
772 * This is our normal running state: we disable and then only enable when
773 * we're going to sleep.
774 */
775void vringh_notify_disable_user(struct vringh *vrh)
776{
777 __vringh_notify_disable(vrh, putu16_user);
778}
779EXPORT_SYMBOL(vringh_notify_disable_user);
780
781/**
782 * vringh_need_notify_user - must we tell the other side about used buffers?
783 * @vrh: the vring we've called vringh_complete_user() on.
784 *
785 * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
786 */
787int vringh_need_notify_user(struct vringh *vrh)
788{
789 return __vringh_need_notify(vrh, getu16_user);
790}
791EXPORT_SYMBOL(vringh_need_notify_user);
792
793/* Kernelspace access helpers. */
794static inline int getu16_kern(u16 *val, const u16 *p)
795{
796 *val = ACCESS_ONCE(*p);
797 return 0;
798}
799
800static inline int putu16_kern(u16 *p, u16 val)
801{
802 ACCESS_ONCE(*p) = val;
803 return 0;
804}
805
806static inline int copydesc_kern(void *dst, const void *src, size_t len)
807{
808 memcpy(dst, src, len);
809 return 0;
810}
811
812static inline int putused_kern(struct vring_used_elem *dst,
813 const struct vring_used_elem *src,
814 unsigned int num)
815{
816 memcpy(dst, src, num * sizeof(*dst));
817 return 0;
818}
819
820static inline int xfer_kern(void *src, void *dst, size_t len)
821{
822 memcpy(dst, src, len);
823 return 0;
824}
825
826/**
827 * vringh_init_kern - initialize a vringh for a kernelspace vring.
828 * @vrh: the vringh to initialize.
829 * @features: the feature bits for this ring.
830 * @num: the number of elements.
831 * @weak_barriers: true if we only need memory barriers, not I/O.
832 * @desc: the userpace descriptor pointer.
833 * @avail: the userpace avail pointer.
834 * @used: the userpace used pointer.
835 *
836 * Returns an error if num is invalid.
837 */
838int vringh_init_kern(struct vringh *vrh, u32 features,
839 unsigned int num, bool weak_barriers,
840 struct vring_desc *desc,
841 struct vring_avail *avail,
842 struct vring_used *used)
843{
844 /* Sane power of 2 please! */
845 if (!num || num > 0xffff || (num & (num - 1))) {
846 vringh_bad("Bad ring size %u", num);
847 return -EINVAL;
848 }
849
850 vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
851 vrh->weak_barriers = weak_barriers;
852 vrh->completed = 0;
853 vrh->last_avail_idx = 0;
854 vrh->last_used_idx = 0;
855 vrh->vring.num = num;
856 vrh->vring.desc = desc;
857 vrh->vring.avail = avail;
858 vrh->vring.used = used;
859 return 0;
860}
861EXPORT_SYMBOL(vringh_init_kern);
862
863/**
864 * vringh_getdesc_kern - get next available descriptor from kernelspace ring.
865 * @vrh: the kernelspace vring.
866 * @riov: where to put the readable descriptors (or NULL)
867 * @wiov: where to put the writable descriptors (or NULL)
868 * @head: head index we received, for passing to vringh_complete_kern().
869 * @gfp: flags for allocating larger riov/wiov.
870 *
871 * Returns 0 if there was no descriptor, 1 if there was, or -errno.
872 *
873 * Note that on error return, you can tell the difference between an
874 * invalid ring and a single invalid descriptor: in the former case,
875 * *head will be vrh->vring.num. You may be able to ignore an invalid
876 * descriptor, but there's not much you can do with an invalid ring.
877 *
878 * Note that you may need to clean up riov and wiov, even on error!
879 */
880int vringh_getdesc_kern(struct vringh *vrh,
881 struct vringh_kiov *riov,
882 struct vringh_kiov *wiov,
883 u16 *head,
884 gfp_t gfp)
885{
886 int err;
887
888 err = __vringh_get_head(vrh, getu16_kern, &vrh->last_avail_idx);
889 if (err < 0)
890 return err;
891
892 /* Empty... */
893 if (err == vrh->vring.num)
894 return 0;
895
896 *head = err;
897 err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL,
898 gfp, copydesc_kern);
899 if (err)
900 return err;
901
902 return 1;
903}
904EXPORT_SYMBOL(vringh_getdesc_kern);
905
906/**
907 * vringh_iov_pull_kern - copy bytes from vring_iov.
908 * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume)
909 * @dst: the place to copy.
910 * @len: the maximum length to copy.
911 *
912 * Returns the bytes copied <= len or a negative errno.
913 */
914ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len)
915{
916 return vringh_iov_xfer(riov, dst, len, xfer_kern);
917}
918EXPORT_SYMBOL(vringh_iov_pull_kern);
919
920/**
921 * vringh_iov_push_kern - copy bytes into vring_iov.
922 * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume)
923 * @dst: the place to copy.
924 * @len: the maximum length to copy.
925 *
926 * Returns the bytes copied <= len or a negative errno.
927 */
928ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
929 const void *src, size_t len)
930{
931 return vringh_iov_xfer(wiov, (void *)src, len, xfer_kern);
932}
933EXPORT_SYMBOL(vringh_iov_push_kern);
934
935/**
936 * vringh_abandon_kern - we've decided not to handle the descriptor(s).
937 * @vrh: the vring.
938 * @num: the number of descriptors to put back (ie. num
939 * vringh_get_kern() to undo).
940 *
941 * The next vringh_get_kern() will return the old descriptor(s) again.
942 */
943void vringh_abandon_kern(struct vringh *vrh, unsigned int num)
944{
945 /* We only update vring_avail_event(vr) when we want to be notified,
946 * so we haven't changed that yet. */
947 vrh->last_avail_idx -= num;
948}
949EXPORT_SYMBOL(vringh_abandon_kern);
950
951/**
952 * vringh_complete_kern - we've finished with descriptor, publish it.
953 * @vrh: the vring.
954 * @head: the head as filled in by vringh_getdesc_kern.
955 * @len: the length of data we have written.
956 *
957 * You should check vringh_need_notify_kern() after one or more calls
958 * to this function.
959 */
960int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len)
961{
962 struct vring_used_elem used;
963
964 used.id = head;
965 used.len = len;
966
967 return __vringh_complete(vrh, &used, 1, putu16_kern, putused_kern);
968}
969EXPORT_SYMBOL(vringh_complete_kern);
970
971/**
972 * vringh_notify_enable_kern - we want to know if something changes.
973 * @vrh: the vring.
974 *
975 * This always enables notifications, but returns false if there are
976 * now more buffers available in the vring.
977 */
978bool vringh_notify_enable_kern(struct vringh *vrh)
979{
980 return __vringh_notify_enable(vrh, getu16_kern, putu16_kern);
981}
982EXPORT_SYMBOL(vringh_notify_enable_kern);
983
984/**
985 * vringh_notify_disable_kern - don't tell us if something changes.
986 * @vrh: the vring.
987 *
988 * This is our normal running state: we disable and then only enable when
989 * we're going to sleep.
990 */
991void vringh_notify_disable_kern(struct vringh *vrh)
992{
993 __vringh_notify_disable(vrh, putu16_kern);
994}
995EXPORT_SYMBOL(vringh_notify_disable_kern);
996
997/**
998 * vringh_need_notify_kern - must we tell the other side about used buffers?
999 * @vrh: the vring we've called vringh_complete_kern() on.
1000 *
1001 * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
1002 */
1003int vringh_need_notify_kern(struct vringh *vrh)
1004{
1005 return __vringh_need_notify(vrh, getu16_kern);
1006}
1007EXPORT_SYMBOL(vringh_need_notify_kern);
diff --git a/include/linux/vringh.h b/include/linux/vringh.h
new file mode 100644
index 000000000000..b8f086625c49
--- /dev/null
+++ b/include/linux/vringh.h
@@ -0,0 +1,196 @@
1/*
2 * Linux host-side vring helpers; for when the kernel needs to access
3 * someone else's vring.
4 *
5 * Copyright IBM Corporation, 2013.
6 * Parts taken from drivers/vhost/vhost.c Copyright 2009 Red Hat, Inc.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 *
22 * Written by: Rusty Russell <rusty@rustcorp.com.au>
23 */
24#ifndef _LINUX_VRINGH_H
25#define _LINUX_VRINGH_H
26#include <uapi/linux/virtio_ring.h>
27#include <linux/uio.h>
28#include <linux/slab.h>
29#include <asm/barrier.h>
30
31/* virtio_ring with information needed for host access. */
32struct vringh {
33 /* Guest publishes used event idx (note: we always do). */
34 bool event_indices;
35
36 /* Can we get away with weak barriers? */
37 bool weak_barriers;
38
39 /* Last available index we saw (ie. where we're up to). */
40 u16 last_avail_idx;
41
42 /* Last index we used. */
43 u16 last_used_idx;
44
45 /* How many descriptors we've completed since last need_notify(). */
46 u32 completed;
47
48 /* The vring (note: it may contain user pointers!) */
49 struct vring vring;
50};
51
52/* The memory the vring can access, and what offset to apply. */
53struct vringh_range {
54 u64 start, end_incl;
55 u64 offset;
56};
57
58/**
59 * struct vringh_iov - iovec mangler.
60 *
61 * Mangles iovec in place, and restores it.
62 * Remaining data is iov + i, of used - i elements.
63 */
64struct vringh_iov {
65 struct iovec *iov;
66 size_t consumed; /* Within iov[i] */
67 unsigned i, used, max_num;
68};
69
70/**
71 * struct vringh_iov - kvec mangler.
72 *
73 * Mangles kvec in place, and restores it.
74 * Remaining data is iov + i, of used - i elements.
75 */
76struct vringh_kiov {
77 struct kvec *iov;
78 size_t consumed; /* Within iov[i] */
79 unsigned i, used, max_num;
80};
81
82/* Flag on max_num to indicate we're kmalloced. */
83#define VRINGH_IOV_ALLOCATED 0x8000000
84
85/* Helpers for userspace vrings. */
86int vringh_init_user(struct vringh *vrh, u32 features,
87 unsigned int num, bool weak_barriers,
88 struct vring_desc __user *desc,
89 struct vring_avail __user *avail,
90 struct vring_used __user *used);
91
92static inline void vringh_iov_init(struct vringh_iov *iov,
93 struct iovec *iovec, unsigned num)
94{
95 iov->used = iov->i = 0;
96 iov->consumed = 0;
97 iov->max_num = num;
98 iov->iov = iovec;
99}
100
101static inline void vringh_iov_reset(struct vringh_iov *iov)
102{
103 iov->iov[iov->i].iov_len += iov->consumed;
104 iov->iov[iov->i].iov_base -= iov->consumed;
105 iov->consumed = 0;
106 iov->i = 0;
107}
108
109static inline void vringh_iov_cleanup(struct vringh_iov *iov)
110{
111 if (iov->max_num & VRINGH_IOV_ALLOCATED)
112 kfree(iov->iov);
113 iov->max_num = iov->used = iov->i = iov->consumed = 0;
114 iov->iov = NULL;
115}
116
117/* Convert a descriptor into iovecs. */
118int vringh_getdesc_user(struct vringh *vrh,
119 struct vringh_iov *riov,
120 struct vringh_iov *wiov,
121 bool (*getrange)(struct vringh *vrh,
122 u64 addr, struct vringh_range *r),
123 u16 *head);
124
125/* Copy bytes from readable vsg, consuming it (and incrementing wiov->i). */
126ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len);
127
128/* Copy bytes into writable vsg, consuming it (and incrementing wiov->i). */
129ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
130 const void *src, size_t len);
131
132/* Mark a descriptor as used. */
133int vringh_complete_user(struct vringh *vrh, u16 head, u32 len);
134int vringh_complete_multi_user(struct vringh *vrh,
135 const struct vring_used_elem used[],
136 unsigned num_used);
137
138/* Pretend we've never seen descriptor (for easy error handling). */
139void vringh_abandon_user(struct vringh *vrh, unsigned int num);
140
141/* Do we need to fire the eventfd to notify the other side? */
142int vringh_need_notify_user(struct vringh *vrh);
143
144bool vringh_notify_enable_user(struct vringh *vrh);
145void vringh_notify_disable_user(struct vringh *vrh);
146
147/* Helpers for kernelspace vrings. */
148int vringh_init_kern(struct vringh *vrh, u32 features,
149 unsigned int num, bool weak_barriers,
150 struct vring_desc *desc,
151 struct vring_avail *avail,
152 struct vring_used *used);
153
154static inline void vringh_kiov_init(struct vringh_kiov *kiov,
155 struct kvec *kvec, unsigned num)
156{
157 kiov->used = kiov->i = 0;
158 kiov->consumed = 0;
159 kiov->max_num = num;
160 kiov->iov = kvec;
161}
162
163static inline void vringh_kiov_reset(struct vringh_kiov *kiov)
164{
165 kiov->iov[kiov->i].iov_len += kiov->consumed;
166 kiov->iov[kiov->i].iov_base -= kiov->consumed;
167 kiov->consumed = 0;
168 kiov->i = 0;
169}
170
171static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov)
172{
173 if (kiov->max_num & VRINGH_IOV_ALLOCATED)
174 kfree(kiov->iov);
175 kiov->max_num = kiov->used = kiov->i = kiov->consumed = 0;
176 kiov->iov = NULL;
177}
178
179int vringh_getdesc_kern(struct vringh *vrh,
180 struct vringh_kiov *riov,
181 struct vringh_kiov *wiov,
182 u16 *head,
183 gfp_t gfp);
184
185ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len);
186ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
187 const void *src, size_t len);
188void vringh_abandon_kern(struct vringh *vrh, unsigned int num);
189int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len);
190
191bool vringh_notify_enable_kern(struct vringh *vrh);
192void vringh_notify_disable_kern(struct vringh *vrh);
193
194int vringh_need_notify_kern(struct vringh *vrh);
195
196#endif /* _LINUX_VRINGH_H */