diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2013-03-19 23:20:14 -0400 |
---|---|---|
committer | Rusty Russell <rusty@rustcorp.com.au> | 2013-03-19 23:35:33 -0400 |
commit | f87d0fbb579818fed3eeb0923cc253163ab93039 (patch) | |
tree | f33e06a6cb4eb4656e710f8ad70100e2130a32a5 /drivers/vhost | |
parent | 61d0b5a4b2777dcf5daef245e212b3c1fa8091ca (diff) |
vringh: host-side implementation of virtio rings.
Getting use of virtio rings correct is tricky, and a recent patch saw
an implementation of in-kernel rings (as separate from userspace).
This abstracts the business of dealing with the virtio ring layout
from the access (userspace or direct); to do this, we use function
pointers, which gcc inlines correctly.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Diffstat (limited to 'drivers/vhost')
-rw-r--r-- | drivers/vhost/Kconfig | 8 | ||||
-rw-r--r-- | drivers/vhost/Kconfig.tcm | 1 | ||||
-rw-r--r-- | drivers/vhost/Makefile | 2 | ||||
-rw-r--r-- | drivers/vhost/vringh.c | 1007 |
4 files changed, 1018 insertions, 0 deletions
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index bf243177ffe1..85b773a93a5d 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig | |||
@@ -1,6 +1,7 @@ | |||
1 | config VHOST_NET | 1 | config VHOST_NET |
2 | tristate "Host kernel accelerator for virtio net" | 2 | tristate "Host kernel accelerator for virtio net" |
3 | depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP) | 3 | depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP) |
4 | select VHOST_RING | ||
4 | ---help--- | 5 | ---help--- |
5 | This kernel module can be loaded in host kernel to accelerate | 6 | This kernel module can be loaded in host kernel to accelerate |
6 | guest networking with virtio_net. Not to be confused with virtio_net | 7 | guest networking with virtio_net. Not to be confused with virtio_net |
@@ -12,3 +13,10 @@ config VHOST_NET | |||
12 | if STAGING | 13 | if STAGING |
13 | source "drivers/vhost/Kconfig.tcm" | 14 | source "drivers/vhost/Kconfig.tcm" |
14 | endif | 15 | endif |
16 | |||
17 | config VHOST_RING | ||
18 | tristate | ||
19 | ---help--- | ||
20 | This option is selected by any driver which needs to access | ||
21 | the host side of a virtio ring. | ||
22 | |||
diff --git a/drivers/vhost/Kconfig.tcm b/drivers/vhost/Kconfig.tcm index 7e3aa28d999e..c3a8cfa1de72 100644 --- a/drivers/vhost/Kconfig.tcm +++ b/drivers/vhost/Kconfig.tcm | |||
@@ -1,6 +1,7 @@ | |||
1 | config TCM_VHOST | 1 | config TCM_VHOST |
2 | tristate "TCM_VHOST fabric module" | 2 | tristate "TCM_VHOST fabric module" |
3 | depends on TARGET_CORE && EVENTFD && m | 3 | depends on TARGET_CORE && EVENTFD && m |
4 | select VHOST_RING | ||
4 | default n | 5 | default n |
5 | ---help--- | 6 | ---help--- |
6 | Say M here to enable the TCM_VHOST fabric module for use with virtio-scsi guests | 7 | Say M here to enable the TCM_VHOST fabric module for use with virtio-scsi guests |
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index a27b053bc9ab..1d37f5e12be6 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile | |||
@@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o | |||
2 | vhost_net-y := vhost.o net.o | 2 | vhost_net-y := vhost.o net.o |
3 | 3 | ||
4 | obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o | 4 | obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o |
5 | |||
6 | obj-$(CONFIG_VHOST_RING) += vringh.o | ||
diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c new file mode 100644 index 000000000000..bff0775e258c --- /dev/null +++ b/drivers/vhost/vringh.c | |||
@@ -0,0 +1,1007 @@ | |||
1 | /* | ||
2 | * Helpers for the host side of a virtio ring. | ||
3 | * | ||
4 | * Since these may be in userspace, we use (inline) accessors. | ||
5 | */ | ||
6 | #include <linux/vringh.h> | ||
7 | #include <linux/virtio_ring.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/ratelimit.h> | ||
10 | #include <linux/uaccess.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <linux/export.h> | ||
13 | |||
14 | static __printf(1,2) __cold void vringh_bad(const char *fmt, ...) | ||
15 | { | ||
16 | static DEFINE_RATELIMIT_STATE(vringh_rs, | ||
17 | DEFAULT_RATELIMIT_INTERVAL, | ||
18 | DEFAULT_RATELIMIT_BURST); | ||
19 | if (__ratelimit(&vringh_rs)) { | ||
20 | va_list ap; | ||
21 | va_start(ap, fmt); | ||
22 | printk(KERN_NOTICE "vringh:"); | ||
23 | vprintk(fmt, ap); | ||
24 | va_end(ap); | ||
25 | } | ||
26 | } | ||
27 | |||
28 | /* Returns vring->num if empty, -ve on error. */ | ||
29 | static inline int __vringh_get_head(const struct vringh *vrh, | ||
30 | int (*getu16)(u16 *val, const u16 *p), | ||
31 | u16 *last_avail_idx) | ||
32 | { | ||
33 | u16 avail_idx, i, head; | ||
34 | int err; | ||
35 | |||
36 | err = getu16(&avail_idx, &vrh->vring.avail->idx); | ||
37 | if (err) { | ||
38 | vringh_bad("Failed to access avail idx at %p", | ||
39 | &vrh->vring.avail->idx); | ||
40 | return err; | ||
41 | } | ||
42 | |||
43 | if (*last_avail_idx == avail_idx) | ||
44 | return vrh->vring.num; | ||
45 | |||
46 | /* Only get avail ring entries after they have been exposed by guest. */ | ||
47 | virtio_rmb(vrh->weak_barriers); | ||
48 | |||
49 | i = *last_avail_idx & (vrh->vring.num - 1); | ||
50 | |||
51 | err = getu16(&head, &vrh->vring.avail->ring[i]); | ||
52 | if (err) { | ||
53 | vringh_bad("Failed to read head: idx %d address %p", | ||
54 | *last_avail_idx, &vrh->vring.avail->ring[i]); | ||
55 | return err; | ||
56 | } | ||
57 | |||
58 | if (head >= vrh->vring.num) { | ||
59 | vringh_bad("Guest says index %u > %u is available", | ||
60 | head, vrh->vring.num); | ||
61 | return -EINVAL; | ||
62 | } | ||
63 | |||
64 | (*last_avail_idx)++; | ||
65 | return head; | ||
66 | } | ||
67 | |||
68 | /* Copy some bytes to/from the iovec. Returns num copied. */ | ||
69 | static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov, | ||
70 | void *ptr, size_t len, | ||
71 | int (*xfer)(void *addr, void *ptr, | ||
72 | size_t len)) | ||
73 | { | ||
74 | int err, done = 0; | ||
75 | |||
76 | while (len && iov->i < iov->used) { | ||
77 | size_t partlen; | ||
78 | |||
79 | partlen = min(iov->iov[iov->i].iov_len, len); | ||
80 | err = xfer(iov->iov[iov->i].iov_base, ptr, partlen); | ||
81 | if (err) | ||
82 | return err; | ||
83 | done += partlen; | ||
84 | len -= partlen; | ||
85 | ptr += partlen; | ||
86 | iov->consumed += partlen; | ||
87 | iov->iov[iov->i].iov_len -= partlen; | ||
88 | iov->iov[iov->i].iov_base += partlen; | ||
89 | |||
90 | if (!iov->iov[iov->i].iov_len) { | ||
91 | /* Fix up old iov element then increment. */ | ||
92 | iov->iov[iov->i].iov_len = iov->consumed; | ||
93 | iov->iov[iov->i].iov_base -= iov->consumed; | ||
94 | |||
95 | iov->consumed = 0; | ||
96 | iov->i++; | ||
97 | } | ||
98 | } | ||
99 | return done; | ||
100 | } | ||
101 | |||
102 | /* May reduce *len if range is shorter. */ | ||
103 | static inline bool range_check(struct vringh *vrh, u64 addr, size_t *len, | ||
104 | struct vringh_range *range, | ||
105 | bool (*getrange)(struct vringh *, | ||
106 | u64, struct vringh_range *)) | ||
107 | { | ||
108 | if (addr < range->start || addr > range->end_incl) { | ||
109 | if (!getrange(vrh, addr, range)) | ||
110 | return false; | ||
111 | } | ||
112 | BUG_ON(addr < range->start || addr > range->end_incl); | ||
113 | |||
114 | /* To end of memory? */ | ||
115 | if (unlikely(addr + *len == 0)) { | ||
116 | if (range->end_incl == -1ULL) | ||
117 | return true; | ||
118 | goto truncate; | ||
119 | } | ||
120 | |||
121 | /* Otherwise, don't wrap. */ | ||
122 | if (addr + *len < addr) { | ||
123 | vringh_bad("Wrapping descriptor %zu@0x%llx", | ||
124 | *len, (unsigned long long)addr); | ||
125 | return false; | ||
126 | } | ||
127 | |||
128 | if (unlikely(addr + *len - 1 > range->end_incl)) | ||
129 | goto truncate; | ||
130 | return true; | ||
131 | |||
132 | truncate: | ||
133 | *len = range->end_incl + 1 - addr; | ||
134 | return true; | ||
135 | } | ||
136 | |||
137 | static inline bool no_range_check(struct vringh *vrh, u64 addr, size_t *len, | ||
138 | struct vringh_range *range, | ||
139 | bool (*getrange)(struct vringh *, | ||
140 | u64, struct vringh_range *)) | ||
141 | { | ||
142 | return true; | ||
143 | } | ||
144 | |||
145 | /* No reason for this code to be inline. */ | ||
146 | static int move_to_indirect(int *up_next, u16 *i, void *addr, | ||
147 | const struct vring_desc *desc, | ||
148 | struct vring_desc **descs, int *desc_max) | ||
149 | { | ||
150 | /* Indirect tables can't have indirect. */ | ||
151 | if (*up_next != -1) { | ||
152 | vringh_bad("Multilevel indirect %u->%u", *up_next, *i); | ||
153 | return -EINVAL; | ||
154 | } | ||
155 | |||
156 | if (unlikely(desc->len % sizeof(struct vring_desc))) { | ||
157 | vringh_bad("Strange indirect len %u", desc->len); | ||
158 | return -EINVAL; | ||
159 | } | ||
160 | |||
161 | /* We will check this when we follow it! */ | ||
162 | if (desc->flags & VRING_DESC_F_NEXT) | ||
163 | *up_next = desc->next; | ||
164 | else | ||
165 | *up_next = -2; | ||
166 | *descs = addr; | ||
167 | *desc_max = desc->len / sizeof(struct vring_desc); | ||
168 | |||
169 | /* Now, start at the first indirect. */ | ||
170 | *i = 0; | ||
171 | return 0; | ||
172 | } | ||
173 | |||
174 | static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp) | ||
175 | { | ||
176 | struct kvec *new; | ||
177 | unsigned int flag, new_num = (iov->max_num & ~VRINGH_IOV_ALLOCATED) * 2; | ||
178 | |||
179 | if (new_num < 8) | ||
180 | new_num = 8; | ||
181 | |||
182 | flag = (iov->max_num & VRINGH_IOV_ALLOCATED); | ||
183 | if (flag) | ||
184 | new = krealloc(iov->iov, new_num * sizeof(struct iovec), gfp); | ||
185 | else { | ||
186 | new = kmalloc(new_num * sizeof(struct iovec), gfp); | ||
187 | if (new) { | ||
188 | memcpy(new, iov->iov, | ||
189 | iov->max_num * sizeof(struct iovec)); | ||
190 | flag = VRINGH_IOV_ALLOCATED; | ||
191 | } | ||
192 | } | ||
193 | if (!new) | ||
194 | return -ENOMEM; | ||
195 | iov->iov = new; | ||
196 | iov->max_num = (new_num | flag); | ||
197 | return 0; | ||
198 | } | ||
199 | |||
200 | static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next, | ||
201 | struct vring_desc **descs, int *desc_max) | ||
202 | { | ||
203 | u16 i = *up_next; | ||
204 | |||
205 | *up_next = -1; | ||
206 | *descs = vrh->vring.desc; | ||
207 | *desc_max = vrh->vring.num; | ||
208 | return i; | ||
209 | } | ||
210 | |||
211 | static int slow_copy(struct vringh *vrh, void *dst, const void *src, | ||
212 | bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len, | ||
213 | struct vringh_range *range, | ||
214 | bool (*getrange)(struct vringh *vrh, | ||
215 | u64, | ||
216 | struct vringh_range *)), | ||
217 | bool (*getrange)(struct vringh *vrh, | ||
218 | u64 addr, | ||
219 | struct vringh_range *r), | ||
220 | struct vringh_range *range, | ||
221 | int (*copy)(void *dst, const void *src, size_t len)) | ||
222 | { | ||
223 | size_t part, len = sizeof(struct vring_desc); | ||
224 | |||
225 | do { | ||
226 | u64 addr; | ||
227 | int err; | ||
228 | |||
229 | part = len; | ||
230 | addr = (u64)(unsigned long)src - range->offset; | ||
231 | |||
232 | if (!rcheck(vrh, addr, &part, range, getrange)) | ||
233 | return -EINVAL; | ||
234 | |||
235 | err = copy(dst, src, part); | ||
236 | if (err) | ||
237 | return err; | ||
238 | |||
239 | dst += part; | ||
240 | src += part; | ||
241 | len -= part; | ||
242 | } while (len); | ||
243 | return 0; | ||
244 | } | ||
245 | |||
246 | static inline int | ||
247 | __vringh_iov(struct vringh *vrh, u16 i, | ||
248 | struct vringh_kiov *riov, | ||
249 | struct vringh_kiov *wiov, | ||
250 | bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len, | ||
251 | struct vringh_range *range, | ||
252 | bool (*getrange)(struct vringh *, u64, | ||
253 | struct vringh_range *)), | ||
254 | bool (*getrange)(struct vringh *, u64, struct vringh_range *), | ||
255 | gfp_t gfp, | ||
256 | int (*copy)(void *dst, const void *src, size_t len)) | ||
257 | { | ||
258 | int err, count = 0, up_next, desc_max; | ||
259 | struct vring_desc desc, *descs; | ||
260 | struct vringh_range range = { -1ULL, 0 }, slowrange; | ||
261 | bool slow = false; | ||
262 | |||
263 | /* We start traversing vring's descriptor table. */ | ||
264 | descs = vrh->vring.desc; | ||
265 | desc_max = vrh->vring.num; | ||
266 | up_next = -1; | ||
267 | |||
268 | if (riov) | ||
269 | riov->i = riov->used = 0; | ||
270 | else if (wiov) | ||
271 | wiov->i = wiov->used = 0; | ||
272 | else | ||
273 | /* You must want something! */ | ||
274 | BUG(); | ||
275 | |||
276 | for (;;) { | ||
277 | void *addr; | ||
278 | struct vringh_kiov *iov; | ||
279 | size_t len; | ||
280 | |||
281 | if (unlikely(slow)) | ||
282 | err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange, | ||
283 | &slowrange, copy); | ||
284 | else | ||
285 | err = copy(&desc, &descs[i], sizeof(desc)); | ||
286 | if (unlikely(err)) | ||
287 | goto fail; | ||
288 | |||
289 | if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) { | ||
290 | /* Make sure it's OK, and get offset. */ | ||
291 | len = desc.len; | ||
292 | if (!rcheck(vrh, desc.addr, &len, &range, getrange)) { | ||
293 | err = -EINVAL; | ||
294 | goto fail; | ||
295 | } | ||
296 | |||
297 | if (unlikely(len != desc.len)) { | ||
298 | slow = true; | ||
299 | /* We need to save this range to use offset */ | ||
300 | slowrange = range; | ||
301 | } | ||
302 | |||
303 | addr = (void *)(long)(desc.addr + range.offset); | ||
304 | err = move_to_indirect(&up_next, &i, addr, &desc, | ||
305 | &descs, &desc_max); | ||
306 | if (err) | ||
307 | goto fail; | ||
308 | continue; | ||
309 | } | ||
310 | |||
311 | if (count++ == vrh->vring.num) { | ||
312 | vringh_bad("Descriptor loop in %p", descs); | ||
313 | err = -ELOOP; | ||
314 | goto fail; | ||
315 | } | ||
316 | |||
317 | if (desc.flags & VRING_DESC_F_WRITE) | ||
318 | iov = wiov; | ||
319 | else { | ||
320 | iov = riov; | ||
321 | if (unlikely(wiov && wiov->i)) { | ||
322 | vringh_bad("Readable desc %p after writable", | ||
323 | &descs[i]); | ||
324 | err = -EINVAL; | ||
325 | goto fail; | ||
326 | } | ||
327 | } | ||
328 | |||
329 | if (!iov) { | ||
330 | vringh_bad("Unexpected %s desc", | ||
331 | !wiov ? "writable" : "readable"); | ||
332 | err = -EPROTO; | ||
333 | goto fail; | ||
334 | } | ||
335 | |||
336 | again: | ||
337 | /* Make sure it's OK, and get offset. */ | ||
338 | len = desc.len; | ||
339 | if (!rcheck(vrh, desc.addr, &len, &range, getrange)) { | ||
340 | err = -EINVAL; | ||
341 | goto fail; | ||
342 | } | ||
343 | addr = (void *)(unsigned long)(desc.addr + range.offset); | ||
344 | |||
345 | if (unlikely(iov->used == (iov->max_num & ~VRINGH_IOV_ALLOCATED))) { | ||
346 | err = resize_iovec(iov, gfp); | ||
347 | if (err) | ||
348 | goto fail; | ||
349 | } | ||
350 | |||
351 | iov->iov[iov->used].iov_base = addr; | ||
352 | iov->iov[iov->used].iov_len = len; | ||
353 | iov->used++; | ||
354 | |||
355 | if (unlikely(len != desc.len)) { | ||
356 | desc.len -= len; | ||
357 | desc.addr += len; | ||
358 | goto again; | ||
359 | } | ||
360 | |||
361 | if (desc.flags & VRING_DESC_F_NEXT) { | ||
362 | i = desc.next; | ||
363 | } else { | ||
364 | /* Just in case we need to finish traversing above. */ | ||
365 | if (unlikely(up_next > 0)) { | ||
366 | i = return_from_indirect(vrh, &up_next, | ||
367 | &descs, &desc_max); | ||
368 | slow = false; | ||
369 | } else | ||
370 | break; | ||
371 | } | ||
372 | |||
373 | if (i >= desc_max) { | ||
374 | vringh_bad("Chained index %u > %u", i, desc_max); | ||
375 | err = -EINVAL; | ||
376 | goto fail; | ||
377 | } | ||
378 | } | ||
379 | |||
380 | return 0; | ||
381 | |||
382 | fail: | ||
383 | return err; | ||
384 | } | ||
385 | |||
386 | static inline int __vringh_complete(struct vringh *vrh, | ||
387 | const struct vring_used_elem *used, | ||
388 | unsigned int num_used, | ||
389 | int (*putu16)(u16 *p, u16 val), | ||
390 | int (*putused)(struct vring_used_elem *dst, | ||
391 | const struct vring_used_elem | ||
392 | *src, unsigned num)) | ||
393 | { | ||
394 | struct vring_used *used_ring; | ||
395 | int err; | ||
396 | u16 used_idx, off; | ||
397 | |||
398 | used_ring = vrh->vring.used; | ||
399 | used_idx = vrh->last_used_idx + vrh->completed; | ||
400 | |||
401 | off = used_idx % vrh->vring.num; | ||
402 | |||
403 | /* Compiler knows num_used == 1 sometimes, hence extra check */ | ||
404 | if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) { | ||
405 | u16 part = vrh->vring.num - off; | ||
406 | err = putused(&used_ring->ring[off], used, part); | ||
407 | if (!err) | ||
408 | err = putused(&used_ring->ring[0], used + part, | ||
409 | num_used - part); | ||
410 | } else | ||
411 | err = putused(&used_ring->ring[off], used, num_used); | ||
412 | |||
413 | if (err) { | ||
414 | vringh_bad("Failed to write %u used entries %u at %p", | ||
415 | num_used, off, &used_ring->ring[off]); | ||
416 | return err; | ||
417 | } | ||
418 | |||
419 | /* Make sure buffer is written before we update index. */ | ||
420 | virtio_wmb(vrh->weak_barriers); | ||
421 | |||
422 | err = putu16(&vrh->vring.used->idx, used_idx + num_used); | ||
423 | if (err) { | ||
424 | vringh_bad("Failed to update used index at %p", | ||
425 | &vrh->vring.used->idx); | ||
426 | return err; | ||
427 | } | ||
428 | |||
429 | vrh->completed += num_used; | ||
430 | return 0; | ||
431 | } | ||
432 | |||
433 | |||
434 | static inline int __vringh_need_notify(struct vringh *vrh, | ||
435 | int (*getu16)(u16 *val, const u16 *p)) | ||
436 | { | ||
437 | bool notify; | ||
438 | u16 used_event; | ||
439 | int err; | ||
440 | |||
441 | /* Flush out used index update. This is paired with the | ||
442 | * barrier that the Guest executes when enabling | ||
443 | * interrupts. */ | ||
444 | virtio_mb(vrh->weak_barriers); | ||
445 | |||
446 | /* Old-style, without event indices. */ | ||
447 | if (!vrh->event_indices) { | ||
448 | u16 flags; | ||
449 | err = getu16(&flags, &vrh->vring.avail->flags); | ||
450 | if (err) { | ||
451 | vringh_bad("Failed to get flags at %p", | ||
452 | &vrh->vring.avail->flags); | ||
453 | return err; | ||
454 | } | ||
455 | return (!(flags & VRING_AVAIL_F_NO_INTERRUPT)); | ||
456 | } | ||
457 | |||
458 | /* Modern: we know when other side wants to know. */ | ||
459 | err = getu16(&used_event, &vring_used_event(&vrh->vring)); | ||
460 | if (err) { | ||
461 | vringh_bad("Failed to get used event idx at %p", | ||
462 | &vring_used_event(&vrh->vring)); | ||
463 | return err; | ||
464 | } | ||
465 | |||
466 | /* Just in case we added so many that we wrap. */ | ||
467 | if (unlikely(vrh->completed > 0xffff)) | ||
468 | notify = true; | ||
469 | else | ||
470 | notify = vring_need_event(used_event, | ||
471 | vrh->last_used_idx + vrh->completed, | ||
472 | vrh->last_used_idx); | ||
473 | |||
474 | vrh->last_used_idx += vrh->completed; | ||
475 | vrh->completed = 0; | ||
476 | return notify; | ||
477 | } | ||
478 | |||
479 | static inline bool __vringh_notify_enable(struct vringh *vrh, | ||
480 | int (*getu16)(u16 *val, const u16 *p), | ||
481 | int (*putu16)(u16 *p, u16 val)) | ||
482 | { | ||
483 | u16 avail; | ||
484 | |||
485 | if (!vrh->event_indices) { | ||
486 | /* Old-school; update flags. */ | ||
487 | if (putu16(&vrh->vring.used->flags, 0) != 0) { | ||
488 | vringh_bad("Clearing used flags %p", | ||
489 | &vrh->vring.used->flags); | ||
490 | return true; | ||
491 | } | ||
492 | } else { | ||
493 | if (putu16(&vring_avail_event(&vrh->vring), | ||
494 | vrh->last_avail_idx) != 0) { | ||
495 | vringh_bad("Updating avail event index %p", | ||
496 | &vring_avail_event(&vrh->vring)); | ||
497 | return true; | ||
498 | } | ||
499 | } | ||
500 | |||
501 | /* They could have slipped one in as we were doing that: make | ||
502 | * sure it's written, then check again. */ | ||
503 | virtio_mb(vrh->weak_barriers); | ||
504 | |||
505 | if (getu16(&avail, &vrh->vring.avail->idx) != 0) { | ||
506 | vringh_bad("Failed to check avail idx at %p", | ||
507 | &vrh->vring.avail->idx); | ||
508 | return true; | ||
509 | } | ||
510 | |||
511 | /* This is unlikely, so we just leave notifications enabled | ||
512 | * (if we're using event_indices, we'll only get one | ||
513 | * notification anyway). */ | ||
514 | return avail == vrh->last_avail_idx; | ||
515 | } | ||
516 | |||
517 | static inline void __vringh_notify_disable(struct vringh *vrh, | ||
518 | int (*putu16)(u16 *p, u16 val)) | ||
519 | { | ||
520 | if (!vrh->event_indices) { | ||
521 | /* Old-school; update flags. */ | ||
522 | if (putu16(&vrh->vring.used->flags, VRING_USED_F_NO_NOTIFY)) { | ||
523 | vringh_bad("Setting used flags %p", | ||
524 | &vrh->vring.used->flags); | ||
525 | } | ||
526 | } | ||
527 | } | ||
528 | |||
529 | /* Userspace access helpers: in this case, addresses are really userspace. */ | ||
530 | static inline int getu16_user(u16 *val, const u16 *p) | ||
531 | { | ||
532 | return get_user(*val, (__force u16 __user *)p); | ||
533 | } | ||
534 | |||
535 | static inline int putu16_user(u16 *p, u16 val) | ||
536 | { | ||
537 | return put_user(val, (__force u16 __user *)p); | ||
538 | } | ||
539 | |||
540 | static inline int copydesc_user(void *dst, const void *src, size_t len) | ||
541 | { | ||
542 | return copy_from_user(dst, (__force void __user *)src, len) ? | ||
543 | -EFAULT : 0; | ||
544 | } | ||
545 | |||
546 | static inline int putused_user(struct vring_used_elem *dst, | ||
547 | const struct vring_used_elem *src, | ||
548 | unsigned int num) | ||
549 | { | ||
550 | return copy_to_user((__force void __user *)dst, src, | ||
551 | sizeof(*dst) * num) ? -EFAULT : 0; | ||
552 | } | ||
553 | |||
554 | static inline int xfer_from_user(void *src, void *dst, size_t len) | ||
555 | { | ||
556 | return copy_from_user(dst, (__force void __user *)src, len) ? | ||
557 | -EFAULT : 0; | ||
558 | } | ||
559 | |||
560 | static inline int xfer_to_user(void *dst, void *src, size_t len) | ||
561 | { | ||
562 | return copy_to_user((__force void __user *)dst, src, len) ? | ||
563 | -EFAULT : 0; | ||
564 | } | ||
565 | |||
566 | /** | ||
567 | * vringh_init_user - initialize a vringh for a userspace vring. | ||
568 | * @vrh: the vringh to initialize. | ||
569 | * @features: the feature bits for this ring. | ||
570 | * @num: the number of elements. | ||
571 | * @weak_barriers: true if we only need memory barriers, not I/O. | ||
572 | * @desc: the userpace descriptor pointer. | ||
573 | * @avail: the userpace avail pointer. | ||
574 | * @used: the userpace used pointer. | ||
575 | * | ||
576 | * Returns an error if num is invalid: you should check pointers | ||
577 | * yourself! | ||
578 | */ | ||
579 | int vringh_init_user(struct vringh *vrh, u32 features, | ||
580 | unsigned int num, bool weak_barriers, | ||
581 | struct vring_desc __user *desc, | ||
582 | struct vring_avail __user *avail, | ||
583 | struct vring_used __user *used) | ||
584 | { | ||
585 | /* Sane power of 2 please! */ | ||
586 | if (!num || num > 0xffff || (num & (num - 1))) { | ||
587 | vringh_bad("Bad ring size %u", num); | ||
588 | return -EINVAL; | ||
589 | } | ||
590 | |||
591 | vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX)); | ||
592 | vrh->weak_barriers = weak_barriers; | ||
593 | vrh->completed = 0; | ||
594 | vrh->last_avail_idx = 0; | ||
595 | vrh->last_used_idx = 0; | ||
596 | vrh->vring.num = num; | ||
597 | /* vring expects kernel addresses, but only used via accessors. */ | ||
598 | vrh->vring.desc = (__force struct vring_desc *)desc; | ||
599 | vrh->vring.avail = (__force struct vring_avail *)avail; | ||
600 | vrh->vring.used = (__force struct vring_used *)used; | ||
601 | return 0; | ||
602 | } | ||
603 | EXPORT_SYMBOL(vringh_init_user); | ||
604 | |||
605 | /** | ||
606 | * vringh_getdesc_user - get next available descriptor from userspace ring. | ||
607 | * @vrh: the userspace vring. | ||
608 | * @riov: where to put the readable descriptors (or NULL) | ||
609 | * @wiov: where to put the writable descriptors (or NULL) | ||
610 | * @getrange: function to call to check ranges. | ||
611 | * @head: head index we received, for passing to vringh_complete_user(). | ||
612 | * | ||
613 | * Returns 0 if there was no descriptor, 1 if there was, or -errno. | ||
614 | * | ||
615 | * Note that on error return, you can tell the difference between an | ||
616 | * invalid ring and a single invalid descriptor: in the former case, | ||
617 | * *head will be vrh->vring.num. You may be able to ignore an invalid | ||
618 | * descriptor, but there's not much you can do with an invalid ring. | ||
619 | * | ||
620 | * Note that you may need to clean up riov and wiov, even on error! | ||
621 | */ | ||
622 | int vringh_getdesc_user(struct vringh *vrh, | ||
623 | struct vringh_iov *riov, | ||
624 | struct vringh_iov *wiov, | ||
625 | bool (*getrange)(struct vringh *vrh, | ||
626 | u64 addr, struct vringh_range *r), | ||
627 | u16 *head) | ||
628 | { | ||
629 | int err; | ||
630 | |||
631 | *head = vrh->vring.num; | ||
632 | err = __vringh_get_head(vrh, getu16_user, &vrh->last_avail_idx); | ||
633 | if (err < 0) | ||
634 | return err; | ||
635 | |||
636 | /* Empty... */ | ||
637 | if (err == vrh->vring.num) | ||
638 | return 0; | ||
639 | |||
640 | /* We need the layouts to be the identical for this to work */ | ||
641 | BUILD_BUG_ON(sizeof(struct vringh_kiov) != sizeof(struct vringh_iov)); | ||
642 | BUILD_BUG_ON(offsetof(struct vringh_kiov, iov) != | ||
643 | offsetof(struct vringh_iov, iov)); | ||
644 | BUILD_BUG_ON(offsetof(struct vringh_kiov, i) != | ||
645 | offsetof(struct vringh_iov, i)); | ||
646 | BUILD_BUG_ON(offsetof(struct vringh_kiov, used) != | ||
647 | offsetof(struct vringh_iov, used)); | ||
648 | BUILD_BUG_ON(offsetof(struct vringh_kiov, max_num) != | ||
649 | offsetof(struct vringh_iov, max_num)); | ||
650 | BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); | ||
651 | BUILD_BUG_ON(offsetof(struct iovec, iov_base) != | ||
652 | offsetof(struct kvec, iov_base)); | ||
653 | BUILD_BUG_ON(offsetof(struct iovec, iov_len) != | ||
654 | offsetof(struct kvec, iov_len)); | ||
655 | BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_base) | ||
656 | != sizeof(((struct kvec *)NULL)->iov_base)); | ||
657 | BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_len) | ||
658 | != sizeof(((struct kvec *)NULL)->iov_len)); | ||
659 | |||
660 | *head = err; | ||
661 | err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov, | ||
662 | (struct vringh_kiov *)wiov, | ||
663 | range_check, getrange, GFP_KERNEL, copydesc_user); | ||
664 | if (err) | ||
665 | return err; | ||
666 | |||
667 | return 1; | ||
668 | } | ||
669 | EXPORT_SYMBOL(vringh_getdesc_user); | ||
670 | |||
671 | /** | ||
672 | * vringh_iov_pull_user - copy bytes from vring_iov. | ||
673 | * @riov: the riov as passed to vringh_getdesc_user() (updated as we consume) | ||
674 | * @dst: the place to copy. | ||
675 | * @len: the maximum length to copy. | ||
676 | * | ||
677 | * Returns the bytes copied <= len or a negative errno. | ||
678 | */ | ||
679 | ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len) | ||
680 | { | ||
681 | return vringh_iov_xfer((struct vringh_kiov *)riov, | ||
682 | dst, len, xfer_from_user); | ||
683 | } | ||
684 | EXPORT_SYMBOL(vringh_iov_pull_user); | ||
685 | |||
686 | /** | ||
687 | * vringh_iov_push_user - copy bytes into vring_iov. | ||
688 | * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume) | ||
689 | * @dst: the place to copy. | ||
690 | * @len: the maximum length to copy. | ||
691 | * | ||
692 | * Returns the bytes copied <= len or a negative errno. | ||
693 | */ | ||
694 | ssize_t vringh_iov_push_user(struct vringh_iov *wiov, | ||
695 | const void *src, size_t len) | ||
696 | { | ||
697 | return vringh_iov_xfer((struct vringh_kiov *)wiov, | ||
698 | (void *)src, len, xfer_to_user); | ||
699 | } | ||
700 | EXPORT_SYMBOL(vringh_iov_push_user); | ||
701 | |||
702 | /** | ||
703 | * vringh_abandon_user - we've decided not to handle the descriptor(s). | ||
704 | * @vrh: the vring. | ||
705 | * @num: the number of descriptors to put back (ie. num | ||
706 | * vringh_get_user() to undo). | ||
707 | * | ||
708 | * The next vringh_get_user() will return the old descriptor(s) again. | ||
709 | */ | ||
710 | void vringh_abandon_user(struct vringh *vrh, unsigned int num) | ||
711 | { | ||
712 | /* We only update vring_avail_event(vr) when we want to be notified, | ||
713 | * so we haven't changed that yet. */ | ||
714 | vrh->last_avail_idx -= num; | ||
715 | } | ||
716 | EXPORT_SYMBOL(vringh_abandon_user); | ||
717 | |||
718 | /** | ||
719 | * vringh_complete_user - we've finished with descriptor, publish it. | ||
720 | * @vrh: the vring. | ||
721 | * @head: the head as filled in by vringh_getdesc_user. | ||
722 | * @len: the length of data we have written. | ||
723 | * | ||
724 | * You should check vringh_need_notify_user() after one or more calls | ||
725 | * to this function. | ||
726 | */ | ||
727 | int vringh_complete_user(struct vringh *vrh, u16 head, u32 len) | ||
728 | { | ||
729 | struct vring_used_elem used; | ||
730 | |||
731 | used.id = head; | ||
732 | used.len = len; | ||
733 | return __vringh_complete(vrh, &used, 1, putu16_user, putused_user); | ||
734 | } | ||
735 | EXPORT_SYMBOL(vringh_complete_user); | ||
736 | |||
737 | /** | ||
738 | * vringh_complete_multi_user - we've finished with many descriptors. | ||
739 | * @vrh: the vring. | ||
740 | * @used: the head, length pairs. | ||
741 | * @num_used: the number of used elements. | ||
742 | * | ||
743 | * You should check vringh_need_notify_user() after one or more calls | ||
744 | * to this function. | ||
745 | */ | ||
746 | int vringh_complete_multi_user(struct vringh *vrh, | ||
747 | const struct vring_used_elem used[], | ||
748 | unsigned num_used) | ||
749 | { | ||
750 | return __vringh_complete(vrh, used, num_used, | ||
751 | putu16_user, putused_user); | ||
752 | } | ||
753 | EXPORT_SYMBOL(vringh_complete_multi_user); | ||
754 | |||
755 | /** | ||
756 | * vringh_notify_enable_user - we want to know if something changes. | ||
757 | * @vrh: the vring. | ||
758 | * | ||
759 | * This always enables notifications, but returns false if there are | ||
760 | * now more buffers available in the vring. | ||
761 | */ | ||
762 | bool vringh_notify_enable_user(struct vringh *vrh) | ||
763 | { | ||
764 | return __vringh_notify_enable(vrh, getu16_user, putu16_user); | ||
765 | } | ||
766 | EXPORT_SYMBOL(vringh_notify_enable_user); | ||
767 | |||
768 | /** | ||
769 | * vringh_notify_disable_user - don't tell us if something changes. | ||
770 | * @vrh: the vring. | ||
771 | * | ||
772 | * This is our normal running state: we disable and then only enable when | ||
773 | * we're going to sleep. | ||
774 | */ | ||
775 | void vringh_notify_disable_user(struct vringh *vrh) | ||
776 | { | ||
777 | __vringh_notify_disable(vrh, putu16_user); | ||
778 | } | ||
779 | EXPORT_SYMBOL(vringh_notify_disable_user); | ||
780 | |||
781 | /** | ||
782 | * vringh_need_notify_user - must we tell the other side about used buffers? | ||
783 | * @vrh: the vring we've called vringh_complete_user() on. | ||
784 | * | ||
785 | * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. | ||
786 | */ | ||
787 | int vringh_need_notify_user(struct vringh *vrh) | ||
788 | { | ||
789 | return __vringh_need_notify(vrh, getu16_user); | ||
790 | } | ||
791 | EXPORT_SYMBOL(vringh_need_notify_user); | ||
792 | |||
793 | /* Kernelspace access helpers. */ | ||
794 | static inline int getu16_kern(u16 *val, const u16 *p) | ||
795 | { | ||
796 | *val = ACCESS_ONCE(*p); | ||
797 | return 0; | ||
798 | } | ||
799 | |||
800 | static inline int putu16_kern(u16 *p, u16 val) | ||
801 | { | ||
802 | ACCESS_ONCE(*p) = val; | ||
803 | return 0; | ||
804 | } | ||
805 | |||
806 | static inline int copydesc_kern(void *dst, const void *src, size_t len) | ||
807 | { | ||
808 | memcpy(dst, src, len); | ||
809 | return 0; | ||
810 | } | ||
811 | |||
812 | static inline int putused_kern(struct vring_used_elem *dst, | ||
813 | const struct vring_used_elem *src, | ||
814 | unsigned int num) | ||
815 | { | ||
816 | memcpy(dst, src, num * sizeof(*dst)); | ||
817 | return 0; | ||
818 | } | ||
819 | |||
820 | static inline int xfer_kern(void *src, void *dst, size_t len) | ||
821 | { | ||
822 | memcpy(dst, src, len); | ||
823 | return 0; | ||
824 | } | ||
825 | |||
826 | /** | ||
827 | * vringh_init_kern - initialize a vringh for a kernelspace vring. | ||
828 | * @vrh: the vringh to initialize. | ||
829 | * @features: the feature bits for this ring. | ||
830 | * @num: the number of elements. | ||
831 | * @weak_barriers: true if we only need memory barriers, not I/O. | ||
832 | * @desc: the userpace descriptor pointer. | ||
833 | * @avail: the userpace avail pointer. | ||
834 | * @used: the userpace used pointer. | ||
835 | * | ||
836 | * Returns an error if num is invalid. | ||
837 | */ | ||
838 | int vringh_init_kern(struct vringh *vrh, u32 features, | ||
839 | unsigned int num, bool weak_barriers, | ||
840 | struct vring_desc *desc, | ||
841 | struct vring_avail *avail, | ||
842 | struct vring_used *used) | ||
843 | { | ||
844 | /* Sane power of 2 please! */ | ||
845 | if (!num || num > 0xffff || (num & (num - 1))) { | ||
846 | vringh_bad("Bad ring size %u", num); | ||
847 | return -EINVAL; | ||
848 | } | ||
849 | |||
850 | vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX)); | ||
851 | vrh->weak_barriers = weak_barriers; | ||
852 | vrh->completed = 0; | ||
853 | vrh->last_avail_idx = 0; | ||
854 | vrh->last_used_idx = 0; | ||
855 | vrh->vring.num = num; | ||
856 | vrh->vring.desc = desc; | ||
857 | vrh->vring.avail = avail; | ||
858 | vrh->vring.used = used; | ||
859 | return 0; | ||
860 | } | ||
861 | EXPORT_SYMBOL(vringh_init_kern); | ||
862 | |||
863 | /** | ||
864 | * vringh_getdesc_kern - get next available descriptor from kernelspace ring. | ||
865 | * @vrh: the kernelspace vring. | ||
866 | * @riov: where to put the readable descriptors (or NULL) | ||
867 | * @wiov: where to put the writable descriptors (or NULL) | ||
868 | * @head: head index we received, for passing to vringh_complete_kern(). | ||
869 | * @gfp: flags for allocating larger riov/wiov. | ||
870 | * | ||
871 | * Returns 0 if there was no descriptor, 1 if there was, or -errno. | ||
872 | * | ||
873 | * Note that on error return, you can tell the difference between an | ||
874 | * invalid ring and a single invalid descriptor: in the former case, | ||
875 | * *head will be vrh->vring.num. You may be able to ignore an invalid | ||
876 | * descriptor, but there's not much you can do with an invalid ring. | ||
877 | * | ||
878 | * Note that you may need to clean up riov and wiov, even on error! | ||
879 | */ | ||
880 | int vringh_getdesc_kern(struct vringh *vrh, | ||
881 | struct vringh_kiov *riov, | ||
882 | struct vringh_kiov *wiov, | ||
883 | u16 *head, | ||
884 | gfp_t gfp) | ||
885 | { | ||
886 | int err; | ||
887 | |||
888 | err = __vringh_get_head(vrh, getu16_kern, &vrh->last_avail_idx); | ||
889 | if (err < 0) | ||
890 | return err; | ||
891 | |||
892 | /* Empty... */ | ||
893 | if (err == vrh->vring.num) | ||
894 | return 0; | ||
895 | |||
896 | *head = err; | ||
897 | err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL, | ||
898 | gfp, copydesc_kern); | ||
899 | if (err) | ||
900 | return err; | ||
901 | |||
902 | return 1; | ||
903 | } | ||
904 | EXPORT_SYMBOL(vringh_getdesc_kern); | ||
905 | |||
906 | /** | ||
907 | * vringh_iov_pull_kern - copy bytes from vring_iov. | ||
908 | * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume) | ||
909 | * @dst: the place to copy. | ||
910 | * @len: the maximum length to copy. | ||
911 | * | ||
912 | * Returns the bytes copied <= len or a negative errno. | ||
913 | */ | ||
914 | ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len) | ||
915 | { | ||
916 | return vringh_iov_xfer(riov, dst, len, xfer_kern); | ||
917 | } | ||
918 | EXPORT_SYMBOL(vringh_iov_pull_kern); | ||
919 | |||
920 | /** | ||
921 | * vringh_iov_push_kern - copy bytes into vring_iov. | ||
922 | * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume) | ||
923 | * @dst: the place to copy. | ||
924 | * @len: the maximum length to copy. | ||
925 | * | ||
926 | * Returns the bytes copied <= len or a negative errno. | ||
927 | */ | ||
928 | ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, | ||
929 | const void *src, size_t len) | ||
930 | { | ||
931 | return vringh_iov_xfer(wiov, (void *)src, len, xfer_kern); | ||
932 | } | ||
933 | EXPORT_SYMBOL(vringh_iov_push_kern); | ||
934 | |||
935 | /** | ||
936 | * vringh_abandon_kern - we've decided not to handle the descriptor(s). | ||
937 | * @vrh: the vring. | ||
938 | * @num: the number of descriptors to put back (ie. num | ||
939 | * vringh_get_kern() to undo). | ||
940 | * | ||
941 | * The next vringh_get_kern() will return the old descriptor(s) again. | ||
942 | */ | ||
943 | void vringh_abandon_kern(struct vringh *vrh, unsigned int num) | ||
944 | { | ||
945 | /* We only update vring_avail_event(vr) when we want to be notified, | ||
946 | * so we haven't changed that yet. */ | ||
947 | vrh->last_avail_idx -= num; | ||
948 | } | ||
949 | EXPORT_SYMBOL(vringh_abandon_kern); | ||
950 | |||
951 | /** | ||
952 | * vringh_complete_kern - we've finished with descriptor, publish it. | ||
953 | * @vrh: the vring. | ||
954 | * @head: the head as filled in by vringh_getdesc_kern. | ||
955 | * @len: the length of data we have written. | ||
956 | * | ||
957 | * You should check vringh_need_notify_kern() after one or more calls | ||
958 | * to this function. | ||
959 | */ | ||
960 | int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len) | ||
961 | { | ||
962 | struct vring_used_elem used; | ||
963 | |||
964 | used.id = head; | ||
965 | used.len = len; | ||
966 | |||
967 | return __vringh_complete(vrh, &used, 1, putu16_kern, putused_kern); | ||
968 | } | ||
969 | EXPORT_SYMBOL(vringh_complete_kern); | ||
970 | |||
971 | /** | ||
972 | * vringh_notify_enable_kern - we want to know if something changes. | ||
973 | * @vrh: the vring. | ||
974 | * | ||
975 | * This always enables notifications, but returns false if there are | ||
976 | * now more buffers available in the vring. | ||
977 | */ | ||
978 | bool vringh_notify_enable_kern(struct vringh *vrh) | ||
979 | { | ||
980 | return __vringh_notify_enable(vrh, getu16_kern, putu16_kern); | ||
981 | } | ||
982 | EXPORT_SYMBOL(vringh_notify_enable_kern); | ||
983 | |||
984 | /** | ||
985 | * vringh_notify_disable_kern - don't tell us if something changes. | ||
986 | * @vrh: the vring. | ||
987 | * | ||
988 | * This is our normal running state: we disable and then only enable when | ||
989 | * we're going to sleep. | ||
990 | */ | ||
991 | void vringh_notify_disable_kern(struct vringh *vrh) | ||
992 | { | ||
993 | __vringh_notify_disable(vrh, putu16_kern); | ||
994 | } | ||
995 | EXPORT_SYMBOL(vringh_notify_disable_kern); | ||
996 | |||
997 | /** | ||
998 | * vringh_need_notify_kern - must we tell the other side about used buffers? | ||
999 | * @vrh: the vring we've called vringh_complete_kern() on. | ||
1000 | * | ||
1001 | * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. | ||
1002 | */ | ||
1003 | int vringh_need_notify_kern(struct vringh *vrh) | ||
1004 | { | ||
1005 | return __vringh_need_notify(vrh, getu16_kern); | ||
1006 | } | ||
1007 | EXPORT_SYMBOL(vringh_need_notify_kern); | ||