aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/vhost
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/vhost')
-rw-r--r--drivers/vhost/net.c237
-rw-r--r--drivers/vhost/vhost.c79
-rw-r--r--drivers/vhost/vhost.h17
3 files changed, 315 insertions, 18 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d395b59289ae..f13e56babe4b 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -74,6 +74,22 @@ static int move_iovec_hdr(struct iovec *from, struct iovec *to,
74 } 74 }
75 return seg; 75 return seg;
76} 76}
77/* Copy iovec entries for len bytes from iovec. */
78static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
79 size_t len, int iovcount)
80{
81 int seg = 0;
82 size_t size;
83 while (len && seg < iovcount) {
84 size = min(from->iov_len, len);
85 to->iov_base = from->iov_base;
86 to->iov_len = size;
87 len -= size;
88 ++from;
89 ++to;
90 ++seg;
91 }
92}
77 93
78/* Caller must have TX VQ lock */ 94/* Caller must have TX VQ lock */
79static void tx_poll_stop(struct vhost_net *net) 95static void tx_poll_stop(struct vhost_net *net)
@@ -129,7 +145,7 @@ static void handle_tx(struct vhost_net *net)
129 145
130 if (wmem < sock->sk->sk_sndbuf / 2) 146 if (wmem < sock->sk->sk_sndbuf / 2)
131 tx_poll_stop(net); 147 tx_poll_stop(net);
132 hdr_size = vq->hdr_size; 148 hdr_size = vq->vhost_hlen;
133 149
134 for (;;) { 150 for (;;) {
135 head = vhost_get_vq_desc(&net->dev, vq, vq->iov, 151 head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
@@ -172,7 +188,7 @@ static void handle_tx(struct vhost_net *net)
172 /* TODO: Check specific error and bomb out unless ENOBUFS? */ 188 /* TODO: Check specific error and bomb out unless ENOBUFS? */
173 err = sock->ops->sendmsg(NULL, sock, &msg, len); 189 err = sock->ops->sendmsg(NULL, sock, &msg, len);
174 if (unlikely(err < 0)) { 190 if (unlikely(err < 0)) {
175 vhost_discard_vq_desc(vq); 191 vhost_discard_vq_desc(vq, 1);
176 tx_poll_start(net, sock); 192 tx_poll_start(net, sock);
177 break; 193 break;
178 } 194 }
@@ -191,9 +207,82 @@ static void handle_tx(struct vhost_net *net)
191 unuse_mm(net->dev.mm); 207 unuse_mm(net->dev.mm);
192} 208}
193 209
210static int peek_head_len(struct sock *sk)
211{
212 struct sk_buff *head;
213 int len = 0;
214
215 lock_sock(sk);
216 head = skb_peek(&sk->sk_receive_queue);
217 if (head)
218 len = head->len;
219 release_sock(sk);
220 return len;
221}
222
223/* This is a multi-buffer version of vhost_get_desc, that works if
224 * vq has read descriptors only.
225 * @vq - the relevant virtqueue
226 * @datalen - data length we'll be reading
227 * @iovcount - returned count of io vectors we fill
228 * @log - vhost log
229 * @log_num - log offset
230 * returns number of buffer heads allocated, negative on error
231 */
232static int get_rx_bufs(struct vhost_virtqueue *vq,
233 struct vring_used_elem *heads,
234 int datalen,
235 unsigned *iovcount,
236 struct vhost_log *log,
237 unsigned *log_num)
238{
239 unsigned int out, in;
240 int seg = 0;
241 int headcount = 0;
242 unsigned d;
243 int r, nlogs = 0;
244
245 while (datalen > 0) {
246 if (unlikely(headcount >= VHOST_NET_MAX_SG)) {
247 r = -ENOBUFS;
248 goto err;
249 }
250 d = vhost_get_vq_desc(vq->dev, vq, vq->iov + seg,
251 ARRAY_SIZE(vq->iov) - seg, &out,
252 &in, log, log_num);
253 if (d == vq->num) {
254 r = 0;
255 goto err;
256 }
257 if (unlikely(out || in <= 0)) {
258 vq_err(vq, "unexpected descriptor format for RX: "
259 "out %d, in %d\n", out, in);
260 r = -EINVAL;
261 goto err;
262 }
263 if (unlikely(log)) {
264 nlogs += *log_num;
265 log += *log_num;
266 }
267 heads[headcount].id = d;
268 heads[headcount].len = iov_length(vq->iov + seg, in);
269 datalen -= heads[headcount].len;
270 ++headcount;
271 seg += in;
272 }
273 heads[headcount - 1].len += datalen;
274 *iovcount = seg;
275 if (unlikely(log))
276 *log_num = nlogs;
277 return headcount;
278err:
279 vhost_discard_vq_desc(vq, headcount);
280 return r;
281}
282
194/* Expects to be always run from workqueue - which acts as 283/* Expects to be always run from workqueue - which acts as
195 * read-size critical section for our kind of RCU. */ 284 * read-size critical section for our kind of RCU. */
196static void handle_rx(struct vhost_net *net) 285static void handle_rx_big(struct vhost_net *net)
197{ 286{
198 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; 287 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
199 unsigned out, in, log, s; 288 unsigned out, in, log, s;
@@ -223,7 +312,7 @@ static void handle_rx(struct vhost_net *net)
223 use_mm(net->dev.mm); 312 use_mm(net->dev.mm);
224 mutex_lock(&vq->mutex); 313 mutex_lock(&vq->mutex);
225 vhost_disable_notify(vq); 314 vhost_disable_notify(vq);
226 hdr_size = vq->hdr_size; 315 hdr_size = vq->vhost_hlen;
227 316
228 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? 317 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
229 vq->log : NULL; 318 vq->log : NULL;
@@ -270,14 +359,14 @@ static void handle_rx(struct vhost_net *net)
270 len, MSG_DONTWAIT | MSG_TRUNC); 359 len, MSG_DONTWAIT | MSG_TRUNC);
271 /* TODO: Check specific error and bomb out unless EAGAIN? */ 360 /* TODO: Check specific error and bomb out unless EAGAIN? */
272 if (err < 0) { 361 if (err < 0) {
273 vhost_discard_vq_desc(vq); 362 vhost_discard_vq_desc(vq, 1);
274 break; 363 break;
275 } 364 }
276 /* TODO: Should check and handle checksum. */ 365 /* TODO: Should check and handle checksum. */
277 if (err > len) { 366 if (err > len) {
278 pr_debug("Discarded truncated rx packet: " 367 pr_debug("Discarded truncated rx packet: "
279 " len %d > %zd\n", err, len); 368 " len %d > %zd\n", err, len);
280 vhost_discard_vq_desc(vq); 369 vhost_discard_vq_desc(vq, 1);
281 continue; 370 continue;
282 } 371 }
283 len = err; 372 len = err;
@@ -302,6 +391,123 @@ static void handle_rx(struct vhost_net *net)
302 unuse_mm(net->dev.mm); 391 unuse_mm(net->dev.mm);
303} 392}
304 393
394/* Expects to be always run from workqueue - which acts as
395 * read-size critical section for our kind of RCU. */
396static void handle_rx_mergeable(struct vhost_net *net)
397{
398 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
399 unsigned uninitialized_var(in), log;
400 struct vhost_log *vq_log;
401 struct msghdr msg = {
402 .msg_name = NULL,
403 .msg_namelen = 0,
404 .msg_control = NULL, /* FIXME: get and handle RX aux data. */
405 .msg_controllen = 0,
406 .msg_iov = vq->iov,
407 .msg_flags = MSG_DONTWAIT,
408 };
409
410 struct virtio_net_hdr_mrg_rxbuf hdr = {
411 .hdr.flags = 0,
412 .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
413 };
414
415 size_t total_len = 0;
416 int err, headcount;
417 size_t vhost_hlen, sock_hlen;
418 size_t vhost_len, sock_len;
419 struct socket *sock = rcu_dereference(vq->private_data);
420 if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
421 return;
422
423 use_mm(net->dev.mm);
424 mutex_lock(&vq->mutex);
425 vhost_disable_notify(vq);
426 vhost_hlen = vq->vhost_hlen;
427 sock_hlen = vq->sock_hlen;
428
429 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
430 vq->log : NULL;
431
432 while ((sock_len = peek_head_len(sock->sk))) {
433 sock_len += sock_hlen;
434 vhost_len = sock_len + vhost_hlen;
435 headcount = get_rx_bufs(vq, vq->heads, vhost_len,
436 &in, vq_log, &log);
437 /* On error, stop handling until the next kick. */
438 if (unlikely(headcount < 0))
439 break;
440 /* OK, now we need to know about added descriptors. */
441 if (!headcount) {
442 if (unlikely(vhost_enable_notify(vq))) {
443 /* They have slipped one in as we were
444 * doing that: check again. */
445 vhost_disable_notify(vq);
446 continue;
447 }
448 /* Nothing new? Wait for eventfd to tell us
449 * they refilled. */
450 break;
451 }
452 /* We don't need to be notified again. */
453 if (unlikely((vhost_hlen)))
454 /* Skip header. TODO: support TSO. */
455 move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
456 else
457 /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
458 * needed because sendmsg can modify msg_iov. */
459 copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
460 msg.msg_iovlen = in;
461 err = sock->ops->recvmsg(NULL, sock, &msg,
462 sock_len, MSG_DONTWAIT | MSG_TRUNC);
463 /* Userspace might have consumed the packet meanwhile:
464 * it's not supposed to do this usually, but might be hard
465 * to prevent. Discard data we got (if any) and keep going. */
466 if (unlikely(err != sock_len)) {
467 pr_debug("Discarded rx packet: "
468 " len %d, expected %zd\n", err, sock_len);
469 vhost_discard_vq_desc(vq, headcount);
470 continue;
471 }
472 if (unlikely(vhost_hlen) &&
473 memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
474 vhost_hlen)) {
475 vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
476 vq->iov->iov_base);
477 break;
478 }
479 /* TODO: Should check and handle checksum. */
480 if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
481 memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
482 offsetof(typeof(hdr), num_buffers),
483 sizeof hdr.num_buffers)) {
484 vq_err(vq, "Failed num_buffers write");
485 vhost_discard_vq_desc(vq, headcount);
486 break;
487 }
488 vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
489 headcount);
490 if (unlikely(vq_log))
491 vhost_log_write(vq, vq_log, log, vhost_len);
492 total_len += vhost_len;
493 if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
494 vhost_poll_queue(&vq->poll);
495 break;
496 }
497 }
498
499 mutex_unlock(&vq->mutex);
500 unuse_mm(net->dev.mm);
501}
502
503static void handle_rx(struct vhost_net *net)
504{
505 if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
506 handle_rx_mergeable(net);
507 else
508 handle_rx_big(net);
509}
510
305static void handle_tx_kick(struct vhost_work *work) 511static void handle_tx_kick(struct vhost_work *work)
306{ 512{
307 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, 513 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
@@ -577,9 +783,21 @@ done:
577 783
578static int vhost_net_set_features(struct vhost_net *n, u64 features) 784static int vhost_net_set_features(struct vhost_net *n, u64 features)
579{ 785{
580 size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ? 786 size_t vhost_hlen, sock_hlen, hdr_len;
581 sizeof(struct virtio_net_hdr) : 0;
582 int i; 787 int i;
788
789 hdr_len = (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ?
790 sizeof(struct virtio_net_hdr_mrg_rxbuf) :
791 sizeof(struct virtio_net_hdr);
792 if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
793 /* vhost provides vnet_hdr */
794 vhost_hlen = hdr_len;
795 sock_hlen = 0;
796 } else {
797 /* socket provides vnet_hdr */
798 vhost_hlen = 0;
799 sock_hlen = hdr_len;
800 }
583 mutex_lock(&n->dev.mutex); 801 mutex_lock(&n->dev.mutex);
584 if ((features & (1 << VHOST_F_LOG_ALL)) && 802 if ((features & (1 << VHOST_F_LOG_ALL)) &&
585 !vhost_log_access_ok(&n->dev)) { 803 !vhost_log_access_ok(&n->dev)) {
@@ -590,7 +808,8 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
590 smp_wmb(); 808 smp_wmb();
591 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { 809 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
592 mutex_lock(&n->vqs[i].mutex); 810 mutex_lock(&n->vqs[i].mutex);
593 n->vqs[i].hdr_size = hdr_size; 811 n->vqs[i].vhost_hlen = vhost_hlen;
812 n->vqs[i].sock_hlen = sock_hlen;
594 mutex_unlock(&n->vqs[i].mutex); 813 mutex_unlock(&n->vqs[i].mutex);
595 } 814 }
596 vhost_net_flush(n); 815 vhost_net_flush(n);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index dd2d019b889f..e05557d52999 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -149,7 +149,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
149 vq->used_flags = 0; 149 vq->used_flags = 0;
150 vq->log_used = false; 150 vq->log_used = false;
151 vq->log_addr = -1ull; 151 vq->log_addr = -1ull;
152 vq->hdr_size = 0; 152 vq->vhost_hlen = 0;
153 vq->sock_hlen = 0;
153 vq->private_data = NULL; 154 vq->private_data = NULL;
154 vq->log_base = NULL; 155 vq->log_base = NULL;
155 vq->error_ctx = NULL; 156 vq->error_ctx = NULL;
@@ -1101,9 +1102,9 @@ int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
1101} 1102}
1102 1103
1103/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ 1104/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
1104void vhost_discard_vq_desc(struct vhost_virtqueue *vq) 1105void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
1105{ 1106{
1106 vq->last_avail_idx--; 1107 vq->last_avail_idx -= n;
1107} 1108}
1108 1109
1109/* After we've used one of their buffers, we tell them about it. We'll then 1110/* After we've used one of their buffers, we tell them about it. We'll then
@@ -1148,6 +1149,67 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
1148 return 0; 1149 return 0;
1149} 1150}
1150 1151
1152static int __vhost_add_used_n(struct vhost_virtqueue *vq,
1153 struct vring_used_elem *heads,
1154 unsigned count)
1155{
1156 struct vring_used_elem __user *used;
1157 int start;
1158
1159 start = vq->last_used_idx % vq->num;
1160 used = vq->used->ring + start;
1161 if (copy_to_user(used, heads, count * sizeof *used)) {
1162 vq_err(vq, "Failed to write used");
1163 return -EFAULT;
1164 }
1165 if (unlikely(vq->log_used)) {
1166 /* Make sure data is seen before log. */
1167 smp_wmb();
1168 /* Log used ring entry write. */
1169 log_write(vq->log_base,
1170 vq->log_addr +
1171 ((void __user *)used - (void __user *)vq->used),
1172 count * sizeof *used);
1173 }
1174 vq->last_used_idx += count;
1175 return 0;
1176}
1177
1178/* After we've used one of their buffers, we tell them about it. We'll then
1179 * want to notify the guest, using eventfd. */
1180int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
1181 unsigned count)
1182{
1183 int start, n, r;
1184
1185 start = vq->last_used_idx % vq->num;
1186 n = vq->num - start;
1187 if (n < count) {
1188 r = __vhost_add_used_n(vq, heads, n);
1189 if (r < 0)
1190 return r;
1191 heads += n;
1192 count -= n;
1193 }
1194 r = __vhost_add_used_n(vq, heads, count);
1195
1196 /* Make sure buffer is written before we update index. */
1197 smp_wmb();
1198 if (put_user(vq->last_used_idx, &vq->used->idx)) {
1199 vq_err(vq, "Failed to increment used idx");
1200 return -EFAULT;
1201 }
1202 if (unlikely(vq->log_used)) {
1203 /* Log used index update. */
1204 log_write(vq->log_base,
1205 vq->log_addr + offsetof(struct vring_used, idx),
1206 sizeof vq->used->idx);
1207 if (vq->log_ctx)
1208 eventfd_signal(vq->log_ctx, 1);
1209 }
1210 return r;
1211}
1212
1151/* This actually signals the guest, using eventfd. */ 1213/* This actually signals the guest, using eventfd. */
1152void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) 1214void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
1153{ 1215{
@@ -1182,6 +1244,15 @@ void vhost_add_used_and_signal(struct vhost_dev *dev,
1182 vhost_signal(dev, vq); 1244 vhost_signal(dev, vq);
1183} 1245}
1184 1246
1247/* multi-buffer version of vhost_add_used_and_signal */
1248void vhost_add_used_and_signal_n(struct vhost_dev *dev,
1249 struct vhost_virtqueue *vq,
1250 struct vring_used_elem *heads, unsigned count)
1251{
1252 vhost_add_used_n(vq, heads, count);
1253 vhost_signal(dev, vq);
1254}
1255
1185/* OK, now we need to know about added descriptors. */ 1256/* OK, now we need to know about added descriptors. */
1186bool vhost_enable_notify(struct vhost_virtqueue *vq) 1257bool vhost_enable_notify(struct vhost_virtqueue *vq)
1187{ 1258{
@@ -1206,7 +1277,7 @@ bool vhost_enable_notify(struct vhost_virtqueue *vq)
1206 return false; 1277 return false;
1207 } 1278 }
1208 1279
1209 return avail_idx != vq->last_avail_idx; 1280 return avail_idx != vq->avail_idx;
1210} 1281}
1211 1282
1212/* We don't need to be notified again. */ 1283/* We don't need to be notified again. */
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 3693327549b3..afd77295971c 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -96,7 +96,9 @@ struct vhost_virtqueue {
96 struct iovec indirect[VHOST_NET_MAX_SG]; 96 struct iovec indirect[VHOST_NET_MAX_SG];
97 struct iovec iov[VHOST_NET_MAX_SG]; 97 struct iovec iov[VHOST_NET_MAX_SG];
98 struct iovec hdr[VHOST_NET_MAX_SG]; 98 struct iovec hdr[VHOST_NET_MAX_SG];
99 size_t hdr_size; 99 size_t vhost_hlen;
100 size_t sock_hlen;
101 struct vring_used_elem heads[VHOST_NET_MAX_SG];
100 /* We use a kind of RCU to access private pointer. 102 /* We use a kind of RCU to access private pointer.
101 * All readers access it from worker, which makes it possible to 103 * All readers access it from worker, which makes it possible to
102 * flush the vhost_work instead of synchronize_rcu. Therefore readers do 104 * flush the vhost_work instead of synchronize_rcu. Therefore readers do
@@ -139,12 +141,16 @@ int vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
139 struct iovec iov[], unsigned int iov_count, 141 struct iovec iov[], unsigned int iov_count,
140 unsigned int *out_num, unsigned int *in_num, 142 unsigned int *out_num, unsigned int *in_num,
141 struct vhost_log *log, unsigned int *log_num); 143 struct vhost_log *log, unsigned int *log_num);
142void vhost_discard_vq_desc(struct vhost_virtqueue *); 144void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
143 145
144int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); 146int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
145void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); 147int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
148 unsigned count);
146void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *, 149void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
147 unsigned int head, int len); 150 unsigned int id, int len);
151void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
152 struct vring_used_elem *heads, unsigned count);
153void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
148void vhost_disable_notify(struct vhost_virtqueue *); 154void vhost_disable_notify(struct vhost_virtqueue *);
149bool vhost_enable_notify(struct vhost_virtqueue *); 155bool vhost_enable_notify(struct vhost_virtqueue *);
150 156
@@ -161,7 +167,8 @@ enum {
161 VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) | 167 VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
162 (1 << VIRTIO_RING_F_INDIRECT_DESC) | 168 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
163 (1 << VHOST_F_LOG_ALL) | 169 (1 << VHOST_F_LOG_ALL) |
164 (1 << VHOST_NET_F_VIRTIO_NET_HDR), 170 (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
171 (1 << VIRTIO_NET_F_MRG_RXBUF),
165}; 172};
166 173
167static inline int vhost_has_feature(struct vhost_dev *dev, int bit) 174static inline int vhost_has_feature(struct vhost_dev *dev, int bit)