diff options
| -rw-r--r-- | drivers/vhost/net.c | 293 | ||||
| -rw-r--r-- | drivers/vhost/vhost.c | 228 | ||||
| -rw-r--r-- | drivers/vhost/vhost.h | 55 | ||||
| -rw-r--r-- | include/linux/cgroup.h | 7 | ||||
| -rw-r--r-- | kernel/cgroup.c | 23 |
5 files changed, 515 insertions, 91 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f11e6bb5b036..f13e56babe4b 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c | |||
| @@ -74,6 +74,22 @@ static int move_iovec_hdr(struct iovec *from, struct iovec *to, | |||
| 74 | } | 74 | } |
| 75 | return seg; | 75 | return seg; |
| 76 | } | 76 | } |
| 77 | /* Copy iovec entries for len bytes from iovec. */ | ||
| 78 | static void copy_iovec_hdr(const struct iovec *from, struct iovec *to, | ||
| 79 | size_t len, int iovcount) | ||
| 80 | { | ||
| 81 | int seg = 0; | ||
| 82 | size_t size; | ||
| 83 | while (len && seg < iovcount) { | ||
| 84 | size = min(from->iov_len, len); | ||
| 85 | to->iov_base = from->iov_base; | ||
| 86 | to->iov_len = size; | ||
| 87 | len -= size; | ||
| 88 | ++from; | ||
| 89 | ++to; | ||
| 90 | ++seg; | ||
| 91 | } | ||
| 92 | } | ||
| 77 | 93 | ||
| 78 | /* Caller must have TX VQ lock */ | 94 | /* Caller must have TX VQ lock */ |
| 79 | static void tx_poll_stop(struct vhost_net *net) | 95 | static void tx_poll_stop(struct vhost_net *net) |
| @@ -129,7 +145,7 @@ static void handle_tx(struct vhost_net *net) | |||
| 129 | 145 | ||
| 130 | if (wmem < sock->sk->sk_sndbuf / 2) | 146 | if (wmem < sock->sk->sk_sndbuf / 2) |
| 131 | tx_poll_stop(net); | 147 | tx_poll_stop(net); |
| 132 | hdr_size = vq->hdr_size; | 148 | hdr_size = vq->vhost_hlen; |
| 133 | 149 | ||
| 134 | for (;;) { | 150 | for (;;) { |
| 135 | head = vhost_get_vq_desc(&net->dev, vq, vq->iov, | 151 | head = vhost_get_vq_desc(&net->dev, vq, vq->iov, |
| @@ -172,7 +188,7 @@ static void handle_tx(struct vhost_net *net) | |||
| 172 | /* TODO: Check specific error and bomb out unless ENOBUFS? */ | 188 | /* TODO: Check specific error and bomb out unless ENOBUFS? */ |
| 173 | err = sock->ops->sendmsg(NULL, sock, &msg, len); | 189 | err = sock->ops->sendmsg(NULL, sock, &msg, len); |
| 174 | if (unlikely(err < 0)) { | 190 | if (unlikely(err < 0)) { |
| 175 | vhost_discard_vq_desc(vq); | 191 | vhost_discard_vq_desc(vq, 1); |
| 176 | tx_poll_start(net, sock); | 192 | tx_poll_start(net, sock); |
| 177 | break; | 193 | break; |
| 178 | } | 194 | } |
| @@ -191,9 +207,82 @@ static void handle_tx(struct vhost_net *net) | |||
| 191 | unuse_mm(net->dev.mm); | 207 | unuse_mm(net->dev.mm); |
| 192 | } | 208 | } |
| 193 | 209 | ||
| 210 | static int peek_head_len(struct sock *sk) | ||
| 211 | { | ||
| 212 | struct sk_buff *head; | ||
| 213 | int len = 0; | ||
| 214 | |||
| 215 | lock_sock(sk); | ||
| 216 | head = skb_peek(&sk->sk_receive_queue); | ||
| 217 | if (head) | ||
| 218 | len = head->len; | ||
| 219 | release_sock(sk); | ||
| 220 | return len; | ||
| 221 | } | ||
| 222 | |||
| 223 | /* This is a multi-buffer version of vhost_get_desc, that works if | ||
| 224 | * vq has read descriptors only. | ||
| 225 | * @vq - the relevant virtqueue | ||
| 226 | * @datalen - data length we'll be reading | ||
| 227 | * @iovcount - returned count of io vectors we fill | ||
| 228 | * @log - vhost log | ||
| 229 | * @log_num - log offset | ||
| 230 | * returns number of buffer heads allocated, negative on error | ||
| 231 | */ | ||
| 232 | static int get_rx_bufs(struct vhost_virtqueue *vq, | ||
| 233 | struct vring_used_elem *heads, | ||
| 234 | int datalen, | ||
| 235 | unsigned *iovcount, | ||
| 236 | struct vhost_log *log, | ||
| 237 | unsigned *log_num) | ||
| 238 | { | ||
| 239 | unsigned int out, in; | ||
| 240 | int seg = 0; | ||
| 241 | int headcount = 0; | ||
| 242 | unsigned d; | ||
| 243 | int r, nlogs = 0; | ||
| 244 | |||
| 245 | while (datalen > 0) { | ||
| 246 | if (unlikely(headcount >= VHOST_NET_MAX_SG)) { | ||
| 247 | r = -ENOBUFS; | ||
| 248 | goto err; | ||
| 249 | } | ||
| 250 | d = vhost_get_vq_desc(vq->dev, vq, vq->iov + seg, | ||
| 251 | ARRAY_SIZE(vq->iov) - seg, &out, | ||
| 252 | &in, log, log_num); | ||
| 253 | if (d == vq->num) { | ||
| 254 | r = 0; | ||
| 255 | goto err; | ||
| 256 | } | ||
| 257 | if (unlikely(out || in <= 0)) { | ||
| 258 | vq_err(vq, "unexpected descriptor format for RX: " | ||
| 259 | "out %d, in %d\n", out, in); | ||
| 260 | r = -EINVAL; | ||
| 261 | goto err; | ||
| 262 | } | ||
| 263 | if (unlikely(log)) { | ||
| 264 | nlogs += *log_num; | ||
| 265 | log += *log_num; | ||
| 266 | } | ||
| 267 | heads[headcount].id = d; | ||
| 268 | heads[headcount].len = iov_length(vq->iov + seg, in); | ||
| 269 | datalen -= heads[headcount].len; | ||
| 270 | ++headcount; | ||
| 271 | seg += in; | ||
| 272 | } | ||
| 273 | heads[headcount - 1].len += datalen; | ||
| 274 | *iovcount = seg; | ||
| 275 | if (unlikely(log)) | ||
| 276 | *log_num = nlogs; | ||
| 277 | return headcount; | ||
| 278 | err: | ||
| 279 | vhost_discard_vq_desc(vq, headcount); | ||
| 280 | return r; | ||
| 281 | } | ||
| 282 | |||
| 194 | /* Expects to be always run from workqueue - which acts as | 283 | /* Expects to be always run from workqueue - which acts as |
| 195 | * read-size critical section for our kind of RCU. */ | 284 | * read-size critical section for our kind of RCU. */ |
| 196 | static void handle_rx(struct vhost_net *net) | 285 | static void handle_rx_big(struct vhost_net *net) |
| 197 | { | 286 | { |
| 198 | struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; | 287 | struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; |
| 199 | unsigned out, in, log, s; | 288 | unsigned out, in, log, s; |
| @@ -223,7 +312,7 @@ static void handle_rx(struct vhost_net *net) | |||
| 223 | use_mm(net->dev.mm); | 312 | use_mm(net->dev.mm); |
| 224 | mutex_lock(&vq->mutex); | 313 | mutex_lock(&vq->mutex); |
| 225 | vhost_disable_notify(vq); | 314 | vhost_disable_notify(vq); |
| 226 | hdr_size = vq->hdr_size; | 315 | hdr_size = vq->vhost_hlen; |
| 227 | 316 | ||
| 228 | vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? | 317 | vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? |
| 229 | vq->log : NULL; | 318 | vq->log : NULL; |
| @@ -270,14 +359,14 @@ static void handle_rx(struct vhost_net *net) | |||
| 270 | len, MSG_DONTWAIT | MSG_TRUNC); | 359 | len, MSG_DONTWAIT | MSG_TRUNC); |
| 271 | /* TODO: Check specific error and bomb out unless EAGAIN? */ | 360 | /* TODO: Check specific error and bomb out unless EAGAIN? */ |
| 272 | if (err < 0) { | 361 | if (err < 0) { |
| 273 | vhost_discard_vq_desc(vq); | 362 | vhost_discard_vq_desc(vq, 1); |
| 274 | break; | 363 | break; |
| 275 | } | 364 | } |
| 276 | /* TODO: Should check and handle checksum. */ | 365 | /* TODO: Should check and handle checksum. */ |
| 277 | if (err > len) { | 366 | if (err > len) { |
| 278 | pr_debug("Discarded truncated rx packet: " | 367 | pr_debug("Discarded truncated rx packet: " |
| 279 | " len %d > %zd\n", err, len); | 368 | " len %d > %zd\n", err, len); |
| 280 | vhost_discard_vq_desc(vq); | 369 | vhost_discard_vq_desc(vq, 1); |
| 281 | continue; | 370 | continue; |
| 282 | } | 371 | } |
| 283 | len = err; | 372 | len = err; |
| @@ -302,54 +391,175 @@ static void handle_rx(struct vhost_net *net) | |||
| 302 | unuse_mm(net->dev.mm); | 391 | unuse_mm(net->dev.mm); |
| 303 | } | 392 | } |
| 304 | 393 | ||
| 305 | static void handle_tx_kick(struct work_struct *work) | 394 | /* Expects to be always run from workqueue - which acts as |
| 395 | * read-size critical section for our kind of RCU. */ | ||
| 396 | static void handle_rx_mergeable(struct vhost_net *net) | ||
| 306 | { | 397 | { |
| 307 | struct vhost_virtqueue *vq; | 398 | struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; |
| 308 | struct vhost_net *net; | 399 | unsigned uninitialized_var(in), log; |
| 309 | vq = container_of(work, struct vhost_virtqueue, poll.work); | 400 | struct vhost_log *vq_log; |
| 310 | net = container_of(vq->dev, struct vhost_net, dev); | 401 | struct msghdr msg = { |
| 402 | .msg_name = NULL, | ||
| 403 | .msg_namelen = 0, | ||
| 404 | .msg_control = NULL, /* FIXME: get and handle RX aux data. */ | ||
| 405 | .msg_controllen = 0, | ||
| 406 | .msg_iov = vq->iov, | ||
| 407 | .msg_flags = MSG_DONTWAIT, | ||
| 408 | }; | ||
| 409 | |||
| 410 | struct virtio_net_hdr_mrg_rxbuf hdr = { | ||
| 411 | .hdr.flags = 0, | ||
| 412 | .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE | ||
| 413 | }; | ||
| 414 | |||
| 415 | size_t total_len = 0; | ||
| 416 | int err, headcount; | ||
| 417 | size_t vhost_hlen, sock_hlen; | ||
| 418 | size_t vhost_len, sock_len; | ||
| 419 | struct socket *sock = rcu_dereference(vq->private_data); | ||
| 420 | if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) | ||
| 421 | return; | ||
| 422 | |||
| 423 | use_mm(net->dev.mm); | ||
| 424 | mutex_lock(&vq->mutex); | ||
| 425 | vhost_disable_notify(vq); | ||
| 426 | vhost_hlen = vq->vhost_hlen; | ||
| 427 | sock_hlen = vq->sock_hlen; | ||
| 428 | |||
| 429 | vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? | ||
| 430 | vq->log : NULL; | ||
| 431 | |||
| 432 | while ((sock_len = peek_head_len(sock->sk))) { | ||
| 433 | sock_len += sock_hlen; | ||
| 434 | vhost_len = sock_len + vhost_hlen; | ||
| 435 | headcount = get_rx_bufs(vq, vq->heads, vhost_len, | ||
| 436 | &in, vq_log, &log); | ||
| 437 | /* On error, stop handling until the next kick. */ | ||
| 438 | if (unlikely(headcount < 0)) | ||
| 439 | break; | ||
| 440 | /* OK, now we need to know about added descriptors. */ | ||
| 441 | if (!headcount) { | ||
| 442 | if (unlikely(vhost_enable_notify(vq))) { | ||
| 443 | /* They have slipped one in as we were | ||
| 444 | * doing that: check again. */ | ||
| 445 | vhost_disable_notify(vq); | ||
| 446 | continue; | ||
| 447 | } | ||
| 448 | /* Nothing new? Wait for eventfd to tell us | ||
| 449 | * they refilled. */ | ||
| 450 | break; | ||
| 451 | } | ||
| 452 | /* We don't need to be notified again. */ | ||
| 453 | if (unlikely((vhost_hlen))) | ||
| 454 | /* Skip header. TODO: support TSO. */ | ||
| 455 | move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in); | ||
| 456 | else | ||
| 457 | /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF: | ||
| 458 | * needed because sendmsg can modify msg_iov. */ | ||
| 459 | copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in); | ||
| 460 | msg.msg_iovlen = in; | ||
| 461 | err = sock->ops->recvmsg(NULL, sock, &msg, | ||
| 462 | sock_len, MSG_DONTWAIT | MSG_TRUNC); | ||
| 463 | /* Userspace might have consumed the packet meanwhile: | ||
| 464 | * it's not supposed to do this usually, but might be hard | ||
| 465 | * to prevent. Discard data we got (if any) and keep going. */ | ||
| 466 | if (unlikely(err != sock_len)) { | ||
| 467 | pr_debug("Discarded rx packet: " | ||
| 468 | " len %d, expected %zd\n", err, sock_len); | ||
| 469 | vhost_discard_vq_desc(vq, headcount); | ||
| 470 | continue; | ||
| 471 | } | ||
| 472 | if (unlikely(vhost_hlen) && | ||
| 473 | memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0, | ||
| 474 | vhost_hlen)) { | ||
| 475 | vq_err(vq, "Unable to write vnet_hdr at addr %p\n", | ||
| 476 | vq->iov->iov_base); | ||
| 477 | break; | ||
| 478 | } | ||
| 479 | /* TODO: Should check and handle checksum. */ | ||
| 480 | if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) && | ||
| 481 | memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount, | ||
| 482 | offsetof(typeof(hdr), num_buffers), | ||
| 483 | sizeof hdr.num_buffers)) { | ||
| 484 | vq_err(vq, "Failed num_buffers write"); | ||
| 485 | vhost_discard_vq_desc(vq, headcount); | ||
| 486 | break; | ||
| 487 | } | ||
| 488 | vhost_add_used_and_signal_n(&net->dev, vq, vq->heads, | ||
| 489 | headcount); | ||
| 490 | if (unlikely(vq_log)) | ||
| 491 | vhost_log_write(vq, vq_log, log, vhost_len); | ||
| 492 | total_len += vhost_len; | ||
| 493 | if (unlikely(total_len >= VHOST_NET_WEIGHT)) { | ||
| 494 | vhost_poll_queue(&vq->poll); | ||
| 495 | break; | ||
| 496 | } | ||
| 497 | } | ||
| 498 | |||
| 499 | mutex_unlock(&vq->mutex); | ||
| 500 | unuse_mm(net->dev.mm); | ||
| 501 | } | ||
| 502 | |||
| 503 | static void handle_rx(struct vhost_net *net) | ||
| 504 | { | ||
| 505 | if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) | ||
| 506 | handle_rx_mergeable(net); | ||
| 507 | else | ||
| 508 | handle_rx_big(net); | ||
| 509 | } | ||
| 510 | |||
| 511 | static void handle_tx_kick(struct vhost_work *work) | ||
| 512 | { | ||
| 513 | struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, | ||
| 514 | poll.work); | ||
| 515 | struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); | ||
| 516 | |||
| 311 | handle_tx(net); | 517 | handle_tx(net); |
| 312 | } | 518 | } |
| 313 | 519 | ||
| 314 | static void handle_rx_kick(struct work_struct *work) | 520 | static void handle_rx_kick(struct vhost_work *work) |
| 315 | { | 521 | { |
| 316 | struct vhost_virtqueue *vq; | 522 | struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, |
| 317 | struct vhost_net *net; | 523 | poll.work); |
| 318 | vq = container_of(work, struct vhost_virtqueue, poll.work); | 524 | struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); |
| 319 | net = container_of(vq->dev, struct vhost_net, dev); | 525 | |
| 320 | handle_rx(net); | 526 | handle_rx(net); |
| 321 | } | 527 | } |
| 322 | 528 | ||
| 323 | static void handle_tx_net(struct work_struct *work) | 529 | static void handle_tx_net(struct vhost_work *work) |
| 324 | { | 530 | { |
| 325 | struct vhost_net *net; | 531 | struct vhost_net *net = container_of(work, struct vhost_net, |
| 326 | net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); | 532 | poll[VHOST_NET_VQ_TX].work); |
| 327 | handle_tx(net); | 533 | handle_tx(net); |
| 328 | } | 534 | } |
| 329 | 535 | ||
| 330 | static void handle_rx_net(struct work_struct *work) | 536 | static void handle_rx_net(struct vhost_work *work) |
| 331 | { | 537 | { |
| 332 | struct vhost_net *net; | 538 | struct vhost_net *net = container_of(work, struct vhost_net, |
| 333 | net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); | 539 | poll[VHOST_NET_VQ_RX].work); |
| 334 | handle_rx(net); | 540 | handle_rx(net); |
| 335 | } | 541 | } |
| 336 | 542 | ||
| 337 | static int vhost_net_open(struct inode *inode, struct file *f) | 543 | static int vhost_net_open(struct inode *inode, struct file *f) |
| 338 | { | 544 | { |
| 339 | struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); | 545 | struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); |
| 546 | struct vhost_dev *dev; | ||
| 340 | int r; | 547 | int r; |
| 548 | |||
| 341 | if (!n) | 549 | if (!n) |
| 342 | return -ENOMEM; | 550 | return -ENOMEM; |
| 551 | |||
| 552 | dev = &n->dev; | ||
| 343 | n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; | 553 | n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; |
| 344 | n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; | 554 | n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; |
| 345 | r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX); | 555 | r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX); |
| 346 | if (r < 0) { | 556 | if (r < 0) { |
| 347 | kfree(n); | 557 | kfree(n); |
| 348 | return r; | 558 | return r; |
| 349 | } | 559 | } |
| 350 | 560 | ||
| 351 | vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); | 561 | vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); |
| 352 | vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); | 562 | vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); |
| 353 | n->tx_poll_state = VHOST_NET_POLL_DISABLED; | 563 | n->tx_poll_state = VHOST_NET_POLL_DISABLED; |
| 354 | 564 | ||
| 355 | f->private_data = n; | 565 | f->private_data = n; |
| @@ -573,9 +783,21 @@ done: | |||
| 573 | 783 | ||
| 574 | static int vhost_net_set_features(struct vhost_net *n, u64 features) | 784 | static int vhost_net_set_features(struct vhost_net *n, u64 features) |
| 575 | { | 785 | { |
| 576 | size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ? | 786 | size_t vhost_hlen, sock_hlen, hdr_len; |
| 577 | sizeof(struct virtio_net_hdr) : 0; | ||
| 578 | int i; | 787 | int i; |
| 788 | |||
| 789 | hdr_len = (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? | ||
| 790 | sizeof(struct virtio_net_hdr_mrg_rxbuf) : | ||
| 791 | sizeof(struct virtio_net_hdr); | ||
| 792 | if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) { | ||
| 793 | /* vhost provides vnet_hdr */ | ||
| 794 | vhost_hlen = hdr_len; | ||
| 795 | sock_hlen = 0; | ||
| 796 | } else { | ||
| 797 | /* socket provides vnet_hdr */ | ||
| 798 | vhost_hlen = 0; | ||
| 799 | sock_hlen = hdr_len; | ||
| 800 | } | ||
| 579 | mutex_lock(&n->dev.mutex); | 801 | mutex_lock(&n->dev.mutex); |
| 580 | if ((features & (1 << VHOST_F_LOG_ALL)) && | 802 | if ((features & (1 << VHOST_F_LOG_ALL)) && |
| 581 | !vhost_log_access_ok(&n->dev)) { | 803 | !vhost_log_access_ok(&n->dev)) { |
| @@ -586,7 +808,8 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features) | |||
| 586 | smp_wmb(); | 808 | smp_wmb(); |
| 587 | for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { | 809 | for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { |
| 588 | mutex_lock(&n->vqs[i].mutex); | 810 | mutex_lock(&n->vqs[i].mutex); |
| 589 | n->vqs[i].hdr_size = hdr_size; | 811 | n->vqs[i].vhost_hlen = vhost_hlen; |
| 812 | n->vqs[i].sock_hlen = sock_hlen; | ||
| 590 | mutex_unlock(&n->vqs[i].mutex); | 813 | mutex_unlock(&n->vqs[i].mutex); |
| 591 | } | 814 | } |
| 592 | vhost_net_flush(n); | 815 | vhost_net_flush(n); |
| @@ -656,25 +879,13 @@ static struct miscdevice vhost_net_misc = { | |||
| 656 | 879 | ||
| 657 | static int vhost_net_init(void) | 880 | static int vhost_net_init(void) |
| 658 | { | 881 | { |
| 659 | int r = vhost_init(); | 882 | return misc_register(&vhost_net_misc); |
| 660 | if (r) | ||
| 661 | goto err_init; | ||
| 662 | r = misc_register(&vhost_net_misc); | ||
| 663 | if (r) | ||
| 664 | goto err_reg; | ||
| 665 | return 0; | ||
| 666 | err_reg: | ||
| 667 | vhost_cleanup(); | ||
| 668 | err_init: | ||
| 669 | return r; | ||
| 670 | |||
| 671 | } | 883 | } |
| 672 | module_init(vhost_net_init); | 884 | module_init(vhost_net_init); |
| 673 | 885 | ||
| 674 | static void vhost_net_exit(void) | 886 | static void vhost_net_exit(void) |
| 675 | { | 887 | { |
| 676 | misc_deregister(&vhost_net_misc); | 888 | misc_deregister(&vhost_net_misc); |
| 677 | vhost_cleanup(); | ||
| 678 | } | 889 | } |
| 679 | module_exit(vhost_net_exit); | 890 | module_exit(vhost_net_exit); |
| 680 | 891 | ||
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 248ed2db0711..e05557d52999 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c | |||
| @@ -17,12 +17,13 @@ | |||
| 17 | #include <linux/mm.h> | 17 | #include <linux/mm.h> |
| 18 | #include <linux/miscdevice.h> | 18 | #include <linux/miscdevice.h> |
| 19 | #include <linux/mutex.h> | 19 | #include <linux/mutex.h> |
| 20 | #include <linux/workqueue.h> | ||
| 21 | #include <linux/rcupdate.h> | 20 | #include <linux/rcupdate.h> |
| 22 | #include <linux/poll.h> | 21 | #include <linux/poll.h> |
| 23 | #include <linux/file.h> | 22 | #include <linux/file.h> |
| 24 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
| 25 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
| 25 | #include <linux/kthread.h> | ||
| 26 | #include <linux/cgroup.h> | ||
| 26 | 27 | ||
| 27 | #include <linux/net.h> | 28 | #include <linux/net.h> |
| 28 | #include <linux/if_packet.h> | 29 | #include <linux/if_packet.h> |
| @@ -37,8 +38,6 @@ enum { | |||
| 37 | VHOST_MEMORY_F_LOG = 0x1, | 38 | VHOST_MEMORY_F_LOG = 0x1, |
| 38 | }; | 39 | }; |
| 39 | 40 | ||
| 40 | static struct workqueue_struct *vhost_workqueue; | ||
| 41 | |||
| 42 | static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, | 41 | static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, |
| 43 | poll_table *pt) | 42 | poll_table *pt) |
| 44 | { | 43 | { |
| @@ -52,23 +51,31 @@ static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, | |||
| 52 | static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, | 51 | static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, |
| 53 | void *key) | 52 | void *key) |
| 54 | { | 53 | { |
| 55 | struct vhost_poll *poll; | 54 | struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait); |
| 56 | poll = container_of(wait, struct vhost_poll, wait); | 55 | |
| 57 | if (!((unsigned long)key & poll->mask)) | 56 | if (!((unsigned long)key & poll->mask)) |
| 58 | return 0; | 57 | return 0; |
| 59 | 58 | ||
| 60 | queue_work(vhost_workqueue, &poll->work); | 59 | vhost_poll_queue(poll); |
| 61 | return 0; | 60 | return 0; |
| 62 | } | 61 | } |
| 63 | 62 | ||
| 64 | /* Init poll structure */ | 63 | /* Init poll structure */ |
| 65 | void vhost_poll_init(struct vhost_poll *poll, work_func_t func, | 64 | void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, |
| 66 | unsigned long mask) | 65 | unsigned long mask, struct vhost_dev *dev) |
| 67 | { | 66 | { |
| 68 | INIT_WORK(&poll->work, func); | 67 | struct vhost_work *work = &poll->work; |
| 68 | |||
| 69 | init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); | 69 | init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); |
| 70 | init_poll_funcptr(&poll->table, vhost_poll_func); | 70 | init_poll_funcptr(&poll->table, vhost_poll_func); |
| 71 | poll->mask = mask; | 71 | poll->mask = mask; |
| 72 | poll->dev = dev; | ||
| 73 | |||
| 74 | INIT_LIST_HEAD(&work->node); | ||
| 75 | work->fn = fn; | ||
| 76 | init_waitqueue_head(&work->done); | ||
| 77 | work->flushing = 0; | ||
| 78 | work->queue_seq = work->done_seq = 0; | ||
| 72 | } | 79 | } |
| 73 | 80 | ||
| 74 | /* Start polling a file. We add ourselves to file's wait queue. The caller must | 81 | /* Start polling a file. We add ourselves to file's wait queue. The caller must |
| @@ -92,12 +99,40 @@ void vhost_poll_stop(struct vhost_poll *poll) | |||
| 92 | * locks that are also used by the callback. */ | 99 | * locks that are also used by the callback. */ |
| 93 | void vhost_poll_flush(struct vhost_poll *poll) | 100 | void vhost_poll_flush(struct vhost_poll *poll) |
| 94 | { | 101 | { |
| 95 | flush_work(&poll->work); | 102 | struct vhost_work *work = &poll->work; |
| 103 | unsigned seq; | ||
| 104 | int left; | ||
| 105 | int flushing; | ||
| 106 | |||
| 107 | spin_lock_irq(&poll->dev->work_lock); | ||
| 108 | seq = work->queue_seq; | ||
| 109 | work->flushing++; | ||
| 110 | spin_unlock_irq(&poll->dev->work_lock); | ||
| 111 | wait_event(work->done, ({ | ||
| 112 | spin_lock_irq(&poll->dev->work_lock); | ||
| 113 | left = seq - work->done_seq <= 0; | ||
| 114 | spin_unlock_irq(&poll->dev->work_lock); | ||
| 115 | left; | ||
| 116 | })); | ||
| 117 | spin_lock_irq(&poll->dev->work_lock); | ||
| 118 | flushing = --work->flushing; | ||
| 119 | spin_unlock_irq(&poll->dev->work_lock); | ||
| 120 | BUG_ON(flushing < 0); | ||
| 96 | } | 121 | } |
| 97 | 122 | ||
| 98 | void vhost_poll_queue(struct vhost_poll *poll) | 123 | void vhost_poll_queue(struct vhost_poll *poll) |
| 99 | { | 124 | { |
| 100 | queue_work(vhost_workqueue, &poll->work); | 125 | struct vhost_dev *dev = poll->dev; |
| 126 | struct vhost_work *work = &poll->work; | ||
| 127 | unsigned long flags; | ||
| 128 | |||
| 129 | spin_lock_irqsave(&dev->work_lock, flags); | ||
| 130 | if (list_empty(&work->node)) { | ||
| 131 | list_add_tail(&work->node, &dev->work_list); | ||
| 132 | work->queue_seq++; | ||
| 133 | wake_up_process(dev->worker); | ||
| 134 | } | ||
| 135 | spin_unlock_irqrestore(&dev->work_lock, flags); | ||
| 101 | } | 136 | } |
| 102 | 137 | ||
| 103 | static void vhost_vq_reset(struct vhost_dev *dev, | 138 | static void vhost_vq_reset(struct vhost_dev *dev, |
| @@ -114,7 +149,8 @@ static void vhost_vq_reset(struct vhost_dev *dev, | |||
| 114 | vq->used_flags = 0; | 149 | vq->used_flags = 0; |
| 115 | vq->log_used = false; | 150 | vq->log_used = false; |
| 116 | vq->log_addr = -1ull; | 151 | vq->log_addr = -1ull; |
| 117 | vq->hdr_size = 0; | 152 | vq->vhost_hlen = 0; |
| 153 | vq->sock_hlen = 0; | ||
| 118 | vq->private_data = NULL; | 154 | vq->private_data = NULL; |
| 119 | vq->log_base = NULL; | 155 | vq->log_base = NULL; |
| 120 | vq->error_ctx = NULL; | 156 | vq->error_ctx = NULL; |
| @@ -125,10 +161,51 @@ static void vhost_vq_reset(struct vhost_dev *dev, | |||
| 125 | vq->log_ctx = NULL; | 161 | vq->log_ctx = NULL; |
| 126 | } | 162 | } |
| 127 | 163 | ||
| 164 | static int vhost_worker(void *data) | ||
| 165 | { | ||
| 166 | struct vhost_dev *dev = data; | ||
| 167 | struct vhost_work *work = NULL; | ||
| 168 | unsigned uninitialized_var(seq); | ||
| 169 | |||
| 170 | for (;;) { | ||
| 171 | /* mb paired w/ kthread_stop */ | ||
| 172 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 173 | |||
| 174 | spin_lock_irq(&dev->work_lock); | ||
| 175 | if (work) { | ||
| 176 | work->done_seq = seq; | ||
| 177 | if (work->flushing) | ||
| 178 | wake_up_all(&work->done); | ||
| 179 | } | ||
| 180 | |||
| 181 | if (kthread_should_stop()) { | ||
| 182 | spin_unlock_irq(&dev->work_lock); | ||
| 183 | __set_current_state(TASK_RUNNING); | ||
| 184 | return 0; | ||
| 185 | } | ||
| 186 | if (!list_empty(&dev->work_list)) { | ||
| 187 | work = list_first_entry(&dev->work_list, | ||
| 188 | struct vhost_work, node); | ||
| 189 | list_del_init(&work->node); | ||
| 190 | seq = work->queue_seq; | ||
| 191 | } else | ||
| 192 | work = NULL; | ||
| 193 | spin_unlock_irq(&dev->work_lock); | ||
| 194 | |||
| 195 | if (work) { | ||
| 196 | __set_current_state(TASK_RUNNING); | ||
| 197 | work->fn(work); | ||
| 198 | } else | ||
| 199 | schedule(); | ||
| 200 | |||
| 201 | } | ||
| 202 | } | ||
| 203 | |||
| 128 | long vhost_dev_init(struct vhost_dev *dev, | 204 | long vhost_dev_init(struct vhost_dev *dev, |
| 129 | struct vhost_virtqueue *vqs, int nvqs) | 205 | struct vhost_virtqueue *vqs, int nvqs) |
| 130 | { | 206 | { |
| 131 | int i; | 207 | int i; |
| 208 | |||
| 132 | dev->vqs = vqs; | 209 | dev->vqs = vqs; |
| 133 | dev->nvqs = nvqs; | 210 | dev->nvqs = nvqs; |
| 134 | mutex_init(&dev->mutex); | 211 | mutex_init(&dev->mutex); |
| @@ -136,6 +213,9 @@ long vhost_dev_init(struct vhost_dev *dev, | |||
| 136 | dev->log_file = NULL; | 213 | dev->log_file = NULL; |
| 137 | dev->memory = NULL; | 214 | dev->memory = NULL; |
| 138 | dev->mm = NULL; | 215 | dev->mm = NULL; |
| 216 | spin_lock_init(&dev->work_lock); | ||
| 217 | INIT_LIST_HEAD(&dev->work_list); | ||
| 218 | dev->worker = NULL; | ||
| 139 | 219 | ||
| 140 | for (i = 0; i < dev->nvqs; ++i) { | 220 | for (i = 0; i < dev->nvqs; ++i) { |
| 141 | dev->vqs[i].dev = dev; | 221 | dev->vqs[i].dev = dev; |
| @@ -143,9 +223,9 @@ long vhost_dev_init(struct vhost_dev *dev, | |||
| 143 | vhost_vq_reset(dev, dev->vqs + i); | 223 | vhost_vq_reset(dev, dev->vqs + i); |
| 144 | if (dev->vqs[i].handle_kick) | 224 | if (dev->vqs[i].handle_kick) |
| 145 | vhost_poll_init(&dev->vqs[i].poll, | 225 | vhost_poll_init(&dev->vqs[i].poll, |
| 146 | dev->vqs[i].handle_kick, | 226 | dev->vqs[i].handle_kick, POLLIN, dev); |
| 147 | POLLIN); | ||
| 148 | } | 227 | } |
| 228 | |||
| 149 | return 0; | 229 | return 0; |
| 150 | } | 230 | } |
| 151 | 231 | ||
| @@ -159,12 +239,36 @@ long vhost_dev_check_owner(struct vhost_dev *dev) | |||
| 159 | /* Caller should have device mutex */ | 239 | /* Caller should have device mutex */ |
| 160 | static long vhost_dev_set_owner(struct vhost_dev *dev) | 240 | static long vhost_dev_set_owner(struct vhost_dev *dev) |
| 161 | { | 241 | { |
| 242 | struct task_struct *worker; | ||
| 243 | int err; | ||
| 162 | /* Is there an owner already? */ | 244 | /* Is there an owner already? */ |
| 163 | if (dev->mm) | 245 | if (dev->mm) { |
| 164 | return -EBUSY; | 246 | err = -EBUSY; |
| 247 | goto err_mm; | ||
| 248 | } | ||
| 165 | /* No owner, become one */ | 249 | /* No owner, become one */ |
| 166 | dev->mm = get_task_mm(current); | 250 | dev->mm = get_task_mm(current); |
| 251 | worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); | ||
| 252 | if (IS_ERR(worker)) { | ||
| 253 | err = PTR_ERR(worker); | ||
| 254 | goto err_worker; | ||
| 255 | } | ||
| 256 | |||
| 257 | dev->worker = worker; | ||
| 258 | err = cgroup_attach_task_current_cg(worker); | ||
| 259 | if (err) | ||
| 260 | goto err_cgroup; | ||
| 261 | wake_up_process(worker); /* avoid contributing to loadavg */ | ||
| 262 | |||
| 167 | return 0; | 263 | return 0; |
| 264 | err_cgroup: | ||
| 265 | kthread_stop(worker); | ||
| 266 | err_worker: | ||
| 267 | if (dev->mm) | ||
| 268 | mmput(dev->mm); | ||
| 269 | dev->mm = NULL; | ||
| 270 | err_mm: | ||
| 271 | return err; | ||
| 168 | } | 272 | } |
| 169 | 273 | ||
| 170 | /* Caller should have device mutex */ | 274 | /* Caller should have device mutex */ |
| @@ -217,6 +321,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev) | |||
| 217 | if (dev->mm) | 321 | if (dev->mm) |
| 218 | mmput(dev->mm); | 322 | mmput(dev->mm); |
| 219 | dev->mm = NULL; | 323 | dev->mm = NULL; |
| 324 | |||
| 325 | WARN_ON(!list_empty(&dev->work_list)); | ||
| 326 | kthread_stop(dev->worker); | ||
| 220 | } | 327 | } |
| 221 | 328 | ||
| 222 | static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) | 329 | static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) |
| @@ -995,9 +1102,9 @@ int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq, | |||
| 995 | } | 1102 | } |
| 996 | 1103 | ||
| 997 | /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ | 1104 | /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ |
| 998 | void vhost_discard_vq_desc(struct vhost_virtqueue *vq) | 1105 | void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n) |
| 999 | { | 1106 | { |
| 1000 | vq->last_avail_idx--; | 1107 | vq->last_avail_idx -= n; |
| 1001 | } | 1108 | } |
| 1002 | 1109 | ||
| 1003 | /* After we've used one of their buffers, we tell them about it. We'll then | 1110 | /* After we've used one of their buffers, we tell them about it. We'll then |
| @@ -1042,6 +1149,67 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) | |||
| 1042 | return 0; | 1149 | return 0; |
| 1043 | } | 1150 | } |
| 1044 | 1151 | ||
| 1152 | static int __vhost_add_used_n(struct vhost_virtqueue *vq, | ||
| 1153 | struct vring_used_elem *heads, | ||
| 1154 | unsigned count) | ||
| 1155 | { | ||
| 1156 | struct vring_used_elem __user *used; | ||
| 1157 | int start; | ||
| 1158 | |||
| 1159 | start = vq->last_used_idx % vq->num; | ||
| 1160 | used = vq->used->ring + start; | ||
| 1161 | if (copy_to_user(used, heads, count * sizeof *used)) { | ||
| 1162 | vq_err(vq, "Failed to write used"); | ||
| 1163 | return -EFAULT; | ||
| 1164 | } | ||
| 1165 | if (unlikely(vq->log_used)) { | ||
| 1166 | /* Make sure data is seen before log. */ | ||
| 1167 | smp_wmb(); | ||
| 1168 | /* Log used ring entry write. */ | ||
| 1169 | log_write(vq->log_base, | ||
| 1170 | vq->log_addr + | ||
| 1171 | ((void __user *)used - (void __user *)vq->used), | ||
| 1172 | count * sizeof *used); | ||
| 1173 | } | ||
| 1174 | vq->last_used_idx += count; | ||
| 1175 | return 0; | ||
| 1176 | } | ||
| 1177 | |||
| 1178 | /* After we've used one of their buffers, we tell them about it. We'll then | ||
| 1179 | * want to notify the guest, using eventfd. */ | ||
| 1180 | int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, | ||
| 1181 | unsigned count) | ||
| 1182 | { | ||
| 1183 | int start, n, r; | ||
| 1184 | |||
| 1185 | start = vq->last_used_idx % vq->num; | ||
| 1186 | n = vq->num - start; | ||
| 1187 | if (n < count) { | ||
| 1188 | r = __vhost_add_used_n(vq, heads, n); | ||
| 1189 | if (r < 0) | ||
| 1190 | return r; | ||
| 1191 | heads += n; | ||
| 1192 | count -= n; | ||
| 1193 | } | ||
| 1194 | r = __vhost_add_used_n(vq, heads, count); | ||
| 1195 | |||
| 1196 | /* Make sure buffer is written before we update index. */ | ||
| 1197 | smp_wmb(); | ||
| 1198 | if (put_user(vq->last_used_idx, &vq->used->idx)) { | ||
| 1199 | vq_err(vq, "Failed to increment used idx"); | ||
| 1200 | return -EFAULT; | ||
| 1201 | } | ||
| 1202 | if (unlikely(vq->log_used)) { | ||
| 1203 | /* Log used index update. */ | ||
| 1204 | log_write(vq->log_base, | ||
| 1205 | vq->log_addr + offsetof(struct vring_used, idx), | ||
| 1206 | sizeof vq->used->idx); | ||
| 1207 | if (vq->log_ctx) | ||
| 1208 | eventfd_signal(vq->log_ctx, 1); | ||
| 1209 | } | ||
| 1210 | return r; | ||
| 1211 | } | ||
| 1212 | |||
| 1045 | /* This actually signals the guest, using eventfd. */ | 1213 | /* This actually signals the guest, using eventfd. */ |
| 1046 | void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) | 1214 | void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) |
| 1047 | { | 1215 | { |
| @@ -1076,6 +1244,15 @@ void vhost_add_used_and_signal(struct vhost_dev *dev, | |||
| 1076 | vhost_signal(dev, vq); | 1244 | vhost_signal(dev, vq); |
| 1077 | } | 1245 | } |
| 1078 | 1246 | ||
| 1247 | /* multi-buffer version of vhost_add_used_and_signal */ | ||
| 1248 | void vhost_add_used_and_signal_n(struct vhost_dev *dev, | ||
| 1249 | struct vhost_virtqueue *vq, | ||
| 1250 | struct vring_used_elem *heads, unsigned count) | ||
| 1251 | { | ||
| 1252 | vhost_add_used_n(vq, heads, count); | ||
| 1253 | vhost_signal(dev, vq); | ||
| 1254 | } | ||
| 1255 | |||
| 1079 | /* OK, now we need to know about added descriptors. */ | 1256 | /* OK, now we need to know about added descriptors. */ |
| 1080 | bool vhost_enable_notify(struct vhost_virtqueue *vq) | 1257 | bool vhost_enable_notify(struct vhost_virtqueue *vq) |
| 1081 | { | 1258 | { |
| @@ -1100,7 +1277,7 @@ bool vhost_enable_notify(struct vhost_virtqueue *vq) | |||
| 1100 | return false; | 1277 | return false; |
| 1101 | } | 1278 | } |
| 1102 | 1279 | ||
| 1103 | return avail_idx != vq->last_avail_idx; | 1280 | return avail_idx != vq->avail_idx; |
| 1104 | } | 1281 | } |
| 1105 | 1282 | ||
| 1106 | /* We don't need to be notified again. */ | 1283 | /* We don't need to be notified again. */ |
| @@ -1115,16 +1292,3 @@ void vhost_disable_notify(struct vhost_virtqueue *vq) | |||
| 1115 | vq_err(vq, "Failed to enable notification at %p: %d\n", | 1292 | vq_err(vq, "Failed to enable notification at %p: %d\n", |
| 1116 | &vq->used->flags, r); | 1293 | &vq->used->flags, r); |
| 1117 | } | 1294 | } |
| 1118 | |||
| 1119 | int vhost_init(void) | ||
| 1120 | { | ||
| 1121 | vhost_workqueue = create_singlethread_workqueue("vhost"); | ||
| 1122 | if (!vhost_workqueue) | ||
| 1123 | return -ENOMEM; | ||
| 1124 | return 0; | ||
| 1125 | } | ||
| 1126 | |||
| 1127 | void vhost_cleanup(void) | ||
| 1128 | { | ||
| 1129 | destroy_workqueue(vhost_workqueue); | ||
| 1130 | } | ||
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 11ee13dba0f7..afd77295971c 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h | |||
| @@ -5,13 +5,13 @@ | |||
| 5 | #include <linux/vhost.h> | 5 | #include <linux/vhost.h> |
| 6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
| 7 | #include <linux/mutex.h> | 7 | #include <linux/mutex.h> |
| 8 | #include <linux/workqueue.h> | ||
| 9 | #include <linux/poll.h> | 8 | #include <linux/poll.h> |
| 10 | #include <linux/file.h> | 9 | #include <linux/file.h> |
| 11 | #include <linux/skbuff.h> | 10 | #include <linux/skbuff.h> |
| 12 | #include <linux/uio.h> | 11 | #include <linux/uio.h> |
| 13 | #include <linux/virtio_config.h> | 12 | #include <linux/virtio_config.h> |
| 14 | #include <linux/virtio_ring.h> | 13 | #include <linux/virtio_ring.h> |
| 14 | #include <asm/atomic.h> | ||
| 15 | 15 | ||
| 16 | struct vhost_device; | 16 | struct vhost_device; |
| 17 | 17 | ||
| @@ -20,19 +20,31 @@ enum { | |||
| 20 | VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2, | 20 | VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2, |
| 21 | }; | 21 | }; |
| 22 | 22 | ||
| 23 | struct vhost_work; | ||
| 24 | typedef void (*vhost_work_fn_t)(struct vhost_work *work); | ||
| 25 | |||
| 26 | struct vhost_work { | ||
| 27 | struct list_head node; | ||
| 28 | vhost_work_fn_t fn; | ||
| 29 | wait_queue_head_t done; | ||
| 30 | int flushing; | ||
| 31 | unsigned queue_seq; | ||
| 32 | unsigned done_seq; | ||
| 33 | }; | ||
| 34 | |||
| 23 | /* Poll a file (eventfd or socket) */ | 35 | /* Poll a file (eventfd or socket) */ |
| 24 | /* Note: there's nothing vhost specific about this structure. */ | 36 | /* Note: there's nothing vhost specific about this structure. */ |
| 25 | struct vhost_poll { | 37 | struct vhost_poll { |
| 26 | poll_table table; | 38 | poll_table table; |
| 27 | wait_queue_head_t *wqh; | 39 | wait_queue_head_t *wqh; |
| 28 | wait_queue_t wait; | 40 | wait_queue_t wait; |
| 29 | /* struct which will handle all actual work. */ | 41 | struct vhost_work work; |
| 30 | struct work_struct work; | ||
| 31 | unsigned long mask; | 42 | unsigned long mask; |
| 43 | struct vhost_dev *dev; | ||
| 32 | }; | 44 | }; |
| 33 | 45 | ||
| 34 | void vhost_poll_init(struct vhost_poll *poll, work_func_t func, | 46 | void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, |
| 35 | unsigned long mask); | 47 | unsigned long mask, struct vhost_dev *dev); |
| 36 | void vhost_poll_start(struct vhost_poll *poll, struct file *file); | 48 | void vhost_poll_start(struct vhost_poll *poll, struct file *file); |
| 37 | void vhost_poll_stop(struct vhost_poll *poll); | 49 | void vhost_poll_stop(struct vhost_poll *poll); |
| 38 | void vhost_poll_flush(struct vhost_poll *poll); | 50 | void vhost_poll_flush(struct vhost_poll *poll); |
| @@ -63,7 +75,7 @@ struct vhost_virtqueue { | |||
| 63 | struct vhost_poll poll; | 75 | struct vhost_poll poll; |
| 64 | 76 | ||
| 65 | /* The routine to call when the Guest pings us, or timeout. */ | 77 | /* The routine to call when the Guest pings us, or timeout. */ |
| 66 | work_func_t handle_kick; | 78 | vhost_work_fn_t handle_kick; |
| 67 | 79 | ||
| 68 | /* Last available index we saw. */ | 80 | /* Last available index we saw. */ |
| 69 | u16 last_avail_idx; | 81 | u16 last_avail_idx; |
| @@ -84,13 +96,15 @@ struct vhost_virtqueue { | |||
| 84 | struct iovec indirect[VHOST_NET_MAX_SG]; | 96 | struct iovec indirect[VHOST_NET_MAX_SG]; |
| 85 | struct iovec iov[VHOST_NET_MAX_SG]; | 97 | struct iovec iov[VHOST_NET_MAX_SG]; |
| 86 | struct iovec hdr[VHOST_NET_MAX_SG]; | 98 | struct iovec hdr[VHOST_NET_MAX_SG]; |
| 87 | size_t hdr_size; | 99 | size_t vhost_hlen; |
| 100 | size_t sock_hlen; | ||
| 101 | struct vring_used_elem heads[VHOST_NET_MAX_SG]; | ||
| 88 | /* We use a kind of RCU to access private pointer. | 102 | /* We use a kind of RCU to access private pointer. |
| 89 | * All readers access it from workqueue, which makes it possible to | 103 | * All readers access it from worker, which makes it possible to |
| 90 | * flush the workqueue instead of synchronize_rcu. Therefore readers do | 104 | * flush the vhost_work instead of synchronize_rcu. Therefore readers do |
| 91 | * not need to call rcu_read_lock/rcu_read_unlock: the beginning of | 105 | * not need to call rcu_read_lock/rcu_read_unlock: the beginning of |
| 92 | * work item execution acts instead of rcu_read_lock() and the end of | 106 | * vhost_work execution acts instead of rcu_read_lock() and the end of |
| 93 | * work item execution acts instead of rcu_read_lock(). | 107 | * vhost_work execution acts instead of rcu_read_lock(). |
| 94 | * Writers use virtqueue mutex. */ | 108 | * Writers use virtqueue mutex. */ |
| 95 | void *private_data; | 109 | void *private_data; |
| 96 | /* Log write descriptors */ | 110 | /* Log write descriptors */ |
| @@ -110,6 +124,9 @@ struct vhost_dev { | |||
| 110 | int nvqs; | 124 | int nvqs; |
| 111 | struct file *log_file; | 125 | struct file *log_file; |
| 112 | struct eventfd_ctx *log_ctx; | 126 | struct eventfd_ctx *log_ctx; |
| 127 | spinlock_t work_lock; | ||
| 128 | struct list_head work_list; | ||
| 129 | struct task_struct *worker; | ||
| 113 | }; | 130 | }; |
| 114 | 131 | ||
| 115 | long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); | 132 | long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); |
| @@ -124,21 +141,22 @@ int vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *, | |||
| 124 | struct iovec iov[], unsigned int iov_count, | 141 | struct iovec iov[], unsigned int iov_count, |
| 125 | unsigned int *out_num, unsigned int *in_num, | 142 | unsigned int *out_num, unsigned int *in_num, |
| 126 | struct vhost_log *log, unsigned int *log_num); | 143 | struct vhost_log *log, unsigned int *log_num); |
| 127 | void vhost_discard_vq_desc(struct vhost_virtqueue *); | 144 | void vhost_discard_vq_desc(struct vhost_virtqueue *, int n); |
| 128 | 145 | ||
| 129 | int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); | 146 | int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); |
| 130 | void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); | 147 | int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads, |
| 148 | unsigned count); | ||
| 131 | void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *, | 149 | void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *, |
| 132 | unsigned int head, int len); | 150 | unsigned int id, int len); |
| 151 | void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *, | ||
| 152 | struct vring_used_elem *heads, unsigned count); | ||
| 153 | void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); | ||
| 133 | void vhost_disable_notify(struct vhost_virtqueue *); | 154 | void vhost_disable_notify(struct vhost_virtqueue *); |
| 134 | bool vhost_enable_notify(struct vhost_virtqueue *); | 155 | bool vhost_enable_notify(struct vhost_virtqueue *); |
| 135 | 156 | ||
| 136 | int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, | 157 | int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, |
| 137 | unsigned int log_num, u64 len); | 158 | unsigned int log_num, u64 len); |
| 138 | 159 | ||
| 139 | int vhost_init(void); | ||
| 140 | void vhost_cleanup(void); | ||
| 141 | |||
| 142 | #define vq_err(vq, fmt, ...) do { \ | 160 | #define vq_err(vq, fmt, ...) do { \ |
| 143 | pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ | 161 | pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ |
| 144 | if ((vq)->error_ctx) \ | 162 | if ((vq)->error_ctx) \ |
| @@ -149,7 +167,8 @@ enum { | |||
| 149 | VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) | | 167 | VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) | |
| 150 | (1 << VIRTIO_RING_F_INDIRECT_DESC) | | 168 | (1 << VIRTIO_RING_F_INDIRECT_DESC) | |
| 151 | (1 << VHOST_F_LOG_ALL) | | 169 | (1 << VHOST_F_LOG_ALL) | |
| 152 | (1 << VHOST_NET_F_VIRTIO_NET_HDR), | 170 | (1 << VHOST_NET_F_VIRTIO_NET_HDR) | |
| 171 | (1 << VIRTIO_NET_F_MRG_RXBUF), | ||
| 153 | }; | 172 | }; |
| 154 | 173 | ||
| 155 | static inline int vhost_has_feature(struct vhost_dev *dev, int bit) | 174 | static inline int vhost_has_feature(struct vhost_dev *dev, int bit) |
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0c621604baa1..e0aa067d1b11 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
| @@ -570,6 +570,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | |||
| 570 | void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); | 570 | void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); |
| 571 | int cgroup_scan_tasks(struct cgroup_scanner *scan); | 571 | int cgroup_scan_tasks(struct cgroup_scanner *scan); |
| 572 | int cgroup_attach_task(struct cgroup *, struct task_struct *); | 572 | int cgroup_attach_task(struct cgroup *, struct task_struct *); |
| 573 | int cgroup_attach_task_current_cg(struct task_struct *); | ||
| 573 | 574 | ||
| 574 | /* | 575 | /* |
| 575 | * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works | 576 | * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works |
| @@ -626,6 +627,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats, | |||
| 626 | return -EINVAL; | 627 | return -EINVAL; |
| 627 | } | 628 | } |
| 628 | 629 | ||
| 630 | /* No cgroups - nothing to do */ | ||
| 631 | static inline int cgroup_attach_task_current_cg(struct task_struct *t) | ||
| 632 | { | ||
| 633 | return 0; | ||
| 634 | } | ||
| 635 | |||
| 629 | #endif /* !CONFIG_CGROUPS */ | 636 | #endif /* !CONFIG_CGROUPS */ |
| 630 | 637 | ||
| 631 | #endif /* _LINUX_CGROUP_H */ | 638 | #endif /* _LINUX_CGROUP_H */ |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 422cb19f156e..37642ad9cca8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -1788,6 +1788,29 @@ out: | |||
| 1788 | return retval; | 1788 | return retval; |
| 1789 | } | 1789 | } |
| 1790 | 1790 | ||
| 1791 | /** | ||
| 1792 | * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup | ||
| 1793 | * @tsk: the task to be attached | ||
| 1794 | */ | ||
| 1795 | int cgroup_attach_task_current_cg(struct task_struct *tsk) | ||
| 1796 | { | ||
| 1797 | struct cgroupfs_root *root; | ||
| 1798 | struct cgroup *cur_cg; | ||
| 1799 | int retval = 0; | ||
| 1800 | |||
| 1801 | cgroup_lock(); | ||
| 1802 | for_each_active_root(root) { | ||
| 1803 | cur_cg = task_cgroup_from_root(current, root); | ||
| 1804 | retval = cgroup_attach_task(cur_cg, tsk); | ||
| 1805 | if (retval) | ||
| 1806 | break; | ||
| 1807 | } | ||
| 1808 | cgroup_unlock(); | ||
| 1809 | |||
| 1810 | return retval; | ||
| 1811 | } | ||
| 1812 | EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg); | ||
| 1813 | |||
| 1791 | /* | 1814 | /* |
| 1792 | * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex | 1815 | * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex |
| 1793 | * held. May take task_lock of task | 1816 | * held. May take task_lock of task |
