diff options
author | Andy Grover <andy.grover@oracle.com> | 2010-10-28 11:40:58 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-10-30 19:34:17 -0400 |
commit | fc8162e3c034af743d8def435fda6396603d321f (patch) | |
tree | b003a652740eb0de1fe71c634618b6666e9bae3c /net | |
parent | f4a3fc03c1d73753879fb655b8cd628b29f6706b (diff) |
RDS: Copy rds_iovecs into kernel memory instead of rereading from userspace
Change rds_rdma_pages to take a passed-in rds_iovec array instead
of doing copy_from_user itself.
Change rds_cmsg_rdma_args to copy rds_iovec array once only. This
eliminates the possibility of userspace changing it after our
sanity checks.
Implement stack-based storage for small numbers of iovecs, based
on net/socket.c, to save an alloc in the extremely common case.
Although this patch reduces iovec copies in cmsg_rdma_args to 1,
we still do another one in rds_rdma_extra_size. Getting rid of
that one will be trickier, so it'll be a separate patch.
Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/rds/rdma.c | 104 |
1 files changed, 65 insertions, 39 deletions
diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 334acdd32ab6..caa4d9866d92 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c | |||
@@ -479,13 +479,38 @@ void rds_atomic_free_op(struct rm_atomic_op *ao) | |||
479 | 479 | ||
480 | 480 | ||
481 | /* | 481 | /* |
482 | * Count the number of pages needed to describe an incoming iovec. | 482 | * Count the number of pages needed to describe an incoming iovec array. |
483 | */ | 483 | */ |
484 | static int rds_rdma_pages(struct rds_rdma_args *args) | 484 | static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) |
485 | { | ||
486 | int tot_pages = 0; | ||
487 | unsigned int nr_pages; | ||
488 | unsigned int i; | ||
489 | |||
490 | /* figure out the number of pages in the vector */ | ||
491 | for (i = 0; i < nr_iovecs; i++) { | ||
492 | nr_pages = rds_pages_in_vec(&iov[i]); | ||
493 | if (nr_pages == 0) | ||
494 | return -EINVAL; | ||
495 | |||
496 | tot_pages += nr_pages; | ||
497 | |||
498 | /* | ||
499 | * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, | ||
500 | * so tot_pages cannot overflow without first going negative. | ||
501 | */ | ||
502 | if (tot_pages < 0) | ||
503 | return -EINVAL; | ||
504 | } | ||
505 | |||
506 | return tot_pages; | ||
507 | } | ||
508 | |||
509 | int rds_rdma_extra_size(struct rds_rdma_args *args) | ||
485 | { | 510 | { |
486 | struct rds_iovec vec; | 511 | struct rds_iovec vec; |
487 | struct rds_iovec __user *local_vec; | 512 | struct rds_iovec __user *local_vec; |
488 | unsigned int tot_pages = 0; | 513 | int tot_pages = 0; |
489 | unsigned int nr_pages; | 514 | unsigned int nr_pages; |
490 | unsigned int i; | 515 | unsigned int i; |
491 | 516 | ||
@@ -507,16 +532,11 @@ static int rds_rdma_pages(struct rds_rdma_args *args) | |||
507 | * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, | 532 | * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, |
508 | * so tot_pages cannot overflow without first going negative. | 533 | * so tot_pages cannot overflow without first going negative. |
509 | */ | 534 | */ |
510 | if ((int)tot_pages < 0) | 535 | if (tot_pages < 0) |
511 | return -EINVAL; | 536 | return -EINVAL; |
512 | } | 537 | } |
513 | 538 | ||
514 | return tot_pages; | 539 | return tot_pages * sizeof(struct scatterlist); |
515 | } | ||
516 | |||
517 | int rds_rdma_extra_size(struct rds_rdma_args *args) | ||
518 | { | ||
519 | return rds_rdma_pages(args) * sizeof(struct scatterlist); | ||
520 | } | 540 | } |
521 | 541 | ||
522 | /* | 542 | /* |
@@ -527,13 +547,12 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | |||
527 | struct cmsghdr *cmsg) | 547 | struct cmsghdr *cmsg) |
528 | { | 548 | { |
529 | struct rds_rdma_args *args; | 549 | struct rds_rdma_args *args; |
530 | struct rds_iovec vec; | ||
531 | struct rm_rdma_op *op = &rm->rdma; | 550 | struct rm_rdma_op *op = &rm->rdma; |
532 | int nr_pages; | 551 | int nr_pages; |
533 | unsigned int nr_bytes; | 552 | unsigned int nr_bytes; |
534 | struct page **pages = NULL; | 553 | struct page **pages = NULL; |
535 | struct rds_iovec __user *local_vec; | 554 | struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack; |
536 | unsigned int nr; | 555 | int iov_size; |
537 | unsigned int i, j; | 556 | unsigned int i, j; |
538 | int ret = 0; | 557 | int ret = 0; |
539 | 558 | ||
@@ -553,7 +572,22 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | |||
553 | goto out; | 572 | goto out; |
554 | } | 573 | } |
555 | 574 | ||
556 | nr_pages = rds_rdma_pages(args); | 575 | /* Check whether to allocate the iovec area */ |
576 | iov_size = args->nr_local * sizeof(struct rds_iovec); | ||
577 | if (args->nr_local > UIO_FASTIOV) { | ||
578 | iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL); | ||
579 | if (!iovs) { | ||
580 | ret = -ENOMEM; | ||
581 | goto out; | ||
582 | } | ||
583 | } | ||
584 | |||
585 | if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) { | ||
586 | ret = -EFAULT; | ||
587 | goto out; | ||
588 | } | ||
589 | |||
590 | nr_pages = rds_rdma_pages(iovs, args->nr_local); | ||
557 | if (nr_pages < 0) { | 591 | if (nr_pages < 0) { |
558 | ret = -EINVAL; | 592 | ret = -EINVAL; |
559 | goto out; | 593 | goto out; |
@@ -606,50 +640,40 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | |||
606 | (unsigned long long)args->remote_vec.addr, | 640 | (unsigned long long)args->remote_vec.addr, |
607 | op->op_rkey); | 641 | op->op_rkey); |
608 | 642 | ||
609 | local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; | ||
610 | |||
611 | for (i = 0; i < args->nr_local; i++) { | 643 | for (i = 0; i < args->nr_local; i++) { |
612 | if (copy_from_user(&vec, &local_vec[i], | 644 | struct rds_iovec *iov = &iovs[i]; |
613 | sizeof(struct rds_iovec))) { | 645 | /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */ |
614 | ret = -EFAULT; | 646 | unsigned int nr = rds_pages_in_vec(iov); |
615 | goto out; | ||
616 | } | ||
617 | |||
618 | nr = rds_pages_in_vec(&vec); | ||
619 | if (nr == 0) { | ||
620 | ret = -EINVAL; | ||
621 | goto out; | ||
622 | } | ||
623 | 647 | ||
624 | rs->rs_user_addr = vec.addr; | 648 | rs->rs_user_addr = iov->addr; |
625 | rs->rs_user_bytes = vec.bytes; | 649 | rs->rs_user_bytes = iov->bytes; |
626 | 650 | ||
627 | /* If it's a WRITE operation, we want to pin the pages for reading. | 651 | /* If it's a WRITE operation, we want to pin the pages for reading. |
628 | * If it's a READ operation, we need to pin the pages for writing. | 652 | * If it's a READ operation, we need to pin the pages for writing. |
629 | */ | 653 | */ |
630 | ret = rds_pin_pages(vec.addr, nr, pages, !op->op_write); | 654 | ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); |
631 | if (ret < 0) | 655 | if (ret < 0) |
632 | goto out; | 656 | goto out; |
633 | 657 | ||
634 | rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n", | 658 | rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", |
635 | nr_bytes, nr, vec.bytes, vec.addr); | 659 | nr_bytes, nr, iov->bytes, iov->addr); |
636 | 660 | ||
637 | nr_bytes += vec.bytes; | 661 | nr_bytes += iov->bytes; |
638 | 662 | ||
639 | for (j = 0; j < nr; j++) { | 663 | for (j = 0; j < nr; j++) { |
640 | unsigned int offset = vec.addr & ~PAGE_MASK; | 664 | unsigned int offset = iov->addr & ~PAGE_MASK; |
641 | struct scatterlist *sg; | 665 | struct scatterlist *sg; |
642 | 666 | ||
643 | sg = &op->op_sg[op->op_nents + j]; | 667 | sg = &op->op_sg[op->op_nents + j]; |
644 | sg_set_page(sg, pages[j], | 668 | sg_set_page(sg, pages[j], |
645 | min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), | 669 | min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), |
646 | offset); | 670 | offset); |
647 | 671 | ||
648 | rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n", | 672 | rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", |
649 | sg->offset, sg->length, vec.addr, vec.bytes); | 673 | sg->offset, sg->length, iov->addr, iov->bytes); |
650 | 674 | ||
651 | vec.addr += sg->length; | 675 | iov->addr += sg->length; |
652 | vec.bytes -= sg->length; | 676 | iov->bytes -= sg->length; |
653 | } | 677 | } |
654 | 678 | ||
655 | op->op_nents += nr; | 679 | op->op_nents += nr; |
@@ -665,6 +689,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | |||
665 | op->op_bytes = nr_bytes; | 689 | op->op_bytes = nr_bytes; |
666 | 690 | ||
667 | out: | 691 | out: |
692 | if (iovs != iovstack) | ||
693 | sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size); | ||
668 | kfree(pages); | 694 | kfree(pages); |
669 | if (ret) | 695 | if (ret) |
670 | rds_rdma_free_op(op); | 696 | rds_rdma_free_op(op); |