diff options
Diffstat (limited to 'net/rds/rdma.c')
-rw-r--r-- | net/rds/rdma.c | 421 |
1 files changed, 288 insertions, 133 deletions
diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 75fd13bb631b..8920f2a83327 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c | |||
@@ -35,7 +35,7 @@ | |||
35 | #include <linux/rbtree.h> | 35 | #include <linux/rbtree.h> |
36 | #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ | 36 | #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ |
37 | 37 | ||
38 | #include "rdma.h" | 38 | #include "rds.h" |
39 | 39 | ||
40 | /* | 40 | /* |
41 | * XXX | 41 | * XXX |
@@ -130,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs) | |||
130 | { | 130 | { |
131 | struct rds_mr *mr; | 131 | struct rds_mr *mr; |
132 | struct rb_node *node; | 132 | struct rb_node *node; |
133 | unsigned long flags; | ||
133 | 134 | ||
134 | /* Release any MRs associated with this socket */ | 135 | /* Release any MRs associated with this socket */ |
136 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | ||
135 | while ((node = rb_first(&rs->rs_rdma_keys))) { | 137 | while ((node = rb_first(&rs->rs_rdma_keys))) { |
136 | mr = container_of(node, struct rds_mr, r_rb_node); | 138 | mr = container_of(node, struct rds_mr, r_rb_node); |
137 | if (mr->r_trans == rs->rs_transport) | 139 | if (mr->r_trans == rs->rs_transport) |
138 | mr->r_invalidate = 0; | 140 | mr->r_invalidate = 0; |
141 | rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); | ||
142 | RB_CLEAR_NODE(&mr->r_rb_node); | ||
143 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | ||
144 | rds_destroy_mr(mr); | ||
139 | rds_mr_put(mr); | 145 | rds_mr_put(mr); |
146 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | ||
140 | } | 147 | } |
148 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | ||
141 | 149 | ||
142 | if (rs->rs_transport && rs->rs_transport->flush_mrs) | 150 | if (rs->rs_transport && rs->rs_transport->flush_mrs) |
143 | rs->rs_transport->flush_mrs(); | 151 | rs->rs_transport->flush_mrs(); |
@@ -181,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, | |||
181 | goto out; | 189 | goto out; |
182 | } | 190 | } |
183 | 191 | ||
184 | if (rs->rs_transport->get_mr == NULL) { | 192 | if (!rs->rs_transport->get_mr) { |
185 | ret = -EOPNOTSUPP; | 193 | ret = -EOPNOTSUPP; |
186 | goto out; | 194 | goto out; |
187 | } | 195 | } |
@@ -197,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, | |||
197 | 205 | ||
198 | /* XXX clamp nr_pages to limit the size of this alloc? */ | 206 | /* XXX clamp nr_pages to limit the size of this alloc? */ |
199 | pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); | 207 | pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); |
200 | if (pages == NULL) { | 208 | if (!pages) { |
201 | ret = -ENOMEM; | 209 | ret = -ENOMEM; |
202 | goto out; | 210 | goto out; |
203 | } | 211 | } |
204 | 212 | ||
205 | mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); | 213 | mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); |
206 | if (mr == NULL) { | 214 | if (!mr) { |
207 | ret = -ENOMEM; | 215 | ret = -ENOMEM; |
208 | goto out; | 216 | goto out; |
209 | } | 217 | } |
@@ -230,13 +238,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, | |||
230 | * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to | 238 | * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to |
231 | * the zero page. | 239 | * the zero page. |
232 | */ | 240 | */ |
233 | ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1); | 241 | ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); |
234 | if (ret < 0) | 242 | if (ret < 0) |
235 | goto out; | 243 | goto out; |
236 | 244 | ||
237 | nents = ret; | 245 | nents = ret; |
238 | sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); | 246 | sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); |
239 | if (sg == NULL) { | 247 | if (!sg) { |
240 | ret = -ENOMEM; | 248 | ret = -ENOMEM; |
241 | goto out; | 249 | goto out; |
242 | } | 250 | } |
@@ -406,68 +414,153 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) | |||
406 | 414 | ||
407 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | 415 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); |
408 | mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); | 416 | mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); |
409 | if (mr && (mr->r_use_once || force)) { | 417 | if (!mr) { |
418 | printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key); | ||
419 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | ||
420 | return; | ||
421 | } | ||
422 | |||
423 | if (mr->r_use_once || force) { | ||
410 | rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); | 424 | rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); |
411 | RB_CLEAR_NODE(&mr->r_rb_node); | 425 | RB_CLEAR_NODE(&mr->r_rb_node); |
412 | zot_me = 1; | 426 | zot_me = 1; |
413 | } else if (mr) | 427 | } |
414 | atomic_inc(&mr->r_refcount); | ||
415 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | 428 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); |
416 | 429 | ||
417 | /* May have to issue a dma_sync on this memory region. | 430 | /* May have to issue a dma_sync on this memory region. |
418 | * Note we could avoid this if the operation was a RDMA READ, | 431 | * Note we could avoid this if the operation was a RDMA READ, |
419 | * but at this point we can't tell. */ | 432 | * but at this point we can't tell. */ |
420 | if (mr != NULL) { | 433 | if (mr->r_trans->sync_mr) |
421 | if (mr->r_trans->sync_mr) | 434 | mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); |
422 | mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); | 435 | |
423 | 436 | /* If the MR was marked as invalidate, this will | |
424 | /* If the MR was marked as invalidate, this will | 437 | * trigger an async flush. */ |
425 | * trigger an async flush. */ | 438 | if (zot_me) |
426 | if (zot_me) | 439 | rds_destroy_mr(mr); |
427 | rds_destroy_mr(mr); | 440 | rds_mr_put(mr); |
428 | rds_mr_put(mr); | ||
429 | } | ||
430 | } | 441 | } |
431 | 442 | ||
432 | void rds_rdma_free_op(struct rds_rdma_op *ro) | 443 | void rds_rdma_free_op(struct rm_rdma_op *ro) |
433 | { | 444 | { |
434 | unsigned int i; | 445 | unsigned int i; |
435 | 446 | ||
436 | for (i = 0; i < ro->r_nents; i++) { | 447 | for (i = 0; i < ro->op_nents; i++) { |
437 | struct page *page = sg_page(&ro->r_sg[i]); | 448 | struct page *page = sg_page(&ro->op_sg[i]); |
438 | 449 | ||
439 | /* Mark page dirty if it was possibly modified, which | 450 | /* Mark page dirty if it was possibly modified, which |
440 | * is the case for a RDMA_READ which copies from remote | 451 | * is the case for a RDMA_READ which copies from remote |
441 | * to local memory */ | 452 | * to local memory */ |
442 | if (!ro->r_write) { | 453 | if (!ro->op_write) { |
443 | BUG_ON(in_interrupt()); | 454 | BUG_ON(irqs_disabled()); |
444 | set_page_dirty(page); | 455 | set_page_dirty(page); |
445 | } | 456 | } |
446 | put_page(page); | 457 | put_page(page); |
447 | } | 458 | } |
448 | 459 | ||
449 | kfree(ro->r_notifier); | 460 | kfree(ro->op_notifier); |
450 | kfree(ro); | 461 | ro->op_notifier = NULL; |
462 | ro->op_active = 0; | ||
463 | } | ||
464 | |||
465 | void rds_atomic_free_op(struct rm_atomic_op *ao) | ||
466 | { | ||
467 | struct page *page = sg_page(ao->op_sg); | ||
468 | |||
469 | /* Mark page dirty if it was possibly modified, which | ||
470 | * is the case for a RDMA_READ which copies from remote | ||
471 | * to local memory */ | ||
472 | set_page_dirty(page); | ||
473 | put_page(page); | ||
474 | |||
475 | kfree(ao->op_notifier); | ||
476 | ao->op_notifier = NULL; | ||
477 | ao->op_active = 0; | ||
451 | } | 478 | } |
452 | 479 | ||
480 | |||
453 | /* | 481 | /* |
454 | * args is a pointer to an in-kernel copy in the sendmsg cmsg. | 482 | * Count the number of pages needed to describe an incoming iovec array. |
455 | */ | 483 | */ |
456 | static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | 484 | static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) |
457 | struct rds_rdma_args *args) | 485 | { |
486 | int tot_pages = 0; | ||
487 | unsigned int nr_pages; | ||
488 | unsigned int i; | ||
489 | |||
490 | /* figure out the number of pages in the vector */ | ||
491 | for (i = 0; i < nr_iovecs; i++) { | ||
492 | nr_pages = rds_pages_in_vec(&iov[i]); | ||
493 | if (nr_pages == 0) | ||
494 | return -EINVAL; | ||
495 | |||
496 | tot_pages += nr_pages; | ||
497 | |||
498 | /* | ||
499 | * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, | ||
500 | * so tot_pages cannot overflow without first going negative. | ||
501 | */ | ||
502 | if (tot_pages < 0) | ||
503 | return -EINVAL; | ||
504 | } | ||
505 | |||
506 | return tot_pages; | ||
507 | } | ||
508 | |||
509 | int rds_rdma_extra_size(struct rds_rdma_args *args) | ||
458 | { | 510 | { |
459 | struct rds_iovec vec; | 511 | struct rds_iovec vec; |
460 | struct rds_rdma_op *op = NULL; | 512 | struct rds_iovec __user *local_vec; |
513 | int tot_pages = 0; | ||
461 | unsigned int nr_pages; | 514 | unsigned int nr_pages; |
462 | unsigned int max_pages; | 515 | unsigned int i; |
516 | |||
517 | local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; | ||
518 | |||
519 | /* figure out the number of pages in the vector */ | ||
520 | for (i = 0; i < args->nr_local; i++) { | ||
521 | if (copy_from_user(&vec, &local_vec[i], | ||
522 | sizeof(struct rds_iovec))) | ||
523 | return -EFAULT; | ||
524 | |||
525 | nr_pages = rds_pages_in_vec(&vec); | ||
526 | if (nr_pages == 0) | ||
527 | return -EINVAL; | ||
528 | |||
529 | tot_pages += nr_pages; | ||
530 | |||
531 | /* | ||
532 | * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, | ||
533 | * so tot_pages cannot overflow without first going negative. | ||
534 | */ | ||
535 | if (tot_pages < 0) | ||
536 | return -EINVAL; | ||
537 | } | ||
538 | |||
539 | return tot_pages * sizeof(struct scatterlist); | ||
540 | } | ||
541 | |||
542 | /* | ||
543 | * The application asks for a RDMA transfer. | ||
544 | * Extract all arguments and set up the rdma_op | ||
545 | */ | ||
546 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
547 | struct cmsghdr *cmsg) | ||
548 | { | ||
549 | struct rds_rdma_args *args; | ||
550 | struct rm_rdma_op *op = &rm->rdma; | ||
551 | int nr_pages; | ||
463 | unsigned int nr_bytes; | 552 | unsigned int nr_bytes; |
464 | struct page **pages = NULL; | 553 | struct page **pages = NULL; |
465 | struct rds_iovec __user *local_vec; | 554 | struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack; |
466 | struct scatterlist *sg; | 555 | int iov_size; |
467 | unsigned int nr; | ||
468 | unsigned int i, j; | 556 | unsigned int i, j; |
469 | int ret; | 557 | int ret = 0; |
558 | |||
559 | if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) | ||
560 | || rm->rdma.op_active) | ||
561 | return -EINVAL; | ||
470 | 562 | ||
563 | args = CMSG_DATA(cmsg); | ||
471 | 564 | ||
472 | if (rs->rs_bound_addr == 0) { | 565 | if (rs->rs_bound_addr == 0) { |
473 | ret = -ENOTCONN; /* XXX not a great errno */ | 566 | ret = -ENOTCONN; /* XXX not a great errno */ |
@@ -479,61 +572,59 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | |||
479 | goto out; | 572 | goto out; |
480 | } | 573 | } |
481 | 574 | ||
482 | nr_pages = 0; | 575 | /* Check whether to allocate the iovec area */ |
483 | max_pages = 0; | 576 | iov_size = args->nr_local * sizeof(struct rds_iovec); |
484 | 577 | if (args->nr_local > UIO_FASTIOV) { | |
485 | local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; | 578 | iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL); |
486 | 579 | if (!iovs) { | |
487 | /* figure out the number of pages in the vector */ | 580 | ret = -ENOMEM; |
488 | for (i = 0; i < args->nr_local; i++) { | ||
489 | if (copy_from_user(&vec, &local_vec[i], | ||
490 | sizeof(struct rds_iovec))) { | ||
491 | ret = -EFAULT; | ||
492 | goto out; | ||
493 | } | ||
494 | |||
495 | nr = rds_pages_in_vec(&vec); | ||
496 | if (nr == 0) { | ||
497 | ret = -EINVAL; | ||
498 | goto out; | 581 | goto out; |
499 | } | 582 | } |
583 | } | ||
500 | 584 | ||
501 | max_pages = max(nr, max_pages); | 585 | if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) { |
502 | nr_pages += nr; | 586 | ret = -EFAULT; |
587 | goto out; | ||
503 | } | 588 | } |
504 | 589 | ||
505 | pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL); | 590 | nr_pages = rds_rdma_pages(iovs, args->nr_local); |
506 | if (pages == NULL) { | 591 | if (nr_pages < 0) { |
507 | ret = -ENOMEM; | 592 | ret = -EINVAL; |
508 | goto out; | 593 | goto out; |
509 | } | 594 | } |
510 | 595 | ||
511 | op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); | 596 | pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); |
512 | if (op == NULL) { | 597 | if (!pages) { |
513 | ret = -ENOMEM; | 598 | ret = -ENOMEM; |
514 | goto out; | 599 | goto out; |
515 | } | 600 | } |
516 | 601 | ||
517 | op->r_write = !!(args->flags & RDS_RDMA_READWRITE); | 602 | op->op_write = !!(args->flags & RDS_RDMA_READWRITE); |
518 | op->r_fence = !!(args->flags & RDS_RDMA_FENCE); | 603 | op->op_fence = !!(args->flags & RDS_RDMA_FENCE); |
519 | op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); | 604 | op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); |
520 | op->r_recverr = rs->rs_recverr; | 605 | op->op_silent = !!(args->flags & RDS_RDMA_SILENT); |
606 | op->op_active = 1; | ||
607 | op->op_recverr = rs->rs_recverr; | ||
521 | WARN_ON(!nr_pages); | 608 | WARN_ON(!nr_pages); |
522 | sg_init_table(op->r_sg, nr_pages); | 609 | op->op_sg = rds_message_alloc_sgs(rm, nr_pages); |
610 | if (!op->op_sg) { | ||
611 | ret = -ENOMEM; | ||
612 | goto out; | ||
613 | } | ||
523 | 614 | ||
524 | if (op->r_notify || op->r_recverr) { | 615 | if (op->op_notify || op->op_recverr) { |
525 | /* We allocate an uninitialized notifier here, because | 616 | /* We allocate an uninitialized notifier here, because |
526 | * we don't want to do that in the completion handler. We | 617 | * we don't want to do that in the completion handler. We |
527 | * would have to use GFP_ATOMIC there, and don't want to deal | 618 | * would have to use GFP_ATOMIC there, and don't want to deal |
528 | * with failed allocations. | 619 | * with failed allocations. |
529 | */ | 620 | */ |
530 | op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); | 621 | op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); |
531 | if (!op->r_notifier) { | 622 | if (!op->op_notifier) { |
532 | ret = -ENOMEM; | 623 | ret = -ENOMEM; |
533 | goto out; | 624 | goto out; |
534 | } | 625 | } |
535 | op->r_notifier->n_user_token = args->user_token; | 626 | op->op_notifier->n_user_token = args->user_token; |
536 | op->r_notifier->n_status = RDS_RDMA_SUCCESS; | 627 | op->op_notifier->n_status = RDS_RDMA_SUCCESS; |
537 | } | 628 | } |
538 | 629 | ||
539 | /* The cookie contains the R_Key of the remote memory region, and | 630 | /* The cookie contains the R_Key of the remote memory region, and |
@@ -543,68 +634,55 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | |||
543 | * destination address (which is really an offset into the MR) | 634 | * destination address (which is really an offset into the MR) |
544 | * FIXME: We may want to move this into ib_rdma.c | 635 | * FIXME: We may want to move this into ib_rdma.c |
545 | */ | 636 | */ |
546 | op->r_key = rds_rdma_cookie_key(args->cookie); | 637 | op->op_rkey = rds_rdma_cookie_key(args->cookie); |
547 | op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); | 638 | op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); |
548 | 639 | ||
549 | nr_bytes = 0; | 640 | nr_bytes = 0; |
550 | 641 | ||
551 | rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", | 642 | rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", |
552 | (unsigned long long)args->nr_local, | 643 | (unsigned long long)args->nr_local, |
553 | (unsigned long long)args->remote_vec.addr, | 644 | (unsigned long long)args->remote_vec.addr, |
554 | op->r_key); | 645 | op->op_rkey); |
555 | 646 | ||
556 | for (i = 0; i < args->nr_local; i++) { | 647 | for (i = 0; i < args->nr_local; i++) { |
557 | if (copy_from_user(&vec, &local_vec[i], | 648 | struct rds_iovec *iov = &iovs[i]; |
558 | sizeof(struct rds_iovec))) { | 649 | /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */ |
559 | ret = -EFAULT; | 650 | unsigned int nr = rds_pages_in_vec(iov); |
560 | goto out; | ||
561 | } | ||
562 | 651 | ||
563 | nr = rds_pages_in_vec(&vec); | 652 | rs->rs_user_addr = iov->addr; |
564 | if (nr == 0) { | 653 | rs->rs_user_bytes = iov->bytes; |
565 | ret = -EINVAL; | ||
566 | goto out; | ||
567 | } | ||
568 | 654 | ||
569 | rs->rs_user_addr = vec.addr; | ||
570 | rs->rs_user_bytes = vec.bytes; | ||
571 | |||
572 | /* did the user change the vec under us? */ | ||
573 | if (nr > max_pages || op->r_nents + nr > nr_pages) { | ||
574 | ret = -EINVAL; | ||
575 | goto out; | ||
576 | } | ||
577 | /* If it's a WRITE operation, we want to pin the pages for reading. | 655 | /* If it's a WRITE operation, we want to pin the pages for reading. |
578 | * If it's a READ operation, we need to pin the pages for writing. | 656 | * If it's a READ operation, we need to pin the pages for writing. |
579 | */ | 657 | */ |
580 | ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write); | 658 | ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); |
581 | if (ret < 0) | 659 | if (ret < 0) |
582 | goto out; | 660 | goto out; |
583 | 661 | ||
584 | rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n", | 662 | rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", |
585 | nr_bytes, nr, vec.bytes, vec.addr); | 663 | nr_bytes, nr, iov->bytes, iov->addr); |
586 | 664 | ||
587 | nr_bytes += vec.bytes; | 665 | nr_bytes += iov->bytes; |
588 | 666 | ||
589 | for (j = 0; j < nr; j++) { | 667 | for (j = 0; j < nr; j++) { |
590 | unsigned int offset = vec.addr & ~PAGE_MASK; | 668 | unsigned int offset = iov->addr & ~PAGE_MASK; |
669 | struct scatterlist *sg; | ||
591 | 670 | ||
592 | sg = &op->r_sg[op->r_nents + j]; | 671 | sg = &op->op_sg[op->op_nents + j]; |
593 | sg_set_page(sg, pages[j], | 672 | sg_set_page(sg, pages[j], |
594 | min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), | 673 | min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), |
595 | offset); | 674 | offset); |
596 | 675 | ||
597 | rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n", | 676 | rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", |
598 | sg->offset, sg->length, vec.addr, vec.bytes); | 677 | sg->offset, sg->length, iov->addr, iov->bytes); |
599 | 678 | ||
600 | vec.addr += sg->length; | 679 | iov->addr += sg->length; |
601 | vec.bytes -= sg->length; | 680 | iov->bytes -= sg->length; |
602 | } | 681 | } |
603 | 682 | ||
604 | op->r_nents += nr; | 683 | op->op_nents += nr; |
605 | } | 684 | } |
606 | 685 | ||
607 | |||
608 | if (nr_bytes > args->remote_vec.bytes) { | 686 | if (nr_bytes > args->remote_vec.bytes) { |
609 | rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", | 687 | rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", |
610 | nr_bytes, | 688 | nr_bytes, |
@@ -612,38 +690,18 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | |||
612 | ret = -EINVAL; | 690 | ret = -EINVAL; |
613 | goto out; | 691 | goto out; |
614 | } | 692 | } |
615 | op->r_bytes = nr_bytes; | 693 | op->op_bytes = nr_bytes; |
616 | 694 | ||
617 | ret = 0; | ||
618 | out: | 695 | out: |
696 | if (iovs != iovstack) | ||
697 | sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size); | ||
619 | kfree(pages); | 698 | kfree(pages); |
620 | if (ret) { | 699 | if (ret) |
621 | if (op) | 700 | rds_rdma_free_op(op); |
622 | rds_rdma_free_op(op); | 701 | else |
623 | op = ERR_PTR(ret); | 702 | rds_stats_inc(s_send_rdma); |
624 | } | ||
625 | return op; | ||
626 | } | ||
627 | |||
628 | /* | ||
629 | * The application asks for a RDMA transfer. | ||
630 | * Extract all arguments and set up the rdma_op | ||
631 | */ | ||
632 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
633 | struct cmsghdr *cmsg) | ||
634 | { | ||
635 | struct rds_rdma_op *op; | ||
636 | |||
637 | if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || | ||
638 | rm->m_rdma_op != NULL) | ||
639 | return -EINVAL; | ||
640 | 703 | ||
641 | op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); | 704 | return ret; |
642 | if (IS_ERR(op)) | ||
643 | return PTR_ERR(op); | ||
644 | rds_stats_inc(s_send_rdma); | ||
645 | rm->m_rdma_op = op; | ||
646 | return 0; | ||
647 | } | 705 | } |
648 | 706 | ||
649 | /* | 707 | /* |
@@ -673,7 +731,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, | |||
673 | 731 | ||
674 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | 732 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); |
675 | mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); | 733 | mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); |
676 | if (mr == NULL) | 734 | if (!mr) |
677 | err = -EINVAL; /* invalid r_key */ | 735 | err = -EINVAL; /* invalid r_key */ |
678 | else | 736 | else |
679 | atomic_inc(&mr->r_refcount); | 737 | atomic_inc(&mr->r_refcount); |
@@ -681,7 +739,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, | |||
681 | 739 | ||
682 | if (mr) { | 740 | if (mr) { |
683 | mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); | 741 | mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); |
684 | rm->m_rdma_mr = mr; | 742 | rm->rdma.op_rdma_mr = mr; |
685 | } | 743 | } |
686 | return err; | 744 | return err; |
687 | } | 745 | } |
@@ -699,5 +757,102 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, | |||
699 | rm->m_rdma_cookie != 0) | 757 | rm->m_rdma_cookie != 0) |
700 | return -EINVAL; | 758 | return -EINVAL; |
701 | 759 | ||
702 | return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr); | 760 | return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr); |
761 | } | ||
762 | |||
763 | /* | ||
764 | * Fill in rds_message for an atomic request. | ||
765 | */ | ||
766 | int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, | ||
767 | struct cmsghdr *cmsg) | ||
768 | { | ||
769 | struct page *page = NULL; | ||
770 | struct rds_atomic_args *args; | ||
771 | int ret = 0; | ||
772 | |||
773 | if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args)) | ||
774 | || rm->atomic.op_active) | ||
775 | return -EINVAL; | ||
776 | |||
777 | args = CMSG_DATA(cmsg); | ||
778 | |||
779 | /* Nonmasked & masked cmsg ops converted to masked hw ops */ | ||
780 | switch (cmsg->cmsg_type) { | ||
781 | case RDS_CMSG_ATOMIC_FADD: | ||
782 | rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; | ||
783 | rm->atomic.op_m_fadd.add = args->fadd.add; | ||
784 | rm->atomic.op_m_fadd.nocarry_mask = 0; | ||
785 | break; | ||
786 | case RDS_CMSG_MASKED_ATOMIC_FADD: | ||
787 | rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; | ||
788 | rm->atomic.op_m_fadd.add = args->m_fadd.add; | ||
789 | rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; | ||
790 | break; | ||
791 | case RDS_CMSG_ATOMIC_CSWP: | ||
792 | rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; | ||
793 | rm->atomic.op_m_cswp.compare = args->cswp.compare; | ||
794 | rm->atomic.op_m_cswp.swap = args->cswp.swap; | ||
795 | rm->atomic.op_m_cswp.compare_mask = ~0; | ||
796 | rm->atomic.op_m_cswp.swap_mask = ~0; | ||
797 | break; | ||
798 | case RDS_CMSG_MASKED_ATOMIC_CSWP: | ||
799 | rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; | ||
800 | rm->atomic.op_m_cswp.compare = args->m_cswp.compare; | ||
801 | rm->atomic.op_m_cswp.swap = args->m_cswp.swap; | ||
802 | rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; | ||
803 | rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; | ||
804 | break; | ||
805 | default: | ||
806 | BUG(); /* should never happen */ | ||
807 | } | ||
808 | |||
809 | rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); | ||
810 | rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); | ||
811 | rm->atomic.op_active = 1; | ||
812 | rm->atomic.op_recverr = rs->rs_recverr; | ||
813 | rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); | ||
814 | if (!rm->atomic.op_sg) { | ||
815 | ret = -ENOMEM; | ||
816 | goto err; | ||
817 | } | ||
818 | |||
819 | /* verify 8 byte-aligned */ | ||
820 | if (args->local_addr & 0x7) { | ||
821 | ret = -EFAULT; | ||
822 | goto err; | ||
823 | } | ||
824 | |||
825 | ret = rds_pin_pages(args->local_addr, 1, &page, 1); | ||
826 | if (ret != 1) | ||
827 | goto err; | ||
828 | ret = 0; | ||
829 | |||
830 | sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); | ||
831 | |||
832 | if (rm->atomic.op_notify || rm->atomic.op_recverr) { | ||
833 | /* We allocate an uninitialized notifier here, because | ||
834 | * we don't want to do that in the completion handler. We | ||
835 | * would have to use GFP_ATOMIC there, and don't want to deal | ||
836 | * with failed allocations. | ||
837 | */ | ||
838 | rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); | ||
839 | if (!rm->atomic.op_notifier) { | ||
840 | ret = -ENOMEM; | ||
841 | goto err; | ||
842 | } | ||
843 | |||
844 | rm->atomic.op_notifier->n_user_token = args->user_token; | ||
845 | rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; | ||
846 | } | ||
847 | |||
848 | rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); | ||
849 | rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); | ||
850 | |||
851 | return ret; | ||
852 | err: | ||
853 | if (page) | ||
854 | put_page(page); | ||
855 | kfree(rm->atomic.op_notifier); | ||
856 | |||
857 | return ret; | ||
703 | } | 858 | } |