aboutsummaryrefslogtreecommitdiffstats
path: root/net/rds/rdma.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/rds/rdma.c')
-rw-r--r--net/rds/rdma.c421
1 files changed, 288 insertions, 133 deletions
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 75fd13bb631b..8920f2a83327 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -35,7 +35,7 @@
35#include <linux/rbtree.h> 35#include <linux/rbtree.h>
36#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ 36#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
37 37
38#include "rdma.h" 38#include "rds.h"
39 39
40/* 40/*
41 * XXX 41 * XXX
@@ -130,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
130{ 130{
131 struct rds_mr *mr; 131 struct rds_mr *mr;
132 struct rb_node *node; 132 struct rb_node *node;
133 unsigned long flags;
133 134
134 /* Release any MRs associated with this socket */ 135 /* Release any MRs associated with this socket */
136 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
135 while ((node = rb_first(&rs->rs_rdma_keys))) { 137 while ((node = rb_first(&rs->rs_rdma_keys))) {
136 mr = container_of(node, struct rds_mr, r_rb_node); 138 mr = container_of(node, struct rds_mr, r_rb_node);
137 if (mr->r_trans == rs->rs_transport) 139 if (mr->r_trans == rs->rs_transport)
138 mr->r_invalidate = 0; 140 mr->r_invalidate = 0;
141 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
142 RB_CLEAR_NODE(&mr->r_rb_node);
143 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
144 rds_destroy_mr(mr);
139 rds_mr_put(mr); 145 rds_mr_put(mr);
146 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
140 } 147 }
148 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
141 149
142 if (rs->rs_transport && rs->rs_transport->flush_mrs) 150 if (rs->rs_transport && rs->rs_transport->flush_mrs)
143 rs->rs_transport->flush_mrs(); 151 rs->rs_transport->flush_mrs();
@@ -181,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
181 goto out; 189 goto out;
182 } 190 }
183 191
184 if (rs->rs_transport->get_mr == NULL) { 192 if (!rs->rs_transport->get_mr) {
185 ret = -EOPNOTSUPP; 193 ret = -EOPNOTSUPP;
186 goto out; 194 goto out;
187 } 195 }
@@ -197,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
197 205
198 /* XXX clamp nr_pages to limit the size of this alloc? */ 206 /* XXX clamp nr_pages to limit the size of this alloc? */
199 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); 207 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
200 if (pages == NULL) { 208 if (!pages) {
201 ret = -ENOMEM; 209 ret = -ENOMEM;
202 goto out; 210 goto out;
203 } 211 }
204 212
205 mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); 213 mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
206 if (mr == NULL) { 214 if (!mr) {
207 ret = -ENOMEM; 215 ret = -ENOMEM;
208 goto out; 216 goto out;
209 } 217 }
@@ -230,13 +238,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
230 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to 238 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
231 * the zero page. 239 * the zero page.
232 */ 240 */
233 ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1); 241 ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
234 if (ret < 0) 242 if (ret < 0)
235 goto out; 243 goto out;
236 244
237 nents = ret; 245 nents = ret;
238 sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); 246 sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
239 if (sg == NULL) { 247 if (!sg) {
240 ret = -ENOMEM; 248 ret = -ENOMEM;
241 goto out; 249 goto out;
242 } 250 }
@@ -406,68 +414,153 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
406 414
407 spin_lock_irqsave(&rs->rs_rdma_lock, flags); 415 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
408 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); 416 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
409 if (mr && (mr->r_use_once || force)) { 417 if (!mr) {
418 printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
419 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
420 return;
421 }
422
423 if (mr->r_use_once || force) {
410 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); 424 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
411 RB_CLEAR_NODE(&mr->r_rb_node); 425 RB_CLEAR_NODE(&mr->r_rb_node);
412 zot_me = 1; 426 zot_me = 1;
413 } else if (mr) 427 }
414 atomic_inc(&mr->r_refcount);
415 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); 428 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
416 429
417 /* May have to issue a dma_sync on this memory region. 430 /* May have to issue a dma_sync on this memory region.
418 * Note we could avoid this if the operation was a RDMA READ, 431 * Note we could avoid this if the operation was a RDMA READ,
419 * but at this point we can't tell. */ 432 * but at this point we can't tell. */
420 if (mr != NULL) { 433 if (mr->r_trans->sync_mr)
421 if (mr->r_trans->sync_mr) 434 mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
422 mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); 435
423 436 /* If the MR was marked as invalidate, this will
424 /* If the MR was marked as invalidate, this will 437 * trigger an async flush. */
425 * trigger an async flush. */ 438 if (zot_me)
426 if (zot_me) 439 rds_destroy_mr(mr);
427 rds_destroy_mr(mr); 440 rds_mr_put(mr);
428 rds_mr_put(mr);
429 }
430} 441}
431 442
432void rds_rdma_free_op(struct rds_rdma_op *ro) 443void rds_rdma_free_op(struct rm_rdma_op *ro)
433{ 444{
434 unsigned int i; 445 unsigned int i;
435 446
436 for (i = 0; i < ro->r_nents; i++) { 447 for (i = 0; i < ro->op_nents; i++) {
437 struct page *page = sg_page(&ro->r_sg[i]); 448 struct page *page = sg_page(&ro->op_sg[i]);
438 449
439 /* Mark page dirty if it was possibly modified, which 450 /* Mark page dirty if it was possibly modified, which
440 * is the case for a RDMA_READ which copies from remote 451 * is the case for a RDMA_READ which copies from remote
441 * to local memory */ 452 * to local memory */
442 if (!ro->r_write) { 453 if (!ro->op_write) {
443 BUG_ON(in_interrupt()); 454 BUG_ON(irqs_disabled());
444 set_page_dirty(page); 455 set_page_dirty(page);
445 } 456 }
446 put_page(page); 457 put_page(page);
447 } 458 }
448 459
449 kfree(ro->r_notifier); 460 kfree(ro->op_notifier);
450 kfree(ro); 461 ro->op_notifier = NULL;
462 ro->op_active = 0;
463}
464
465void rds_atomic_free_op(struct rm_atomic_op *ao)
466{
467 struct page *page = sg_page(ao->op_sg);
468
469 /* Mark page dirty if it was possibly modified, which
470 * is the case for a RDMA_READ which copies from remote
471 * to local memory */
472 set_page_dirty(page);
473 put_page(page);
474
475 kfree(ao->op_notifier);
476 ao->op_notifier = NULL;
477 ao->op_active = 0;
451} 478}
452 479
480
453/* 481/*
454 * args is a pointer to an in-kernel copy in the sendmsg cmsg. 482 * Count the number of pages needed to describe an incoming iovec array.
455 */ 483 */
456static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, 484static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
457 struct rds_rdma_args *args) 485{
486 int tot_pages = 0;
487 unsigned int nr_pages;
488 unsigned int i;
489
490 /* figure out the number of pages in the vector */
491 for (i = 0; i < nr_iovecs; i++) {
492 nr_pages = rds_pages_in_vec(&iov[i]);
493 if (nr_pages == 0)
494 return -EINVAL;
495
496 tot_pages += nr_pages;
497
498 /*
499 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
500 * so tot_pages cannot overflow without first going negative.
501 */
502 if (tot_pages < 0)
503 return -EINVAL;
504 }
505
506 return tot_pages;
507}
508
509int rds_rdma_extra_size(struct rds_rdma_args *args)
458{ 510{
459 struct rds_iovec vec; 511 struct rds_iovec vec;
460 struct rds_rdma_op *op = NULL; 512 struct rds_iovec __user *local_vec;
513 int tot_pages = 0;
461 unsigned int nr_pages; 514 unsigned int nr_pages;
462 unsigned int max_pages; 515 unsigned int i;
516
517 local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
518
519 /* figure out the number of pages in the vector */
520 for (i = 0; i < args->nr_local; i++) {
521 if (copy_from_user(&vec, &local_vec[i],
522 sizeof(struct rds_iovec)))
523 return -EFAULT;
524
525 nr_pages = rds_pages_in_vec(&vec);
526 if (nr_pages == 0)
527 return -EINVAL;
528
529 tot_pages += nr_pages;
530
531 /*
532 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
533 * so tot_pages cannot overflow without first going negative.
534 */
535 if (tot_pages < 0)
536 return -EINVAL;
537 }
538
539 return tot_pages * sizeof(struct scatterlist);
540}
541
542/*
543 * The application asks for a RDMA transfer.
544 * Extract all arguments and set up the rdma_op
545 */
546int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
547 struct cmsghdr *cmsg)
548{
549 struct rds_rdma_args *args;
550 struct rm_rdma_op *op = &rm->rdma;
551 int nr_pages;
463 unsigned int nr_bytes; 552 unsigned int nr_bytes;
464 struct page **pages = NULL; 553 struct page **pages = NULL;
465 struct rds_iovec __user *local_vec; 554 struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack;
466 struct scatterlist *sg; 555 int iov_size;
467 unsigned int nr;
468 unsigned int i, j; 556 unsigned int i, j;
469 int ret; 557 int ret = 0;
558
559 if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
560 || rm->rdma.op_active)
561 return -EINVAL;
470 562
563 args = CMSG_DATA(cmsg);
471 564
472 if (rs->rs_bound_addr == 0) { 565 if (rs->rs_bound_addr == 0) {
473 ret = -ENOTCONN; /* XXX not a great errno */ 566 ret = -ENOTCONN; /* XXX not a great errno */
@@ -479,61 +572,59 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
479 goto out; 572 goto out;
480 } 573 }
481 574
482 nr_pages = 0; 575 /* Check whether to allocate the iovec area */
483 max_pages = 0; 576 iov_size = args->nr_local * sizeof(struct rds_iovec);
484 577 if (args->nr_local > UIO_FASTIOV) {
485 local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; 578 iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL);
486 579 if (!iovs) {
487 /* figure out the number of pages in the vector */ 580 ret = -ENOMEM;
488 for (i = 0; i < args->nr_local; i++) {
489 if (copy_from_user(&vec, &local_vec[i],
490 sizeof(struct rds_iovec))) {
491 ret = -EFAULT;
492 goto out;
493 }
494
495 nr = rds_pages_in_vec(&vec);
496 if (nr == 0) {
497 ret = -EINVAL;
498 goto out; 581 goto out;
499 } 582 }
583 }
500 584
501 max_pages = max(nr, max_pages); 585 if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) {
502 nr_pages += nr; 586 ret = -EFAULT;
587 goto out;
503 } 588 }
504 589
505 pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL); 590 nr_pages = rds_rdma_pages(iovs, args->nr_local);
506 if (pages == NULL) { 591 if (nr_pages < 0) {
507 ret = -ENOMEM; 592 ret = -EINVAL;
508 goto out; 593 goto out;
509 } 594 }
510 595
511 op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); 596 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
512 if (op == NULL) { 597 if (!pages) {
513 ret = -ENOMEM; 598 ret = -ENOMEM;
514 goto out; 599 goto out;
515 } 600 }
516 601
517 op->r_write = !!(args->flags & RDS_RDMA_READWRITE); 602 op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
518 op->r_fence = !!(args->flags & RDS_RDMA_FENCE); 603 op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
519 op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); 604 op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
520 op->r_recverr = rs->rs_recverr; 605 op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
606 op->op_active = 1;
607 op->op_recverr = rs->rs_recverr;
521 WARN_ON(!nr_pages); 608 WARN_ON(!nr_pages);
522 sg_init_table(op->r_sg, nr_pages); 609 op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
610 if (!op->op_sg) {
611 ret = -ENOMEM;
612 goto out;
613 }
523 614
524 if (op->r_notify || op->r_recverr) { 615 if (op->op_notify || op->op_recverr) {
525 /* We allocate an uninitialized notifier here, because 616 /* We allocate an uninitialized notifier here, because
526 * we don't want to do that in the completion handler. We 617 * we don't want to do that in the completion handler. We
527 * would have to use GFP_ATOMIC there, and don't want to deal 618 * would have to use GFP_ATOMIC there, and don't want to deal
528 * with failed allocations. 619 * with failed allocations.
529 */ 620 */
530 op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); 621 op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
531 if (!op->r_notifier) { 622 if (!op->op_notifier) {
532 ret = -ENOMEM; 623 ret = -ENOMEM;
533 goto out; 624 goto out;
534 } 625 }
535 op->r_notifier->n_user_token = args->user_token; 626 op->op_notifier->n_user_token = args->user_token;
536 op->r_notifier->n_status = RDS_RDMA_SUCCESS; 627 op->op_notifier->n_status = RDS_RDMA_SUCCESS;
537 } 628 }
538 629
539 /* The cookie contains the R_Key of the remote memory region, and 630 /* The cookie contains the R_Key of the remote memory region, and
@@ -543,68 +634,55 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
543 * destination address (which is really an offset into the MR) 634 * destination address (which is really an offset into the MR)
544 * FIXME: We may want to move this into ib_rdma.c 635 * FIXME: We may want to move this into ib_rdma.c
545 */ 636 */
546 op->r_key = rds_rdma_cookie_key(args->cookie); 637 op->op_rkey = rds_rdma_cookie_key(args->cookie);
547 op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); 638 op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
548 639
549 nr_bytes = 0; 640 nr_bytes = 0;
550 641
551 rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", 642 rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
552 (unsigned long long)args->nr_local, 643 (unsigned long long)args->nr_local,
553 (unsigned long long)args->remote_vec.addr, 644 (unsigned long long)args->remote_vec.addr,
554 op->r_key); 645 op->op_rkey);
555 646
556 for (i = 0; i < args->nr_local; i++) { 647 for (i = 0; i < args->nr_local; i++) {
557 if (copy_from_user(&vec, &local_vec[i], 648 struct rds_iovec *iov = &iovs[i];
558 sizeof(struct rds_iovec))) { 649 /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
559 ret = -EFAULT; 650 unsigned int nr = rds_pages_in_vec(iov);
560 goto out;
561 }
562 651
563 nr = rds_pages_in_vec(&vec); 652 rs->rs_user_addr = iov->addr;
564 if (nr == 0) { 653 rs->rs_user_bytes = iov->bytes;
565 ret = -EINVAL;
566 goto out;
567 }
568 654
569 rs->rs_user_addr = vec.addr;
570 rs->rs_user_bytes = vec.bytes;
571
572 /* did the user change the vec under us? */
573 if (nr > max_pages || op->r_nents + nr > nr_pages) {
574 ret = -EINVAL;
575 goto out;
576 }
577 /* If it's a WRITE operation, we want to pin the pages for reading. 655 /* If it's a WRITE operation, we want to pin the pages for reading.
578 * If it's a READ operation, we need to pin the pages for writing. 656 * If it's a READ operation, we need to pin the pages for writing.
579 */ 657 */
580 ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write); 658 ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
581 if (ret < 0) 659 if (ret < 0)
582 goto out; 660 goto out;
583 661
584 rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n", 662 rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
585 nr_bytes, nr, vec.bytes, vec.addr); 663 nr_bytes, nr, iov->bytes, iov->addr);
586 664
587 nr_bytes += vec.bytes; 665 nr_bytes += iov->bytes;
588 666
589 for (j = 0; j < nr; j++) { 667 for (j = 0; j < nr; j++) {
590 unsigned int offset = vec.addr & ~PAGE_MASK; 668 unsigned int offset = iov->addr & ~PAGE_MASK;
669 struct scatterlist *sg;
591 670
592 sg = &op->r_sg[op->r_nents + j]; 671 sg = &op->op_sg[op->op_nents + j];
593 sg_set_page(sg, pages[j], 672 sg_set_page(sg, pages[j],
594 min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), 673 min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
595 offset); 674 offset);
596 675
597 rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n", 676 rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
598 sg->offset, sg->length, vec.addr, vec.bytes); 677 sg->offset, sg->length, iov->addr, iov->bytes);
599 678
600 vec.addr += sg->length; 679 iov->addr += sg->length;
601 vec.bytes -= sg->length; 680 iov->bytes -= sg->length;
602 } 681 }
603 682
604 op->r_nents += nr; 683 op->op_nents += nr;
605 } 684 }
606 685
607
608 if (nr_bytes > args->remote_vec.bytes) { 686 if (nr_bytes > args->remote_vec.bytes) {
609 rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", 687 rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
610 nr_bytes, 688 nr_bytes,
@@ -612,38 +690,18 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
612 ret = -EINVAL; 690 ret = -EINVAL;
613 goto out; 691 goto out;
614 } 692 }
615 op->r_bytes = nr_bytes; 693 op->op_bytes = nr_bytes;
616 694
617 ret = 0;
618out: 695out:
696 if (iovs != iovstack)
697 sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size);
619 kfree(pages); 698 kfree(pages);
620 if (ret) { 699 if (ret)
621 if (op) 700 rds_rdma_free_op(op);
622 rds_rdma_free_op(op); 701 else
623 op = ERR_PTR(ret); 702 rds_stats_inc(s_send_rdma);
624 }
625 return op;
626}
627
628/*
629 * The application asks for a RDMA transfer.
630 * Extract all arguments and set up the rdma_op
631 */
632int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
633 struct cmsghdr *cmsg)
634{
635 struct rds_rdma_op *op;
636
637 if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) ||
638 rm->m_rdma_op != NULL)
639 return -EINVAL;
640 703
641 op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); 704 return ret;
642 if (IS_ERR(op))
643 return PTR_ERR(op);
644 rds_stats_inc(s_send_rdma);
645 rm->m_rdma_op = op;
646 return 0;
647} 705}
648 706
649/* 707/*
@@ -673,7 +731,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
673 731
674 spin_lock_irqsave(&rs->rs_rdma_lock, flags); 732 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
675 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); 733 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
676 if (mr == NULL) 734 if (!mr)
677 err = -EINVAL; /* invalid r_key */ 735 err = -EINVAL; /* invalid r_key */
678 else 736 else
679 atomic_inc(&mr->r_refcount); 737 atomic_inc(&mr->r_refcount);
@@ -681,7 +739,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
681 739
682 if (mr) { 740 if (mr) {
683 mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); 741 mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
684 rm->m_rdma_mr = mr; 742 rm->rdma.op_rdma_mr = mr;
685 } 743 }
686 return err; 744 return err;
687} 745}
@@ -699,5 +757,102 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
699 rm->m_rdma_cookie != 0) 757 rm->m_rdma_cookie != 0)
700 return -EINVAL; 758 return -EINVAL;
701 759
702 return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr); 760 return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
761}
762
763/*
764 * Fill in rds_message for an atomic request.
765 */
766int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
767 struct cmsghdr *cmsg)
768{
769 struct page *page = NULL;
770 struct rds_atomic_args *args;
771 int ret = 0;
772
773 if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
774 || rm->atomic.op_active)
775 return -EINVAL;
776
777 args = CMSG_DATA(cmsg);
778
779 /* Nonmasked & masked cmsg ops converted to masked hw ops */
780 switch (cmsg->cmsg_type) {
781 case RDS_CMSG_ATOMIC_FADD:
782 rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
783 rm->atomic.op_m_fadd.add = args->fadd.add;
784 rm->atomic.op_m_fadd.nocarry_mask = 0;
785 break;
786 case RDS_CMSG_MASKED_ATOMIC_FADD:
787 rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
788 rm->atomic.op_m_fadd.add = args->m_fadd.add;
789 rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
790 break;
791 case RDS_CMSG_ATOMIC_CSWP:
792 rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
793 rm->atomic.op_m_cswp.compare = args->cswp.compare;
794 rm->atomic.op_m_cswp.swap = args->cswp.swap;
795 rm->atomic.op_m_cswp.compare_mask = ~0;
796 rm->atomic.op_m_cswp.swap_mask = ~0;
797 break;
798 case RDS_CMSG_MASKED_ATOMIC_CSWP:
799 rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
800 rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
801 rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
802 rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
803 rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
804 break;
805 default:
806 BUG(); /* should never happen */
807 }
808
809 rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
810 rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
811 rm->atomic.op_active = 1;
812 rm->atomic.op_recverr = rs->rs_recverr;
813 rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
814 if (!rm->atomic.op_sg) {
815 ret = -ENOMEM;
816 goto err;
817 }
818
819 /* verify 8 byte-aligned */
820 if (args->local_addr & 0x7) {
821 ret = -EFAULT;
822 goto err;
823 }
824
825 ret = rds_pin_pages(args->local_addr, 1, &page, 1);
826 if (ret != 1)
827 goto err;
828 ret = 0;
829
830 sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
831
832 if (rm->atomic.op_notify || rm->atomic.op_recverr) {
833 /* We allocate an uninitialized notifier here, because
834 * we don't want to do that in the completion handler. We
835 * would have to use GFP_ATOMIC there, and don't want to deal
836 * with failed allocations.
837 */
838 rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
839 if (!rm->atomic.op_notifier) {
840 ret = -ENOMEM;
841 goto err;
842 }
843
844 rm->atomic.op_notifier->n_user_token = args->user_token;
845 rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
846 }
847
848 rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
849 rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
850
851 return ret;
852err:
853 if (page)
854 put_page(page);
855 kfree(rm->atomic.op_notifier);
856
857 return ret;
703} 858}