Merge branch 'linus' into cpumask-for-linus

Conflicts: arch/x86/kernel/cpu/common.c
author: Ingo Molnar <mingo@elte.hu> 2009-03-30 17:53:32 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-03-30 17:53:32 -0400
commit: 65fb0d23fcddd8697c871047b700c78817bdaa43 (patch)
tree: 119e6e5f276622c4c862f6c9b6d795264ba1603a /net/rds
parent: 8c083f081d0014057901c68a0a3e0f8ca7ac8d23 (diff)
parent: dfbbe89e197a77f2c8046a51c74e33e35f878080 (diff)
41 files changed, 15494 insertions, 0 deletions
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
new file mode 100644
index 000000000000..796773b5df9b
--- /dev/null
+++ b/net/rds/Kconfig
@@ -0,0 +1,14 @@
+config RDS
+        tristate "Reliable Datagram Sockets (RDS) (EXPERIMENTAL)"
+        depends on INET && INFINIBAND_IPOIB && EXPERIMENTAL
+        depends on INFINIBAND && INFINIBAND_ADDR_TRANS
+        ---help---
+          RDS provides reliable, sequenced delivery of datagrams
+          over Infiniband.
+config RDS_DEBUG
+        bool "Debugging messages"
+        depends on RDS
+        default n
diff --git a/net/rds/Makefile b/net/rds/Makefile
new file mode 100644
index 000000000000..51f27585fa08
--- /dev/null
+++ b/net/rds/Makefile
@@ -0,0 +1,14 @@
+obj-$(CONFIG_RDS) += rds.o
+rds-y :=        af_rds.o bind.o cong.o connection.o info.o message.o   \
+                        recv.o send.o stats.o sysctl.o threads.o transport.o \
+                        loop.o page.o rdma.o \
+                        rdma_transport.o \
+                        ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
+                        ib_sysctl.o ib_rdma.o \
+                        iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
+                        iw_sysctl.o iw_rdma.o
+ifeq ($(CONFIG_RDS_DEBUG), y)
+EXTRA_CFLAGS += -DDEBUG
+endif
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
new file mode 100644
index 000000000000..20cf16fc572f
--- /dev/null
+++ b/net/rds/af_rds.c
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/poll.h>
+#include <linux/version.h>
+#include <net/sock.h>
+#include "rds.h"
+#include "rdma.h"
+#include "rdma_transport.h"
+/* this is just used for stats gathering :/ */
+static DEFINE_SPINLOCK(rds_sock_lock);
+static unsigned long rds_sock_count;
+static LIST_HEAD(rds_sock_list);
+DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
+/*
+ * This is called as the final descriptor referencing this socket is closed.
+ * We have to unbind the socket so that another socket can be bound to the
+ * address it was using.
+ *
+ * We have to be careful about racing with the incoming path.  sock_orphan()
+ * sets SOCK_DEAD and we use that as an indicator to the rx path that new
+ * messages shouldn't be queued.
+ */
+static int rds_release(struct socket *sock)
+{
+        struct sock *sk = sock->sk;
+        struct rds_sock *rs;
+        unsigned long flags;
+        if (sk == NULL)
+                goto out;
+        rs = rds_sk_to_rs(sk);
+        sock_orphan(sk);
+        /* Note - rds_clear_recv_queue grabs rs_recv_lock, so
+         * that ensures the recv path has completed messing
+         * with the socket. */
+        rds_clear_recv_queue(rs);
+        rds_cong_remove_socket(rs);
+        rds_remove_bound(rs);
+        rds_send_drop_to(rs, NULL);
+        rds_rdma_drop_keys(rs);
+        rds_notify_queue_get(rs, NULL);
+        spin_lock_irqsave(&rds_sock_lock, flags);
+        list_del_init(&rs->rs_item);
+        rds_sock_count--;
+        spin_unlock_irqrestore(&rds_sock_lock, flags);
+        sock->sk = NULL;
+        sock_put(sk);
+out:
+        return 0;
+}
+/*
+ * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
+ * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
+ * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
+ * this seems more conservative.
+ * NB - normally, one would use sk_callback_lock for this, but we can
+ * get here from interrupts, whereas the network code grabs sk_callback_lock
+ * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
+ */
+void rds_wake_sk_sleep(struct rds_sock *rs)
+{
+        unsigned long flags;
+        read_lock_irqsave(&rs->rs_recv_lock, flags);
+        __rds_wake_sk_sleep(rds_rs_to_sk(rs));
+        read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
+                       int *uaddr_len, int peer)
+{
+        struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+        /* racey, don't care */
+        if (peer) {
+                if (!rs->rs_conn_addr)
+                        return -ENOTCONN;
+                sin->sin_port = rs->rs_conn_port;
+                sin->sin_addr.s_addr = rs->rs_conn_addr;
+        } else {
+                sin->sin_port = rs->rs_bound_port;
+                sin->sin_addr.s_addr = rs->rs_bound_addr;
+        }
+        sin->sin_family = AF_INET;
+        *uaddr_len = sizeof(*sin);
+        return 0;
+}
+/*
+ * RDS' poll is without a doubt the least intuitive part of the interface,
+ * as POLLIN and POLLOUT do not behave entirely as you would expect from
+ * a network protocol.
+ *
+ * POLLIN is asserted if
+ *  -   there is data on the receive queue.
+ *  -   to signal that a previously congested destination may have become
+ *      uncongested
+ *  -   A notification has been queued to the socket (this can be a congestion
+ *      update, or a RDMA completion).
+ *
+ * POLLOUT is asserted if there is room on the send queue. This does not mean
+ * however, that the next sendmsg() call will succeed. If the application tries
+ * to send to a congested destination, the system call may still fail (and
+ * return ENOBUFS).
+ */
+static unsigned int rds_poll(struct file *file, struct socket *sock,
+                             poll_table *wait)
+{
+        struct sock *sk = sock->sk;
+        struct rds_sock *rs = rds_sk_to_rs(sk);
+        unsigned int mask = 0;
+        unsigned long flags;
+        poll_wait(file, sk->sk_sleep, wait);
+        poll_wait(file, &rds_poll_waitq, wait);
+        read_lock_irqsave(&rs->rs_recv_lock, flags);
+        if (!rs->rs_cong_monitor) {
+                /* When a congestion map was updated, we signal POLLIN for
+                 * "historical" reasons. Applications can also poll for
+                 * WRBAND instead. */
+                if (rds_cong_updated_since(&rs->rs_cong_track))
+                        mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
+        } else {
+                spin_lock(&rs->rs_lock);
+                if (rs->rs_cong_notify)
+                        mask |= (POLLIN | POLLRDNORM);
+                spin_unlock(&rs->rs_lock);
+        }
+        if (!list_empty(&rs->rs_recv_queue)
+         || !list_empty(&rs->rs_notify_queue))
+                mask |= (POLLIN | POLLRDNORM);
+        if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
+                mask |= (POLLOUT | POLLWRNORM);
+        read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+        return mask;
+}
+static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+        return -ENOIOCTLCMD;
+}
+static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
+                              int len)
+{
+        struct sockaddr_in sin;
+        int ret = 0;
+        /* racing with another thread binding seems ok here */
+        if (rs->rs_bound_addr == 0) {
+                ret = -ENOTCONN; /* XXX not a great errno */
+                goto out;
+        }
+        if (len < sizeof(struct sockaddr_in)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        if (copy_from_user(&sin, optval, sizeof(sin))) {
+                ret = -EFAULT;
+                goto out;
+        }
+        rds_send_drop_to(rs, &sin);
+out:
+        return ret;
+}
+static int rds_set_bool_option(unsigned char *optvar, char __user *optval,
+                               int optlen)
+{
+        int value;
+        if (optlen < sizeof(int))
+                return -EINVAL;
+        if (get_user(value, (int __user *) optval))
+                return -EFAULT;
+        *optvar = !!value;
+        return 0;
+}
+static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
+                            int optlen)
+{
+        int ret;
+        ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
+        if (ret == 0) {
+                if (rs->rs_cong_monitor) {
+                        rds_cong_add_socket(rs);
+                } else {
+                        rds_cong_remove_socket(rs);
+                        rs->rs_cong_mask = 0;
+                        rs->rs_cong_notify = 0;
+                }
+        }
+        return ret;
+}
+static int rds_setsockopt(struct socket *sock, int level, int optname,
+                          char __user *optval, int optlen)
+{
+        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+        int ret;
+        if (level != SOL_RDS) {
+                ret = -ENOPROTOOPT;
+                goto out;
+        }
+        switch (optname) {
+        case RDS_CANCEL_SENT_TO:
+                ret = rds_cancel_sent_to(rs, optval, optlen);
+                break;
+        case RDS_GET_MR:
+                ret = rds_get_mr(rs, optval, optlen);
+                break;
+        case RDS_FREE_MR:
+                ret = rds_free_mr(rs, optval, optlen);
+                break;
+        case RDS_RECVERR:
+                ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
+                break;
+        case RDS_CONG_MONITOR:
+                ret = rds_cong_monitor(rs, optval, optlen);
+                break;
+        default:
+                ret = -ENOPROTOOPT;
+        }
+out:
+        return ret;
+}
+static int rds_getsockopt(struct socket *sock, int level, int optname,
+                          char __user *optval, int __user *optlen)
+{
+        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+        int ret = -ENOPROTOOPT, len;
+        if (level != SOL_RDS)
+                goto out;
+        if (get_user(len, optlen)) {
+                ret = -EFAULT;
+                goto out;
+        }
+        switch (optname) {
+        case RDS_INFO_FIRST ... RDS_INFO_LAST:
+                ret = rds_info_getsockopt(sock, optname, optval,
+                                          optlen);
+                break;
+        case RDS_RECVERR:
+                if (len < sizeof(int))
+                        ret = -EINVAL;
+                else
+                if (put_user(rs->rs_recverr, (int __user *) optval)
+                 || put_user(sizeof(int), optlen))
+                        ret = -EFAULT;
+                else
+                        ret = 0;
+                break;
+        default:
+                break;
+        }
+out:
+        return ret;
+}
+static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
+                       int addr_len, int flags)
+{
+        struct sock *sk = sock->sk;
+        struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+        struct rds_sock *rs = rds_sk_to_rs(sk);
+        int ret = 0;
+        lock_sock(sk);
+        if (addr_len != sizeof(struct sockaddr_in)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        if (sin->sin_family != AF_INET) {
+                ret = -EAFNOSUPPORT;
+                goto out;
+        }
+        if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+                ret = -EDESTADDRREQ;
+                goto out;
+        }
+        rs->rs_conn_addr = sin->sin_addr.s_addr;
+        rs->rs_conn_port = sin->sin_port;
+out:
+        release_sock(sk);
+        return ret;
+}
+static struct proto rds_proto = {
+        .name     = "RDS",
+        .owner    = THIS_MODULE,
+        .obj_size = sizeof(struct rds_sock),
+};
+static struct proto_ops rds_proto_ops = {
+        .family =       AF_RDS,
+        .owner =        THIS_MODULE,
+        .release =      rds_release,
+        .bind =         rds_bind,
+        .connect =      rds_connect,
+        .socketpair =   sock_no_socketpair,
+        .accept =       sock_no_accept,
+        .getname =      rds_getname,
+        .poll =         rds_poll,
+        .ioctl =        rds_ioctl,
+        .listen =       sock_no_listen,
+        .shutdown =     sock_no_shutdown,
+        .setsockopt =   rds_setsockopt,
+        .getsockopt =   rds_getsockopt,
+        .sendmsg =      rds_sendmsg,
+        .recvmsg =      rds_recvmsg,
+        .mmap =         sock_no_mmap,
+        .sendpage =     sock_no_sendpage,
+};
+static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
+{
+        unsigned long flags;
+        struct rds_sock *rs;
+        sock_init_data(sock, sk);
+        sock->ops               = &rds_proto_ops;
+        sk->sk_protocol         = protocol;
+        rs = rds_sk_to_rs(sk);
+        spin_lock_init(&rs->rs_lock);
+        rwlock_init(&rs->rs_recv_lock);
+        INIT_LIST_HEAD(&rs->rs_send_queue);
+        INIT_LIST_HEAD(&rs->rs_recv_queue);
+        INIT_LIST_HEAD(&rs->rs_notify_queue);
+        INIT_LIST_HEAD(&rs->rs_cong_list);
+        spin_lock_init(&rs->rs_rdma_lock);
+        rs->rs_rdma_keys = RB_ROOT;
+        spin_lock_irqsave(&rds_sock_lock, flags);
+        list_add_tail(&rs->rs_item, &rds_sock_list);
+        rds_sock_count++;
+        spin_unlock_irqrestore(&rds_sock_lock, flags);
+        return 0;
+}
+static int rds_create(struct net *net, struct socket *sock, int protocol)
+{
+        struct sock *sk;
+        if (sock->type != SOCK_SEQPACKET || protocol)
+                return -ESOCKTNOSUPPORT;
+        sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto);
+        if (!sk)
+                return -ENOMEM;
+        return __rds_create(sock, sk, protocol);
+}
+void rds_sock_addref(struct rds_sock *rs)
+{
+        sock_hold(rds_rs_to_sk(rs));
+}
+void rds_sock_put(struct rds_sock *rs)
+{
+        sock_put(rds_rs_to_sk(rs));
+}
+static struct net_proto_family rds_family_ops = {
+        .family =       AF_RDS,
+        .create =       rds_create,
+        .owner  =       THIS_MODULE,
+};
+static void rds_sock_inc_info(struct socket *sock, unsigned int len,
+                              struct rds_info_iterator *iter,
+                              struct rds_info_lengths *lens)
+{
+        struct rds_sock *rs;
+        struct sock *sk;
+        struct rds_incoming *inc;
+        unsigned long flags;
+        unsigned int total = 0;
+        len /= sizeof(struct rds_info_message);
+        spin_lock_irqsave(&rds_sock_lock, flags);
+        list_for_each_entry(rs, &rds_sock_list, rs_item) {
+                sk = rds_rs_to_sk(rs);
+                read_lock(&rs->rs_recv_lock);
+                /* XXX too lazy to maintain counts.. */
+                list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
+                        total++;
+                        if (total <= len)
+                                rds_inc_info_copy(inc, iter, inc->i_saddr,
+                                                  rs->rs_bound_addr, 1);
+                }
+                read_unlock(&rs->rs_recv_lock);
+        }
+        spin_unlock_irqrestore(&rds_sock_lock, flags);
+        lens->nr = total;
+        lens->each = sizeof(struct rds_info_message);
+}
+static void rds_sock_info(struct socket *sock, unsigned int len,
+                          struct rds_info_iterator *iter,
+                          struct rds_info_lengths *lens)
+{
+        struct rds_info_socket sinfo;
+        struct rds_sock *rs;
+        unsigned long flags;
+        len /= sizeof(struct rds_info_socket);
+        spin_lock_irqsave(&rds_sock_lock, flags);
+        if (len < rds_sock_count)
+                goto out;
+        list_for_each_entry(rs, &rds_sock_list, rs_item) {
+                sinfo.sndbuf = rds_sk_sndbuf(rs);
+                sinfo.rcvbuf = rds_sk_rcvbuf(rs);
+                sinfo.bound_addr = rs->rs_bound_addr;
+                sinfo.connected_addr = rs->rs_conn_addr;
+                sinfo.bound_port = rs->rs_bound_port;
+                sinfo.connected_port = rs->rs_conn_port;
+                sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
+                rds_info_copy(iter, &sinfo, sizeof(sinfo));
+        }
+out:
+        lens->nr = rds_sock_count;
+        lens->each = sizeof(struct rds_info_socket);
+        spin_unlock_irqrestore(&rds_sock_lock, flags);
+}
+static void __exit rds_exit(void)
+{
+        rds_rdma_exit();
+        sock_unregister(rds_family_ops.family);
+        proto_unregister(&rds_proto);
+        rds_conn_exit();
+        rds_cong_exit();
+        rds_sysctl_exit();
+        rds_threads_exit();
+        rds_stats_exit();
+        rds_page_exit();
+        rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
+        rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+}
+module_exit(rds_exit);
+static int __init rds_init(void)
+{
+        int ret;
+        ret = rds_conn_init();
+        if (ret)
+                goto out;
+        ret = rds_threads_init();
+        if (ret)
+                goto out_conn;
+        ret = rds_sysctl_init();
+        if (ret)
+                goto out_threads;
+        ret = rds_stats_init();
+        if (ret)
+                goto out_sysctl;
+        ret = proto_register(&rds_proto, 1);
+        if (ret)
+                goto out_stats;
+        ret = sock_register(&rds_family_ops);
+        if (ret)
+                goto out_proto;
+        rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
+        rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+        /* ib/iwarp transports currently compiled-in */
+        ret = rds_rdma_init();
+        if (ret)
+                goto out_sock;
+        goto out;
+out_sock:
+        sock_unregister(rds_family_ops.family);
+out_proto:
+        proto_unregister(&rds_proto);
+out_stats:
+        rds_stats_exit();
+out_sysctl:
+        rds_sysctl_exit();
+out_threads:
+        rds_threads_exit();
+out_conn:
+        rds_conn_exit();
+        rds_cong_exit();
+        rds_page_exit();
+out:
+        return ret;
+}
+module_init(rds_init);
+#define DRV_VERSION     "4.0"
+#define DRV_RELDATE     "Feb 12, 2009"
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
+                   " v" DRV_VERSION " (" DRV_RELDATE ")");
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NETPROTO(PF_RDS);
diff --git a/net/rds/bind.c b/net/rds/bind.c
new file mode 100644
index 000000000000..c17cc39160ce
--- /dev/null
+++ b/net/rds/bind.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include "rds.h"
+/*
+ * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
+ * particularly zippy.
+ *
+ * This is now called for every incoming frame so we arguably care much more
+ * about it than we used to.
+ */
+static DEFINE_SPINLOCK(rds_bind_lock);
+static struct rb_root rds_bind_tree = RB_ROOT;
+static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
+                                           struct rds_sock *insert)
+{
+        struct rb_node **p = &rds_bind_tree.rb_node;
+        struct rb_node *parent = NULL;
+        struct rds_sock *rs;
+        u64 cmp;
+        u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
+        while (*p) {
+                parent = *p;
+                rs = rb_entry(parent, struct rds_sock, rs_bound_node);
+                cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
+                      be16_to_cpu(rs->rs_bound_port);
+                if (needle < cmp)
+                        p = &(*p)->rb_left;
+                else if (needle > cmp)
+                        p = &(*p)->rb_right;
+                else
+                        return rs;
+        }
+        if (insert) {
+                rb_link_node(&insert->rs_bound_node, parent, p);
+                rb_insert_color(&insert->rs_bound_node, &rds_bind_tree);
+        }
+        return NULL;
+}
+/*
+ * Return the rds_sock bound at the given local address.
+ *
+ * The rx path can race with rds_release.  We notice if rds_release() has
+ * marked this socket and don't return a rs ref to the rx path.
+ */
+struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
+{
+        struct rds_sock *rs;
+        unsigned long flags;
+        spin_lock_irqsave(&rds_bind_lock, flags);
+        rs = rds_bind_tree_walk(addr, port, NULL);
+        if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
+                rds_sock_addref(rs);
+        else
+                rs = NULL;
+        spin_unlock_irqrestore(&rds_bind_lock, flags);
+        rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
+                ntohs(port));
+        return rs;
+}
+/* returns -ve errno or +ve port */
+static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
+{
+        unsigned long flags;
+        int ret = -EADDRINUSE;
+        u16 rover, last;
+        if (*port != 0) {
+                rover = be16_to_cpu(*port);
+                last = rover;
+        } else {
+                rover = max_t(u16, net_random(), 2);
+                last = rover - 1;
+        }
+        spin_lock_irqsave(&rds_bind_lock, flags);
+        do {
+                if (rover == 0)
+                        rover++;
+                if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) {
+                        *port = cpu_to_be16(rover);
+                        ret = 0;
+                        break;
+                }
+        } while (rover++ != last);
+        if (ret == 0)  {
+                rs->rs_bound_addr = addr;
+                rs->rs_bound_port = *port;
+                rds_sock_addref(rs);
+                rdsdebug("rs %p binding to %pI4:%d\n",
+                  rs, &addr, (int)ntohs(*port));
+        }
+        spin_unlock_irqrestore(&rds_bind_lock, flags);
+        return ret;
+}
+void rds_remove_bound(struct rds_sock *rs)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&rds_bind_lock, flags);
+        if (rs->rs_bound_addr) {
+                rdsdebug("rs %p unbinding from %pI4:%d\n",
+                  rs, &rs->rs_bound_addr,
+                  ntohs(rs->rs_bound_port));
+                rb_erase(&rs->rs_bound_node, &rds_bind_tree);
+                rds_sock_put(rs);
+                rs->rs_bound_addr = 0;
+        }
+        spin_unlock_irqrestore(&rds_bind_lock, flags);
+}
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+        struct sock *sk = sock->sk;
+        struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+        struct rds_sock *rs = rds_sk_to_rs(sk);
+        struct rds_transport *trans;
+        int ret = 0;
+        lock_sock(sk);
+        if (addr_len != sizeof(struct sockaddr_in) ||
+            sin->sin_family != AF_INET ||
+            rs->rs_bound_addr ||
+            sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
+        if (ret)
+                goto out;
+        trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
+        if (trans == NULL) {
+                ret = -EADDRNOTAVAIL;
+                rds_remove_bound(rs);
+                goto out;
+        }
+        rs->rs_transport = trans;
+        ret = 0;
+out:
+        release_sock(sk);
+        return ret;
+}
diff --git a/net/rds/cong.c b/net/rds/cong.c
new file mode 100644
index 000000000000..710e4599d76c
--- /dev/null
+++ b/net/rds/cong.c
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2007 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <asm-generic/bitops/le.h>
+#include "rds.h"
+/*
+ * This file implements the receive side of the unconventional congestion
+ * management in RDS.
+ *
+ * Messages waiting in the receive queue on the receiving socket are accounted
+ * against the sockets SO_RCVBUF option value.  Only the payload bytes in the
+ * message are accounted for.  If the number of bytes queued equals or exceeds
+ * rcvbuf then the socket is congested.  All sends attempted to this socket's
+ * address should return block or return -EWOULDBLOCK.
+ *
+ * Applications are expected to be reasonably tuned such that this situation
+ * very rarely occurs.  An application encountering this "back-pressure" is
+ * considered a bug.
+ *
+ * This is implemented by having each node maintain bitmaps which indicate
+ * which ports on bound addresses are congested.  As the bitmap changes it is
+ * sent through all the connections which terminate in the local address of the
+ * bitmap which changed.
+ *
+ * The bitmaps are allocated as connections are brought up.  This avoids
+ * allocation in the interrupt handling path which queues messages on sockets.
+ * The dense bitmaps let transports send the entire bitmap on any bitmap change
+ * reasonably efficiently.  This is much easier to implement than some
+ * finer-grained communication of per-port congestion.  The sender does a very
+ * inexpensive bit test to test if the port it's about to send to is congested
+ * or not.
+ */
+/*
+ * Interaction with poll is a tad tricky. We want all processes stuck in
+ * poll to wake up and check whether a congested destination became uncongested.
+ * The really sad thing is we have no idea which destinations the application
+ * wants to send to - we don't even know which rds_connections are involved.
+ * So until we implement a more flexible rds poll interface, we have to make
+ * do with this:
+ * We maintain a global counter that is incremented each time a congestion map
+ * update is received. Each rds socket tracks this value, and if rds_poll
+ * finds that the saved generation number is smaller than the global generation
+ * number, it wakes up the process.
+ */
+static atomic_t         rds_cong_generation = ATOMIC_INIT(0);
+/*
+ * Congestion monitoring
+ */
+static LIST_HEAD(rds_cong_monitor);
+static DEFINE_RWLOCK(rds_cong_monitor_lock);
+/*
+ * Yes, a global lock.  It's used so infrequently that it's worth keeping it
+ * global to simplify the locking.  It's only used in the following
+ * circumstances:
+ *
+ *  - on connection buildup to associate a conn with its maps
+ *  - on map changes to inform conns of a new map to send
+ *
+ *  It's sadly ordered under the socket callback lock and the connection lock.
+ *  Receive paths can mark ports congested from interrupt context so the
+ *  lock masks interrupts.
+ */
+static DEFINE_SPINLOCK(rds_cong_lock);
+static struct rb_root rds_cong_tree = RB_ROOT;
+static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
+                                               struct rds_cong_map *insert)
+{
+        struct rb_node **p = &rds_cong_tree.rb_node;
+        struct rb_node *parent = NULL;
+        struct rds_cong_map *map;
+        while (*p) {
+                parent = *p;
+                map = rb_entry(parent, struct rds_cong_map, m_rb_node);
+                if (addr < map->m_addr)
+                        p = &(*p)->rb_left;
+                else if (addr > map->m_addr)
+                        p = &(*p)->rb_right;
+                else
+                        return map;
+        }
+        if (insert) {
+                rb_link_node(&insert->m_rb_node, parent, p);
+                rb_insert_color(&insert->m_rb_node, &rds_cong_tree);
+        }
+        return NULL;
+}
+/*
+ * There is only ever one bitmap for any address.  Connections try and allocate
+ * these bitmaps in the process getting pointers to them.  The bitmaps are only
+ * ever freed as the module is removed after all connections have been freed.
+ */
+static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
+{
+        struct rds_cong_map *map;
+        struct rds_cong_map *ret = NULL;
+        unsigned long zp;
+        unsigned long i;
+        unsigned long flags;
+        map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
+        if (map == NULL)
+                return NULL;
+        map->m_addr = addr;
+        init_waitqueue_head(&map->m_waitq);
+        INIT_LIST_HEAD(&map->m_conn_list);
+        for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
+                zp = get_zeroed_page(GFP_KERNEL);
+                if (zp == 0)
+                        goto out;
+                map->m_page_addrs[i] = zp;
+        }
+        spin_lock_irqsave(&rds_cong_lock, flags);
+        ret = rds_cong_tree_walk(addr, map);
+        spin_unlock_irqrestore(&rds_cong_lock, flags);
+        if (ret == NULL) {
+                ret = map;
+                map = NULL;
+        }
+out:
+        if (map) {
+                for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+                        free_page(map->m_page_addrs[i]);
+                kfree(map);
+        }
+        rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));
+        return ret;
+}
+/*
+ * Put the conn on its local map's list.  This is called when the conn is
+ * really added to the hash.  It's nested under the rds_conn_lock, sadly.
+ */
+void rds_cong_add_conn(struct rds_connection *conn)
+{
+        unsigned long flags;
+        rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong);
+        spin_lock_irqsave(&rds_cong_lock, flags);
+        list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list);
+        spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+void rds_cong_remove_conn(struct rds_connection *conn)
+{
+        unsigned long flags;
+        rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong);
+        spin_lock_irqsave(&rds_cong_lock, flags);
+        list_del_init(&conn->c_map_item);
+        spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+int rds_cong_get_maps(struct rds_connection *conn)
+{
+        conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
+        conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
+        if (conn->c_lcong == NULL || conn->c_fcong == NULL)
+                return -ENOMEM;
+        return 0;
+}
+void rds_cong_queue_updates(struct rds_cong_map *map)
+{
+        struct rds_connection *conn;
+        unsigned long flags;
+        spin_lock_irqsave(&rds_cong_lock, flags);
+        list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
+                if (!test_and_set_bit(0, &conn->c_map_queued)) {
+                        rds_stats_inc(s_cong_update_queued);
+                        queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+                }
+        }
+        spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
+{
+        rdsdebug("waking map %p for %pI4\n",
+          map, &map->m_addr);
+        rds_stats_inc(s_cong_update_received);
+        atomic_inc(&rds_cong_generation);
+        if (waitqueue_active(&map->m_waitq))
+                wake_up(&map->m_waitq);
+        if (waitqueue_active(&rds_poll_waitq))
+                wake_up_all(&rds_poll_waitq);
+        if (portmask && !list_empty(&rds_cong_monitor)) {
+                unsigned long flags;
+                struct rds_sock *rs;
+                read_lock_irqsave(&rds_cong_monitor_lock, flags);
+                list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) {
+                        spin_lock(&rs->rs_lock);
+                        rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
+                        rs->rs_cong_mask &= ~portmask;
+                        spin_unlock(&rs->rs_lock);
+                        if (rs->rs_cong_notify)
+                                rds_wake_sk_sleep(rs);
+                }
+                read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+        }
+}
+int rds_cong_updated_since(unsigned long *recent)
+{
+        unsigned long gen = atomic_read(&rds_cong_generation);
+        if (likely(*recent == gen))
+                return 0;
+        *recent = gen;
+        return 1;
+}
+/*
+ * We're called under the locking that protects the sockets receive buffer
+ * consumption.  This makes it a lot easier for the caller to only call us
+ * when it knows that an existing set bit needs to be cleared, and vice versa.
+ * We can't block and we need to deal with concurrent sockets working against
+ * the same per-address map.
+ */
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
+{
+        unsigned long i;
+        unsigned long off;
+        rdsdebug("setting congestion for %pI4:%u in map %p\n",
+          &map->m_addr, ntohs(port), map);
+        i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+        off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+        generic___set_le_bit(off, (void *)map->m_page_addrs[i]);
+}
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
+{
+        unsigned long i;
+        unsigned long off;
+        rdsdebug("clearing congestion for %pI4:%u in map %p\n",
+          &map->m_addr, ntohs(port), map);
+        i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+        off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+        generic___clear_le_bit(off, (void *)map->m_page_addrs[i]);
+}
+static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
+{
+        unsigned long i;
+        unsigned long off;
+        i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+        off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+        return generic_test_le_bit(off, (void *)map->m_page_addrs[i]);
+}
+void rds_cong_add_socket(struct rds_sock *rs)
+{
+        unsigned long flags;
+        write_lock_irqsave(&rds_cong_monitor_lock, flags);
+        if (list_empty(&rs->rs_cong_list))
+                list_add(&rs->rs_cong_list, &rds_cong_monitor);
+        write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+}
+void rds_cong_remove_socket(struct rds_sock *rs)
+{
+        unsigned long flags;
+        struct rds_cong_map *map;
+        write_lock_irqsave(&rds_cong_monitor_lock, flags);
+        list_del_init(&rs->rs_cong_list);
+        write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+        /* update congestion map for now-closed port */
+        spin_lock_irqsave(&rds_cong_lock, flags);
+        map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
+        spin_unlock_irqrestore(&rds_cong_lock, flags);
+        if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
+                rds_cong_clear_bit(map, rs->rs_bound_port);
+                rds_cong_queue_updates(map);
+        }
+}
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock,
+                  struct rds_sock *rs)
+{
+        if (!rds_cong_test_bit(map, port))
+                return 0;
+        if (nonblock) {
+                if (rs && rs->rs_cong_monitor) {
+                        unsigned long flags;
+                        /* It would have been nice to have an atomic set_bit on
+                         * a uint64_t. */
+                        spin_lock_irqsave(&rs->rs_lock, flags);
+                        rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port));
+                        spin_unlock_irqrestore(&rs->rs_lock, flags);
+                        /* Test again - a congestion update may have arrived in
+                         * the meantime. */
+                        if (!rds_cong_test_bit(map, port))
+                                return 0;
+                }
+                rds_stats_inc(s_cong_send_error);
+                return -ENOBUFS;
+        }
+        rds_stats_inc(s_cong_send_blocked);
+        rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port));
+        return wait_event_interruptible(map->m_waitq,
+                                        !rds_cong_test_bit(map, port));
+}
+void rds_cong_exit(void)
+{
+        struct rb_node *node;
+        struct rds_cong_map *map;
+        unsigned long i;
+        while ((node = rb_first(&rds_cong_tree))) {
+                map = rb_entry(node, struct rds_cong_map, m_rb_node);
+                rdsdebug("freeing map %p\n", map);
+                rb_erase(&map->m_rb_node, &rds_cong_tree);
+                for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+                        free_page(map->m_page_addrs[i]);
+                kfree(map);
+        }
+}
+/*
+ * Allocate a RDS message containing a congestion update.
+ */
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn)
+{
+        struct rds_cong_map *map = conn->c_lcong;
+        struct rds_message *rm;
+        rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES);
+        if (!IS_ERR(rm))
+                rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP;
+        return rm;
+}
diff --git a/net/rds/connection.c b/net/rds/connection.c
new file mode 100644
index 000000000000..273f064930a8
--- /dev/null
+++ b/net/rds/connection.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <net/inet_hashtables.h>
+#include "rds.h"
+#include "loop.h"
+#include "rdma.h"
+#define RDS_CONNECTION_HASH_BITS 12
+#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
+#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1)
+/* converting this to RCU is a chore for another day.. */
+static DEFINE_SPINLOCK(rds_conn_lock);
+static unsigned long rds_conn_count;
+static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
+static struct kmem_cache *rds_conn_slab;
+static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
+{
+        /* Pass NULL, don't need struct net for hash */
+        unsigned long hash = inet_ehashfn(NULL,
+                                          be32_to_cpu(laddr), 0,
+                                          be32_to_cpu(faddr), 0);
+        return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
+}
+#define rds_conn_info_set(var, test, suffix) do {               \
+        if (test)                                               \
+                var |= RDS_INFO_CONNECTION_FLAG_##suffix;       \
+} while (0)
+static inline int rds_conn_is_sending(struct rds_connection *conn)
+{
+        int ret = 0;
+        if (!mutex_trylock(&conn->c_send_lock))
+                ret = 1;
+        else
+                mutex_unlock(&conn->c_send_lock);
+        return ret;
+}
+static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
+                                              __be32 laddr, __be32 faddr,
+                                              struct rds_transport *trans)
+{
+        struct rds_connection *conn, *ret = NULL;
+        struct hlist_node *pos;
+        hlist_for_each_entry(conn, pos, head, c_hash_node) {
+                if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
+                                conn->c_trans == trans) {
+                        ret = conn;
+                        break;
+                }
+        }
+        rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret,
+                 &laddr, &faddr);
+        return ret;
+}
+/*
+ * This is called by transports as they're bringing down a connection.
+ * It clears partial message state so that the transport can start sending
+ * and receiving over this connection again in the future.  It is up to
+ * the transport to have serialized this call with its send and recv.
+ */
+void rds_conn_reset(struct rds_connection *conn)
+{
+        rdsdebug("connection %pI4 to %pI4 reset\n",
+          &conn->c_laddr, &conn->c_faddr);
+        rds_stats_inc(s_conn_reset);
+        rds_send_reset(conn);
+        conn->c_flags = 0;
+        /* Do not clear next_rx_seq here, else we cannot distinguish
+         * retransmitted packets from new packets, and will hand all
+         * of them to the application. That is not consistent with the
+         * reliability guarantees of RDS. */
+}
+/*
+ * There is only every one 'conn' for a given pair of addresses in the
+ * system at a time.  They contain messages to be retransmitted and so
+ * span the lifetime of the actual underlying transport connections.
+ *
+ * For now they are not garbage collected once they're created.  They
+ * are torn down as the module is removed, if ever.
+ */
+static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+                                       struct rds_transport *trans, gfp_t gfp,
+                                       int is_outgoing)
+{
+        struct rds_connection *conn, *tmp, *parent = NULL;
+        struct hlist_head *head = rds_conn_bucket(laddr, faddr);
+        unsigned long flags;
+        int ret;
+        spin_lock_irqsave(&rds_conn_lock, flags);
+        conn = rds_conn_lookup(head, laddr, faddr, trans);
+        if (conn
+         && conn->c_loopback
+         && conn->c_trans != &rds_loop_transport
+         && !is_outgoing) {
+                /* This is a looped back IB connection, and we're
+                 * called by the code handling the incoming connect.
+                 * We need a second connection object into which we
+                 * can stick the other QP. */
+                parent = conn;
+                conn = parent->c_passive;
+        }
+        spin_unlock_irqrestore(&rds_conn_lock, flags);
+        if (conn)
+                goto out;
+        conn = kmem_cache_alloc(rds_conn_slab, gfp);
+        if (conn == NULL) {
+                conn = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        memset(conn, 0, sizeof(*conn));
+        INIT_HLIST_NODE(&conn->c_hash_node);
+        conn->c_version = RDS_PROTOCOL_3_0;
+        conn->c_laddr = laddr;
+        conn->c_faddr = faddr;
+        spin_lock_init(&conn->c_lock);
+        conn->c_next_tx_seq = 1;
+        mutex_init(&conn->c_send_lock);
+        INIT_LIST_HEAD(&conn->c_send_queue);
+        INIT_LIST_HEAD(&conn->c_retrans);
+        ret = rds_cong_get_maps(conn);
+        if (ret) {
+                kmem_cache_free(rds_conn_slab, conn);
+                conn = ERR_PTR(ret);
+                goto out;
+        }
+        /*
+         * This is where a connection becomes loopback.  If *any* RDS sockets
+         * can bind to the destination address then we'd rather the messages
+         * flow through loopback rather than either transport.
+         */
+        if (rds_trans_get_preferred(faddr)) {
+                conn->c_loopback = 1;
+                if (is_outgoing && trans->t_prefer_loopback) {
+                        /* "outgoing" connection - and the transport
+                         * says it wants the connection handled by the
+                         * loopback transport. This is what TCP does.
+                         */
+                        trans = &rds_loop_transport;
+                }
+        }
+        conn->c_trans = trans;
+        ret = trans->conn_alloc(conn, gfp);
+        if (ret) {
+                kmem_cache_free(rds_conn_slab, conn);
+                conn = ERR_PTR(ret);
+                goto out;
+        }
+        atomic_set(&conn->c_state, RDS_CONN_DOWN);
+        conn->c_reconnect_jiffies = 0;
+        INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
+        INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
+        INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker);
+        INIT_WORK(&conn->c_down_w, rds_shutdown_worker);
+        mutex_init(&conn->c_cm_lock);
+        conn->c_flags = 0;
+        rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
+          conn, &laddr, &faddr,
+          trans->t_name ? trans->t_name : "[unknown]",
+          is_outgoing ? "(outgoing)" : "");
+        spin_lock_irqsave(&rds_conn_lock, flags);
+        if (parent == NULL) {
+                tmp = rds_conn_lookup(head, laddr, faddr, trans);
+                if (tmp == NULL)
+                        hlist_add_head(&conn->c_hash_node, head);
+        } else {
+                tmp = parent->c_passive;
+                if (!tmp)
+                        parent->c_passive = conn;
+        }
+        if (tmp) {
+                trans->conn_free(conn->c_transport_data);
+                kmem_cache_free(rds_conn_slab, conn);
+                conn = tmp;
+        } else {
+                rds_cong_add_conn(conn);
+                rds_conn_count++;
+        }
+        spin_unlock_irqrestore(&rds_conn_lock, flags);
+out:
+        return conn;
+}
+struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+                                       struct rds_transport *trans, gfp_t gfp)
+{
+        return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+}
+struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+                                       struct rds_transport *trans, gfp_t gfp)
+{
+        return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+}
+void rds_conn_destroy(struct rds_connection *conn)
+{
+        struct rds_message *rm, *rtmp;
+        rdsdebug("freeing conn %p for %pI4 -> "
+                 "%pI4\n", conn, &conn->c_laddr,
+                 &conn->c_faddr);
+        hlist_del_init(&conn->c_hash_node);
+        /* wait for the rds thread to shut it down */
+        atomic_set(&conn->c_state, RDS_CONN_ERROR);
+        cancel_delayed_work(&conn->c_conn_w);
+        queue_work(rds_wq, &conn->c_down_w);
+        flush_workqueue(rds_wq);
+        /* tear down queued messages */
+        list_for_each_entry_safe(rm, rtmp,
+                                 &conn->c_send_queue,
+                                 m_conn_item) {
+                list_del_init(&rm->m_conn_item);
+                BUG_ON(!list_empty(&rm->m_sock_item));
+                rds_message_put(rm);
+        }
+        if (conn->c_xmit_rm)
+                rds_message_put(conn->c_xmit_rm);
+        conn->c_trans->conn_free(conn->c_transport_data);
+        /*
+         * The congestion maps aren't freed up here.  They're
+         * freed by rds_cong_exit() after all the connections
+         * have been freed.
+         */
+        rds_cong_remove_conn(conn);
+        BUG_ON(!list_empty(&conn->c_retrans));
+        kmem_cache_free(rds_conn_slab, conn);
+        rds_conn_count--;
+}
+static void rds_conn_message_info(struct socket *sock, unsigned int len,
+                                  struct rds_info_iterator *iter,
+                                  struct rds_info_lengths *lens,
+                                  int want_send)
+{
+        struct hlist_head *head;
+        struct hlist_node *pos;
+        struct list_head *list;
+        struct rds_connection *conn;
+        struct rds_message *rm;
+        unsigned long flags;
+        unsigned int total = 0;
+        size_t i;
+        len /= sizeof(struct rds_info_message);
+        spin_lock_irqsave(&rds_conn_lock, flags);
+        for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+             i++, head++) {
+                hlist_for_each_entry(conn, pos, head, c_hash_node) {
+                        if (want_send)
+                                list = &conn->c_send_queue;
+                        else
+                                list = &conn->c_retrans;
+                        spin_lock(&conn->c_lock);
+                        /* XXX too lazy to maintain counts.. */
+                        list_for_each_entry(rm, list, m_conn_item) {
+                                total++;
+                                if (total <= len)
+                                        rds_inc_info_copy(&rm->m_inc, iter,
+                                                          conn->c_laddr,
+                                                          conn->c_faddr, 0);
+                        }
+                        spin_unlock(&conn->c_lock);
+                }
+        }
+        spin_unlock_irqrestore(&rds_conn_lock, flags);
+        lens->nr = total;
+        lens->each = sizeof(struct rds_info_message);
+}
+static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
+                                       struct rds_info_iterator *iter,
+                                       struct rds_info_lengths *lens)
+{
+        rds_conn_message_info(sock, len, iter, lens, 1);
+}
+static void rds_conn_message_info_retrans(struct socket *sock,
+                                          unsigned int len,
+                                          struct rds_info_iterator *iter,
+                                          struct rds_info_lengths *lens)
+{
+        rds_conn_message_info(sock, len, iter, lens, 0);
+}
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+                          struct rds_info_iterator *iter,
+                          struct rds_info_lengths *lens,
+                          int (*visitor)(struct rds_connection *, void *),
+                          size_t item_len)
+{
+        uint64_t buffer[(item_len + 7) / 8];
+        struct hlist_head *head;
+        struct hlist_node *pos;
+        struct hlist_node *tmp;
+        struct rds_connection *conn;
+        unsigned long flags;
+        size_t i;
+        spin_lock_irqsave(&rds_conn_lock, flags);
+        lens->nr = 0;
+        lens->each = item_len;
+        for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+             i++, head++) {
+                hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) {
+                        /* XXX no c_lock usage.. */
+                        if (!visitor(conn, buffer))
+                                continue;
+                        /* We copy as much as we can fit in the buffer,
+                         * but we count all items so that the caller
+                         * can resize the buffer. */
+                        if (len >= item_len) {
+                                rds_info_copy(iter, buffer, item_len);
+                                len -= item_len;
+                        }
+                        lens->nr++;
+                }
+        }
+        spin_unlock_irqrestore(&rds_conn_lock, flags);
+}
+static int rds_conn_info_visitor(struct rds_connection *conn,
+                                  void *buffer)
+{
+        struct rds_info_connection *cinfo = buffer;
+        cinfo->next_tx_seq = conn->c_next_tx_seq;
+        cinfo->next_rx_seq = conn->c_next_rx_seq;
+        cinfo->laddr = conn->c_laddr;
+        cinfo->faddr = conn->c_faddr;
+        strncpy(cinfo->transport, conn->c_trans->t_name,
+                sizeof(cinfo->transport));
+        cinfo->flags = 0;
+        rds_conn_info_set(cinfo->flags,
+                          rds_conn_is_sending(conn), SENDING);
+        /* XXX Future: return the state rather than these funky bits */
+        rds_conn_info_set(cinfo->flags,
+                          atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
+                          CONNECTING);
+        rds_conn_info_set(cinfo->flags,
+                          atomic_read(&conn->c_state) == RDS_CONN_UP,
+                          CONNECTED);
+        return 1;
+}
+static void rds_conn_info(struct socket *sock, unsigned int len,
+                          struct rds_info_iterator *iter,
+                          struct rds_info_lengths *lens)
+{
+        rds_for_each_conn_info(sock, len, iter, lens,
+                                rds_conn_info_visitor,
+                                sizeof(struct rds_info_connection));
+}
+int __init rds_conn_init(void)
+{
+        rds_conn_slab = kmem_cache_create("rds_connection",
+                                          sizeof(struct rds_connection),
+                                          0, 0, NULL);
+        if (rds_conn_slab == NULL)
+                return -ENOMEM;
+        rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+        rds_info_register_func(RDS_INFO_SEND_MESSAGES,
+                               rds_conn_message_info_send);
+        rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
+                               rds_conn_message_info_retrans);
+        return 0;
+}
+void rds_conn_exit(void)
+{
+        rds_loop_exit();
+        WARN_ON(!hlist_empty(rds_conn_hash));
+        kmem_cache_destroy(rds_conn_slab);
+        rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+        rds_info_deregister_func(RDS_INFO_SEND_MESSAGES,
+                                 rds_conn_message_info_send);
+        rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
+                                 rds_conn_message_info_retrans);
+}
+/*
+ * Force a disconnect
+ */
+void rds_conn_drop(struct rds_connection *conn)
+{
+        atomic_set(&conn->c_state, RDS_CONN_ERROR);
+        queue_work(rds_wq, &conn->c_down_w);
+}
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+        va_list ap;
+        va_start(ap, fmt);
+        vprintk(fmt, ap);
+        va_end(ap);
+        rds_conn_drop(conn);
+}
diff --git a/net/rds/ib.c b/net/rds/ib.c
new file mode 100644
index 000000000000..06a7b798d9a7
--- /dev/null
+++ b/net/rds/ib.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/delay.h>
+#include "rds.h"
+#include "ib.h"
+unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
+unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
+module_param(fmr_pool_size, int, 0444);
+MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
+module_param(fmr_message_size, int, 0444);
+MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
+struct list_head rds_ib_devices;
+DEFINE_SPINLOCK(ib_nodev_conns_lock);
+LIST_HEAD(ib_nodev_conns);
+void rds_ib_add_one(struct ib_device *device)
+{
+        struct rds_ib_device *rds_ibdev;
+        struct ib_device_attr *dev_attr;
+        /* Only handle IB (no iWARP) devices */
+        if (device->node_type != RDMA_NODE_IB_CA)
+                return;
+        dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
+        if (!dev_attr)
+                return;
+        if (ib_query_device(device, dev_attr)) {
+                rdsdebug("Query device failed for %s\n", device->name);
+                goto free_attr;
+        }
+        rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL);
+        if (!rds_ibdev)
+                goto free_attr;
+        spin_lock_init(&rds_ibdev->spinlock);
+        rds_ibdev->max_wrs = dev_attr->max_qp_wr;
+        rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
+        rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1);
+        rds_ibdev->fmr_page_size  = 1 << rds_ibdev->fmr_page_shift;
+        rds_ibdev->fmr_page_mask  = ~((u64) rds_ibdev->fmr_page_size - 1);
+        rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
+        rds_ibdev->max_fmrs = dev_attr->max_fmr ?
+                        min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
+                        fmr_pool_size;
+        rds_ibdev->dev = device;
+        rds_ibdev->pd = ib_alloc_pd(device);
+        if (IS_ERR(rds_ibdev->pd))
+                goto free_dev;
+        rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
+                                      IB_ACCESS_LOCAL_WRITE);
+        if (IS_ERR(rds_ibdev->mr))
+                goto err_pd;
+        rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
+        if (IS_ERR(rds_ibdev->mr_pool)) {
+                rds_ibdev->mr_pool = NULL;
+                goto err_mr;
+        }
+        INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
+        INIT_LIST_HEAD(&rds_ibdev->conn_list);
+        list_add_tail(&rds_ibdev->list, &rds_ib_devices);
+        ib_set_client_data(device, &rds_ib_client, rds_ibdev);
+        goto free_attr;
+err_mr:
+        ib_dereg_mr(rds_ibdev->mr);
+err_pd:
+        ib_dealloc_pd(rds_ibdev->pd);
+free_dev:
+        kfree(rds_ibdev);
+free_attr:
+        kfree(dev_attr);
+}
+void rds_ib_remove_one(struct ib_device *device)
+{
+        struct rds_ib_device *rds_ibdev;
+        struct rds_ib_ipaddr *i_ipaddr, *i_next;
+        rds_ibdev = ib_get_client_data(device, &rds_ib_client);
+        if (!rds_ibdev)
+                return;
+        list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
+                list_del(&i_ipaddr->list);
+                kfree(i_ipaddr);
+        }
+        rds_ib_remove_conns(rds_ibdev);
+        if (rds_ibdev->mr_pool)
+                rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
+        ib_dereg_mr(rds_ibdev->mr);
+        while (ib_dealloc_pd(rds_ibdev->pd)) {
+                rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd);
+                msleep(1);
+        }
+        list_del(&rds_ibdev->list);
+        kfree(rds_ibdev);
+}
+struct ib_client rds_ib_client = {
+        .name   = "rds_ib",
+        .add    = rds_ib_add_one,
+        .remove = rds_ib_remove_one
+};
+static int rds_ib_conn_info_visitor(struct rds_connection *conn,
+                                    void *buffer)
+{
+        struct rds_info_rdma_connection *iinfo = buffer;
+        struct rds_ib_connection *ic;
+        /* We will only ever look at IB transports */
+        if (conn->c_trans != &rds_ib_transport)
+                return 0;
+        iinfo->src_addr = conn->c_laddr;
+        iinfo->dst_addr = conn->c_faddr;
+        memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
+        memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
+        if (rds_conn_state(conn) == RDS_CONN_UP) {
+                struct rds_ib_device *rds_ibdev;
+                struct rdma_dev_addr *dev_addr;
+                ic = conn->c_transport_data;
+                dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+                ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+                ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+                rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+                iinfo->max_send_wr = ic->i_send_ring.w_nr;
+                iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
+                iinfo->max_send_sge = rds_ibdev->max_sge;
+                rds_ib_get_mr_info(rds_ibdev, iinfo);
+        }
+        return 1;
+}
+static void rds_ib_ic_info(struct socket *sock, unsigned int len,
+                           struct rds_info_iterator *iter,
+                           struct rds_info_lengths *lens)
+{
+        rds_for_each_conn_info(sock, len, iter, lens,
+                                rds_ib_conn_info_visitor,
+                                sizeof(struct rds_info_rdma_connection));
+}
+/*
+ * Early RDS/IB was built to only bind to an address if there is an IPoIB
+ * device with that address set.
+ *
+ * If it were me, I'd advocate for something more flexible.  Sending and
+ * receiving should be device-agnostic.  Transports would try and maintain
+ * connections between peers who have messages queued.  Userspace would be
+ * allowed to influence which paths have priority.  We could call userspace
+ * asserting this policy "routing".
+ */
+static int rds_ib_laddr_check(__be32 addr)
+{
+        int ret;
+        struct rdma_cm_id *cm_id;
+        struct sockaddr_in sin;
+        /* Create a CMA ID and try to bind it. This catches both
+         * IB and iWARP capable NICs.
+         */
+        cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+        if (!cm_id)
+                return -EADDRNOTAVAIL;
+        memset(&sin, 0, sizeof(sin));
+        sin.sin_family = AF_INET;
+        sin.sin_addr.s_addr = addr;
+        /* rdma_bind_addr will only succeed for IB & iWARP devices */
+        ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+        /* due to this, we will claim to support iWARP devices unless we
+           check node_type. */
+        if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
+                ret = -EADDRNOTAVAIL;
+        rdsdebug("addr %pI4 ret %d node type %d\n",
+                &addr, ret,
+                cm_id->device ? cm_id->device->node_type : -1);
+        rdma_destroy_id(cm_id);
+        return ret;
+}
+void rds_ib_exit(void)
+{
+        rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+        rds_ib_remove_nodev_conns();
+        ib_unregister_client(&rds_ib_client);
+        rds_ib_sysctl_exit();
+        rds_ib_recv_exit();
+        rds_trans_unregister(&rds_ib_transport);
+}
+struct rds_transport rds_ib_transport = {
+        .laddr_check            = rds_ib_laddr_check,
+        .xmit_complete          = rds_ib_xmit_complete,
+        .xmit                   = rds_ib_xmit,
+        .xmit_cong_map          = NULL,
+        .xmit_rdma              = rds_ib_xmit_rdma,
+        .recv                   = rds_ib_recv,
+        .conn_alloc             = rds_ib_conn_alloc,
+        .conn_free              = rds_ib_conn_free,
+        .conn_connect           = rds_ib_conn_connect,
+        .conn_shutdown          = rds_ib_conn_shutdown,
+        .inc_copy_to_user       = rds_ib_inc_copy_to_user,
+        .inc_purge              = rds_ib_inc_purge,
+        .inc_free               = rds_ib_inc_free,
+        .cm_initiate_connect    = rds_ib_cm_initiate_connect,
+        .cm_handle_connect      = rds_ib_cm_handle_connect,
+        .cm_connect_complete    = rds_ib_cm_connect_complete,
+        .stats_info_copy        = rds_ib_stats_info_copy,
+        .exit                   = rds_ib_exit,
+        .get_mr                 = rds_ib_get_mr,
+        .sync_mr                = rds_ib_sync_mr,
+        .free_mr                = rds_ib_free_mr,
+        .flush_mrs              = rds_ib_flush_mrs,
+        .t_owner                = THIS_MODULE,
+        .t_name                 = "infiniband",
+};
+int __init rds_ib_init(void)
+{
+        int ret;
+        INIT_LIST_HEAD(&rds_ib_devices);
+        ret = ib_register_client(&rds_ib_client);
+        if (ret)
+                goto out;
+        ret = rds_ib_sysctl_init();
+        if (ret)
+                goto out_ibreg;
+        ret = rds_ib_recv_init();
+        if (ret)
+                goto out_sysctl;
+        ret = rds_trans_register(&rds_ib_transport);
+        if (ret)
+                goto out_recv;
+        rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+        goto out;
+out_recv:
+        rds_ib_recv_exit();
+out_sysctl:
+        rds_ib_sysctl_exit();
+out_ibreg:
+        ib_unregister_client(&rds_ib_client);
+out:
+        return ret;
+}
+MODULE_LICENSE("GPL");
diff --git a/net/rds/ib.h b/net/rds/ib.h
new file mode 100644
index 000000000000..8be563a1363a
--- /dev/null
+++ b/net/rds/ib.h
@@ -0,0 +1,367 @@
+#ifndef _RDS_IB_H
+#define _RDS_IB_H
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+#include "rdma_transport.h"
+#define RDS_FMR_SIZE                    256
+#define RDS_FMR_POOL_SIZE               4096
+#define RDS_IB_MAX_SGE                  8
+#define RDS_IB_RECV_SGE                 2
+#define RDS_IB_DEFAULT_RECV_WR          1024
+#define RDS_IB_DEFAULT_SEND_WR          256
+#define RDS_IB_SUPPORTED_PROTOCOLS      0x00000003      /* minor versions supported */
+extern struct list_head rds_ib_devices;
+/*
+ * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
+ * try and minimize the amount of memory tied up both the device and
+ * socket receive queues.
+ */
+/* page offset of the final full frag that fits in the page */
+#define RDS_PAGE_LAST_OFF (((PAGE_SIZE  / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
+struct rds_page_frag {
+        struct list_head        f_item;
+        struct page             *f_page;
+        unsigned long           f_offset;
+        dma_addr_t              f_mapped;
+};
+struct rds_ib_incoming {
+        struct list_head        ii_frags;
+        struct rds_incoming     ii_inc;
+};
+struct rds_ib_connect_private {
+        /* Add new fields at the end, and don't permute existing fields. */
+        __be32                  dp_saddr;
+        __be32                  dp_daddr;
+        u8                      dp_protocol_major;
+        u8                      dp_protocol_minor;
+        __be16                  dp_protocol_minor_mask; /* bitmask */
+        __be32                  dp_reserved1;
+        __be64                  dp_ack_seq;
+        __be32                  dp_credit;              /* non-zero enables flow ctl */
+};
+struct rds_ib_send_work {
+        struct rds_message      *s_rm;
+        struct rds_rdma_op      *s_op;
+        struct ib_send_wr       s_wr;
+        struct ib_sge           s_sge[RDS_IB_MAX_SGE];
+        unsigned long           s_queued;
+};
+struct rds_ib_recv_work {
+        struct rds_ib_incoming  *r_ibinc;
+        struct rds_page_frag    *r_frag;
+        struct ib_recv_wr       r_wr;
+        struct ib_sge           r_sge[2];
+};
+struct rds_ib_work_ring {
+        u32             w_nr;
+        u32             w_alloc_ptr;
+        u32             w_alloc_ctr;
+        u32             w_free_ptr;
+        atomic_t        w_free_ctr;
+};
+struct rds_ib_device;
+struct rds_ib_connection {
+        struct list_head        ib_node;
+        struct rds_ib_device    *rds_ibdev;
+        struct rds_connection   *conn;
+        /* alphabet soup, IBTA style */
+        struct rdma_cm_id       *i_cm_id;
+        struct ib_pd            *i_pd;
+        struct ib_mr            *i_mr;
+        struct ib_cq            *i_send_cq;
+        struct ib_cq            *i_recv_cq;
+        /* tx */
+        struct rds_ib_work_ring i_send_ring;
+        struct rds_message      *i_rm;
+        struct rds_header       *i_send_hdrs;
+        u64                     i_send_hdrs_dma;
+        struct rds_ib_send_work *i_sends;
+        /* rx */
+        struct mutex            i_recv_mutex;
+        struct rds_ib_work_ring i_recv_ring;
+        struct rds_ib_incoming  *i_ibinc;
+        u32                     i_recv_data_rem;
+        struct rds_header       *i_recv_hdrs;
+        u64                     i_recv_hdrs_dma;
+        struct rds_ib_recv_work *i_recvs;
+        struct rds_page_frag    i_frag;
+        u64                     i_ack_recv;     /* last ACK received */
+        /* sending acks */
+        unsigned long           i_ack_flags;
+        u64                     i_ack_next;     /* next ACK to send */
+        struct rds_header       *i_ack;
+        struct ib_send_wr       i_ack_wr;
+        struct ib_sge           i_ack_sge;
+        u64                     i_ack_dma;
+        unsigned long           i_ack_queued;
+        /* Flow control related information
+         *
+         * Our algorithm uses a pair variables that we need to access
+         * atomically - one for the send credits, and one posted
+         * recv credits we need to transfer to remote.
+         * Rather than protect them using a slow spinlock, we put both into
+         * a single atomic_t and update it using cmpxchg
+         */
+        atomic_t                i_credits;
+        /* Protocol version specific information */
+        unsigned int            i_flowctl:1;    /* enable/disable flow ctl */
+        /* Batched completions */
+        unsigned int            i_unsignaled_wrs;
+        long                    i_unsignaled_bytes;
+};
+/* This assumes that atomic_t is at least 32 bits */
+#define IB_GET_SEND_CREDITS(v)  ((v) & 0xffff)
+#define IB_GET_POST_CREDITS(v)  ((v) >> 16)
+#define IB_SET_SEND_CREDITS(v)  ((v) & 0xffff)
+#define IB_SET_POST_CREDITS(v)  ((v) << 16)
+struct rds_ib_ipaddr {
+        struct list_head        list;
+        __be32                  ipaddr;
+};
+struct rds_ib_device {
+        struct list_head        list;
+        struct list_head        ipaddr_list;
+        struct list_head        conn_list;
+        struct ib_device        *dev;
+        struct ib_pd            *pd;
+        struct ib_mr            *mr;
+        struct rds_ib_mr_pool   *mr_pool;
+        int                     fmr_page_shift;
+        int                     fmr_page_size;
+        u64                     fmr_page_mask;
+        unsigned int            fmr_max_remaps;
+        unsigned int            max_fmrs;
+        int                     max_sge;
+        unsigned int            max_wrs;
+        spinlock_t              spinlock;       /* protect the above */
+};
+/* bits for i_ack_flags */
+#define IB_ACK_IN_FLIGHT        0
+#define IB_ACK_REQUESTED        1
+/* Magic WR_ID for ACKs */
+#define RDS_IB_ACK_WR_ID        (~(u64) 0)
+struct rds_ib_statistics {
+        uint64_t        s_ib_connect_raced;
+        uint64_t        s_ib_listen_closed_stale;
+        uint64_t        s_ib_tx_cq_call;
+        uint64_t        s_ib_tx_cq_event;
+        uint64_t        s_ib_tx_ring_full;
+        uint64_t        s_ib_tx_throttle;
+        uint64_t        s_ib_tx_sg_mapping_failure;
+        uint64_t        s_ib_tx_stalled;
+        uint64_t        s_ib_tx_credit_updates;
+        uint64_t        s_ib_rx_cq_call;
+        uint64_t        s_ib_rx_cq_event;
+        uint64_t        s_ib_rx_ring_empty;
+        uint64_t        s_ib_rx_refill_from_cq;
+        uint64_t        s_ib_rx_refill_from_thread;
+        uint64_t        s_ib_rx_alloc_limit;
+        uint64_t        s_ib_rx_credit_updates;
+        uint64_t        s_ib_ack_sent;
+        uint64_t        s_ib_ack_send_failure;
+        uint64_t        s_ib_ack_send_delayed;
+        uint64_t        s_ib_ack_send_piggybacked;
+        uint64_t        s_ib_ack_received;
+        uint64_t        s_ib_rdma_mr_alloc;
+        uint64_t        s_ib_rdma_mr_free;
+        uint64_t        s_ib_rdma_mr_used;
+        uint64_t        s_ib_rdma_mr_pool_flush;
+        uint64_t        s_ib_rdma_mr_pool_wait;
+        uint64_t        s_ib_rdma_mr_pool_depleted;
+};
+extern struct workqueue_struct *rds_ib_wq;
+/*
+ * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
+ * doesn't define it.
+ */
+static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev,
+                struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+        unsigned int i;
+        for (i = 0; i < sg_dma_len; ++i) {
+                ib_dma_sync_single_for_cpu(dev,
+                                ib_sg_dma_address(dev, &sg[i]),
+                                ib_sg_dma_len(dev, &sg[i]),
+                                direction);
+        }
+}
+#define ib_dma_sync_sg_for_cpu  rds_ib_dma_sync_sg_for_cpu
+static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
+                struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+        unsigned int i;
+        for (i = 0; i < sg_dma_len; ++i) {
+                ib_dma_sync_single_for_device(dev,
+                                ib_sg_dma_address(dev, &sg[i]),
+                                ib_sg_dma_len(dev, &sg[i]),
+                                direction);
+        }
+}
+#define ib_dma_sync_sg_for_device       rds_ib_dma_sync_sg_for_device
+/* ib.c */
+extern struct rds_transport rds_ib_transport;
+extern void rds_ib_add_one(struct ib_device *device);
+extern void rds_ib_remove_one(struct ib_device *device);
+extern struct ib_client rds_ib_client;
+extern unsigned int fmr_pool_size;
+extern unsigned int fmr_message_size;
+extern spinlock_t ib_nodev_conns_lock;
+extern struct list_head ib_nodev_conns;
+/* ib_cm.c */
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp);
+void rds_ib_conn_free(void *arg);
+int rds_ib_conn_connect(struct rds_connection *conn);
+void rds_ib_conn_shutdown(struct rds_connection *conn);
+void rds_ib_state_change(struct sock *sk);
+int __init rds_ib_listen_init(void);
+void rds_ib_listen_stop(void);
+void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+                             struct rdma_cm_event *event);
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
+void rds_ib_cm_connect_complete(struct rds_connection *conn,
+                                struct rdma_cm_event *event);
+#define rds_ib_conn_error(conn, fmt...) \
+        __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
+/* ib_rdma.c */
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
+int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void rds_ib_remove_nodev_conns(void);
+void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev);
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+                    struct rds_sock *rs, u32 *key_ret);
+void rds_ib_sync_mr(void *trans_private, int dir);
+void rds_ib_free_mr(void *trans_private, int invalidate);
+void rds_ib_flush_mrs(void);
+/* ib_recv.c */
+int __init rds_ib_recv_init(void);
+void rds_ib_recv_exit(void);
+int rds_ib_recv(struct rds_connection *conn);
+int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+                       gfp_t page_gfp, int prefill);
+void rds_ib_inc_purge(struct rds_incoming *inc);
+void rds_ib_inc_free(struct rds_incoming *inc);
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+                             size_t size);
+void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
+void rds_ib_attempt_ack(struct rds_ib_connection *ic);
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic);
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic);
+/* ib_ring.c */
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr);
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr);
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos);
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val);
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val);
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring);
+int rds_ib_ring_low(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
+extern wait_queue_head_t rds_ib_ring_empty_wait;
+/* ib_send.c */
+void rds_ib_xmit_complete(struct rds_connection *conn);
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+                unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_ib_send_init_ring(struct rds_ib_connection *ic);
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
+                             u32 *adv_credits, int need_posted);
+/* ib_stats.c */
+DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
+#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+                                    unsigned int avail);
+/* ib_sysctl.c */
+int __init rds_ib_sysctl_init(void);
+void rds_ib_sysctl_exit(void);
+extern unsigned long rds_ib_sysctl_max_send_wr;
+extern unsigned long rds_ib_sysctl_max_recv_wr;
+extern unsigned long rds_ib_sysctl_max_unsig_wrs;
+extern unsigned long rds_ib_sysctl_max_unsig_bytes;
+extern unsigned long rds_ib_sysctl_max_recv_allocation;
+extern unsigned int rds_ib_sysctl_flow_control;
+extern ctl_table rds_ib_sysctl_table[];
+/*
+ * Helper functions for getting/setting the header and data SGEs in
+ * RDS packets (not RDMA)
+ */
+static inline struct ib_sge *
+rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
+{
+        return &sge[0];
+}
+static inline struct ib_sge *
+rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
+{
+        return &sge[1];
+}
+static inline void rds_ib_set_64bit(u64 *ptr, u64 val)
+{
+#if BITS_PER_LONG == 64
+        *ptr = val;
+#else
+        set_64bit(ptr, val);
+#endif
+}
+#endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
new file mode 100644
index 000000000000..0532237bd128
--- /dev/null
+++ b/net/rds/ib_cm.c
@@ -0,0 +1,726 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/vmalloc.h>
+#include "rds.h"
+#include "ib.h"
+/*
+ * Set the selected protocol version
+ */
+static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
+{
+        conn->c_version = version;
+}
+/*
+ * Set up flow control
+ */
+static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
+{
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        if (rds_ib_sysctl_flow_control && credits != 0) {
+                /* We're doing flow control */
+                ic->i_flowctl = 1;
+                rds_ib_send_add_credits(conn, credits);
+        } else {
+                ic->i_flowctl = 0;
+        }
+}
+/*
+ * Tune RNR behavior. Without flow control, we use a rather
+ * low timeout, but not the absolute minimum - this should
+ * be tunable.
+ *
+ * We already set the RNR retry count to 7 (which is the
+ * smallest infinite number :-) above.
+ * If flow control is off, we want to change this back to 0
+ * so that we learn quickly when our credit accounting is
+ * buggy.
+ *
+ * Caller passes in a qp_attr pointer - don't waste stack spacv
+ * by allocation this twice.
+ */
+static void
+rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
+{
+        int ret;
+        attr->min_rnr_timer = IB_RNR_TIMER_000_32;
+        ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
+        if (ret)
+                printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
+}
+/*
+ * Connection established.
+ * We get here for both outgoing and incoming connection.
+ */
+void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
+{
+        const struct rds_ib_connect_private *dp = NULL;
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        struct rds_ib_device *rds_ibdev;
+        struct ib_qp_attr qp_attr;
+        int err;
+        if (event->param.conn.private_data_len) {
+                dp = event->param.conn.private_data;
+                rds_ib_set_protocol(conn,
+                                RDS_PROTOCOL(dp->dp_protocol_major,
+                                        dp->dp_protocol_minor));
+                rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+        }
+        printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
+                        &conn->c_laddr,
+                        RDS_PROTOCOL_MAJOR(conn->c_version),
+                        RDS_PROTOCOL_MINOR(conn->c_version),
+                        ic->i_flowctl ? ", flow control" : "");
+        /* Tune RNR behavior */
+        rds_ib_tune_rnr(ic, &qp_attr);
+        qp_attr.qp_state = IB_QPS_RTS;
+        err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
+        if (err)
+                printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
+        /* update ib_device with this local ipaddr & conn */
+        rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+        err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
+        if (err)
+                printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
+        err = rds_ib_add_conn(rds_ibdev, conn);
+        if (err)
+                printk(KERN_ERR "rds_ib_add_conn failed (%d)\n", err);
+        /* If the peer gave us the last packet it saw, process this as if
+         * we had received a regular ACK. */
+        if (dp && dp->dp_ack_seq)
+                rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+        rds_connect_complete(conn);
+}
+static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
+                        struct rdma_conn_param *conn_param,
+                        struct rds_ib_connect_private *dp,
+                        u32 protocol_version)
+{
+        memset(conn_param, 0, sizeof(struct rdma_conn_param));
+        /* XXX tune these? */
+        conn_param->responder_resources = 1;
+        conn_param->initiator_depth = 1;
+        conn_param->retry_count = 7;
+        conn_param->rnr_retry_count = 7;
+        if (dp) {
+                struct rds_ib_connection *ic = conn->c_transport_data;
+                memset(dp, 0, sizeof(*dp));
+                dp->dp_saddr = conn->c_laddr;
+                dp->dp_daddr = conn->c_faddr;
+                dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
+                dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
+                dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+                dp->dp_ack_seq = rds_ib_piggyb_ack(ic);
+                /* Advertise flow control */
+                if (ic->i_flowctl) {
+                        unsigned int credits;
+                        credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
+                        dp->dp_credit = cpu_to_be32(credits);
+                        atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+                }
+                conn_param->private_data = dp;
+                conn_param->private_data_len = sizeof(*dp);
+        }
+}
+static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
+{
+        rdsdebug("event %u data %p\n", event->event, data);
+}
+static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
+{
+        struct rds_connection *conn = data;
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
+        switch (event->event) {
+        case IB_EVENT_COMM_EST:
+                rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
+                break;
+        default:
+                printk(KERN_WARNING "RDS/ib: unhandled QP event %u "
+                       "on connection to %pI4\n", event->event,
+                       &conn->c_faddr);
+                break;
+        }
+}
+/*
+ * This needs to be very careful to not leave IS_ERR pointers around for
+ * cleanup to trip over.
+ */
+static int rds_ib_setup_qp(struct rds_connection *conn)
+{
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        struct ib_device *dev = ic->i_cm_id->device;
+        struct ib_qp_init_attr attr;
+        struct rds_ib_device *rds_ibdev;
+        int ret;
+        /* rds_ib_add_one creates a rds_ib_device object per IB device,
+         * and allocates a protection domain, memory range and FMR pool
+         * for each.  If that fails for any reason, it will not register
+         * the rds_ibdev at all.
+         */
+        rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
+        if (rds_ibdev == NULL) {
+                if (printk_ratelimit())
+                        printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
+                                        dev->name);
+                return -EOPNOTSUPP;
+        }
+        if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
+                rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
+        if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
+                rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
+        /* Protection domain and memory range */
+        ic->i_pd = rds_ibdev->pd;
+        ic->i_mr = rds_ibdev->mr;
+        ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler,
+                                     rds_ib_cq_event_handler, conn,
+                                     ic->i_send_ring.w_nr + 1, 0);
+        if (IS_ERR(ic->i_send_cq)) {
+                ret = PTR_ERR(ic->i_send_cq);
+                ic->i_send_cq = NULL;
+                rdsdebug("ib_create_cq send failed: %d\n", ret);
+                goto out;
+        }
+        ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler,
+                                     rds_ib_cq_event_handler, conn,
+                                     ic->i_recv_ring.w_nr, 0);
+        if (IS_ERR(ic->i_recv_cq)) {
+                ret = PTR_ERR(ic->i_recv_cq);
+                ic->i_recv_cq = NULL;
+                rdsdebug("ib_create_cq recv failed: %d\n", ret);
+                goto out;
+        }
+        ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
+        if (ret) {
+                rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+                goto out;
+        }
+        ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+        if (ret) {
+                rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
+                goto out;
+        }
+        /* XXX negotiate max send/recv with remote? */
+        memset(&attr, 0, sizeof(attr));
+        attr.event_handler = rds_ib_qp_event_handler;
+        attr.qp_context = conn;
+        /* + 1 to allow for the single ack message */
+        attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
+        attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
+        attr.cap.max_send_sge = rds_ibdev->max_sge;
+        attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
+        attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+        attr.qp_type = IB_QPT_RC;
+        attr.send_cq = ic->i_send_cq;
+        attr.recv_cq = ic->i_recv_cq;
+        /*
+         * XXX this can fail if max_*_wr is too large?  Are we supposed
+         * to back off until we get a value that the hardware can support?
+         */
+        ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
+        if (ret) {
+                rdsdebug("rdma_create_qp failed: %d\n", ret);
+                goto out;
+        }
+        ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
+                                           ic->i_send_ring.w_nr *
+                                                sizeof(struct rds_header),
+                                           &ic->i_send_hdrs_dma, GFP_KERNEL);
+        if (ic->i_send_hdrs == NULL) {
+                ret = -ENOMEM;
+                rdsdebug("ib_dma_alloc_coherent send failed\n");
+                goto out;
+        }
+        ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
+                                           ic->i_recv_ring.w_nr *
+                                                sizeof(struct rds_header),
+                                           &ic->i_recv_hdrs_dma, GFP_KERNEL);
+        if (ic->i_recv_hdrs == NULL) {
+                ret = -ENOMEM;
+                rdsdebug("ib_dma_alloc_coherent recv failed\n");
+                goto out;
+        }
+        ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
+                                       &ic->i_ack_dma, GFP_KERNEL);
+        if (ic->i_ack == NULL) {
+                ret = -ENOMEM;
+                rdsdebug("ib_dma_alloc_coherent ack failed\n");
+                goto out;
+        }
+        ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
+        if (ic->i_sends == NULL) {
+                ret = -ENOMEM;
+                rdsdebug("send allocation failed\n");
+                goto out;
+        }
+        rds_ib_send_init_ring(ic);
+        ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
+        if (ic->i_recvs == NULL) {
+                ret = -ENOMEM;
+                rdsdebug("recv allocation failed\n");
+                goto out;
+        }
+        rds_ib_recv_init_ring(ic);
+        rds_ib_recv_init_ack(ic);
+        /* Post receive buffers - as a side effect, this will update
+         * the posted credit count. */
+        rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+        rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
+                 ic->i_send_cq, ic->i_recv_cq);
+out:
+        return ret;
+}
+static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp)
+{
+        u16 common;
+        u32 version = 0;
+        /* rdma_cm private data is odd - when there is any private data in the
+         * request, we will be given a pretty large buffer without telling us the
+         * original size. The only way to tell the difference is by looking at
+         * the contents, which are initialized to zero.
+         * If the protocol version fields aren't set, this is a connection attempt
+         * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+         * We really should have changed this for OFED 1.3 :-( */
+        if (dp->dp_protocol_major == 0)
+                return RDS_PROTOCOL_3_0;
+        common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
+        if (dp->dp_protocol_major == 3 && common) {
+                version = RDS_PROTOCOL_3_0;
+                while ((common >>= 1) != 0)
+                        version++;
+        } else if (printk_ratelimit()) {
+                printk(KERN_NOTICE "RDS: Connection from %pI4 using "
+                        "incompatible protocol version %u.%u\n",
+                        &dp->dp_saddr,
+                        dp->dp_protocol_major,
+                        dp->dp_protocol_minor);
+        }
+        return version;
+}
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+                                    struct rdma_cm_event *event)
+{
+        __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
+        __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
+        const struct rds_ib_connect_private *dp = event->param.conn.private_data;
+        struct rds_ib_connect_private dp_rep;
+        struct rds_connection *conn = NULL;
+        struct rds_ib_connection *ic = NULL;
+        struct rdma_conn_param conn_param;
+        u32 version;
+        int err, destroy = 1;
+        /* Check whether the remote protocol version matches ours. */
+        version = rds_ib_protocol_compatible(dp);
+        if (!version)
+                goto out;
+        rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
+                 "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
+                 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
+                 (unsigned long long)be64_to_cpu(lguid),
+                 (unsigned long long)be64_to_cpu(fguid));
+        conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport,
+                               GFP_KERNEL);
+        if (IS_ERR(conn)) {
+                rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
+                conn = NULL;
+                goto out;
+        }
+        /*
+         * The connection request may occur while the
+         * previous connection exist, e.g. in case of failover.
+         * But as connections may be initiated simultaneously
+         * by both hosts, we have a random backoff mechanism -
+         * see the comment above rds_queue_reconnect()
+         */
+        mutex_lock(&conn->c_cm_lock);
+        if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+                if (rds_conn_state(conn) == RDS_CONN_UP) {
+                        rdsdebug("incoming connect while connecting\n");
+                        rds_conn_drop(conn);
+                        rds_ib_stats_inc(s_ib_listen_closed_stale);
+                } else
+                if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
+                        /* Wait and see - our connect may still be succeeding */
+                        rds_ib_stats_inc(s_ib_connect_raced);
+                }
+                mutex_unlock(&conn->c_cm_lock);
+                goto out;
+        }
+        ic = conn->c_transport_data;
+        rds_ib_set_protocol(conn, version);
+        rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+        /* If the peer gave us the last packet it saw, process this as if
+         * we had received a regular ACK. */
+        if (dp->dp_ack_seq)
+                rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+        BUG_ON(cm_id->context);
+        BUG_ON(ic->i_cm_id);
+        ic->i_cm_id = cm_id;
+        cm_id->context = conn;
+        /* We got halfway through setting up the ib_connection, if we
+         * fail now, we have to take the long route out of this mess. */
+        destroy = 0;
+        err = rds_ib_setup_qp(conn);
+        if (err) {
+                rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
+                goto out;
+        }
+        rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+        /* rdma_accept() calls rdma_reject() internally if it fails */
+        err = rdma_accept(cm_id, &conn_param);
+        mutex_unlock(&conn->c_cm_lock);
+        if (err) {
+                rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
+                goto out;
+        }
+        return 0;
+out:
+        rdma_reject(cm_id, NULL, 0);
+        return destroy;
+}
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
+{
+        struct rds_connection *conn = cm_id->context;
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        struct rdma_conn_param conn_param;
+        struct rds_ib_connect_private dp;
+        int ret;
+        /* If the peer doesn't do protocol negotiation, we must
+         * default to RDSv3.0 */
+        rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
+        ic->i_flowctl = rds_ib_sysctl_flow_control;     /* advertise flow control */
+        ret = rds_ib_setup_qp(conn);
+        if (ret) {
+                rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
+                goto out;
+        }
+        rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
+        ret = rdma_connect(cm_id, &conn_param);
+        if (ret)
+                rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+out:
+        /* Beware - returning non-zero tells the rdma_cm to destroy
+         * the cm_id. We should certainly not do it as long as we still
+         * "own" the cm_id. */
+        if (ret) {
+                if (ic->i_cm_id == cm_id)
+                        ret = 0;
+        }
+        return ret;
+}
+int rds_ib_conn_connect(struct rds_connection *conn)
+{
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        struct sockaddr_in src, dest;
+        int ret;
+        /* XXX I wonder what affect the port space has */
+        /* delegate cm event handler to rdma_transport */
+        ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+                                     RDMA_PS_TCP);
+        if (IS_ERR(ic->i_cm_id)) {
+                ret = PTR_ERR(ic->i_cm_id);
+                ic->i_cm_id = NULL;
+                rdsdebug("rdma_create_id() failed: %d\n", ret);
+                goto out;
+        }
+        rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
+        src.sin_family = AF_INET;
+        src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+        src.sin_port = (__force u16)htons(0);
+        dest.sin_family = AF_INET;
+        dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+        dest.sin_port = (__force u16)htons(RDS_PORT);
+        ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
+                                (struct sockaddr *)&dest,
+                                RDS_RDMA_RESOLVE_TIMEOUT_MS);
+        if (ret) {
+                rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
+                         ret);
+                rdma_destroy_id(ic->i_cm_id);
+                ic->i_cm_id = NULL;
+        }
+out:
+        return ret;
+}
+/*
+ * This is so careful about only cleaning up resources that were built up
+ * so that it can be called at any point during startup.  In fact it
+ * can be called multiple times for a given connection.
+ */
+void rds_ib_conn_shutdown(struct rds_connection *conn)
+{
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        int err = 0;
+        rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
+                 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
+                 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
+        if (ic->i_cm_id) {
+                struct ib_device *dev = ic->i_cm_id->device;
+                rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
+                err = rdma_disconnect(ic->i_cm_id);
+                if (err) {
+                        /* Actually this may happen quite frequently, when
+                         * an outgoing connect raced with an incoming connect.
+                         */
+                        rdsdebug("failed to disconnect, cm: %p err %d\n",
+                                ic->i_cm_id, err);
+                }
+                wait_event(rds_ib_ring_empty_wait,
+                        rds_ib_ring_empty(&ic->i_send_ring) &&
+                        rds_ib_ring_empty(&ic->i_recv_ring));
+                if (ic->i_send_hdrs)
+                        ib_dma_free_coherent(dev,
+                                           ic->i_send_ring.w_nr *
+                                                sizeof(struct rds_header),
+                                           ic->i_send_hdrs,
+                                           ic->i_send_hdrs_dma);
+                if (ic->i_recv_hdrs)
+                        ib_dma_free_coherent(dev,
+                                           ic->i_recv_ring.w_nr *
+                                                sizeof(struct rds_header),
+                                           ic->i_recv_hdrs,
+                                           ic->i_recv_hdrs_dma);
+                if (ic->i_ack)
+                        ib_dma_free_coherent(dev, sizeof(struct rds_header),
+                                             ic->i_ack, ic->i_ack_dma);
+                if (ic->i_sends)
+                        rds_ib_send_clear_ring(ic);
+                if (ic->i_recvs)
+                        rds_ib_recv_clear_ring(ic);
+                if (ic->i_cm_id->qp)
+                        rdma_destroy_qp(ic->i_cm_id);
+                if (ic->i_send_cq)
+                        ib_destroy_cq(ic->i_send_cq);
+                if (ic->i_recv_cq)
+                        ib_destroy_cq(ic->i_recv_cq);
+                rdma_destroy_id(ic->i_cm_id);
+                /*
+                 * Move connection back to the nodev list.
+                 */
+                if (ic->rds_ibdev) {
+                        spin_lock_irq(&ic->rds_ibdev->spinlock);
+                        BUG_ON(list_empty(&ic->ib_node));
+                        list_del(&ic->ib_node);
+                        spin_unlock_irq(&ic->rds_ibdev->spinlock);
+                        spin_lock_irq(&ib_nodev_conns_lock);
+                        list_add_tail(&ic->ib_node, &ib_nodev_conns);
+                        spin_unlock_irq(&ib_nodev_conns_lock);
+                        ic->rds_ibdev = NULL;
+                }
+                ic->i_cm_id = NULL;
+                ic->i_pd = NULL;
+                ic->i_mr = NULL;
+                ic->i_send_cq = NULL;
+                ic->i_recv_cq = NULL;
+                ic->i_send_hdrs = NULL;
+                ic->i_recv_hdrs = NULL;
+                ic->i_ack = NULL;
+        }
+        BUG_ON(ic->rds_ibdev);
+        /* Clear pending transmit */
+        if (ic->i_rm) {
+                rds_message_put(ic->i_rm);
+                ic->i_rm = NULL;
+        }
+        /* Clear the ACK state */
+        clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+        rds_ib_set_64bit(&ic->i_ack_next, 0);
+        ic->i_ack_recv = 0;
+        /* Clear flow control state */
+        ic->i_flowctl = 0;
+        atomic_set(&ic->i_credits, 0);
+        rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
+        rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+        if (ic->i_ibinc) {
+                rds_inc_put(&ic->i_ibinc->ii_inc);
+                ic->i_ibinc = NULL;
+        }
+        vfree(ic->i_sends);
+        ic->i_sends = NULL;
+        vfree(ic->i_recvs);
+        ic->i_recvs = NULL;
+}
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+        struct rds_ib_connection *ic;
+        unsigned long flags;
+        /* XXX too lazy? */
+        ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
+        if (ic == NULL)
+                return -ENOMEM;
+        INIT_LIST_HEAD(&ic->ib_node);
+        mutex_init(&ic->i_recv_mutex);
+        /*
+         * rds_ib_conn_shutdown() waits for these to be emptied so they
+         * must be initialized before it can be called.
+         */
+        rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
+        rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+        ic->conn = conn;
+        conn->c_transport_data = ic;
+        spin_lock_irqsave(&ib_nodev_conns_lock, flags);
+        list_add_tail(&ic->ib_node, &ib_nodev_conns);
+        spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
+        rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
+        return 0;
+}
+void rds_ib_conn_free(void *arg)
+{
+        struct rds_ib_connection *ic = arg;
+        rdsdebug("ic %p\n", ic);
+        list_del(&ic->ib_node);
+        kfree(ic);
+}
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+        va_list ap;
+        rds_conn_drop(conn);
+        va_start(ap, fmt);
+        vprintk(fmt, ap);
+        va_end(ap);
+}
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
new file mode 100644
index 000000000000..69a6289ed672
--- /dev/null
+++ b/net/rds/ib_rdma.c
@@ -0,0 +1,641 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include "rds.h"
+#include "rdma.h"
+#include "ib.h"
+/*
+ * This is stored as mr->r_trans_private.
+ */
+struct rds_ib_mr {
+        struct rds_ib_device    *device;
+        struct rds_ib_mr_pool   *pool;
+        struct ib_fmr           *fmr;
+        struct list_head        list;
+        unsigned int            remap_count;
+        struct scatterlist      *sg;
+        unsigned int            sg_len;
+        u64                     *dma;
+        int                     sg_dma_len;
+};
+/*
+ * Our own little FMR pool
+ */
+struct rds_ib_mr_pool {
+        struct mutex            flush_lock;             /* serialize fmr invalidate */
+        struct work_struct      flush_worker;           /* flush worker */
+        spinlock_t              list_lock;              /* protect variables below */
+        atomic_t                item_count;             /* total # of MRs */
+        atomic_t                dirty_count;            /* # dirty of MRs */
+        struct list_head        drop_list;              /* MRs that have reached their max_maps limit */
+        struct list_head        free_list;              /* unused MRs */
+        struct list_head        clean_list;             /* unused & unamapped MRs */
+        atomic_t                free_pinned;            /* memory pinned by free MRs */
+        unsigned long           max_items;
+        unsigned long           max_items_soft;
+        unsigned long           max_free_pinned;
+        struct ib_fmr_attr      fmr_attr;
+};
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all);
+static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
+static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
+static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
+{
+        struct rds_ib_device *rds_ibdev;
+        struct rds_ib_ipaddr *i_ipaddr;
+        list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+                spin_lock_irq(&rds_ibdev->spinlock);
+                list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+                        if (i_ipaddr->ipaddr == ipaddr) {
+                                spin_unlock_irq(&rds_ibdev->spinlock);
+                                return rds_ibdev;
+                        }
+                }
+                spin_unlock_irq(&rds_ibdev->spinlock);
+        }
+        return NULL;
+}
+static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+        struct rds_ib_ipaddr *i_ipaddr;
+        i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL);
+        if (!i_ipaddr)
+                return -ENOMEM;
+        i_ipaddr->ipaddr = ipaddr;
+        spin_lock_irq(&rds_ibdev->spinlock);
+        list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
+        spin_unlock_irq(&rds_ibdev->spinlock);
+        return 0;
+}
+static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+        struct rds_ib_ipaddr *i_ipaddr, *next;
+        spin_lock_irq(&rds_ibdev->spinlock);
+        list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) {
+                if (i_ipaddr->ipaddr == ipaddr) {
+                        list_del(&i_ipaddr->list);
+                        kfree(i_ipaddr);
+                        break;
+                }
+        }
+        spin_unlock_irq(&rds_ibdev->spinlock);
+}
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+        struct rds_ib_device *rds_ibdev_old;
+        rds_ibdev_old = rds_ib_get_device(ipaddr);
+        if (rds_ibdev_old)
+                rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+        return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+}
+int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+{
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        /* conn was previously on the nodev_conns_list */
+        spin_lock_irq(&ib_nodev_conns_lock);
+        BUG_ON(list_empty(&ib_nodev_conns));
+        BUG_ON(list_empty(&ic->ib_node));
+        list_del(&ic->ib_node);
+        spin_unlock_irq(&ib_nodev_conns_lock);
+        spin_lock_irq(&rds_ibdev->spinlock);
+        list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
+        spin_unlock_irq(&rds_ibdev->spinlock);
+        ic->rds_ibdev = rds_ibdev;
+        return 0;
+}
+void rds_ib_remove_nodev_conns(void)
+{
+        struct rds_ib_connection *ic, *_ic;
+        LIST_HEAD(tmp_list);
+        /* avoid calling conn_destroy with irqs off */
+        spin_lock_irq(&ib_nodev_conns_lock);
+        list_splice(&ib_nodev_conns, &tmp_list);
+        INIT_LIST_HEAD(&ib_nodev_conns);
+        spin_unlock_irq(&ib_nodev_conns_lock);
+        list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
+                if (ic->conn->c_passive)
+                        rds_conn_destroy(ic->conn->c_passive);
+                rds_conn_destroy(ic->conn);
+        }
+}
+void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev)
+{
+        struct rds_ib_connection *ic, *_ic;
+        LIST_HEAD(tmp_list);
+        /* avoid calling conn_destroy with irqs off */
+        spin_lock_irq(&rds_ibdev->spinlock);
+        list_splice(&rds_ibdev->conn_list, &tmp_list);
+        INIT_LIST_HEAD(&rds_ibdev->conn_list);
+        spin_unlock_irq(&rds_ibdev->spinlock);
+        list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
+                if (ic->conn->c_passive)
+                        rds_conn_destroy(ic->conn->c_passive);
+                rds_conn_destroy(ic->conn);
+        }
+}
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
+{
+        struct rds_ib_mr_pool *pool;
+        pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+        if (!pool)
+                return ERR_PTR(-ENOMEM);
+        INIT_LIST_HEAD(&pool->free_list);
+        INIT_LIST_HEAD(&pool->drop_list);
+        INIT_LIST_HEAD(&pool->clean_list);
+        mutex_init(&pool->flush_lock);
+        spin_lock_init(&pool->list_lock);
+        INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+        pool->fmr_attr.max_pages = fmr_message_size;
+        pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
+        pool->fmr_attr.page_shift = rds_ibdev->fmr_page_shift;
+        pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
+        /* We never allow more than max_items MRs to be allocated.
+         * When we exceed more than max_items_soft, we start freeing
+         * items more aggressively.
+         * Make sure that max_items > max_items_soft > max_items / 2
+         */
+        pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
+        pool->max_items = rds_ibdev->max_fmrs;
+        return pool;
+}
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
+{
+        struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+        iinfo->rdma_mr_max = pool->max_items;
+        iinfo->rdma_mr_size = pool->fmr_attr.max_pages;
+}
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
+{
+        flush_workqueue(rds_wq);
+        rds_ib_flush_mr_pool(pool, 1);
+        BUG_ON(atomic_read(&pool->item_count));
+        BUG_ON(atomic_read(&pool->free_pinned));
+        kfree(pool);
+}
+static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
+{
+        struct rds_ib_mr *ibmr = NULL;
+        unsigned long flags;
+        spin_lock_irqsave(&pool->list_lock, flags);
+        if (!list_empty(&pool->clean_list)) {
+                ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list);
+                list_del_init(&ibmr->list);
+        }
+        spin_unlock_irqrestore(&pool->list_lock, flags);
+        return ibmr;
+}
+static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
+{
+        struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+        struct rds_ib_mr *ibmr = NULL;
+        int err = 0, iter = 0;
+        while (1) {
+                ibmr = rds_ib_reuse_fmr(pool);
+                if (ibmr)
+                        return ibmr;
+                /* No clean MRs - now we have the choice of either
+                 * allocating a fresh MR up to the limit imposed by the
+                 * driver, or flush any dirty unused MRs.
+                 * We try to avoid stalling in the send path if possible,
+                 * so we allocate as long as we're allowed to.
+                 *
+                 * We're fussy with enforcing the FMR limit, though. If the driver
+                 * tells us we can't use more than N fmrs, we shouldn't start
+                 * arguing with it */
+                if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+                        break;
+                atomic_dec(&pool->item_count);
+                if (++iter > 2) {
+                        rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted);
+                        return ERR_PTR(-EAGAIN);
+                }
+                /* We do have some empty MRs. Flush them out. */
+                rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
+                rds_ib_flush_mr_pool(pool, 0);
+        }
+        ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+        if (!ibmr) {
+                err = -ENOMEM;
+                goto out_no_cigar;
+        }
+        ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
+                        (IB_ACCESS_LOCAL_WRITE |
+                         IB_ACCESS_REMOTE_READ |
+                         IB_ACCESS_REMOTE_WRITE),
+                        &pool->fmr_attr);
+        if (IS_ERR(ibmr->fmr)) {
+                err = PTR_ERR(ibmr->fmr);
+                ibmr->fmr = NULL;
+                printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
+                goto out_no_cigar;
+        }
+        rds_ib_stats_inc(s_ib_rdma_mr_alloc);
+        return ibmr;
+out_no_cigar:
+        if (ibmr) {
+                if (ibmr->fmr)
+                        ib_dealloc_fmr(ibmr->fmr);
+                kfree(ibmr);
+        }
+        atomic_dec(&pool->item_count);
+        return ERR_PTR(err);
+}
+static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
+               struct scatterlist *sg, unsigned int nents)
+{
+        struct ib_device *dev = rds_ibdev->dev;
+        struct scatterlist *scat = sg;
+        u64 io_addr = 0;
+        u64 *dma_pages;
+        u32 len;
+        int page_cnt, sg_dma_len;
+        int i, j;
+        int ret;
+        sg_dma_len = ib_dma_map_sg(dev, sg, nents,
+                                 DMA_BIDIRECTIONAL);
+        if (unlikely(!sg_dma_len)) {
+                printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n");
+                return -EBUSY;
+        }
+        len = 0;
+        page_cnt = 0;
+        for (i = 0; i < sg_dma_len; ++i) {
+                unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+                u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+                if (dma_addr & ~rds_ibdev->fmr_page_mask) {
+                        if (i > 0)
+                                return -EINVAL;
+                        else
+                                ++page_cnt;
+                }
+                if ((dma_addr + dma_len) & ~rds_ibdev->fmr_page_mask) {
+                        if (i < sg_dma_len - 1)
+                                return -EINVAL;
+                        else
+                                ++page_cnt;
+                }
+                len += dma_len;
+        }
+        page_cnt += len >> rds_ibdev->fmr_page_shift;
+        if (page_cnt > fmr_message_size)
+                return -EINVAL;
+        dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC);
+        if (!dma_pages)
+                return -ENOMEM;
+        page_cnt = 0;
+        for (i = 0; i < sg_dma_len; ++i) {
+                unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+                u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+                for (j = 0; j < dma_len; j += rds_ibdev->fmr_page_size)
+                        dma_pages[page_cnt++] =
+                                (dma_addr & rds_ibdev->fmr_page_mask) + j;
+        }
+        ret = ib_map_phys_fmr(ibmr->fmr,
+                                   dma_pages, page_cnt, io_addr);
+        if (ret)
+                goto out;
+        /* Success - we successfully remapped the MR, so we can
+         * safely tear down the old mapping. */
+        rds_ib_teardown_mr(ibmr);
+        ibmr->sg = scat;
+        ibmr->sg_len = nents;
+        ibmr->sg_dma_len = sg_dma_len;
+        ibmr->remap_count++;
+        rds_ib_stats_inc(s_ib_rdma_mr_used);
+        ret = 0;
+out:
+        kfree(dma_pages);
+        return ret;
+}
+void rds_ib_sync_mr(void *trans_private, int direction)
+{
+        struct rds_ib_mr *ibmr = trans_private;
+        struct rds_ib_device *rds_ibdev = ibmr->device;
+        switch (direction) {
+        case DMA_FROM_DEVICE:
+                ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
+                        ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+                break;
+        case DMA_TO_DEVICE:
+                ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
+                        ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+                break;
+        }
+}
+static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+        struct rds_ib_device *rds_ibdev = ibmr->device;
+        if (ibmr->sg_dma_len) {
+                ib_dma_unmap_sg(rds_ibdev->dev,
+                                ibmr->sg, ibmr->sg_len,
+                                DMA_BIDIRECTIONAL);
+                ibmr->sg_dma_len = 0;
+        }
+        /* Release the s/g list */
+        if (ibmr->sg_len) {
+                unsigned int i;
+                for (i = 0; i < ibmr->sg_len; ++i) {
+                        struct page *page = sg_page(&ibmr->sg[i]);
+                        /* FIXME we need a way to tell a r/w MR
+                         * from a r/o MR */
+                        set_page_dirty(page);
+                        put_page(page);
+                }
+                kfree(ibmr->sg);
+                ibmr->sg = NULL;
+                ibmr->sg_len = 0;
+        }
+}
+static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+        unsigned int pinned = ibmr->sg_len;
+        __rds_ib_teardown_mr(ibmr);
+        if (pinned) {
+                struct rds_ib_device *rds_ibdev = ibmr->device;
+                struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+                atomic_sub(pinned, &pool->free_pinned);
+        }
+}
+static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
+{
+        unsigned int item_count;
+        item_count = atomic_read(&pool->item_count);
+        if (free_all)
+                return item_count;
+        return 0;
+}
+/*
+ * Flush our pool of MRs.
+ * At a minimum, all currently unused MRs are unmapped.
+ * If the number of MRs allocated exceeds the limit, we also try
+ * to free as many MRs as needed to get back to this limit.
+ */
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
+{
+        struct rds_ib_mr *ibmr, *next;
+        LIST_HEAD(unmap_list);
+        LIST_HEAD(fmr_list);
+        unsigned long unpinned = 0;
+        unsigned long flags;
+        unsigned int nfreed = 0, ncleaned = 0, free_goal;
+        int ret = 0;
+        rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
+        mutex_lock(&pool->flush_lock);
+        spin_lock_irqsave(&pool->list_lock, flags);
+        /* Get the list of all MRs to be dropped. Ordering matters -
+         * we want to put drop_list ahead of free_list. */
+        list_splice_init(&pool->free_list, &unmap_list);
+        list_splice_init(&pool->drop_list, &unmap_list);
+        if (free_all)
+                list_splice_init(&pool->clean_list, &unmap_list);
+        spin_unlock_irqrestore(&pool->list_lock, flags);
+        free_goal = rds_ib_flush_goal(pool, free_all);
+        if (list_empty(&unmap_list))
+                goto out;
+        /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
+        list_for_each_entry(ibmr, &unmap_list, list)
+                list_add(&ibmr->fmr->list, &fmr_list);
+        ret = ib_unmap_fmr(&fmr_list);
+        if (ret)
+                printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
+        /* Now we can destroy the DMA mapping and unpin any pages */
+        list_for_each_entry_safe(ibmr, next, &unmap_list, list) {
+                unpinned += ibmr->sg_len;
+                __rds_ib_teardown_mr(ibmr);
+                if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
+                        rds_ib_stats_inc(s_ib_rdma_mr_free);
+                        list_del(&ibmr->list);
+                        ib_dealloc_fmr(ibmr->fmr);
+                        kfree(ibmr);
+                        nfreed++;
+                }
+                ncleaned++;
+        }
+        spin_lock_irqsave(&pool->list_lock, flags);
+        list_splice(&unmap_list, &pool->clean_list);
+        spin_unlock_irqrestore(&pool->list_lock, flags);
+        atomic_sub(unpinned, &pool->free_pinned);
+        atomic_sub(ncleaned, &pool->dirty_count);
+        atomic_sub(nfreed, &pool->item_count);
+out:
+        mutex_unlock(&pool->flush_lock);
+        return ret;
+}
+static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
+{
+        struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker);
+        rds_ib_flush_mr_pool(pool, 0);
+}
+void rds_ib_free_mr(void *trans_private, int invalidate)
+{
+        struct rds_ib_mr *ibmr = trans_private;
+        struct rds_ib_device *rds_ibdev = ibmr->device;
+        struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+        unsigned long flags;
+        rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
+        /* Return it to the pool's free list */
+        spin_lock_irqsave(&pool->list_lock, flags);
+        if (ibmr->remap_count >= pool->fmr_attr.max_maps)
+                list_add(&ibmr->list, &pool->drop_list);
+        else
+                list_add(&ibmr->list, &pool->free_list);
+        atomic_add(ibmr->sg_len, &pool->free_pinned);
+        atomic_inc(&pool->dirty_count);
+        spin_unlock_irqrestore(&pool->list_lock, flags);
+        /* If we've pinned too many pages, request a flush */
+        if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
+         || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+                queue_work(rds_wq, &pool->flush_worker);
+        if (invalidate) {
+                if (likely(!in_interrupt())) {
+                        rds_ib_flush_mr_pool(pool, 0);
+                } else {
+                        /* We get here if the user created a MR marked
+                         * as use_once and invalidate at the same time. */
+                        queue_work(rds_wq, &pool->flush_worker);
+                }
+        }
+}
+void rds_ib_flush_mrs(void)
+{
+        struct rds_ib_device *rds_ibdev;
+        list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+                struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+                if (pool)
+                        rds_ib_flush_mr_pool(pool, 0);
+        }
+}
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+                    struct rds_sock *rs, u32 *key_ret)
+{
+        struct rds_ib_device *rds_ibdev;
+        struct rds_ib_mr *ibmr = NULL;
+        int ret;
+        rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
+        if (!rds_ibdev) {
+                ret = -ENODEV;
+                goto out;
+        }
+        if (!rds_ibdev->mr_pool) {
+                ret = -ENODEV;
+                goto out;
+        }
+        ibmr = rds_ib_alloc_fmr(rds_ibdev);
+        if (IS_ERR(ibmr))
+                return ibmr;
+        ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
+        if (ret == 0)
+                *key_ret = ibmr->fmr->rkey;
+        else
+                printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
+        ibmr->device = rds_ibdev;
+ out:
+        if (ret) {
+                if (ibmr)
+                        rds_ib_free_mr(ibmr, 0);
+                ibmr = ERR_PTR(ret);
+        }
+        return ibmr;
+}
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
new file mode 100644
index 000000000000..5061b5502162
--- /dev/null
+++ b/net/rds/ib_recv.c
@@ -0,0 +1,869 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+#include "ib.h"
+static struct kmem_cache *rds_ib_incoming_slab;
+static struct kmem_cache *rds_ib_frag_slab;
+static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
+static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
+{
+        rdsdebug("frag %p page %p\n", frag, frag->f_page);
+        __free_page(frag->f_page);
+        frag->f_page = NULL;
+}
+static void rds_ib_frag_free(struct rds_page_frag *frag)
+{
+        rdsdebug("frag %p page %p\n", frag, frag->f_page);
+        BUG_ON(frag->f_page != NULL);
+        kmem_cache_free(rds_ib_frag_slab, frag);
+}
+/*
+ * We map a page at a time.  Its fragments are posted in order.  This
+ * is called in fragment order as the fragments get send completion events.
+ * Only the last frag in the page performs the unmapping.
+ *
+ * It's OK for ring cleanup to call this in whatever order it likes because
+ * DMA is not in flight and so we can unmap while other ring entries still
+ * hold page references in their frags.
+ */
+static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
+                                   struct rds_ib_recv_work *recv)
+{
+        struct rds_page_frag *frag = recv->r_frag;
+        rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
+        if (frag->f_mapped)
+                ib_dma_unmap_page(ic->i_cm_id->device,
+                               frag->f_mapped,
+                               RDS_FRAG_SIZE, DMA_FROM_DEVICE);
+        frag->f_mapped = 0;
+}
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
+{
+        struct rds_ib_recv_work *recv;
+        u32 i;
+        for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
+                struct ib_sge *sge;
+                recv->r_ibinc = NULL;
+                recv->r_frag = NULL;
+                recv->r_wr.next = NULL;
+                recv->r_wr.wr_id = i;
+                recv->r_wr.sg_list = recv->r_sge;
+                recv->r_wr.num_sge = RDS_IB_RECV_SGE;
+                sge = rds_ib_data_sge(ic, recv->r_sge);
+                sge->addr = 0;
+                sge->length = RDS_FRAG_SIZE;
+                sge->lkey = ic->i_mr->lkey;
+                sge = rds_ib_header_sge(ic, recv->r_sge);
+                sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+                sge->length = sizeof(struct rds_header);
+                sge->lkey = ic->i_mr->lkey;
+        }
+}
+static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
+                                  struct rds_ib_recv_work *recv)
+{
+        if (recv->r_ibinc) {
+                rds_inc_put(&recv->r_ibinc->ii_inc);
+                recv->r_ibinc = NULL;
+        }
+        if (recv->r_frag) {
+                rds_ib_recv_unmap_page(ic, recv);
+                if (recv->r_frag->f_page)
+                        rds_ib_frag_drop_page(recv->r_frag);
+                rds_ib_frag_free(recv->r_frag);
+                recv->r_frag = NULL;
+        }
+}
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
+{
+        u32 i;
+        for (i = 0; i < ic->i_recv_ring.w_nr; i++)
+                rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
+        if (ic->i_frag.f_page)
+                rds_ib_frag_drop_page(&ic->i_frag);
+}
+static int rds_ib_recv_refill_one(struct rds_connection *conn,
+                                  struct rds_ib_recv_work *recv,
+                                  gfp_t kptr_gfp, gfp_t page_gfp)
+{
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        dma_addr_t dma_addr;
+        struct ib_sge *sge;
+        int ret = -ENOMEM;
+        if (recv->r_ibinc == NULL) {
+                if (atomic_read(&rds_ib_allocation) >= rds_ib_sysctl_max_recv_allocation) {
+                        rds_ib_stats_inc(s_ib_rx_alloc_limit);
+                        goto out;
+                }
+                recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab,
+                                                 kptr_gfp);
+                if (recv->r_ibinc == NULL)
+                        goto out;
+                atomic_inc(&rds_ib_allocation);
+                INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
+                rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
+        }
+        if (recv->r_frag == NULL) {
+                recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp);
+                if (recv->r_frag == NULL)
+                        goto out;
+                INIT_LIST_HEAD(&recv->r_frag->f_item);
+                recv->r_frag->f_page = NULL;
+        }
+        if (ic->i_frag.f_page == NULL) {
+                ic->i_frag.f_page = alloc_page(page_gfp);
+                if (ic->i_frag.f_page == NULL)
+                        goto out;
+                ic->i_frag.f_offset = 0;
+        }
+        dma_addr = ib_dma_map_page(ic->i_cm_id->device,
+                                  ic->i_frag.f_page,
+                                  ic->i_frag.f_offset,
+                                  RDS_FRAG_SIZE,
+                                  DMA_FROM_DEVICE);
+        if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
+                goto out;
+        /*
+         * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
+         * must be called on this recv.  This happens as completions hit
+         * in order or on connection shutdown.
+         */
+        recv->r_frag->f_page = ic->i_frag.f_page;
+        recv->r_frag->f_offset = ic->i_frag.f_offset;
+        recv->r_frag->f_mapped = dma_addr;
+        sge = rds_ib_data_sge(ic, recv->r_sge);
+        sge->addr = dma_addr;
+        sge->length = RDS_FRAG_SIZE;
+        sge = rds_ib_header_sge(ic, recv->r_sge);
+        sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
+        sge->length = sizeof(struct rds_header);
+        get_page(recv->r_frag->f_page);
+        if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
+                ic->i_frag.f_offset += RDS_FRAG_SIZE;
+        } else {
+                put_page(ic->i_frag.f_page);
+                ic->i_frag.f_page = NULL;
+                ic->i_frag.f_offset = 0;
+        }
+        ret = 0;
+out:
+        return ret;
+}
+/*
+ * This tries to allocate and post unused work requests after making sure that
+ * they have all the allocations they need to queue received fragments into
+ * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
+ * pairs don't go unmatched.
+ *
+ * -1 is returned if posting fails due to temporary resource exhaustion.
+ */
+int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+                       gfp_t page_gfp, int prefill)
+{
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        struct rds_ib_recv_work *recv;
+        struct ib_recv_wr *failed_wr;
+        unsigned int posted = 0;
+        int ret = 0;
+        u32 pos;
+        while ((prefill || rds_conn_up(conn))
+                        && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+                if (pos >= ic->i_recv_ring.w_nr) {
+                        printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
+                                        pos);
+                        ret = -EINVAL;
+                        break;
+                }
+                recv = &ic->i_recvs[pos];
+                ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
+                if (ret) {
+                        ret = -1;
+                        break;
+                }
+                /* XXX when can this fail? */
+                ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
+                rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
+                         recv->r_ibinc, recv->r_frag->f_page,
+                         (long) recv->r_frag->f_mapped, ret);
+                if (ret) {
+                        rds_ib_conn_error(conn, "recv post on "
+                               "%pI4 returned %d, disconnecting and "
+                               "reconnecting\n", &conn->c_faddr,
+                               ret);
+                        ret = -1;
+                        break;
+                }
+                posted++;
+        }
+        /* We're doing flow control - update the window. */
+        if (ic->i_flowctl && posted)
+                rds_ib_advertise_credits(conn, posted);
+        if (ret)
+                rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
+        return ret;
+}
+void rds_ib_inc_purge(struct rds_incoming *inc)
+{
+        struct rds_ib_incoming *ibinc;
+        struct rds_page_frag *frag;
+        struct rds_page_frag *pos;
+        ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+        rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
+        list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+                list_del_init(&frag->f_item);
+                rds_ib_frag_drop_page(frag);
+                rds_ib_frag_free(frag);
+        }
+}
+void rds_ib_inc_free(struct rds_incoming *inc)
+{
+        struct rds_ib_incoming *ibinc;
+        ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+        rds_ib_inc_purge(inc);
+        rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+        BUG_ON(!list_empty(&ibinc->ii_frags));
+        kmem_cache_free(rds_ib_incoming_slab, ibinc);
+        atomic_dec(&rds_ib_allocation);
+        BUG_ON(atomic_read(&rds_ib_allocation) < 0);
+}
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+                            size_t size)
+{
+        struct rds_ib_incoming *ibinc;
+        struct rds_page_frag *frag;
+        struct iovec *iov = first_iov;
+        unsigned long to_copy;
+        unsigned long frag_off = 0;
+        unsigned long iov_off = 0;
+        int copied = 0;
+        int ret;
+        u32 len;
+        ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+        frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+        len = be32_to_cpu(inc->i_hdr.h_len);
+        while (copied < size && copied < len) {
+                if (frag_off == RDS_FRAG_SIZE) {
+                        frag = list_entry(frag->f_item.next,
+                                          struct rds_page_frag, f_item);
+                        frag_off = 0;
+                }
+                while (iov_off == iov->iov_len) {
+                        iov_off = 0;
+                        iov++;
+                }
+                to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+                to_copy = min_t(size_t, to_copy, size - copied);
+                to_copy = min_t(unsigned long, to_copy, len - copied);
+                rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+                         "[%p, %lu] + %lu\n",
+                         to_copy, iov->iov_base, iov->iov_len, iov_off,
+                         frag->f_page, frag->f_offset, frag_off);
+                /* XXX needs + offset for multiple recvs per page */
+                ret = rds_page_copy_to_user(frag->f_page,
+                                            frag->f_offset + frag_off,
+                                            iov->iov_base + iov_off,
+                                            to_copy);
+                if (ret) {
+                        copied = ret;
+                        break;
+                }
+                iov_off += to_copy;
+                frag_off += to_copy;
+                copied += to_copy;
+        }
+        return copied;
+}
+/* ic starts out kzalloc()ed */
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
+{
+        struct ib_send_wr *wr = &ic->i_ack_wr;
+        struct ib_sge *sge = &ic->i_ack_sge;
+        sge->addr = ic->i_ack_dma;
+        sge->length = sizeof(struct rds_header);
+        sge->lkey = ic->i_mr->lkey;
+        wr->sg_list = sge;
+        wr->num_sge = 1;
+        wr->opcode = IB_WR_SEND;
+        wr->wr_id = RDS_IB_ACK_WR_ID;
+        wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+}
+/*
+ * You'd think that with reliable IB connections you wouldn't need to ack
+ * messages that have been received.  The problem is that IB hardware generates
+ * an ack message before it has DMAed the message into memory.  This creates a
+ * potential message loss if the HCA is disabled for any reason between when it
+ * sends the ack and before the message is DMAed and processed.  This is only a
+ * potential issue if another HCA is available for fail-over.
+ *
+ * When the remote host receives our ack they'll free the sent message from
+ * their send queue.  To decrease the latency of this we always send an ack
+ * immediately after we've received messages.
+ *
+ * For simplicity, we only have one ack in flight at a time.  This puts
+ * pressure on senders to have deep enough send queues to absorb the latency of
+ * a single ack frame being in flight.  This might not be good enough.
+ *
+ * This is implemented by have a long-lived send_wr and sge which point to a
+ * statically allocated ack frame.  This ack wr does not fall under the ring
+ * accounting that the tx and rx wrs do.  The QP attribute specifically makes
+ * room for it beyond the ring size.  Send completion notices its special
+ * wr_id and avoids working with the ring in that case.
+ */
+static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
+                                int ack_required)
+{
+        rds_ib_set_64bit(&ic->i_ack_next, seq);
+        if (ack_required) {
+                smp_mb__before_clear_bit();
+                set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+        }
+}
+static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
+{
+        clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+        smp_mb__after_clear_bit();
+        return ic->i_ack_next;
+}
+static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
+{
+        struct rds_header *hdr = ic->i_ack;
+        struct ib_send_wr *failed_wr;
+        u64 seq;
+        int ret;
+        seq = rds_ib_get_ack(ic);
+        rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+        rds_message_populate_header(hdr, 0, 0, 0);
+        hdr->h_ack = cpu_to_be64(seq);
+        hdr->h_credit = adv_credits;
+        rds_message_make_checksum(hdr);
+        ic->i_ack_queued = jiffies;
+        ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
+        if (unlikely(ret)) {
+                /* Failed to send. Release the WR, and
+                 * force another ACK.
+                 */
+                clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+                set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+                rds_ib_stats_inc(s_ib_ack_send_failure);
+                /* Need to finesse this later. */
+                BUG();
+        } else
+                rds_ib_stats_inc(s_ib_ack_sent);
+}
+/*
+ * There are 3 ways of getting acknowledgements to the peer:
+ *  1.  We call rds_ib_attempt_ack from the recv completion handler
+ *      to send an ACK-only frame.
+ *      However, there can be only one such frame in the send queue
+ *      at any time, so we may have to postpone it.
+ *  2.  When another (data) packet is transmitted while there's
+ *      an ACK in the queue, we piggyback the ACK sequence number
+ *      on the data packet.
+ *  3.  If the ACK WR is done sending, we get called from the
+ *      send queue completion handler, and check whether there's
+ *      another ACK pending (postponed because the WR was on the
+ *      queue). If so, we transmit it.
+ *
+ * We maintain 2 variables:
+ *  -   i_ack_flags, which keeps track of whether the ACK WR
+ *      is currently in the send queue or not (IB_ACK_IN_FLIGHT)
+ *  -   i_ack_next, which is the last sequence number we received
+ *
+ * Potentially, send queue and receive queue handlers can run concurrently.
+ *
+ * Reconnecting complicates this picture just slightly. When we
+ * reconnect, we may be seeing duplicate packets. The peer
+ * is retransmitting them, because it hasn't seen an ACK for
+ * them. It is important that we ACK these.
+ *
+ * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
+ * this flag set *MUST* be acknowledged immediately.
+ */
+/*
+ * When we get here, we're called from the recv queue handler.
+ * Check whether we ought to transmit an ACK.
+ */
+void rds_ib_attempt_ack(struct rds_ib_connection *ic)
+{
+        unsigned int adv_credits;
+        if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+                return;
+        if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
+                rds_ib_stats_inc(s_ib_ack_send_delayed);
+                return;
+        }
+        /* Can we get a send credit? */
+        if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
+                rds_ib_stats_inc(s_ib_tx_throttle);
+                clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+                return;
+        }
+        clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+        rds_ib_send_ack(ic, adv_credits);
+}
+/*
+ * We get here from the send completion handler, when the
+ * adapter tells us the ACK frame was sent.
+ */
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
+{
+        clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+        rds_ib_attempt_ack(ic);
+}
+/*
+ * This is called by the regular xmit code when it wants to piggyback
+ * an ACK on an outgoing frame.
+ */
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
+{
+        if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+                rds_ib_stats_inc(s_ib_ack_send_piggybacked);
+        return rds_ib_get_ack(ic);
+}
+/*
+ * It's kind of lame that we're copying from the posted receive pages into
+ * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
+ * them.  But receiving new congestion bitmaps should be a *rare* event, so
+ * hopefully we won't need to invest that complexity in making it more
+ * efficient.  By copying we can share a simpler core with TCP which has to
+ * copy.
+ */
+static void rds_ib_cong_recv(struct rds_connection *conn,
+                              struct rds_ib_incoming *ibinc)
+{
+        struct rds_cong_map *map;
+        unsigned int map_off;
+        unsigned int map_page;
+        struct rds_page_frag *frag;
+        unsigned long frag_off;
+        unsigned long to_copy;
+        unsigned long copied;
+        uint64_t uncongested = 0;
+        void *addr;
+        /* catch completely corrupt packets */
+        if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+                return;
+        map = conn->c_fcong;
+        map_page = 0;
+        map_off = 0;
+        frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+        frag_off = 0;
+        copied = 0;
+        while (copied < RDS_CONG_MAP_BYTES) {
+                uint64_t *src, *dst;
+                unsigned int k;
+                to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
+                BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
+                addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
+                src = addr + frag_off;
+                dst = (void *)map->m_page_addrs[map_page] + map_off;
+                for (k = 0; k < to_copy; k += 8) {
+                        /* Record ports that became uncongested, ie
+                         * bits that changed from 0 to 1. */
+                        uncongested |= ~(*src) & *dst;
+                        *dst++ = *src++;
+                }
+                kunmap_atomic(addr, KM_SOFTIRQ0);
+                copied += to_copy;
+                map_off += to_copy;
+                if (map_off == PAGE_SIZE) {
+                        map_off = 0;
+                        map_page++;
+                }
+                frag_off += to_copy;
+                if (frag_off == RDS_FRAG_SIZE) {
+                        frag = list_entry(frag->f_item.next,
+                                          struct rds_page_frag, f_item);
+                        frag_off = 0;
+                }
+        }
+        /* the congestion map is in little endian order */
+        uncongested = le64_to_cpu(uncongested);
+        rds_cong_map_updated(map, uncongested);
+}
+/*
+ * Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rds_ib_ack_state {
+        u64             ack_next;
+        u64             ack_recv;
+        unsigned int    ack_required:1;
+        unsigned int    ack_next_valid:1;
+        unsigned int    ack_recv_valid:1;
+};
+static void rds_ib_process_recv(struct rds_connection *conn,
+                                struct rds_ib_recv_work *recv, u32 byte_len,
+                                struct rds_ib_ack_state *state)
+{
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        struct rds_ib_incoming *ibinc = ic->i_ibinc;
+        struct rds_header *ihdr, *hdr;
+        /* XXX shut down the connection if port 0,0 are seen? */
+        rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
+                 byte_len);
+        if (byte_len < sizeof(struct rds_header)) {
+                rds_ib_conn_error(conn, "incoming message "
+                       "from %pI4 didn't inclue a "
+                       "header, disconnecting and "
+                       "reconnecting\n",
+                       &conn->c_faddr);
+                return;
+        }
+        byte_len -= sizeof(struct rds_header);
+        ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+        /* Validate the checksum. */
+        if (!rds_message_verify_checksum(ihdr)) {
+                rds_ib_conn_error(conn, "incoming message "
+                       "from %pI4 has corrupted header - "
+                       "forcing a reconnect\n",
+                       &conn->c_faddr);
+                rds_stats_inc(s_recv_drop_bad_checksum);
+                return;
+        }
+        /* Process the ACK sequence which comes with every packet */
+        state->ack_recv = be64_to_cpu(ihdr->h_ack);
+        state->ack_recv_valid = 1;
+        /* Process the credits update if there was one */
+        if (ihdr->h_credit)
+                rds_ib_send_add_credits(conn, ihdr->h_credit);
+        if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
+                /* This is an ACK-only packet. The fact that it gets
+                 * special treatment here is that historically, ACKs
+                 * were rather special beasts.
+                 */
+                rds_ib_stats_inc(s_ib_ack_received);
+                /*
+                 * Usually the frags make their way on to incs and are then freed as
+                 * the inc is freed.  We don't go that route, so we have to drop the
+                 * page ref ourselves.  We can't just leave the page on the recv
+                 * because that confuses the dma mapping of pages and each recv's use
+                 * of a partial page.  We can leave the frag, though, it will be
+                 * reused.
+                 *
+                 * FIXME: Fold this into the code path below.
+                 */
+                rds_ib_frag_drop_page(recv->r_frag);
+                return;
+        }
+        /*
+         * If we don't already have an inc on the connection then this
+         * fragment has a header and starts a message.. copy its header
+         * into the inc and save the inc so we can hang upcoming fragments
+         * off its list.
+         */
+        if (ibinc == NULL) {
+                ibinc = recv->r_ibinc;
+                recv->r_ibinc = NULL;
+                ic->i_ibinc = ibinc;
+                hdr = &ibinc->ii_inc.i_hdr;
+                memcpy(hdr, ihdr, sizeof(*hdr));
+                ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+                rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
+                         ic->i_recv_data_rem, hdr->h_flags);
+        } else {
+                hdr = &ibinc->ii_inc.i_hdr;
+                /* We can't just use memcmp here; fragments of a
+                 * single message may carry different ACKs */
+                if (hdr->h_sequence != ihdr->h_sequence
+                 || hdr->h_len != ihdr->h_len
+                 || hdr->h_sport != ihdr->h_sport
+                 || hdr->h_dport != ihdr->h_dport) {
+                        rds_ib_conn_error(conn,
+                                "fragment header mismatch; forcing reconnect\n");
+                        return;
+                }
+        }
+        list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
+        recv->r_frag = NULL;
+        if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
+                ic->i_recv_data_rem -= RDS_FRAG_SIZE;
+        else {
+                ic->i_recv_data_rem = 0;
+                ic->i_ibinc = NULL;
+                if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+                        rds_ib_cong_recv(conn, ibinc);
+                else {
+                        rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+                                          &ibinc->ii_inc, GFP_ATOMIC,
+                                          KM_SOFTIRQ0);
+                        state->ack_next = be64_to_cpu(hdr->h_sequence);
+                        state->ack_next_valid = 1;
+                }
+                /* Evaluate the ACK_REQUIRED flag *after* we received
+                 * the complete frame, and after bumping the next_rx
+                 * sequence. */
+                if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
+                        rds_stats_inc(s_recv_ack_required);
+                        state->ack_required = 1;
+                }
+                rds_inc_put(&ibinc->ii_inc);
+        }
+}
+/*
+ * Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring.  Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rds_recv_incoming() per RDS connection.
+ */
+void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+        struct rds_connection *conn = context;
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        struct ib_wc wc;
+        struct rds_ib_ack_state state = { 0, };
+        struct rds_ib_recv_work *recv;
+        rdsdebug("conn %p cq %p\n", conn, cq);
+        rds_ib_stats_inc(s_ib_rx_cq_call);
+        ib_req_notify_cq(cq, IB_CQ_SOLICITED);
+        while (ib_poll_cq(cq, 1, &wc) > 0) {
+                rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+                         (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+                         be32_to_cpu(wc.ex.imm_data));
+                rds_ib_stats_inc(s_ib_rx_cq_event);
+                recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
+                rds_ib_recv_unmap_page(ic, recv);
+                /*
+                 * Also process recvs in connecting state because it is possible
+                 * to get a recv completion _before_ the rdmacm ESTABLISHED
+                 * event is processed.
+                 */
+                if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
+                        /* We expect errors as the qp is drained during shutdown */
+                        if (wc.status == IB_WC_SUCCESS) {
+                                rds_ib_process_recv(conn, recv, wc.byte_len, &state);
+                        } else {
+                                rds_ib_conn_error(conn, "recv completion on "
+                                       "%pI4 had status %u, disconnecting and "
+                                       "reconnecting\n", &conn->c_faddr,
+                                       wc.status);
+                        }
+                }
+                rds_ib_ring_free(&ic->i_recv_ring, 1);
+        }
+        if (state.ack_next_valid)
+                rds_ib_set_ack(ic, state.ack_next, state.ack_required);
+        if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+                rds_send_drop_acked(conn, state.ack_recv, NULL);
+                ic->i_ack_recv = state.ack_recv;
+        }
+        if (rds_conn_up(conn))
+                rds_ib_attempt_ack(ic);
+        /* If we ever end up with a really empty receive ring, we're
+         * in deep trouble, as the sender will definitely see RNR
+         * timeouts. */
+        if (rds_ib_ring_empty(&ic->i_recv_ring))
+                rds_ib_stats_inc(s_ib_rx_ring_empty);
+        /*
+         * If the ring is running low, then schedule the thread to refill.
+         */
+        if (rds_ib_ring_low(&ic->i_recv_ring))
+                queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+}
+int rds_ib_recv(struct rds_connection *conn)
+{
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        int ret = 0;
+        rdsdebug("conn %p\n", conn);
+        /*
+         * If we get a temporary posting failure in this context then
+         * we're really low and we want the caller to back off for a bit.
+         */
+        mutex_lock(&ic->i_recv_mutex);
+        if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
+                ret = -ENOMEM;
+        else
+                rds_ib_stats_inc(s_ib_rx_refill_from_thread);
+        mutex_unlock(&ic->i_recv_mutex);
+        if (rds_conn_up(conn))
+                rds_ib_attempt_ack(ic);
+        return ret;
+}
+int __init rds_ib_recv_init(void)
+{
+        struct sysinfo si;
+        int ret = -ENOMEM;
+        /* Default to 30% of all available RAM for recv memory */
+        si_meminfo(&si);
+        rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
+        rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
+                                        sizeof(struct rds_ib_incoming),
+                                        0, 0, NULL);
+        if (rds_ib_incoming_slab == NULL)
+                goto out;
+        rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
+                                        sizeof(struct rds_page_frag),
+                                        0, 0, NULL);
+        if (rds_ib_frag_slab == NULL)
+                kmem_cache_destroy(rds_ib_incoming_slab);
+        else
+                ret = 0;
+out:
+        return ret;
+}
+void rds_ib_recv_exit(void)
+{
+        kmem_cache_destroy(rds_ib_incoming_slab);
+        kmem_cache_destroy(rds_ib_frag_slab);
+}
diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c
new file mode 100644
index 000000000000..99a6ccae964c
--- /dev/null
+++ b/net/rds/ib_ring.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include "rds.h"
+#include "ib.h"
+/*
+ * Locking for IB rings.
+ * We assume that allocation is always protected by a mutex
+ * in the caller (this is a valid assumption for the current
+ * implementation).
+ *
+ * Freeing always happens in an interrupt, and hence only
+ * races with allocations, but not with other free()s.
+ *
+ * The interaction between allocation and freeing is that
+ * the alloc code has to determine the number of free entries.
+ * To this end, we maintain two counters; an allocation counter
+ * and a free counter. Both are allowed to run freely, and wrap
+ * around.
+ * The number of used entries is always (alloc_ctr - free_ctr) % NR.
+ *
+ * The current implementation makes free_ctr atomic. When the
+ * caller finds an allocation fails, it should set an "alloc fail"
+ * bit and retry the allocation. The "alloc fail" bit essentially tells
+ * the CQ completion handlers to wake it up after freeing some
+ * more entries.
+ */
+/*
+ * This only happens on shutdown.
+ */
+DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait);
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr)
+{
+        memset(ring, 0, sizeof(*ring));
+        ring->w_nr = nr;
+        rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
+}
+static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring)
+{
+        u32 diff;
+        /* This assumes that atomic_t has at least as many bits as u32 */
+        diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
+        BUG_ON(diff > ring->w_nr);
+        return diff;
+}
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr)
+{
+        /* We only ever get called from the connection setup code,
+         * prior to creating the QP. */
+        BUG_ON(__rds_ib_ring_used(ring));
+        ring->w_nr = nr;
+}
+static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+        return __rds_ib_ring_used(ring) == 0;
+}
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos)
+{
+        u32 ret = 0, avail;
+        avail = ring->w_nr - __rds_ib_ring_used(ring);
+        rdsdebug("ring %p val %u next %u free %u\n", ring, val,
+                 ring->w_alloc_ptr, avail);
+        if (val && avail) {
+                ret = min(val, avail);
+                *pos = ring->w_alloc_ptr;
+                ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
+                ring->w_alloc_ctr += ret;
+        }
+        return ret;
+}
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val)
+{
+        ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
+        atomic_add(val, &ring->w_free_ctr);
+        if (__rds_ib_ring_empty(ring) &&
+            waitqueue_active(&rds_ib_ring_empty_wait))
+                wake_up(&rds_ib_ring_empty_wait);
+}
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val)
+{
+        ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
+        ring->w_alloc_ctr -= val;
+}
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+        return __rds_ib_ring_empty(ring);
+}
+int rds_ib_ring_low(struct rds_ib_work_ring *ring)
+{
+        return __rds_ib_ring_used(ring) <= (ring->w_nr >> 2);
+}
+/*
+ * returns the oldest alloced ring entry.  This will be the next one
+ * freed.  This can't be called if there are none allocated.
+ */
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring)
+{
+        return ring->w_free_ptr;
+}
+/*
+ * returns the number of completed work requests.
+ */
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest)
+{
+        u32 ret;
+        if (oldest <= (unsigned long long)wr_id)
+                ret = (unsigned long long)wr_id - oldest + 1;
+        else
+                ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
+        rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
+                 wr_id, oldest);
+        return ret;
+}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
new file mode 100644
index 000000000000..cb6c52cb1c4c
--- /dev/null
+++ b/net/rds/ib_send.c
@@ -0,0 +1,874 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include "rds.h"
+#include "rdma.h"
+#include "ib.h"
+static void rds_ib_send_rdma_complete(struct rds_message *rm,
+                                      int wc_status)
+{
+        int notify_status;
+        switch (wc_status) {
+        case IB_WC_WR_FLUSH_ERR:
+                return;
+        case IB_WC_SUCCESS:
+                notify_status = RDS_RDMA_SUCCESS;
+                break;
+        case IB_WC_REM_ACCESS_ERR:
+                notify_status = RDS_RDMA_REMOTE_ERROR;
+                break;
+        default:
+                notify_status = RDS_RDMA_OTHER_ERROR;
+                break;
+        }
+        rds_rdma_send_complete(rm, notify_status);
+}
+static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
+                                   struct rds_rdma_op *op)
+{
+        if (op->r_mapped) {
+                ib_dma_unmap_sg(ic->i_cm_id->device,
+                        op->r_sg, op->r_nents,
+                        op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+                op->r_mapped = 0;
+        }
+}
+static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
+                          struct rds_ib_send_work *send,
+                          int wc_status)
+{
+        struct rds_message *rm = send->s_rm;
+        rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
+        ib_dma_unmap_sg(ic->i_cm_id->device,
+                     rm->m_sg, rm->m_nents,
+                     DMA_TO_DEVICE);
+        if (rm->m_rdma_op != NULL) {
+                rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
+                /* If the user asked for a completion notification on this
+                 * message, we can implement three different semantics:
+                 *  1.  Notify when we received the ACK on the RDS message
+                 *      that was queued with the RDMA. This provides reliable
+                 *      notification of RDMA status at the expense of a one-way
+                 *      packet delay.
+                 *  2.  Notify when the IB stack gives us the completion event for
+                 *      the RDMA operation.
+                 *  3.  Notify when the IB stack gives us the completion event for
+                 *      the accompanying RDS messages.
+                 * Here, we implement approach #3. To implement approach #2,
+                 * call rds_rdma_send_complete from the cq_handler. To implement #1,
+                 * don't call rds_rdma_send_complete at all, and fall back to the notify
+                 * handling in the ACK processing code.
+                 *
+                 * Note: There's no need to explicitly sync any RDMA buffers using
+                 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+                 * operation itself unmapped the RDMA buffers, which takes care
+                 * of synching.
+                 */
+                rds_ib_send_rdma_complete(rm, wc_status);
+                if (rm->m_rdma_op->r_write)
+                        rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
+                else
+                        rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
+        }
+        /* If anyone waited for this message to get flushed out, wake
+         * them up now */
+        rds_message_unmapped(rm);
+        rds_message_put(rm);
+        send->s_rm = NULL;
+}
+void rds_ib_send_init_ring(struct rds_ib_connection *ic)
+{
+        struct rds_ib_send_work *send;
+        u32 i;
+        for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+                struct ib_sge *sge;
+                send->s_rm = NULL;
+                send->s_op = NULL;
+                send->s_wr.wr_id = i;
+                send->s_wr.sg_list = send->s_sge;
+                send->s_wr.num_sge = 1;
+                send->s_wr.opcode = IB_WR_SEND;
+                send->s_wr.send_flags = 0;
+                send->s_wr.ex.imm_data = 0;
+                sge = rds_ib_data_sge(ic, send->s_sge);
+                sge->lkey = ic->i_mr->lkey;
+                sge = rds_ib_header_sge(ic, send->s_sge);
+                sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
+                sge->length = sizeof(struct rds_header);
+                sge->lkey = ic->i_mr->lkey;
+        }
+}
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
+{
+        struct rds_ib_send_work *send;
+        u32 i;
+        for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+                if (send->s_wr.opcode == 0xdead)
+                        continue;
+                if (send->s_rm)
+                        rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
+                if (send->s_op)
+                        rds_ib_send_unmap_rdma(ic, send->s_op);
+        }
+}
+/*
+ * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
+ * operations performed in the send path.  As the sender allocs and potentially
+ * unallocs the next free entry in the ring it doesn't alter which is
+ * the next to be freed, which is what this is concerned with.
+ */
+void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+        struct rds_connection *conn = context;
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        struct ib_wc wc;
+        struct rds_ib_send_work *send;
+        u32 completed;
+        u32 oldest;
+        u32 i = 0;
+        int ret;
+        rdsdebug("cq %p conn %p\n", cq, conn);
+        rds_ib_stats_inc(s_ib_tx_cq_call);
+        ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+        if (ret)
+                rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+        while (ib_poll_cq(cq, 1, &wc) > 0) {
+                rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+                         (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+                         be32_to_cpu(wc.ex.imm_data));
+                rds_ib_stats_inc(s_ib_tx_cq_event);
+                if (wc.wr_id == RDS_IB_ACK_WR_ID) {
+                        if (ic->i_ack_queued + HZ/2 < jiffies)
+                                rds_ib_stats_inc(s_ib_tx_stalled);
+                        rds_ib_ack_send_complete(ic);
+                        continue;
+                }
+                oldest = rds_ib_ring_oldest(&ic->i_send_ring);
+                completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
+                for (i = 0; i < completed; i++) {
+                        send = &ic->i_sends[oldest];
+                        /* In the error case, wc.opcode sometimes contains garbage */
+                        switch (send->s_wr.opcode) {
+                        case IB_WR_SEND:
+                                if (send->s_rm)
+                                        rds_ib_send_unmap_rm(ic, send, wc.status);
+                                break;
+                        case IB_WR_RDMA_WRITE:
+                        case IB_WR_RDMA_READ:
+                                /* Nothing to be done - the SG list will be unmapped
+                                 * when the SEND completes. */
+                                break;
+                        default:
+                                if (printk_ratelimit())
+                                        printk(KERN_NOTICE
+                                                "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
+                                                __func__, send->s_wr.opcode);
+                                break;
+                        }
+                        send->s_wr.opcode = 0xdead;
+                        send->s_wr.num_sge = 1;
+                        if (send->s_queued + HZ/2 < jiffies)
+                                rds_ib_stats_inc(s_ib_tx_stalled);
+                        /* If a RDMA operation produced an error, signal this right
+                         * away. If we don't, the subsequent SEND that goes with this
+                         * RDMA will be canceled with ERR_WFLUSH, and the application
+                         * never learn that the RDMA failed. */
+                        if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
+                                struct rds_message *rm;
+                                rm = rds_send_get_message(conn, send->s_op);
+                                if (rm)
+                                        rds_ib_send_rdma_complete(rm, wc.status);
+                        }
+                        oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+                }
+                rds_ib_ring_free(&ic->i_send_ring, completed);
+                if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
+                 || test_bit(0, &conn->c_map_queued))
+                        queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+                /* We expect errors as the qp is drained during shutdown */
+                if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+                        rds_ib_conn_error(conn,
+                                "send completion on %pI4 "
+                                "had status %u, disconnecting and reconnecting\n",
+                                &conn->c_faddr, wc.status);
+                }
+        }
+}
+/*
+ * This is the main function for allocating credits when sending
+ * messages.
+ *
+ * Conceptually, we have two counters:
+ *  -   send credits: this tells us how many WRs we're allowed
+ *      to submit without overruning the reciever's queue. For
+ *      each SEND WR we post, we decrement this by one.
+ *
+ *  -   posted credits: this tells us how many WRs we recently
+ *      posted to the receive queue. This value is transferred
+ *      to the peer as a "credit update" in a RDS header field.
+ *      Every time we transmit credits to the peer, we subtract
+ *      the amount of transferred credits from this counter.
+ *
+ * It is essential that we avoid situations where both sides have
+ * exhausted their send credits, and are unable to send new credits
+ * to the peer. We achieve this by requiring that we send at least
+ * one credit update to the peer before exhausting our credits.
+ * When new credits arrive, we subtract one credit that is withheld
+ * until we've posted new buffers and are ready to transmit these
+ * credits (see rds_ib_send_add_credits below).
+ *
+ * The RDS send code is essentially single-threaded; rds_send_xmit
+ * grabs c_send_lock to ensure exclusive access to the send ring.
+ * However, the ACK sending code is independent and can race with
+ * message SENDs.
+ *
+ * In the send path, we need to update the counters for send credits
+ * and the counter of posted buffers atomically - when we use the
+ * last available credit, we cannot allow another thread to race us
+ * and grab the posted credits counter.  Hence, we have to use a
+ * spinlock to protect the credit counter, or use atomics.
+ *
+ * Spinlocks shared between the send and the receive path are bad,
+ * because they create unnecessary delays. An early implementation
+ * using a spinlock showed a 5% degradation in throughput at some
+ * loads.
+ *
+ * This implementation avoids spinlocks completely, putting both
+ * counters into a single atomic, and updating that atomic using
+ * atomic_add (in the receive path, when receiving fresh credits),
+ * and using atomic_cmpxchg when updating the two counters.
+ */
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
+                             u32 wanted, u32 *adv_credits, int need_posted)
+{
+        unsigned int avail, posted, got = 0, advertise;
+        long oldval, newval;
+        *adv_credits = 0;
+        if (!ic->i_flowctl)
+                return wanted;
+try_again:
+        advertise = 0;
+        oldval = newval = atomic_read(&ic->i_credits);
+        posted = IB_GET_POST_CREDITS(oldval);
+        avail = IB_GET_SEND_CREDITS(oldval);
+        rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n",
+                        wanted, avail, posted);
+        /* The last credit must be used to send a credit update. */
+        if (avail && !posted)
+                avail--;
+        if (avail < wanted) {
+                struct rds_connection *conn = ic->i_cm_id->context;
+                /* Oops, there aren't that many credits left! */
+                set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+                got = avail;
+        } else {
+                /* Sometimes you get what you want, lalala. */
+                got = wanted;
+        }
+        newval -= IB_SET_SEND_CREDITS(got);
+        /*
+         * If need_posted is non-zero, then the caller wants
+         * the posted regardless of whether any send credits are
+         * available.
+         */
+        if (posted && (got || need_posted)) {
+                advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
+                newval -= IB_SET_POST_CREDITS(advertise);
+        }
+        /* Finally bill everything */
+        if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
+                goto try_again;
+        *adv_credits = advertise;
+        return got;
+}
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
+{
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        if (credits == 0)
+                return;
+        rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n",
+                        credits,
+                        IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
+                        test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
+        atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
+        if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+                queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+        WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
+        rds_ib_stats_inc(s_ib_rx_credit_updates);
+}
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
+{
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        if (posted == 0)
+                return;
+        atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
+        /* Decide whether to send an update to the peer now.
+         * If we would send a credit update for every single buffer we
+         * post, we would end up with an ACK storm (ACK arrives,
+         * consumes buffer, we refill the ring, send ACK to remote
+         * advertising the newly posted buffer... ad inf)
+         *
+         * Performance pretty much depends on how often we send
+         * credit updates - too frequent updates mean lots of ACKs.
+         * Too infrequent updates, and the peer will run out of
+         * credits and has to throttle.
+         * For the time being, 16 seems to be a good compromise.
+         */
+        if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
+                set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+}
+static inline void
+rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
+                struct rds_ib_send_work *send, unsigned int pos,
+                unsigned long buffer, unsigned int length,
+                int send_flags)
+{
+        struct ib_sge *sge;
+        WARN_ON(pos != send - ic->i_sends);
+        send->s_wr.send_flags = send_flags;
+        send->s_wr.opcode = IB_WR_SEND;
+        send->s_wr.num_sge = 2;
+        send->s_wr.next = NULL;
+        send->s_queued = jiffies;
+        send->s_op = NULL;
+        if (length != 0) {
+                sge = rds_ib_data_sge(ic, send->s_sge);
+                sge->addr = buffer;
+                sge->length = length;
+                sge->lkey = ic->i_mr->lkey;
+                sge = rds_ib_header_sge(ic, send->s_sge);
+        } else {
+                /* We're sending a packet with no payload. There is only
+                 * one SGE */
+                send->s_wr.num_sge = 1;
+                sge = &send->s_sge[0];
+        }
+        sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
+        sge->length = sizeof(struct rds_header);
+        sge->lkey = ic->i_mr->lkey;
+}
+/*
+ * This can be called multiple times for a given message.  The first time
+ * we see a message we map its scatterlist into the IB device so that
+ * we can provide that mapped address to the IB scatter gather entries
+ * in the IB work requests.  We translate the scatterlist into a series
+ * of work requests that fragment the message.  These work requests complete
+ * in order so we pass ownership of the message to the completion handler
+ * once we send the final fragment.
+ *
+ * The RDS core uses the c_send_lock to only enter this function once
+ * per connection.  This makes sure that the tx ring alloc/unalloc pairs
+ * don't get out of sync and confuse the ring.
+ */
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+                unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        struct ib_device *dev = ic->i_cm_id->device;
+        struct rds_ib_send_work *send = NULL;
+        struct rds_ib_send_work *first;
+        struct rds_ib_send_work *prev;
+        struct ib_send_wr *failed_wr;
+        struct scatterlist *scat;
+        u32 pos;
+        u32 i;
+        u32 work_alloc;
+        u32 credit_alloc;
+        u32 posted;
+        u32 adv_credits = 0;
+        int send_flags = 0;
+        int sent;
+        int ret;
+        int flow_controlled = 0;
+        BUG_ON(off % RDS_FRAG_SIZE);
+        BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
+        /* FIXME we may overallocate here */
+        if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
+                i = 1;
+        else
+                i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
+        work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+        if (work_alloc == 0) {
+                set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+                rds_ib_stats_inc(s_ib_tx_ring_full);
+                ret = -ENOMEM;
+                goto out;
+        }
+        credit_alloc = work_alloc;
+        if (ic->i_flowctl) {
+                credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0);
+                adv_credits += posted;
+                if (credit_alloc < work_alloc) {
+                        rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
+                        work_alloc = credit_alloc;
+                        flow_controlled++;
+                }
+                if (work_alloc == 0) {
+                        rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+                        rds_ib_stats_inc(s_ib_tx_throttle);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+        }
+        /* map the message the first time we see it */
+        if (ic->i_rm == NULL) {
+                /*
+                printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n",
+                                be16_to_cpu(rm->m_inc.i_hdr.h_dport),
+                                rm->m_inc.i_hdr.h_flags,
+                                be32_to_cpu(rm->m_inc.i_hdr.h_len));
+                   */
+                if (rm->m_nents) {
+                        rm->m_count = ib_dma_map_sg(dev,
+                                         rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
+                        rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
+                        if (rm->m_count == 0) {
+                                rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+                                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+                                ret = -ENOMEM; /* XXX ? */
+                                goto out;
+                        }
+                } else {
+                        rm->m_count = 0;
+                }
+                ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+                ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
+                rds_message_addref(rm);
+                ic->i_rm = rm;
+                /* Finalize the header */
+                if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
+                        rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
+                if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+                        rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+                /* If it has a RDMA op, tell the peer we did it. This is
+                 * used by the peer to release use-once RDMA MRs. */
+                if (rm->m_rdma_op) {
+                        struct rds_ext_header_rdma ext_hdr;
+                        ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
+                        rds_message_add_extension(&rm->m_inc.i_hdr,
+                                        RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+                }
+                if (rm->m_rdma_cookie) {
+                        rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+                                        rds_rdma_cookie_key(rm->m_rdma_cookie),
+                                        rds_rdma_cookie_offset(rm->m_rdma_cookie));
+                }
+                /* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so
+                 * we should not do this unless we have a chance of at least
+                 * sticking the header into the send ring. Which is why we
+                 * should call rds_ib_ring_alloc first. */
+                rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
+                rds_message_make_checksum(&rm->m_inc.i_hdr);
+                /*
+                 * Update adv_credits since we reset the ACK_REQUIRED bit.
+                 */
+                rds_ib_send_grab_credits(ic, 0, &posted, 1);
+                adv_credits += posted;
+                BUG_ON(adv_credits > 255);
+        } else if (ic->i_rm != rm)
+                BUG();
+        send = &ic->i_sends[pos];
+        first = send;
+        prev = NULL;
+        scat = &rm->m_sg[sg];
+        sent = 0;
+        i = 0;
+        /* Sometimes you want to put a fence between an RDMA
+         * READ and the following SEND.
+         * We could either do this all the time
+         * or when requested by the user. Right now, we let
+         * the application choose.
+         */
+        if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+                send_flags = IB_SEND_FENCE;
+        /*
+         * We could be copying the header into the unused tail of the page.
+         * That would need to be changed in the future when those pages might
+         * be mapped userspace pages or page cache pages.  So instead we always
+         * use a second sge and our long-lived ring of mapped headers.  We send
+         * the header after the data so that the data payload can be aligned on
+         * the receiver.
+         */
+        /* handle a 0-len message */
+        if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
+                rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
+                goto add_header;
+        }
+        /* if there's data reference it with a chain of work reqs */
+        for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
+                unsigned int len;
+                send = &ic->i_sends[pos];
+                len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+                rds_ib_xmit_populate_wr(ic, send, pos,
+                                ib_sg_dma_address(dev, scat) + off, len,
+                                send_flags);
+                /*
+                 * We want to delay signaling completions just enough to get
+                 * the batching benefits but not so much that we create dead time
+                 * on the wire.
+                 */
+                if (ic->i_unsignaled_wrs-- == 0) {
+                        ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+                        send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+                }
+                ic->i_unsignaled_bytes -= len;
+                if (ic->i_unsignaled_bytes <= 0) {
+                        ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
+                        send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+                }
+                /*
+                 * Always signal the last one if we're stopping due to flow control.
+                 */
+                if (flow_controlled && i == (work_alloc-1))
+                        send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+                rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+                         &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+                sent += len;
+                off += len;
+                if (off == ib_sg_dma_len(dev, scat)) {
+                        scat++;
+                        off = 0;
+                }
+add_header:
+                /* Tack on the header after the data. The header SGE should already
+                 * have been set up to point to the right header buffer. */
+                memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+                if (0) {
+                        struct rds_header *hdr = &ic->i_send_hdrs[pos];
+                        printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
+                                be16_to_cpu(hdr->h_dport),
+                                hdr->h_flags,
+                                be32_to_cpu(hdr->h_len));
+                }
+                if (adv_credits) {
+                        struct rds_header *hdr = &ic->i_send_hdrs[pos];
+                        /* add credit and redo the header checksum */
+                        hdr->h_credit = adv_credits;
+                        rds_message_make_checksum(hdr);
+                        adv_credits = 0;
+                        rds_ib_stats_inc(s_ib_tx_credit_updates);
+                }
+                if (prev)
+                        prev->s_wr.next = &send->s_wr;
+                prev = send;
+                pos = (pos + 1) % ic->i_send_ring.w_nr;
+        }
+        /* Account the RDS header in the number of bytes we sent, but just once.
+         * The caller has no concept of fragmentation. */
+        if (hdr_off == 0)
+                sent += sizeof(struct rds_header);
+        /* if we finished the message then send completion owns it */
+        if (scat == &rm->m_sg[rm->m_count]) {
+                prev->s_rm = ic->i_rm;
+                prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+                ic->i_rm = NULL;
+        }
+        if (i < work_alloc) {
+                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+                work_alloc = i;
+        }
+        if (ic->i_flowctl && i < credit_alloc)
+                rds_ib_send_add_credits(conn, credit_alloc - i);
+        /* XXX need to worry about failed_wr and partial sends. */
+        failed_wr = &first->s_wr;
+        ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+        rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+                 first, &first->s_wr, ret, failed_wr);
+        BUG_ON(failed_wr != &first->s_wr);
+        if (ret) {
+                printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
+                       "returned %d\n", &conn->c_faddr, ret);
+                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+                if (prev->s_rm) {
+                        ic->i_rm = prev->s_rm;
+                        prev->s_rm = NULL;
+                }
+                /* Finesse this later */
+                BUG();
+                goto out;
+        }
+        ret = sent;
+out:
+        BUG_ON(adv_credits);
+        return ret;
+}
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
+{
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        struct rds_ib_send_work *send = NULL;
+        struct rds_ib_send_work *first;
+        struct rds_ib_send_work *prev;
+        struct ib_send_wr *failed_wr;
+        struct rds_ib_device *rds_ibdev;
+        struct scatterlist *scat;
+        unsigned long len;
+        u64 remote_addr = op->r_remote_addr;
+        u32 pos;
+        u32 work_alloc;
+        u32 i;
+        u32 j;
+        int sent;
+        int ret;
+        int num_sge;
+        rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+        /* map the message the first time we see it */
+        if (!op->r_mapped) {
+                op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
+                                        op->r_sg, op->r_nents, (op->r_write) ?
+                                        DMA_TO_DEVICE : DMA_FROM_DEVICE);
+                rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
+                if (op->r_count == 0) {
+                        rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+                        ret = -ENOMEM; /* XXX ? */
+                        goto out;
+                }
+                op->r_mapped = 1;
+        }
+        /*
+         * Instead of knowing how to return a partial rdma read/write we insist that there
+         * be enough work requests to send the entire message.
+         */
+        i = ceil(op->r_count, rds_ibdev->max_sge);
+        work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+        if (work_alloc != i) {
+                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+                rds_ib_stats_inc(s_ib_tx_ring_full);
+                ret = -ENOMEM;
+                goto out;
+        }
+        send = &ic->i_sends[pos];
+        first = send;
+        prev = NULL;
+        scat = &op->r_sg[0];
+        sent = 0;
+        num_sge = op->r_count;
+        for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
+                send->s_wr.send_flags = 0;
+                send->s_queued = jiffies;
+                /*
+                 * We want to delay signaling completions just enough to get
+                 * the batching benefits but not so much that we create dead time on the wire.
+                 */
+                if (ic->i_unsignaled_wrs-- == 0) {
+                        ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+                        send->s_wr.send_flags = IB_SEND_SIGNALED;
+                }
+                send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+                send->s_wr.wr.rdma.remote_addr = remote_addr;
+                send->s_wr.wr.rdma.rkey = op->r_key;
+                send->s_op = op;
+                if (num_sge > rds_ibdev->max_sge) {
+                        send->s_wr.num_sge = rds_ibdev->max_sge;
+                        num_sge -= rds_ibdev->max_sge;
+                } else {
+                        send->s_wr.num_sge = num_sge;
+                }
+                send->s_wr.next = NULL;
+                if (prev)
+                        prev->s_wr.next = &send->s_wr;
+                for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
+                        len = ib_sg_dma_len(ic->i_cm_id->device, scat);
+                        send->s_sge[j].addr =
+                                 ib_sg_dma_address(ic->i_cm_id->device, scat);
+                        send->s_sge[j].length = len;
+                        send->s_sge[j].lkey = ic->i_mr->lkey;
+                        sent += len;
+                        rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
+                        remote_addr += len;
+                        scat++;
+                }
+                rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+                        &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+                prev = send;
+                if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
+                        send = ic->i_sends;
+        }
+        /* if we finished the message then send completion owns it */
+        if (scat == &op->r_sg[op->r_count])
+                prev->s_wr.send_flags = IB_SEND_SIGNALED;
+        if (i < work_alloc) {
+                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+                work_alloc = i;
+        }
+        failed_wr = &first->s_wr;
+        ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+        rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+                 first, &first->s_wr, ret, failed_wr);
+        BUG_ON(failed_wr != &first->s_wr);
+        if (ret) {
+                printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
+                       "returned %d\n", &conn->c_faddr, ret);
+                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+                goto out;
+        }
+        if (unlikely(failed_wr != &first->s_wr)) {
+                printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+                BUG_ON(failed_wr != &first->s_wr);
+        }
+out:
+        return ret;
+}
+void rds_ib_xmit_complete(struct rds_connection *conn)
+{
+        struct rds_ib_connection *ic = conn->c_transport_data;
+        /* We may have a pending ACK or window update we were unable
+         * to send previously (due to flow control). Try again. */
+        rds_ib_attempt_ack(ic);
+}
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
new file mode 100644
index 000000000000..02e3e3d50d4a
--- /dev/null
+++ b/net/rds/ib_stats.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include "rds.h"
+#include "ib.h"
+DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned;
+static char *rds_ib_stat_names[] = {
+        "ib_connect_raced",
+        "ib_listen_closed_stale",
+        "ib_tx_cq_call",
+        "ib_tx_cq_event",
+        "ib_tx_ring_full",
+        "ib_tx_throttle",
+        "ib_tx_sg_mapping_failure",
+        "ib_tx_stalled",
+        "ib_tx_credit_updates",
+        "ib_rx_cq_call",
+        "ib_rx_cq_event",
+        "ib_rx_ring_empty",
+        "ib_rx_refill_from_cq",
+        "ib_rx_refill_from_thread",
+        "ib_rx_alloc_limit",
+        "ib_rx_credit_updates",
+        "ib_ack_sent",
+        "ib_ack_send_failure",
+        "ib_ack_send_delayed",
+        "ib_ack_send_piggybacked",
+        "ib_ack_received",
+        "ib_rdma_mr_alloc",
+        "ib_rdma_mr_free",
+        "ib_rdma_mr_used",
+        "ib_rdma_mr_pool_flush",
+        "ib_rdma_mr_pool_wait",
+        "ib_rdma_mr_pool_depleted",
+};
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+                                    unsigned int avail)
+{
+        struct rds_ib_statistics stats = {0, };
+        uint64_t *src;
+        uint64_t *sum;
+        size_t i;
+        int cpu;
+        if (avail < ARRAY_SIZE(rds_ib_stat_names))
+                goto out;
+        for_each_online_cpu(cpu) {
+                src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu));
+                sum = (uint64_t *)&stats;
+                for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+                        *(sum++) += *(src++);
+        }
+        rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names,
+                            ARRAY_SIZE(rds_ib_stat_names));
+out:
+        return ARRAY_SIZE(rds_ib_stat_names);
+}
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
new file mode 100644
index 000000000000..d87830db93a0
--- /dev/null
+++ b/net/rds/ib_sysctl.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include "ib.h"
+static struct ctl_table_header *rds_ib_sysctl_hdr;
+unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR;
+unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR;
+unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
+static unsigned long rds_ib_sysctl_max_wr_min = 1;
+/* hardware will fail CQ creation long before this */
+static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0;
+unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
+static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
+static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
+unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
+static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
+static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
+unsigned int rds_ib_sysctl_flow_control = 1;
+ctl_table rds_ib_sysctl_table[] = {
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "max_send_wr",
+                .data           = &rds_ib_sysctl_max_send_wr,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_minmax,
+                .extra1         = &rds_ib_sysctl_max_wr_min,
+                .extra2         = &rds_ib_sysctl_max_wr_max,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "max_recv_wr",
+                .data           = &rds_ib_sysctl_max_recv_wr,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_minmax,
+                .extra1         = &rds_ib_sysctl_max_wr_min,
+                .extra2         = &rds_ib_sysctl_max_wr_max,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "max_unsignaled_wr",
+                .data           = &rds_ib_sysctl_max_unsig_wrs,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_minmax,
+                .extra1         = &rds_ib_sysctl_max_unsig_wr_min,
+                .extra2         = &rds_ib_sysctl_max_unsig_wr_max,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "max_unsignaled_bytes",
+                .data           = &rds_ib_sysctl_max_unsig_bytes,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_minmax,
+                .extra1         = &rds_ib_sysctl_max_unsig_bytes_min,
+                .extra2         = &rds_ib_sysctl_max_unsig_bytes_max,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "max_recv_allocation",
+                .data           = &rds_ib_sysctl_max_recv_allocation,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_minmax,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "flow_control",
+                .data           = &rds_ib_sysctl_flow_control,
+                .maxlen         = sizeof(rds_ib_sysctl_flow_control),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        { .ctl_name = 0}
+};
+static struct ctl_path rds_ib_sysctl_path[] = {
+        { .procname = "net", .ctl_name = CTL_NET, },
+        { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
+        { .procname = "ib", .ctl_name = CTL_UNNUMBERED, },
+        { }
+};
+void rds_ib_sysctl_exit(void)
+{
+        if (rds_ib_sysctl_hdr)
+                unregister_sysctl_table(rds_ib_sysctl_hdr);
+}
+int __init rds_ib_sysctl_init(void)
+{
+        rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
+        if (rds_ib_sysctl_hdr == NULL)
+                return -ENOMEM;
+        return 0;
+}
diff --git a/net/rds/info.c b/net/rds/info.c
new file mode 100644
index 000000000000..1d885535214d
--- /dev/null
+++ b/net/rds/info.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include "rds.h"
+/*
+ * This file implements a getsockopt() call which copies a set of fixed
+ * sized structs into a user-specified buffer as a means of providing
+ * read-only information about RDS.
+ *
+ * For a given information source there are a given number of fixed sized
+ * structs at a given time.  The structs are only copied if the user-specified
+ * buffer is big enough.  The destination pages that make up the buffer
+ * are pinned for the duration of the copy.
+ *
+ * This gives us the following benefits:
+ *
+ * - simple implementation, no copy "position" across multiple calls
+ * - consistent snapshot of an info source
+ * - atomic copy works well with whatever locking info source has
+ * - one portable tool to get rds info across implementations
+ * - long-lived tool can get info without allocating
+ *
+ * at the following costs:
+ *
+ * - info source copy must be pinned, may be "large"
+ */
+struct rds_info_iterator {
+        struct page **pages;
+        void *addr;
+        unsigned long offset;
+};
+static DEFINE_SPINLOCK(rds_info_lock);
+static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1];
+void rds_info_register_func(int optname, rds_info_func func)
+{
+        int offset = optname - RDS_INFO_FIRST;
+        BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+        spin_lock(&rds_info_lock);
+        BUG_ON(rds_info_funcs[offset] != NULL);
+        rds_info_funcs[offset] = func;
+        spin_unlock(&rds_info_lock);
+}
+void rds_info_deregister_func(int optname, rds_info_func func)
+{
+        int offset = optname - RDS_INFO_FIRST;
+        BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+        spin_lock(&rds_info_lock);
+        BUG_ON(rds_info_funcs[offset] != func);
+        rds_info_funcs[offset] = NULL;
+        spin_unlock(&rds_info_lock);
+}
+/*
+ * Typically we hold an atomic kmap across multiple rds_info_copy() calls
+ * because the kmap is so expensive.  This must be called before using blocking
+ * operations while holding the mapping and as the iterator is torn down.
+ */
+void rds_info_iter_unmap(struct rds_info_iterator *iter)
+{
+        if (iter->addr != NULL) {
+                kunmap_atomic(iter->addr, KM_USER0);
+                iter->addr = NULL;
+        }
+}
+/*
+ * get_user_pages() called flush_dcache_page() on the pages for us.
+ */
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+                   unsigned long bytes)
+{
+        unsigned long this;
+        while (bytes) {
+                if (iter->addr == NULL)
+                        iter->addr = kmap_atomic(*iter->pages, KM_USER0);
+                this = min(bytes, PAGE_SIZE - iter->offset);
+                rdsdebug("page %p addr %p offset %lu this %lu data %p "
+                          "bytes %lu\n", *iter->pages, iter->addr,
+                          iter->offset, this, data, bytes);
+                memcpy(iter->addr + iter->offset, data, this);
+                data += this;
+                bytes -= this;
+                iter->offset += this;
+                if (iter->offset == PAGE_SIZE) {
+                        kunmap_atomic(iter->addr, KM_USER0);
+                        iter->addr = NULL;
+                        iter->offset = 0;
+                        iter->pages++;
+                }
+        }
+}
+/*
+ * @optval points to the userspace buffer that the information snapshot
+ * will be copied into.
+ *
+ * @optlen on input is the size of the buffer in userspace.  @optlen
+ * on output is the size of the requested snapshot in bytes.
+ *
+ * This function returns -errno if there is a failure, particularly -ENOSPC
+ * if the given userspace buffer was not large enough to fit the snapshot.
+ * On success it returns the positive number of bytes of each array element
+ * in the snapshot.
+ */
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+                        int __user *optlen)
+{
+        struct rds_info_iterator iter;
+        struct rds_info_lengths lens;
+        unsigned long nr_pages = 0;
+        unsigned long start;
+        unsigned long i;
+        rds_info_func func;
+        struct page **pages = NULL;
+        int ret;
+        int len;
+        int total;
+        if (get_user(len, optlen)) {
+                ret = -EFAULT;
+                goto out;
+        }
+        /* check for all kinds of wrapping and the like */
+        start = (unsigned long)optval;
+        if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /* a 0 len call is just trying to probe its length */
+        if (len == 0)
+                goto call_func;
+        nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
+                        >> PAGE_SHIFT;
+        pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
+        if (pages == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        down_read(&current->mm->mmap_sem);
+        ret = get_user_pages(current, current->mm, start, nr_pages, 1, 0,
+                             pages, NULL);
+        up_read(&current->mm->mmap_sem);
+        if (ret != nr_pages) {
+                if (ret > 0)
+                        nr_pages = ret;
+                else
+                        nr_pages = 0;
+                ret = -EAGAIN; /* XXX ? */
+                goto out;
+        }
+        rdsdebug("len %d nr_pages %lu\n", len, nr_pages);
+call_func:
+        func = rds_info_funcs[optname - RDS_INFO_FIRST];
+        if (func == NULL) {
+                ret = -ENOPROTOOPT;
+                goto out;
+        }
+        iter.pages = pages;
+        iter.addr = NULL;
+        iter.offset = start & (PAGE_SIZE - 1);
+        func(sock, len, &iter, &lens);
+        BUG_ON(lens.each == 0);
+        total = lens.nr * lens.each;
+        rds_info_iter_unmap(&iter);
+        if (total > len) {
+                len = total;
+                ret = -ENOSPC;
+        } else {
+                len = total;
+                ret = lens.each;
+        }
+        if (put_user(len, optlen))
+                ret = -EFAULT;
+out:
+        for (i = 0; pages != NULL && i < nr_pages; i++)
+                put_page(pages[i]);
+        kfree(pages);
+        return ret;
+}
diff --git a/net/rds/info.h b/net/rds/info.h
new file mode 100644
index 000000000000..b6c052ca7d22
--- /dev/null
+++ b/net/rds/info.h
@@ -0,0 +1,30 @@
+#ifndef _RDS_INFO_H
+#define _RDS_INFO_H
+struct rds_info_lengths {
+        unsigned int    nr;
+        unsigned int    each;
+};
+struct rds_info_iterator;
+/*
+ * These functions must fill in the fields of @lens to reflect the size
+ * of the available info source.  If the snapshot fits in @len then it
+ * should be copied using @iter.  The caller will deduce if it was copied
+ * or not by comparing the lengths.
+ */
+typedef void (*rds_info_func)(struct socket *sock, unsigned int len,
+                              struct rds_info_iterator *iter,
+                              struct rds_info_lengths *lens);
+void rds_info_register_func(int optname, rds_info_func func);
+void rds_info_deregister_func(int optname, rds_info_func func);
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+                        int __user *optlen);
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+                   unsigned long bytes);
+void rds_info_iter_unmap(struct rds_info_iterator *iter);
+#endif
diff --git a/net/rds/iw.c b/net/rds/iw.c
new file mode 100644
index 000000000000..1b56905c4c08
--- /dev/null
+++ b/net/rds/iw.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/delay.h>
+#include "rds.h"
+#include "iw.h"
+unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE;
+unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */
+module_param(fastreg_pool_size, int, 0444);
+MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device");
+module_param(fastreg_message_size, int, 0444);
+MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)");
+struct list_head rds_iw_devices;
+DEFINE_SPINLOCK(iw_nodev_conns_lock);
+LIST_HEAD(iw_nodev_conns);
+void rds_iw_add_one(struct ib_device *device)
+{
+        struct rds_iw_device *rds_iwdev;
+        struct ib_device_attr *dev_attr;
+        /* Only handle iwarp devices */
+        if (device->node_type != RDMA_NODE_RNIC)
+                return;
+        dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
+        if (!dev_attr)
+                return;
+        if (ib_query_device(device, dev_attr)) {
+                rdsdebug("Query device failed for %s\n", device->name);
+                goto free_attr;
+        }
+        rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
+        if (!rds_iwdev)
+                goto free_attr;
+        spin_lock_init(&rds_iwdev->spinlock);
+        rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
+        rds_iwdev->max_wrs = dev_attr->max_qp_wr;
+        rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
+        rds_iwdev->page_shift = max(PAGE_SHIFT, ffs(dev_attr->page_size_cap) - 1);
+        rds_iwdev->dev = device;
+        rds_iwdev->pd = ib_alloc_pd(device);
+        if (IS_ERR(rds_iwdev->pd))
+                goto free_dev;
+        if (!rds_iwdev->dma_local_lkey) {
+                if (device->node_type != RDMA_NODE_RNIC) {
+                        rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+                                                IB_ACCESS_LOCAL_WRITE);
+                } else {
+                        rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+                                                IB_ACCESS_REMOTE_READ |
+                                                IB_ACCESS_REMOTE_WRITE |
+                                                IB_ACCESS_LOCAL_WRITE);
+                }
+                if (IS_ERR(rds_iwdev->mr))
+                        goto err_pd;
+        } else
+                rds_iwdev->mr = NULL;
+        rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev);
+        if (IS_ERR(rds_iwdev->mr_pool)) {
+                rds_iwdev->mr_pool = NULL;
+                goto err_mr;
+        }
+        INIT_LIST_HEAD(&rds_iwdev->cm_id_list);
+        INIT_LIST_HEAD(&rds_iwdev->conn_list);
+        list_add_tail(&rds_iwdev->list, &rds_iw_devices);
+        ib_set_client_data(device, &rds_iw_client, rds_iwdev);
+        goto free_attr;
+err_mr:
+        if (rds_iwdev->mr)
+                ib_dereg_mr(rds_iwdev->mr);
+err_pd:
+        ib_dealloc_pd(rds_iwdev->pd);
+free_dev:
+        kfree(rds_iwdev);
+free_attr:
+        kfree(dev_attr);
+}
+void rds_iw_remove_one(struct ib_device *device)
+{
+        struct rds_iw_device *rds_iwdev;
+        struct rds_iw_cm_id *i_cm_id, *next;
+        rds_iwdev = ib_get_client_data(device, &rds_iw_client);
+        if (!rds_iwdev)
+                return;
+        spin_lock_irq(&rds_iwdev->spinlock);
+        list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) {
+                list_del(&i_cm_id->list);
+                kfree(i_cm_id);
+        }
+        spin_unlock_irq(&rds_iwdev->spinlock);
+        rds_iw_remove_conns(rds_iwdev);
+        if (rds_iwdev->mr_pool)
+                rds_iw_destroy_mr_pool(rds_iwdev->mr_pool);
+        if (rds_iwdev->mr)
+                ib_dereg_mr(rds_iwdev->mr);
+        while (ib_dealloc_pd(rds_iwdev->pd)) {
+                rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd);
+                msleep(1);
+        }
+        list_del(&rds_iwdev->list);
+        kfree(rds_iwdev);
+}
+struct ib_client rds_iw_client = {
+        .name   = "rds_iw",
+        .add    = rds_iw_add_one,
+        .remove = rds_iw_remove_one
+};
+static int rds_iw_conn_info_visitor(struct rds_connection *conn,
+                                    void *buffer)
+{
+        struct rds_info_rdma_connection *iinfo = buffer;
+        struct rds_iw_connection *ic;
+        /* We will only ever look at IB transports */
+        if (conn->c_trans != &rds_iw_transport)
+                return 0;
+        iinfo->src_addr = conn->c_laddr;
+        iinfo->dst_addr = conn->c_faddr;
+        memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
+        memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
+        if (rds_conn_state(conn) == RDS_CONN_UP) {
+                struct rds_iw_device *rds_iwdev;
+                struct rdma_dev_addr *dev_addr;
+                ic = conn->c_transport_data;
+                dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+                ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+                ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+                rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+                iinfo->max_send_wr = ic->i_send_ring.w_nr;
+                iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
+                iinfo->max_send_sge = rds_iwdev->max_sge;
+                rds_iw_get_mr_info(rds_iwdev, iinfo);
+        }
+        return 1;
+}
+static void rds_iw_ic_info(struct socket *sock, unsigned int len,
+                           struct rds_info_iterator *iter,
+                           struct rds_info_lengths *lens)
+{
+        rds_for_each_conn_info(sock, len, iter, lens,
+                                rds_iw_conn_info_visitor,
+                                sizeof(struct rds_info_rdma_connection));
+}
+/*
+ * Early RDS/IB was built to only bind to an address if there is an IPoIB
+ * device with that address set.
+ *
+ * If it were me, I'd advocate for something more flexible.  Sending and
+ * receiving should be device-agnostic.  Transports would try and maintain
+ * connections between peers who have messages queued.  Userspace would be
+ * allowed to influence which paths have priority.  We could call userspace
+ * asserting this policy "routing".
+ */
+static int rds_iw_laddr_check(__be32 addr)
+{
+        int ret;
+        struct rdma_cm_id *cm_id;
+        struct sockaddr_in sin;
+        /* Create a CMA ID and try to bind it. This catches both
+         * IB and iWARP capable NICs.
+         */
+        cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+        if (!cm_id)
+                return -EADDRNOTAVAIL;
+        memset(&sin, 0, sizeof(sin));
+        sin.sin_family = AF_INET;
+        sin.sin_addr.s_addr = addr;
+        /* rdma_bind_addr will only succeed for IB & iWARP devices */
+        ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+        /* due to this, we will claim to support IB devices unless we
+           check node_type. */
+        if (ret || cm_id->device->node_type != RDMA_NODE_RNIC)
+                ret = -EADDRNOTAVAIL;
+        rdsdebug("addr %pI4 ret %d node type %d\n",
+                &addr, ret,
+                cm_id->device ? cm_id->device->node_type : -1);
+        rdma_destroy_id(cm_id);
+        return ret;
+}
+void rds_iw_exit(void)
+{
+        rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
+        rds_iw_remove_nodev_conns();
+        ib_unregister_client(&rds_iw_client);
+        rds_iw_sysctl_exit();
+        rds_iw_recv_exit();
+        rds_trans_unregister(&rds_iw_transport);
+}
+struct rds_transport rds_iw_transport = {
+        .laddr_check            = rds_iw_laddr_check,
+        .xmit_complete          = rds_iw_xmit_complete,
+        .xmit                   = rds_iw_xmit,
+        .xmit_cong_map          = NULL,
+        .xmit_rdma              = rds_iw_xmit_rdma,
+        .recv                   = rds_iw_recv,
+        .conn_alloc             = rds_iw_conn_alloc,
+        .conn_free              = rds_iw_conn_free,
+        .conn_connect           = rds_iw_conn_connect,
+        .conn_shutdown          = rds_iw_conn_shutdown,
+        .inc_copy_to_user       = rds_iw_inc_copy_to_user,
+        .inc_purge              = rds_iw_inc_purge,
+        .inc_free               = rds_iw_inc_free,
+        .cm_initiate_connect    = rds_iw_cm_initiate_connect,
+        .cm_handle_connect      = rds_iw_cm_handle_connect,
+        .cm_connect_complete    = rds_iw_cm_connect_complete,
+        .stats_info_copy        = rds_iw_stats_info_copy,
+        .exit                   = rds_iw_exit,
+        .get_mr                 = rds_iw_get_mr,
+        .sync_mr                = rds_iw_sync_mr,
+        .free_mr                = rds_iw_free_mr,
+        .flush_mrs              = rds_iw_flush_mrs,
+        .t_owner                = THIS_MODULE,
+        .t_name                 = "iwarp",
+        .t_prefer_loopback      = 1,
+};
+int __init rds_iw_init(void)
+{
+        int ret;
+        INIT_LIST_HEAD(&rds_iw_devices);
+        ret = ib_register_client(&rds_iw_client);
+        if (ret)
+                goto out;
+        ret = rds_iw_sysctl_init();
+        if (ret)
+                goto out_ibreg;
+        ret = rds_iw_recv_init();
+        if (ret)
+                goto out_sysctl;
+        ret = rds_trans_register(&rds_iw_transport);
+        if (ret)
+                goto out_recv;
+        rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
+        goto out;
+out_recv:
+        rds_iw_recv_exit();
+out_sysctl:
+        rds_iw_sysctl_exit();
+out_ibreg:
+        ib_unregister_client(&rds_iw_client);
+out:
+        return ret;
+}
+MODULE_LICENSE("GPL");
diff --git a/net/rds/iw.h b/net/rds/iw.h
new file mode 100644
index 000000000000..0ddda34f2a1c
--- /dev/null
+++ b/net/rds/iw.h
@@ -0,0 +1,395 @@
+#ifndef _RDS_IW_H
+#define _RDS_IW_H
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+#include "rdma_transport.h"
+#define RDS_FASTREG_SIZE                20
+#define RDS_FASTREG_POOL_SIZE           2048
+#define RDS_IW_MAX_SGE                  8
+#define RDS_IW_RECV_SGE                 2
+#define RDS_IW_DEFAULT_RECV_WR          1024
+#define RDS_IW_DEFAULT_SEND_WR          256
+#define RDS_IW_SUPPORTED_PROTOCOLS      0x00000003      /* minor versions supported */
+extern struct list_head rds_iw_devices;
+/*
+ * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
+ * try and minimize the amount of memory tied up both the device and
+ * socket receive queues.
+ */
+/* page offset of the final full frag that fits in the page */
+#define RDS_PAGE_LAST_OFF (((PAGE_SIZE  / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
+struct rds_page_frag {
+        struct list_head        f_item;
+        struct page             *f_page;
+        unsigned long           f_offset;
+        dma_addr_t              f_mapped;
+};
+struct rds_iw_incoming {
+        struct list_head        ii_frags;
+        struct rds_incoming     ii_inc;
+};
+struct rds_iw_connect_private {
+        /* Add new fields at the end, and don't permute existing fields. */
+        __be32                  dp_saddr;
+        __be32                  dp_daddr;
+        u8                      dp_protocol_major;
+        u8                      dp_protocol_minor;
+        __be16                  dp_protocol_minor_mask; /* bitmask */
+        __be32                  dp_reserved1;
+        __be64                  dp_ack_seq;
+        __be32                  dp_credit;              /* non-zero enables flow ctl */
+};
+struct rds_iw_scatterlist {
+        struct scatterlist      *list;
+        unsigned int            len;
+        int                     dma_len;
+        unsigned int            dma_npages;
+        unsigned int            bytes;
+};
+struct rds_iw_mapping {
+        spinlock_t              m_lock; /* protect the mapping struct */
+        struct list_head        m_list;
+        struct rds_iw_mr        *m_mr;
+        uint32_t                m_rkey;
+        struct rds_iw_scatterlist m_sg;
+};
+struct rds_iw_send_work {
+        struct rds_message      *s_rm;
+        /* We should really put these into a union: */
+        struct rds_rdma_op      *s_op;
+        struct rds_iw_mapping   *s_mapping;
+        struct ib_mr            *s_mr;
+        struct ib_fast_reg_page_list *s_page_list;
+        unsigned char           s_remap_count;
+        struct ib_send_wr       s_wr;
+        struct ib_sge           s_sge[RDS_IW_MAX_SGE];
+        unsigned long           s_queued;
+};
+struct rds_iw_recv_work {
+        struct rds_iw_incoming  *r_iwinc;
+        struct rds_page_frag    *r_frag;
+        struct ib_recv_wr       r_wr;
+        struct ib_sge           r_sge[2];
+};
+struct rds_iw_work_ring {
+        u32             w_nr;
+        u32             w_alloc_ptr;
+        u32             w_alloc_ctr;
+        u32             w_free_ptr;
+        atomic_t        w_free_ctr;
+};
+struct rds_iw_device;
+struct rds_iw_connection {
+        struct list_head        iw_node;
+        struct rds_iw_device    *rds_iwdev;
+        struct rds_connection   *conn;
+        /* alphabet soup, IBTA style */
+        struct rdma_cm_id       *i_cm_id;
+        struct ib_pd            *i_pd;
+        struct ib_mr            *i_mr;
+        struct ib_cq            *i_send_cq;
+        struct ib_cq            *i_recv_cq;
+        /* tx */
+        struct rds_iw_work_ring i_send_ring;
+        struct rds_message      *i_rm;
+        struct rds_header       *i_send_hdrs;
+        u64                     i_send_hdrs_dma;
+        struct rds_iw_send_work *i_sends;
+        /* rx */
+        struct mutex            i_recv_mutex;
+        struct rds_iw_work_ring i_recv_ring;
+        struct rds_iw_incoming  *i_iwinc;
+        u32                     i_recv_data_rem;
+        struct rds_header       *i_recv_hdrs;
+        u64                     i_recv_hdrs_dma;
+        struct rds_iw_recv_work *i_recvs;
+        struct rds_page_frag    i_frag;
+        u64                     i_ack_recv;     /* last ACK received */
+        /* sending acks */
+        unsigned long           i_ack_flags;
+        u64                     i_ack_next;     /* next ACK to send */
+        struct rds_header       *i_ack;
+        struct ib_send_wr       i_ack_wr;
+        struct ib_sge           i_ack_sge;
+        u64                     i_ack_dma;
+        unsigned long           i_ack_queued;
+        /* Flow control related information
+         *
+         * Our algorithm uses a pair variables that we need to access
+         * atomically - one for the send credits, and one posted
+         * recv credits we need to transfer to remote.
+         * Rather than protect them using a slow spinlock, we put both into
+         * a single atomic_t and update it using cmpxchg
+         */
+        atomic_t                i_credits;
+        /* Protocol version specific information */
+        unsigned int            i_flowctl:1;    /* enable/disable flow ctl */
+        unsigned int            i_dma_local_lkey:1;
+        unsigned int            i_fastreg_posted:1; /* fastreg posted on this connection */
+        /* Batched completions */
+        unsigned int            i_unsignaled_wrs;
+        long                    i_unsignaled_bytes;
+};
+/* This assumes that atomic_t is at least 32 bits */
+#define IB_GET_SEND_CREDITS(v)  ((v) & 0xffff)
+#define IB_GET_POST_CREDITS(v)  ((v) >> 16)
+#define IB_SET_SEND_CREDITS(v)  ((v) & 0xffff)
+#define IB_SET_POST_CREDITS(v)  ((v) << 16)
+struct rds_iw_cm_id {
+        struct list_head        list;
+        struct rdma_cm_id       *cm_id;
+};
+struct rds_iw_device {
+        struct list_head        list;
+        struct list_head        cm_id_list;
+        struct list_head        conn_list;
+        struct ib_device        *dev;
+        struct ib_pd            *pd;
+        struct ib_mr            *mr;
+        struct rds_iw_mr_pool   *mr_pool;
+        int                     page_shift;
+        int                     max_sge;
+        unsigned int            max_wrs;
+        unsigned int            dma_local_lkey:1;
+        spinlock_t              spinlock;       /* protect the above */
+};
+/* bits for i_ack_flags */
+#define IB_ACK_IN_FLIGHT        0
+#define IB_ACK_REQUESTED        1
+/* Magic WR_ID for ACKs */
+#define RDS_IW_ACK_WR_ID        ((u64)0xffffffffffffffffULL)
+#define RDS_IW_FAST_REG_WR_ID   ((u64)0xefefefefefefefefULL)
+#define RDS_IW_LOCAL_INV_WR_ID  ((u64)0xdfdfdfdfdfdfdfdfULL)
+struct rds_iw_statistics {
+        uint64_t        s_iw_connect_raced;
+        uint64_t        s_iw_listen_closed_stale;
+        uint64_t        s_iw_tx_cq_call;
+        uint64_t        s_iw_tx_cq_event;
+        uint64_t        s_iw_tx_ring_full;
+        uint64_t        s_iw_tx_throttle;
+        uint64_t        s_iw_tx_sg_mapping_failure;
+        uint64_t        s_iw_tx_stalled;
+        uint64_t        s_iw_tx_credit_updates;
+        uint64_t        s_iw_rx_cq_call;
+        uint64_t        s_iw_rx_cq_event;
+        uint64_t        s_iw_rx_ring_empty;
+        uint64_t        s_iw_rx_refill_from_cq;
+        uint64_t        s_iw_rx_refill_from_thread;
+        uint64_t        s_iw_rx_alloc_limit;
+        uint64_t        s_iw_rx_credit_updates;
+        uint64_t        s_iw_ack_sent;
+        uint64_t        s_iw_ack_send_failure;
+        uint64_t        s_iw_ack_send_delayed;
+        uint64_t        s_iw_ack_send_piggybacked;
+        uint64_t        s_iw_ack_received;
+        uint64_t        s_iw_rdma_mr_alloc;
+        uint64_t        s_iw_rdma_mr_free;
+        uint64_t        s_iw_rdma_mr_used;
+        uint64_t        s_iw_rdma_mr_pool_flush;
+        uint64_t        s_iw_rdma_mr_pool_wait;
+        uint64_t        s_iw_rdma_mr_pool_depleted;
+};
+extern struct workqueue_struct *rds_iw_wq;
+/*
+ * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
+ * doesn't define it.
+ */
+static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev,
+                struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+        unsigned int i;
+        for (i = 0; i < sg_dma_len; ++i) {
+                ib_dma_sync_single_for_cpu(dev,
+                                ib_sg_dma_address(dev, &sg[i]),
+                                ib_sg_dma_len(dev, &sg[i]),
+                                direction);
+        }
+}
+#define ib_dma_sync_sg_for_cpu  rds_iw_dma_sync_sg_for_cpu
+static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev,
+                struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+        unsigned int i;
+        for (i = 0; i < sg_dma_len; ++i) {
+                ib_dma_sync_single_for_device(dev,
+                                ib_sg_dma_address(dev, &sg[i]),
+                                ib_sg_dma_len(dev, &sg[i]),
+                                direction);
+        }
+}
+#define ib_dma_sync_sg_for_device       rds_iw_dma_sync_sg_for_device
+static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
+{
+        return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey;
+}
+/* ib.c */
+extern struct rds_transport rds_iw_transport;
+extern void rds_iw_add_one(struct ib_device *device);
+extern void rds_iw_remove_one(struct ib_device *device);
+extern struct ib_client rds_iw_client;
+extern unsigned int fastreg_pool_size;
+extern unsigned int fastreg_message_size;
+extern spinlock_t iw_nodev_conns_lock;
+extern struct list_head iw_nodev_conns;
+/* ib_cm.c */
+int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp);
+void rds_iw_conn_free(void *arg);
+int rds_iw_conn_connect(struct rds_connection *conn);
+void rds_iw_conn_shutdown(struct rds_connection *conn);
+void rds_iw_state_change(struct sock *sk);
+int __init rds_iw_listen_init(void);
+void rds_iw_listen_stop(void);
+void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
+int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
+                             struct rdma_cm_event *event);
+int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id);
+void rds_iw_cm_connect_complete(struct rds_connection *conn,
+                                struct rdma_cm_event *event);
+#define rds_iw_conn_error(conn, fmt...) \
+        __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt)
+/* ib_rdma.c */
+int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
+int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
+void rds_iw_remove_nodev_conns(void);
+void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev);
+struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *);
+void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo);
+void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *);
+void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
+                    struct rds_sock *rs, u32 *key_ret);
+void rds_iw_sync_mr(void *trans_private, int dir);
+void rds_iw_free_mr(void *trans_private, int invalidate);
+void rds_iw_flush_mrs(void);
+void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
+/* ib_recv.c */
+int __init rds_iw_recv_init(void);
+void rds_iw_recv_exit(void);
+int rds_iw_recv(struct rds_connection *conn);
+int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+                       gfp_t page_gfp, int prefill);
+void rds_iw_inc_purge(struct rds_incoming *inc);
+void rds_iw_inc_free(struct rds_incoming *inc);
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+                             size_t size);
+void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
+void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
+void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
+void rds_iw_attempt_ack(struct rds_iw_connection *ic);
+void rds_iw_ack_send_complete(struct rds_iw_connection *ic);
+u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic);
+/* ib_ring.c */
+void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr);
+void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr);
+u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos);
+void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val);
+void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val);
+int rds_iw_ring_empty(struct rds_iw_work_ring *ring);
+int rds_iw_ring_low(struct rds_iw_work_ring *ring);
+u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring);
+u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest);
+extern wait_queue_head_t rds_iw_ring_empty_wait;
+/* ib_send.c */
+void rds_iw_xmit_complete(struct rds_connection *conn);
+int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
+                unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_iw_send_init_ring(struct rds_iw_connection *ic);
+void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
+void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
+void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
+int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
+                             u32 *adv_credits, int need_posted);
+/* ib_stats.c */
+DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
+#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member)
+unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
+                                    unsigned int avail);
+/* ib_sysctl.c */
+int __init rds_iw_sysctl_init(void);
+void rds_iw_sysctl_exit(void);
+extern unsigned long rds_iw_sysctl_max_send_wr;
+extern unsigned long rds_iw_sysctl_max_recv_wr;
+extern unsigned long rds_iw_sysctl_max_unsig_wrs;
+extern unsigned long rds_iw_sysctl_max_unsig_bytes;
+extern unsigned long rds_iw_sysctl_max_recv_allocation;
+extern unsigned int rds_iw_sysctl_flow_control;
+extern ctl_table rds_iw_sysctl_table[];
+/*
+ * Helper functions for getting/setting the header and data SGEs in
+ * RDS packets (not RDMA)
+ */
+static inline struct ib_sge *
+rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
+{
+        return &sge[0];
+}
+static inline struct ib_sge *
+rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
+{
+        return &sge[1];
+}
+static inline void rds_iw_set_64bit(u64 *ptr, u64 val)
+{
+#if BITS_PER_LONG == 64
+        *ptr = val;
+#else
+        set_64bit(ptr, val);
+#endif
+}
+#endif
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
new file mode 100644
index 000000000000..57ecb3d4b8a5
--- /dev/null
+++ b/net/rds/iw_cm.c
@@ -0,0 +1,750 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/vmalloc.h>
+#include "rds.h"
+#include "iw.h"
+/*
+ * Set the selected protocol version
+ */
+static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version)
+{
+        conn->c_version = version;
+}
+/*
+ * Set up flow control
+ */
+static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits)
+{
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        if (rds_iw_sysctl_flow_control && credits != 0) {
+                /* We're doing flow control */
+                ic->i_flowctl = 1;
+                rds_iw_send_add_credits(conn, credits);
+        } else {
+                ic->i_flowctl = 0;
+        }
+}
+/*
+ * Connection established.
+ * We get here for both outgoing and incoming connection.
+ */
+void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
+{
+        const struct rds_iw_connect_private *dp = NULL;
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        struct rds_iw_device *rds_iwdev;
+        int err;
+        if (event->param.conn.private_data_len) {
+                dp = event->param.conn.private_data;
+                rds_iw_set_protocol(conn,
+                                RDS_PROTOCOL(dp->dp_protocol_major,
+                                        dp->dp_protocol_minor));
+                rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+        }
+        /* update ib_device with this local ipaddr & conn */
+        rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+        err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id);
+        if (err)
+                printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err);
+        err = rds_iw_add_conn(rds_iwdev, conn);
+        if (err)
+                printk(KERN_ERR "rds_iw_add_conn failed (%d)\n", err);
+        /* If the peer gave us the last packet it saw, process this as if
+         * we had received a regular ACK. */
+        if (dp && dp->dp_ack_seq)
+                rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+        printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n",
+                        &conn->c_laddr, &conn->c_faddr,
+                        RDS_PROTOCOL_MAJOR(conn->c_version),
+                        RDS_PROTOCOL_MINOR(conn->c_version),
+                        ic->i_flowctl ? ", flow control" : "");
+        rds_connect_complete(conn);
+}
+static void rds_iw_cm_fill_conn_param(struct rds_connection *conn,
+                        struct rdma_conn_param *conn_param,
+                        struct rds_iw_connect_private *dp,
+                        u32 protocol_version)
+{
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        memset(conn_param, 0, sizeof(struct rdma_conn_param));
+        /* XXX tune these? */
+        conn_param->responder_resources = 1;
+        conn_param->initiator_depth = 1;
+        if (dp) {
+                memset(dp, 0, sizeof(*dp));
+                dp->dp_saddr = conn->c_laddr;
+                dp->dp_daddr = conn->c_faddr;
+                dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
+                dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
+                dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS);
+                dp->dp_ack_seq = rds_iw_piggyb_ack(ic);
+                /* Advertise flow control */
+                if (ic->i_flowctl) {
+                        unsigned int credits;
+                        credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
+                        dp->dp_credit = cpu_to_be32(credits);
+                        atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+                }
+                conn_param->private_data = dp;
+                conn_param->private_data_len = sizeof(*dp);
+        }
+}
+static void rds_iw_cq_event_handler(struct ib_event *event, void *data)
+{
+        rdsdebug("event %u data %p\n", event->event, data);
+}
+static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
+{
+        struct rds_connection *conn = data;
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
+        switch (event->event) {
+        case IB_EVENT_COMM_EST:
+                rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
+                break;
+        case IB_EVENT_QP_REQ_ERR:
+        case IB_EVENT_QP_FATAL:
+        default:
+                rds_iw_conn_error(conn, "RDS/IW: Fatal QP Event %u - connection %pI4->%pI4...reconnecting\n",
+                        event->event, &conn->c_laddr,
+                        &conn->c_faddr);
+                break;
+        }
+}
+/*
+ * Create a QP
+ */
+static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
+                struct rds_iw_device *rds_iwdev,
+                struct rds_iw_work_ring *send_ring,
+                void (*send_cq_handler)(struct ib_cq *, void *),
+                struct rds_iw_work_ring *recv_ring,
+                void (*recv_cq_handler)(struct ib_cq *, void *),
+                void *context)
+{
+        struct ib_device *dev = rds_iwdev->dev;
+        unsigned int send_size, recv_size;
+        int ret;
+        /* The offset of 1 is to accomodate the additional ACK WR. */
+        send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
+        recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
+        rds_iw_ring_resize(send_ring, send_size - 1);
+        rds_iw_ring_resize(recv_ring, recv_size - 1);
+        memset(attr, 0, sizeof(*attr));
+        attr->event_handler = rds_iw_qp_event_handler;
+        attr->qp_context = context;
+        attr->cap.max_send_wr = send_size;
+        attr->cap.max_recv_wr = recv_size;
+        attr->cap.max_send_sge = rds_iwdev->max_sge;
+        attr->cap.max_recv_sge = RDS_IW_RECV_SGE;
+        attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+        attr->qp_type = IB_QPT_RC;
+        attr->send_cq = ib_create_cq(dev, send_cq_handler,
+                                     rds_iw_cq_event_handler,
+                                     context, send_size, 0);
+        if (IS_ERR(attr->send_cq)) {
+                ret = PTR_ERR(attr->send_cq);
+                attr->send_cq = NULL;
+                rdsdebug("ib_create_cq send failed: %d\n", ret);
+                goto out;
+        }
+        attr->recv_cq = ib_create_cq(dev, recv_cq_handler,
+                                     rds_iw_cq_event_handler,
+                                     context, recv_size, 0);
+        if (IS_ERR(attr->recv_cq)) {
+                ret = PTR_ERR(attr->recv_cq);
+                attr->recv_cq = NULL;
+                rdsdebug("ib_create_cq send failed: %d\n", ret);
+                goto out;
+        }
+        ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP);
+        if (ret) {
+                rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+                goto out;
+        }
+        ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED);
+        if (ret) {
+                rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
+                goto out;
+        }
+out:
+        if (ret) {
+                if (attr->send_cq)
+                        ib_destroy_cq(attr->send_cq);
+                if (attr->recv_cq)
+                        ib_destroy_cq(attr->recv_cq);
+        }
+        return ret;
+}
+/*
+ * This needs to be very careful to not leave IS_ERR pointers around for
+ * cleanup to trip over.
+ */
+static int rds_iw_setup_qp(struct rds_connection *conn)
+{
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        struct ib_device *dev = ic->i_cm_id->device;
+        struct ib_qp_init_attr attr;
+        struct rds_iw_device *rds_iwdev;
+        int ret;
+        /* rds_iw_add_one creates a rds_iw_device object per IB device,
+         * and allocates a protection domain, memory range and MR pool
+         * for each.  If that fails for any reason, it will not register
+         * the rds_iwdev at all.
+         */
+        rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
+        if (rds_iwdev == NULL) {
+                if (printk_ratelimit())
+                        printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
+                                        dev->name);
+                return -EOPNOTSUPP;
+        }
+        /* Protection domain and memory range */
+        ic->i_pd = rds_iwdev->pd;
+        ic->i_mr = rds_iwdev->mr;
+        ret = rds_iw_init_qp_attrs(&attr, rds_iwdev,
+                        &ic->i_send_ring, rds_iw_send_cq_comp_handler,
+                        &ic->i_recv_ring, rds_iw_recv_cq_comp_handler,
+                        conn);
+        if (ret < 0)
+                goto out;
+        ic->i_send_cq = attr.send_cq;
+        ic->i_recv_cq = attr.recv_cq;
+        /*
+         * XXX this can fail if max_*_wr is too large?  Are we supposed
+         * to back off until we get a value that the hardware can support?
+         */
+        ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
+        if (ret) {
+                rdsdebug("rdma_create_qp failed: %d\n", ret);
+                goto out;
+        }
+        ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
+                                           ic->i_send_ring.w_nr *
+                                                sizeof(struct rds_header),
+                                           &ic->i_send_hdrs_dma, GFP_KERNEL);
+        if (ic->i_send_hdrs == NULL) {
+                ret = -ENOMEM;
+                rdsdebug("ib_dma_alloc_coherent send failed\n");
+                goto out;
+        }
+        ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
+                                           ic->i_recv_ring.w_nr *
+                                                sizeof(struct rds_header),
+                                           &ic->i_recv_hdrs_dma, GFP_KERNEL);
+        if (ic->i_recv_hdrs == NULL) {
+                ret = -ENOMEM;
+                rdsdebug("ib_dma_alloc_coherent recv failed\n");
+                goto out;
+        }
+        ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
+                                       &ic->i_ack_dma, GFP_KERNEL);
+        if (ic->i_ack == NULL) {
+                ret = -ENOMEM;
+                rdsdebug("ib_dma_alloc_coherent ack failed\n");
+                goto out;
+        }
+        ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
+        if (ic->i_sends == NULL) {
+                ret = -ENOMEM;
+                rdsdebug("send allocation failed\n");
+                goto out;
+        }
+        rds_iw_send_init_ring(ic);
+        ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
+        if (ic->i_recvs == NULL) {
+                ret = -ENOMEM;
+                rdsdebug("recv allocation failed\n");
+                goto out;
+        }
+        rds_iw_recv_init_ring(ic);
+        rds_iw_recv_init_ack(ic);
+        /* Post receive buffers - as a side effect, this will update
+         * the posted credit count. */
+        rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+        rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
+                 ic->i_send_cq, ic->i_recv_cq);
+out:
+        return ret;
+}
+static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp)
+{
+        u16 common;
+        u32 version = 0;
+        /* rdma_cm private data is odd - when there is any private data in the
+         * request, we will be given a pretty large buffer without telling us the
+         * original size. The only way to tell the difference is by looking at
+         * the contents, which are initialized to zero.
+         * If the protocol version fields aren't set, this is a connection attempt
+         * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+         * We really should have changed this for OFED 1.3 :-( */
+        if (dp->dp_protocol_major == 0)
+                return RDS_PROTOCOL_3_0;
+        common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS;
+        if (dp->dp_protocol_major == 3 && common) {
+                version = RDS_PROTOCOL_3_0;
+                while ((common >>= 1) != 0)
+                        version++;
+        } else if (printk_ratelimit()) {
+                printk(KERN_NOTICE "RDS: Connection from %pI4 using "
+                        "incompatible protocol version %u.%u\n",
+                        &dp->dp_saddr,
+                        dp->dp_protocol_major,
+                        dp->dp_protocol_minor);
+        }
+        return version;
+}
+int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
+                                    struct rdma_cm_event *event)
+{
+        const struct rds_iw_connect_private *dp = event->param.conn.private_data;
+        struct rds_iw_connect_private dp_rep;
+        struct rds_connection *conn = NULL;
+        struct rds_iw_connection *ic = NULL;
+        struct rdma_conn_param conn_param;
+        struct rds_iw_device *rds_iwdev;
+        u32 version;
+        int err, destroy = 1;
+        /* Check whether the remote protocol version matches ours. */
+        version = rds_iw_protocol_compatible(dp);
+        if (!version)
+                goto out;
+        rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n",
+                 &dp->dp_saddr, &dp->dp_daddr,
+                 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
+        conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport,
+                               GFP_KERNEL);
+        if (IS_ERR(conn)) {
+                rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
+                conn = NULL;
+                goto out;
+        }
+        /*
+         * The connection request may occur while the
+         * previous connection exist, e.g. in case of failover.
+         * But as connections may be initiated simultaneously
+         * by both hosts, we have a random backoff mechanism -
+         * see the comment above rds_queue_reconnect()
+         */
+        mutex_lock(&conn->c_cm_lock);
+        if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+                if (rds_conn_state(conn) == RDS_CONN_UP) {
+                        rdsdebug("incoming connect while connecting\n");
+                        rds_conn_drop(conn);
+                        rds_iw_stats_inc(s_iw_listen_closed_stale);
+                } else
+                if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
+                        /* Wait and see - our connect may still be succeeding */
+                        rds_iw_stats_inc(s_iw_connect_raced);
+                }
+                mutex_unlock(&conn->c_cm_lock);
+                goto out;
+        }
+        ic = conn->c_transport_data;
+        rds_iw_set_protocol(conn, version);
+        rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+        /* If the peer gave us the last packet it saw, process this as if
+         * we had received a regular ACK. */
+        if (dp->dp_ack_seq)
+                rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+        BUG_ON(cm_id->context);
+        BUG_ON(ic->i_cm_id);
+        ic->i_cm_id = cm_id;
+        cm_id->context = conn;
+        rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client);
+        ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
+        /* We got halfway through setting up the ib_connection, if we
+         * fail now, we have to take the long route out of this mess. */
+        destroy = 0;
+        err = rds_iw_setup_qp(conn);
+        if (err) {
+                rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
+                goto out;
+        }
+        rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+        /* rdma_accept() calls rdma_reject() internally if it fails */
+        err = rdma_accept(cm_id, &conn_param);
+        mutex_unlock(&conn->c_cm_lock);
+        if (err) {
+                rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err);
+                goto out;
+        }
+        return 0;
+out:
+        rdma_reject(cm_id, NULL, 0);
+        return destroy;
+}
+int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id)
+{
+        struct rds_connection *conn = cm_id->context;
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        struct rdma_conn_param conn_param;
+        struct rds_iw_connect_private dp;
+        int ret;
+        /* If the peer doesn't do protocol negotiation, we must
+         * default to RDSv3.0 */
+        rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0);
+        ic->i_flowctl = rds_iw_sysctl_flow_control;     /* advertise flow control */
+        ret = rds_iw_setup_qp(conn);
+        if (ret) {
+                rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret);
+                goto out;
+        }
+        rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
+        ret = rdma_connect(cm_id, &conn_param);
+        if (ret)
+                rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+out:
+        /* Beware - returning non-zero tells the rdma_cm to destroy
+         * the cm_id. We should certainly not do it as long as we still
+         * "own" the cm_id. */
+        if (ret) {
+                struct rds_iw_connection *ic = conn->c_transport_data;
+                if (ic->i_cm_id == cm_id)
+                        ret = 0;
+        }
+        return ret;
+}
+int rds_iw_conn_connect(struct rds_connection *conn)
+{
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        struct rds_iw_device *rds_iwdev;
+        struct sockaddr_in src, dest;
+        int ret;
+        /* XXX I wonder what affect the port space has */
+        /* delegate cm event handler to rdma_transport */
+        ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+                                     RDMA_PS_TCP);
+        if (IS_ERR(ic->i_cm_id)) {
+                ret = PTR_ERR(ic->i_cm_id);
+                ic->i_cm_id = NULL;
+                rdsdebug("rdma_create_id() failed: %d\n", ret);
+                goto out;
+        }
+        rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
+        src.sin_family = AF_INET;
+        src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+        src.sin_port = (__force u16)htons(0);
+        /* First, bind to the local address and device. */
+        ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src);
+        if (ret) {
+                rdsdebug("rdma_bind_addr(%pI4) failed: %d\n",
+                                &conn->c_laddr, ret);
+                rdma_destroy_id(ic->i_cm_id);
+                ic->i_cm_id = NULL;
+                goto out;
+        }
+        rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+        ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
+        dest.sin_family = AF_INET;
+        dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+        dest.sin_port = (__force u16)htons(RDS_PORT);
+        ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
+                                (struct sockaddr *)&dest,
+                                RDS_RDMA_RESOLVE_TIMEOUT_MS);
+        if (ret) {
+                rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
+                         ret);
+                rdma_destroy_id(ic->i_cm_id);
+                ic->i_cm_id = NULL;
+        }
+out:
+        return ret;
+}
+/*
+ * This is so careful about only cleaning up resources that were built up
+ * so that it can be called at any point during startup.  In fact it
+ * can be called multiple times for a given connection.
+ */
+void rds_iw_conn_shutdown(struct rds_connection *conn)
+{
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        int err = 0;
+        struct ib_qp_attr qp_attr;
+        rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
+                 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
+                 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
+        if (ic->i_cm_id) {
+                struct ib_device *dev = ic->i_cm_id->device;
+                rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
+                err = rdma_disconnect(ic->i_cm_id);
+                if (err) {
+                        /* Actually this may happen quite frequently, when
+                         * an outgoing connect raced with an incoming connect.
+                         */
+                        rdsdebug("rds_iw_conn_shutdown: failed to disconnect,"
+                                   " cm: %p err %d\n", ic->i_cm_id, err);
+                }
+                if (ic->i_cm_id->qp) {
+                        qp_attr.qp_state = IB_QPS_ERR;
+                        ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
+                }
+                wait_event(rds_iw_ring_empty_wait,
+                        rds_iw_ring_empty(&ic->i_send_ring) &&
+                        rds_iw_ring_empty(&ic->i_recv_ring));
+                if (ic->i_send_hdrs)
+                        ib_dma_free_coherent(dev,
+                                           ic->i_send_ring.w_nr *
+                                                sizeof(struct rds_header),
+                                           ic->i_send_hdrs,
+                                           ic->i_send_hdrs_dma);
+                if (ic->i_recv_hdrs)
+                        ib_dma_free_coherent(dev,
+                                           ic->i_recv_ring.w_nr *
+                                                sizeof(struct rds_header),
+                                           ic->i_recv_hdrs,
+                                           ic->i_recv_hdrs_dma);
+                if (ic->i_ack)
+                        ib_dma_free_coherent(dev, sizeof(struct rds_header),
+                                             ic->i_ack, ic->i_ack_dma);
+                if (ic->i_sends)
+                        rds_iw_send_clear_ring(ic);
+                if (ic->i_recvs)
+                        rds_iw_recv_clear_ring(ic);
+                if (ic->i_cm_id->qp)
+                        rdma_destroy_qp(ic->i_cm_id);
+                if (ic->i_send_cq)
+                        ib_destroy_cq(ic->i_send_cq);
+                if (ic->i_recv_cq)
+                        ib_destroy_cq(ic->i_recv_cq);
+                /*
+                 * If associated with an rds_iw_device:
+                 *      Move connection back to the nodev list.
+                 *      Remove cm_id from the device cm_id list.
+                 */
+                if (ic->rds_iwdev) {
+                        spin_lock_irq(&ic->rds_iwdev->spinlock);
+                        BUG_ON(list_empty(&ic->iw_node));
+                        list_del(&ic->iw_node);
+                        spin_unlock_irq(&ic->rds_iwdev->spinlock);
+                        spin_lock_irq(&iw_nodev_conns_lock);
+                        list_add_tail(&ic->iw_node, &iw_nodev_conns);
+                        spin_unlock_irq(&iw_nodev_conns_lock);
+                        rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
+                        ic->rds_iwdev = NULL;
+                }
+                rdma_destroy_id(ic->i_cm_id);
+                ic->i_cm_id = NULL;
+                ic->i_pd = NULL;
+                ic->i_mr = NULL;
+                ic->i_send_cq = NULL;
+                ic->i_recv_cq = NULL;
+                ic->i_send_hdrs = NULL;
+                ic->i_recv_hdrs = NULL;
+                ic->i_ack = NULL;
+        }
+        BUG_ON(ic->rds_iwdev);
+        /* Clear pending transmit */
+        if (ic->i_rm) {
+                rds_message_put(ic->i_rm);
+                ic->i_rm = NULL;
+        }
+        /* Clear the ACK state */
+        clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+        rds_iw_set_64bit(&ic->i_ack_next, 0);
+        ic->i_ack_recv = 0;
+        /* Clear flow control state */
+        ic->i_flowctl = 0;
+        atomic_set(&ic->i_credits, 0);
+        rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
+        rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
+        if (ic->i_iwinc) {
+                rds_inc_put(&ic->i_iwinc->ii_inc);
+                ic->i_iwinc = NULL;
+        }
+        vfree(ic->i_sends);
+        ic->i_sends = NULL;
+        vfree(ic->i_recvs);
+        ic->i_recvs = NULL;
+        rdsdebug("shutdown complete\n");
+}
+int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+        struct rds_iw_connection *ic;
+        unsigned long flags;
+        /* XXX too lazy? */
+        ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
+        if (ic == NULL)
+                return -ENOMEM;
+        INIT_LIST_HEAD(&ic->iw_node);
+        mutex_init(&ic->i_recv_mutex);
+        /*
+         * rds_iw_conn_shutdown() waits for these to be emptied so they
+         * must be initialized before it can be called.
+         */
+        rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
+        rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
+        ic->conn = conn;
+        conn->c_transport_data = ic;
+        spin_lock_irqsave(&iw_nodev_conns_lock, flags);
+        list_add_tail(&ic->iw_node, &iw_nodev_conns);
+        spin_unlock_irqrestore(&iw_nodev_conns_lock, flags);
+        rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
+        return 0;
+}
+void rds_iw_conn_free(void *arg)
+{
+        struct rds_iw_connection *ic = arg;
+        rdsdebug("ic %p\n", ic);
+        list_del(&ic->iw_node);
+        kfree(ic);
+}
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+        va_list ap;
+        rds_conn_drop(conn);
+        va_start(ap, fmt);
+        vprintk(fmt, ap);
+        va_end(ap);
+}
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
new file mode 100644
index 000000000000..1c02a8f952d0
--- /dev/null
+++ b/net/rds/iw_rdma.c
@@ -0,0 +1,888 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include "rds.h"
+#include "rdma.h"
+#include "iw.h"
+/*
+ * This is stored as mr->r_trans_private.
+ */
+struct rds_iw_mr {
+        struct rds_iw_device    *device;
+        struct rds_iw_mr_pool   *pool;
+        struct rdma_cm_id       *cm_id;
+        struct ib_mr    *mr;
+        struct ib_fast_reg_page_list *page_list;
+        struct rds_iw_mapping   mapping;
+        unsigned char           remap_count;
+};
+/*
+ * Our own little MR pool
+ */
+struct rds_iw_mr_pool {
+        struct rds_iw_device    *device;                /* back ptr to the device that owns us */
+        struct mutex            flush_lock;             /* serialize fmr invalidate */
+        struct work_struct      flush_worker;           /* flush worker */
+        spinlock_t              list_lock;              /* protect variables below */
+        atomic_t                item_count;             /* total # of MRs */
+        atomic_t                dirty_count;            /* # dirty of MRs */
+        struct list_head        dirty_list;             /* dirty mappings */
+        struct list_head        clean_list;             /* unused & unamapped MRs */
+        atomic_t                free_pinned;            /* memory pinned by free MRs */
+        unsigned long           max_message_size;       /* in pages */
+        unsigned long           max_items;
+        unsigned long           max_items_soft;
+        unsigned long           max_free_pinned;
+        int                     max_pages;
+};
+static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all);
+static void rds_iw_mr_pool_flush_worker(struct work_struct *work);
+static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
+                          struct rds_iw_mr *ibmr,
+                          struct scatterlist *sg, unsigned int nents);
+static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
+                        struct list_head *unmap_list,
+                        struct list_head *kill_list);
+static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id)
+{
+        struct rds_iw_device *iwdev;
+        struct rds_iw_cm_id *i_cm_id;
+        *rds_iwdev = NULL;
+        *cm_id = NULL;
+        list_for_each_entry(iwdev, &rds_iw_devices, list) {
+                spin_lock_irq(&iwdev->spinlock);
+                list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) {
+                        struct sockaddr_in *src_addr, *dst_addr;
+                        src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr;
+                        dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr;
+                        rdsdebug("local ipaddr = %x port %d, "
+                                 "remote ipaddr = %x port %d"
+                                 "..looking for %x port %d, "
+                                 "remote ipaddr = %x port %d\n",
+                                src_addr->sin_addr.s_addr,
+                                src_addr->sin_port,
+                                dst_addr->sin_addr.s_addr,
+                                dst_addr->sin_port,
+                                rs->rs_bound_addr,
+                                rs->rs_bound_port,
+                                rs->rs_conn_addr,
+                                rs->rs_conn_port);
+#ifdef WORKING_TUPLE_DETECTION
+                        if (src_addr->sin_addr.s_addr == rs->rs_bound_addr &&
+                            src_addr->sin_port == rs->rs_bound_port &&
+                            dst_addr->sin_addr.s_addr == rs->rs_conn_addr &&
+                            dst_addr->sin_port == rs->rs_conn_port) {
+#else
+                        /* FIXME - needs to compare the local and remote
+                         * ipaddr/port tuple, but the ipaddr is the only
+                         * available infomation in the rds_sock (as the rest are
+                         * zero'ed.  It doesn't appear to be properly populated
+                         * during connection setup...
+                         */
+                        if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) {
+#endif
+                                spin_unlock_irq(&iwdev->spinlock);
+                                *rds_iwdev = iwdev;
+                                *cm_id = i_cm_id->cm_id;
+                                return 0;
+                        }
+                }
+                spin_unlock_irq(&iwdev->spinlock);
+        }
+        return 1;
+}
+static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+{
+        struct rds_iw_cm_id *i_cm_id;
+        i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL);
+        if (!i_cm_id)
+                return -ENOMEM;
+        i_cm_id->cm_id = cm_id;
+        spin_lock_irq(&rds_iwdev->spinlock);
+        list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list);
+        spin_unlock_irq(&rds_iwdev->spinlock);
+        return 0;
+}
+void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+{
+        struct rds_iw_cm_id *i_cm_id;
+        spin_lock_irq(&rds_iwdev->spinlock);
+        list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) {
+                if (i_cm_id->cm_id == cm_id) {
+                        list_del(&i_cm_id->list);
+                        kfree(i_cm_id);
+                        break;
+                }
+        }
+        spin_unlock_irq(&rds_iwdev->spinlock);
+}
+int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+{
+        struct sockaddr_in *src_addr, *dst_addr;
+        struct rds_iw_device *rds_iwdev_old;
+        struct rds_sock rs;
+        struct rdma_cm_id *pcm_id;
+        int rc;
+        src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
+        dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
+        rs.rs_bound_addr = src_addr->sin_addr.s_addr;
+        rs.rs_bound_port = src_addr->sin_port;
+        rs.rs_conn_addr = dst_addr->sin_addr.s_addr;
+        rs.rs_conn_port = dst_addr->sin_port;
+        rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id);
+        if (rc)
+                rds_iw_remove_cm_id(rds_iwdev, cm_id);
+        return rds_iw_add_cm_id(rds_iwdev, cm_id);
+}
+int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
+{
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        /* conn was previously on the nodev_conns_list */
+        spin_lock_irq(&iw_nodev_conns_lock);
+        BUG_ON(list_empty(&iw_nodev_conns));
+        BUG_ON(list_empty(&ic->iw_node));
+        list_del(&ic->iw_node);
+        spin_unlock_irq(&iw_nodev_conns_lock);
+        spin_lock_irq(&rds_iwdev->spinlock);
+        list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
+        spin_unlock_irq(&rds_iwdev->spinlock);
+        ic->rds_iwdev = rds_iwdev;
+        return 0;
+}
+void rds_iw_remove_nodev_conns(void)
+{
+        struct rds_iw_connection *ic, *_ic;
+        LIST_HEAD(tmp_list);
+        /* avoid calling conn_destroy with irqs off */
+        spin_lock_irq(&iw_nodev_conns_lock);
+        list_splice(&iw_nodev_conns, &tmp_list);
+        INIT_LIST_HEAD(&iw_nodev_conns);
+        spin_unlock_irq(&iw_nodev_conns_lock);
+        list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
+                if (ic->conn->c_passive)
+                        rds_conn_destroy(ic->conn->c_passive);
+                rds_conn_destroy(ic->conn);
+        }
+}
+void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev)
+{
+        struct rds_iw_connection *ic, *_ic;
+        LIST_HEAD(tmp_list);
+        /* avoid calling conn_destroy with irqs off */
+        spin_lock_irq(&rds_iwdev->spinlock);
+        list_splice(&rds_iwdev->conn_list, &tmp_list);
+        INIT_LIST_HEAD(&rds_iwdev->conn_list);
+        spin_unlock_irq(&rds_iwdev->spinlock);
+        list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
+                if (ic->conn->c_passive)
+                        rds_conn_destroy(ic->conn->c_passive);
+                rds_conn_destroy(ic->conn);
+        }
+}
+static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
+                struct scatterlist *list, unsigned int sg_len)
+{
+        sg->list = list;
+        sg->len = sg_len;
+        sg->dma_len = 0;
+        sg->dma_npages = 0;
+        sg->bytes = 0;
+}
+static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
+                        struct rds_iw_scatterlist *sg,
+                        unsigned int dma_page_shift)
+{
+        struct ib_device *dev = rds_iwdev->dev;
+        u64 *dma_pages = NULL;
+        u64 dma_mask;
+        unsigned int dma_page_size;
+        int i, j, ret;
+        dma_page_size = 1 << dma_page_shift;
+        dma_mask = dma_page_size - 1;
+        WARN_ON(sg->dma_len);
+        sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
+        if (unlikely(!sg->dma_len)) {
+                printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n");
+                return ERR_PTR(-EBUSY);
+        }
+        sg->bytes = 0;
+        sg->dma_npages = 0;
+        ret = -EINVAL;
+        for (i = 0; i < sg->dma_len; ++i) {
+                unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
+                u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
+                u64 end_addr;
+                sg->bytes += dma_len;
+                end_addr = dma_addr + dma_len;
+                if (dma_addr & dma_mask) {
+                        if (i > 0)
+                                goto out_unmap;
+                        dma_addr &= ~dma_mask;
+                }
+                if (end_addr & dma_mask) {
+                        if (i < sg->dma_len - 1)
+                                goto out_unmap;
+                        end_addr = (end_addr + dma_mask) & ~dma_mask;
+                }
+                sg->dma_npages += (end_addr - dma_addr) >> dma_page_shift;
+        }
+        /* Now gather the dma addrs into one list */
+        if (sg->dma_npages > fastreg_message_size)
+                goto out_unmap;
+        dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC);
+        if (!dma_pages) {
+                ret = -ENOMEM;
+                goto out_unmap;
+        }
+        for (i = j = 0; i < sg->dma_len; ++i) {
+                unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
+                u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
+                u64 end_addr;
+                end_addr = dma_addr + dma_len;
+                dma_addr &= ~dma_mask;
+                for (; dma_addr < end_addr; dma_addr += dma_page_size)
+                        dma_pages[j++] = dma_addr;
+                BUG_ON(j > sg->dma_npages);
+        }
+        return dma_pages;
+out_unmap:
+        ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
+        sg->dma_len = 0;
+        kfree(dma_pages);
+        return ERR_PTR(ret);
+}
+struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev)
+{
+        struct rds_iw_mr_pool *pool;
+        pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+        if (!pool) {
+                printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        pool->device = rds_iwdev;
+        INIT_LIST_HEAD(&pool->dirty_list);
+        INIT_LIST_HEAD(&pool->clean_list);
+        mutex_init(&pool->flush_lock);
+        spin_lock_init(&pool->list_lock);
+        INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker);
+        pool->max_message_size = fastreg_message_size;
+        pool->max_items = fastreg_pool_size;
+        pool->max_free_pinned = pool->max_items * pool->max_message_size / 4;
+        pool->max_pages = fastreg_message_size;
+        /* We never allow more than max_items MRs to be allocated.
+         * When we exceed more than max_items_soft, we start freeing
+         * items more aggressively.
+         * Make sure that max_items > max_items_soft > max_items / 2
+         */
+        pool->max_items_soft = pool->max_items * 3 / 4;
+        return pool;
+}
+void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo)
+{
+        struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+        iinfo->rdma_mr_max = pool->max_items;
+        iinfo->rdma_mr_size = pool->max_pages;
+}
+void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool)
+{
+        flush_workqueue(rds_wq);
+        rds_iw_flush_mr_pool(pool, 1);
+        BUG_ON(atomic_read(&pool->item_count));
+        BUG_ON(atomic_read(&pool->free_pinned));
+        kfree(pool);
+}
+static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool)
+{
+        struct rds_iw_mr *ibmr = NULL;
+        unsigned long flags;
+        spin_lock_irqsave(&pool->list_lock, flags);
+        if (!list_empty(&pool->clean_list)) {
+                ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list);
+                list_del_init(&ibmr->mapping.m_list);
+        }
+        spin_unlock_irqrestore(&pool->list_lock, flags);
+        return ibmr;
+}
+static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev)
+{
+        struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+        struct rds_iw_mr *ibmr = NULL;
+        int err = 0, iter = 0;
+        while (1) {
+                ibmr = rds_iw_reuse_fmr(pool);
+                if (ibmr)
+                        return ibmr;
+                /* No clean MRs - now we have the choice of either
+                 * allocating a fresh MR up to the limit imposed by the
+                 * driver, or flush any dirty unused MRs.
+                 * We try to avoid stalling in the send path if possible,
+                 * so we allocate as long as we're allowed to.
+                 *
+                 * We're fussy with enforcing the FMR limit, though. If the driver
+                 * tells us we can't use more than N fmrs, we shouldn't start
+                 * arguing with it */
+                if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+                        break;
+                atomic_dec(&pool->item_count);
+                if (++iter > 2) {
+                        rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted);
+                        return ERR_PTR(-EAGAIN);
+                }
+                /* We do have some empty MRs. Flush them out. */
+                rds_iw_stats_inc(s_iw_rdma_mr_pool_wait);
+                rds_iw_flush_mr_pool(pool, 0);
+        }
+        ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+        if (!ibmr) {
+                err = -ENOMEM;
+                goto out_no_cigar;
+        }
+        spin_lock_init(&ibmr->mapping.m_lock);
+        INIT_LIST_HEAD(&ibmr->mapping.m_list);
+        ibmr->mapping.m_mr = ibmr;
+        err = rds_iw_init_fastreg(pool, ibmr);
+        if (err)
+                goto out_no_cigar;
+        rds_iw_stats_inc(s_iw_rdma_mr_alloc);
+        return ibmr;
+out_no_cigar:
+        if (ibmr) {
+                rds_iw_destroy_fastreg(pool, ibmr);
+                kfree(ibmr);
+        }
+        atomic_dec(&pool->item_count);
+        return ERR_PTR(err);
+}
+void rds_iw_sync_mr(void *trans_private, int direction)
+{
+        struct rds_iw_mr *ibmr = trans_private;
+        struct rds_iw_device *rds_iwdev = ibmr->device;
+        switch (direction) {
+        case DMA_FROM_DEVICE:
+                ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list,
+                        ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
+                break;
+        case DMA_TO_DEVICE:
+                ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list,
+                        ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
+                break;
+        }
+}
+static inline unsigned int rds_iw_flush_goal(struct rds_iw_mr_pool *pool, int free_all)
+{
+        unsigned int item_count;
+        item_count = atomic_read(&pool->item_count);
+        if (free_all)
+                return item_count;
+        return 0;
+}
+/*
+ * Flush our pool of MRs.
+ * At a minimum, all currently unused MRs are unmapped.
+ * If the number of MRs allocated exceeds the limit, we also try
+ * to free as many MRs as needed to get back to this limit.
+ */
+static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
+{
+        struct rds_iw_mr *ibmr, *next;
+        LIST_HEAD(unmap_list);
+        LIST_HEAD(kill_list);
+        unsigned long flags;
+        unsigned int nfreed = 0, ncleaned = 0, free_goal;
+        int ret = 0;
+        rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
+        mutex_lock(&pool->flush_lock);
+        spin_lock_irqsave(&pool->list_lock, flags);
+        /* Get the list of all mappings to be destroyed */
+        list_splice_init(&pool->dirty_list, &unmap_list);
+        if (free_all)
+                list_splice_init(&pool->clean_list, &kill_list);
+        spin_unlock_irqrestore(&pool->list_lock, flags);
+        free_goal = rds_iw_flush_goal(pool, free_all);
+        /* Batched invalidate of dirty MRs.
+         * For FMR based MRs, the mappings on the unmap list are
+         * actually members of an ibmr (ibmr->mapping). They either
+         * migrate to the kill_list, or have been cleaned and should be
+         * moved to the clean_list.
+         * For fastregs, they will be dynamically allocated, and
+         * will be destroyed by the unmap function.
+         */
+        if (!list_empty(&unmap_list)) {
+                ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list);
+                /* If we've been asked to destroy all MRs, move those
+                 * that were simply cleaned to the kill list */
+                if (free_all)
+                        list_splice_init(&unmap_list, &kill_list);
+        }
+        /* Destroy any MRs that are past their best before date */
+        list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) {
+                rds_iw_stats_inc(s_iw_rdma_mr_free);
+                list_del(&ibmr->mapping.m_list);
+                rds_iw_destroy_fastreg(pool, ibmr);
+                kfree(ibmr);
+                nfreed++;
+        }
+        /* Anything that remains are laundered ibmrs, which we can add
+         * back to the clean list. */
+        if (!list_empty(&unmap_list)) {
+                spin_lock_irqsave(&pool->list_lock, flags);
+                list_splice(&unmap_list, &pool->clean_list);
+                spin_unlock_irqrestore(&pool->list_lock, flags);
+        }
+        atomic_sub(ncleaned, &pool->dirty_count);
+        atomic_sub(nfreed, &pool->item_count);
+        mutex_unlock(&pool->flush_lock);
+        return ret;
+}
+static void rds_iw_mr_pool_flush_worker(struct work_struct *work)
+{
+        struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker);
+        rds_iw_flush_mr_pool(pool, 0);
+}
+void rds_iw_free_mr(void *trans_private, int invalidate)
+{
+        struct rds_iw_mr *ibmr = trans_private;
+        struct rds_iw_mr_pool *pool = ibmr->device->mr_pool;
+        rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len);
+        if (!pool)
+                return;
+        /* Return it to the pool's free list */
+        rds_iw_free_fastreg(pool, ibmr);
+        /* If we've pinned too many pages, request a flush */
+        if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
+         || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+                queue_work(rds_wq, &pool->flush_worker);
+        if (invalidate) {
+                if (likely(!in_interrupt())) {
+                        rds_iw_flush_mr_pool(pool, 0);
+                } else {
+                        /* We get here if the user created a MR marked
+                         * as use_once and invalidate at the same time. */
+                        queue_work(rds_wq, &pool->flush_worker);
+                }
+        }
+}
+void rds_iw_flush_mrs(void)
+{
+        struct rds_iw_device *rds_iwdev;
+        list_for_each_entry(rds_iwdev, &rds_iw_devices, list) {
+                struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+                if (pool)
+                        rds_iw_flush_mr_pool(pool, 0);
+        }
+}
+void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
+                    struct rds_sock *rs, u32 *key_ret)
+{
+        struct rds_iw_device *rds_iwdev;
+        struct rds_iw_mr *ibmr = NULL;
+        struct rdma_cm_id *cm_id;
+        int ret;
+        ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id);
+        if (ret || !cm_id) {
+                ret = -ENODEV;
+                goto out;
+        }
+        if (!rds_iwdev->mr_pool) {
+                ret = -ENODEV;
+                goto out;
+        }
+        ibmr = rds_iw_alloc_mr(rds_iwdev);
+        if (IS_ERR(ibmr))
+                return ibmr;
+        ibmr->cm_id = cm_id;
+        ibmr->device = rds_iwdev;
+        ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents);
+        if (ret == 0)
+                *key_ret = ibmr->mr->rkey;
+        else
+                printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret);
+out:
+        if (ret) {
+                if (ibmr)
+                        rds_iw_free_mr(ibmr, 0);
+                ibmr = ERR_PTR(ret);
+        }
+        return ibmr;
+}
+/*
+ * iWARP fastreg handling
+ *
+ * The life cycle of a fastreg registration is a bit different from
+ * FMRs.
+ * The idea behind fastreg is to have one MR, to which we bind different
+ * mappings over time. To avoid stalling on the expensive map and invalidate
+ * operations, these operations are pipelined on the same send queue on
+ * which we want to send the message containing the r_key.
+ *
+ * This creates a bit of a problem for us, as we do not have the destination
+ * IP in GET_MR, so the connection must be setup prior to the GET_MR call for
+ * RDMA to be correctly setup.  If a fastreg request is present, rds_iw_xmit
+ * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request
+ * before queuing the SEND. When completions for these arrive, they are
+ * dispatched to the MR has a bit set showing that RDMa can be performed.
+ *
+ * There is another interesting aspect that's related to invalidation.
+ * The application can request that a mapping is invalidated in FREE_MR.
+ * The expectation there is that this invalidation step includes ALL
+ * PREVIOUSLY FREED MRs.
+ */
+static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool,
+                                struct rds_iw_mr *ibmr)
+{
+        struct rds_iw_device *rds_iwdev = pool->device;
+        struct ib_fast_reg_page_list *page_list = NULL;
+        struct ib_mr *mr;
+        int err;
+        mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size);
+        if (IS_ERR(mr)) {
+                err = PTR_ERR(mr);
+                printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err);
+                return err;
+        }
+        /* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages
+         * is not filled in.
+         */
+        page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size);
+        if (IS_ERR(page_list)) {
+                err = PTR_ERR(page_list);
+                printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err);
+                ib_dereg_mr(mr);
+                return err;
+        }
+        ibmr->page_list = page_list;
+        ibmr->mr = mr;
+        return 0;
+}
+static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
+{
+        struct rds_iw_mr *ibmr = mapping->m_mr;
+        struct ib_send_wr f_wr, *failed_wr;
+        int ret;
+        /*
+         * Perform a WR for the fast_reg_mr. Each individual page
+         * in the sg list is added to the fast reg page list and placed
+         * inside the fast_reg_mr WR.  The key used is a rolling 8bit
+         * counter, which should guarantee uniqueness.
+         */
+        ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++);
+        mapping->m_rkey = ibmr->mr->rkey;
+        memset(&f_wr, 0, sizeof(f_wr));
+        f_wr.wr_id = RDS_IW_FAST_REG_WR_ID;
+        f_wr.opcode = IB_WR_FAST_REG_MR;
+        f_wr.wr.fast_reg.length = mapping->m_sg.bytes;
+        f_wr.wr.fast_reg.rkey = mapping->m_rkey;
+        f_wr.wr.fast_reg.page_list = ibmr->page_list;
+        f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
+        f_wr.wr.fast_reg.page_shift = ibmr->device->page_shift;
+        f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
+                                IB_ACCESS_REMOTE_READ |
+                                IB_ACCESS_REMOTE_WRITE;
+        f_wr.wr.fast_reg.iova_start = 0;
+        f_wr.send_flags = IB_SEND_SIGNALED;
+        failed_wr = &f_wr;
+        ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr);
+        BUG_ON(failed_wr != &f_wr);
+        if (ret && printk_ratelimit())
+                printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+                        __func__, __LINE__, ret);
+        return ret;
+}
+static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr)
+{
+        struct ib_send_wr s_wr, *failed_wr;
+        int ret = 0;
+        if (!ibmr->cm_id->qp || !ibmr->mr)
+                goto out;
+        memset(&s_wr, 0, sizeof(s_wr));
+        s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID;
+        s_wr.opcode = IB_WR_LOCAL_INV;
+        s_wr.ex.invalidate_rkey = ibmr->mr->rkey;
+        s_wr.send_flags = IB_SEND_SIGNALED;
+        failed_wr = &s_wr;
+        ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
+        if (ret && printk_ratelimit()) {
+                printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+                        __func__, __LINE__, ret);
+                goto out;
+        }
+out:
+        return ret;
+}
+static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
+                        struct rds_iw_mr *ibmr,
+                        struct scatterlist *sg,
+                        unsigned int sg_len)
+{
+        struct rds_iw_device *rds_iwdev = pool->device;
+        struct rds_iw_mapping *mapping = &ibmr->mapping;
+        u64 *dma_pages;
+        int i, ret = 0;
+        rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
+        dma_pages = rds_iw_map_scatterlist(rds_iwdev,
+                                &mapping->m_sg,
+                                rds_iwdev->page_shift);
+        if (IS_ERR(dma_pages)) {
+                ret = PTR_ERR(dma_pages);
+                dma_pages = NULL;
+                goto out;
+        }
+        if (mapping->m_sg.dma_len > pool->max_message_size) {
+                ret = -EMSGSIZE;
+                goto out;
+        }
+        for (i = 0; i < mapping->m_sg.dma_npages; ++i)
+                ibmr->page_list->page_list[i] = dma_pages[i];
+        ret = rds_iw_rdma_build_fastreg(mapping);
+        if (ret)
+                goto out;
+        rds_iw_stats_inc(s_iw_rdma_mr_used);
+out:
+        kfree(dma_pages);
+        return ret;
+}
+/*
+ * "Free" a fastreg MR.
+ */
+static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
+                struct rds_iw_mr *ibmr)
+{
+        unsigned long flags;
+        int ret;
+        if (!ibmr->mapping.m_sg.dma_len)
+                return;
+        ret = rds_iw_rdma_fastreg_inv(ibmr);
+        if (ret)
+                return;
+        /* Try to post the LOCAL_INV WR to the queue. */
+        spin_lock_irqsave(&pool->list_lock, flags);
+        list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list);
+        atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned);
+        atomic_inc(&pool->dirty_count);
+        spin_unlock_irqrestore(&pool->list_lock, flags);
+}
+static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
+                                struct list_head *unmap_list,
+                                struct list_head *kill_list)
+{
+        struct rds_iw_mapping *mapping, *next;
+        unsigned int ncleaned = 0;
+        LIST_HEAD(laundered);
+        /* Batched invalidation of fastreg MRs.
+         * Why do we do it this way, even though we could pipeline unmap
+         * and remap? The reason is the application semantics - when the
+         * application requests an invalidation of MRs, it expects all
+         * previously released R_Keys to become invalid.
+         *
+         * If we implement MR reuse naively, we risk memory corruption
+         * (this has actually been observed). So the default behavior
+         * requires that a MR goes through an explicit unmap operation before
+         * we can reuse it again.
+         *
+         * We could probably improve on this a little, by allowing immediate
+         * reuse of a MR on the same socket (eg you could add small
+         * cache of unused MRs to strct rds_socket - GET_MR could grab one
+         * of these without requiring an explicit invalidate).
+         */
+        while (!list_empty(unmap_list)) {
+                unsigned long flags;
+                spin_lock_irqsave(&pool->list_lock, flags);
+                list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
+                        list_move(&mapping->m_list, &laundered);
+                        ncleaned++;
+                }
+                spin_unlock_irqrestore(&pool->list_lock, flags);
+        }
+        /* Move all laundered mappings back to the unmap list.
+         * We do not kill any WRs right now - it doesn't seem the
+         * fastreg API has a max_remap limit. */
+        list_splice_init(&laundered, unmap_list);
+        return ncleaned;
+}
+static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool,
+                struct rds_iw_mr *ibmr)
+{
+        if (ibmr->page_list)
+                ib_free_fast_reg_page_list(ibmr->page_list);
+        if (ibmr->mr)
+                ib_dereg_mr(ibmr->mr);
+}
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
new file mode 100644
index 000000000000..a1931f0027a2
--- /dev/null
+++ b/net/rds/iw_recv.c
@@ -0,0 +1,869 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+#include "iw.h"
+static struct kmem_cache *rds_iw_incoming_slab;
+static struct kmem_cache *rds_iw_frag_slab;
+static atomic_t rds_iw_allocation = ATOMIC_INIT(0);
+static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
+{
+        rdsdebug("frag %p page %p\n", frag, frag->f_page);
+        __free_page(frag->f_page);
+        frag->f_page = NULL;
+}
+static void rds_iw_frag_free(struct rds_page_frag *frag)
+{
+        rdsdebug("frag %p page %p\n", frag, frag->f_page);
+        BUG_ON(frag->f_page != NULL);
+        kmem_cache_free(rds_iw_frag_slab, frag);
+}
+/*
+ * We map a page at a time.  Its fragments are posted in order.  This
+ * is called in fragment order as the fragments get send completion events.
+ * Only the last frag in the page performs the unmapping.
+ *
+ * It's OK for ring cleanup to call this in whatever order it likes because
+ * DMA is not in flight and so we can unmap while other ring entries still
+ * hold page references in their frags.
+ */
+static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic,
+                                   struct rds_iw_recv_work *recv)
+{
+        struct rds_page_frag *frag = recv->r_frag;
+        rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
+        if (frag->f_mapped)
+                ib_dma_unmap_page(ic->i_cm_id->device,
+                               frag->f_mapped,
+                               RDS_FRAG_SIZE, DMA_FROM_DEVICE);
+        frag->f_mapped = 0;
+}
+void rds_iw_recv_init_ring(struct rds_iw_connection *ic)
+{
+        struct rds_iw_recv_work *recv;
+        u32 i;
+        for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
+                struct ib_sge *sge;
+                recv->r_iwinc = NULL;
+                recv->r_frag = NULL;
+                recv->r_wr.next = NULL;
+                recv->r_wr.wr_id = i;
+                recv->r_wr.sg_list = recv->r_sge;
+                recv->r_wr.num_sge = RDS_IW_RECV_SGE;
+                sge = rds_iw_data_sge(ic, recv->r_sge);
+                sge->addr = 0;
+                sge->length = RDS_FRAG_SIZE;
+                sge->lkey = 0;
+                sge = rds_iw_header_sge(ic, recv->r_sge);
+                sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+                sge->length = sizeof(struct rds_header);
+                sge->lkey = 0;
+        }
+}
+static void rds_iw_recv_clear_one(struct rds_iw_connection *ic,
+                                  struct rds_iw_recv_work *recv)
+{
+        if (recv->r_iwinc) {
+                rds_inc_put(&recv->r_iwinc->ii_inc);
+                recv->r_iwinc = NULL;
+        }
+        if (recv->r_frag) {
+                rds_iw_recv_unmap_page(ic, recv);
+                if (recv->r_frag->f_page)
+                        rds_iw_frag_drop_page(recv->r_frag);
+                rds_iw_frag_free(recv->r_frag);
+                recv->r_frag = NULL;
+        }
+}
+void rds_iw_recv_clear_ring(struct rds_iw_connection *ic)
+{
+        u32 i;
+        for (i = 0; i < ic->i_recv_ring.w_nr; i++)
+                rds_iw_recv_clear_one(ic, &ic->i_recvs[i]);
+        if (ic->i_frag.f_page)
+                rds_iw_frag_drop_page(&ic->i_frag);
+}
+static int rds_iw_recv_refill_one(struct rds_connection *conn,
+                                  struct rds_iw_recv_work *recv,
+                                  gfp_t kptr_gfp, gfp_t page_gfp)
+{
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        dma_addr_t dma_addr;
+        struct ib_sge *sge;
+        int ret = -ENOMEM;
+        if (recv->r_iwinc == NULL) {
+                if (atomic_read(&rds_iw_allocation) >= rds_iw_sysctl_max_recv_allocation) {
+                        rds_iw_stats_inc(s_iw_rx_alloc_limit);
+                        goto out;
+                }
+                recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
+                                                 kptr_gfp);
+                if (recv->r_iwinc == NULL)
+                        goto out;
+                atomic_inc(&rds_iw_allocation);
+                INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
+                rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
+        }
+        if (recv->r_frag == NULL) {
+                recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
+                if (recv->r_frag == NULL)
+                        goto out;
+                INIT_LIST_HEAD(&recv->r_frag->f_item);
+                recv->r_frag->f_page = NULL;
+        }
+        if (ic->i_frag.f_page == NULL) {
+                ic->i_frag.f_page = alloc_page(page_gfp);
+                if (ic->i_frag.f_page == NULL)
+                        goto out;
+                ic->i_frag.f_offset = 0;
+        }
+        dma_addr = ib_dma_map_page(ic->i_cm_id->device,
+                                  ic->i_frag.f_page,
+                                  ic->i_frag.f_offset,
+                                  RDS_FRAG_SIZE,
+                                  DMA_FROM_DEVICE);
+        if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
+                goto out;
+        /*
+         * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap()
+         * must be called on this recv.  This happens as completions hit
+         * in order or on connection shutdown.
+         */
+        recv->r_frag->f_page = ic->i_frag.f_page;
+        recv->r_frag->f_offset = ic->i_frag.f_offset;
+        recv->r_frag->f_mapped = dma_addr;
+        sge = rds_iw_data_sge(ic, recv->r_sge);
+        sge->addr = dma_addr;
+        sge->length = RDS_FRAG_SIZE;
+        sge = rds_iw_header_sge(ic, recv->r_sge);
+        sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
+        sge->length = sizeof(struct rds_header);
+        get_page(recv->r_frag->f_page);
+        if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
+                ic->i_frag.f_offset += RDS_FRAG_SIZE;
+        } else {
+                put_page(ic->i_frag.f_page);
+                ic->i_frag.f_page = NULL;
+                ic->i_frag.f_offset = 0;
+        }
+        ret = 0;
+out:
+        return ret;
+}
+/*
+ * This tries to allocate and post unused work requests after making sure that
+ * they have all the allocations they need to queue received fragments into
+ * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
+ * pairs don't go unmatched.
+ *
+ * -1 is returned if posting fails due to temporary resource exhaustion.
+ */
+int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+                       gfp_t page_gfp, int prefill)
+{
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        struct rds_iw_recv_work *recv;
+        struct ib_recv_wr *failed_wr;
+        unsigned int posted = 0;
+        int ret = 0;
+        u32 pos;
+        while ((prefill || rds_conn_up(conn))
+                        && rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+                if (pos >= ic->i_recv_ring.w_nr) {
+                        printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
+                                        pos);
+                        ret = -EINVAL;
+                        break;
+                }
+                recv = &ic->i_recvs[pos];
+                ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
+                if (ret) {
+                        ret = -1;
+                        break;
+                }
+                /* XXX when can this fail? */
+                ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
+                rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv,
+                         recv->r_iwinc, recv->r_frag->f_page,
+                         (long) recv->r_frag->f_mapped, ret);
+                if (ret) {
+                        rds_iw_conn_error(conn, "recv post on "
+                               "%pI4 returned %d, disconnecting and "
+                               "reconnecting\n", &conn->c_faddr,
+                               ret);
+                        ret = -1;
+                        break;
+                }
+                posted++;
+        }
+        /* We're doing flow control - update the window. */
+        if (ic->i_flowctl && posted)
+                rds_iw_advertise_credits(conn, posted);
+        if (ret)
+                rds_iw_ring_unalloc(&ic->i_recv_ring, 1);
+        return ret;
+}
+void rds_iw_inc_purge(struct rds_incoming *inc)
+{
+        struct rds_iw_incoming *iwinc;
+        struct rds_page_frag *frag;
+        struct rds_page_frag *pos;
+        iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+        rdsdebug("purging iwinc %p inc %p\n", iwinc, inc);
+        list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) {
+                list_del_init(&frag->f_item);
+                rds_iw_frag_drop_page(frag);
+                rds_iw_frag_free(frag);
+        }
+}
+void rds_iw_inc_free(struct rds_incoming *inc)
+{
+        struct rds_iw_incoming *iwinc;
+        iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+        rds_iw_inc_purge(inc);
+        rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc);
+        BUG_ON(!list_empty(&iwinc->ii_frags));
+        kmem_cache_free(rds_iw_incoming_slab, iwinc);
+        atomic_dec(&rds_iw_allocation);
+        BUG_ON(atomic_read(&rds_iw_allocation) < 0);
+}
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+                            size_t size)
+{
+        struct rds_iw_incoming *iwinc;
+        struct rds_page_frag *frag;
+        struct iovec *iov = first_iov;
+        unsigned long to_copy;
+        unsigned long frag_off = 0;
+        unsigned long iov_off = 0;
+        int copied = 0;
+        int ret;
+        u32 len;
+        iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+        frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
+        len = be32_to_cpu(inc->i_hdr.h_len);
+        while (copied < size && copied < len) {
+                if (frag_off == RDS_FRAG_SIZE) {
+                        frag = list_entry(frag->f_item.next,
+                                          struct rds_page_frag, f_item);
+                        frag_off = 0;
+                }
+                while (iov_off == iov->iov_len) {
+                        iov_off = 0;
+                        iov++;
+                }
+                to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+                to_copy = min_t(size_t, to_copy, size - copied);
+                to_copy = min_t(unsigned long, to_copy, len - copied);
+                rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+                         "[%p, %lu] + %lu\n",
+                         to_copy, iov->iov_base, iov->iov_len, iov_off,
+                         frag->f_page, frag->f_offset, frag_off);
+                /* XXX needs + offset for multiple recvs per page */
+                ret = rds_page_copy_to_user(frag->f_page,
+                                            frag->f_offset + frag_off,
+                                            iov->iov_base + iov_off,
+                                            to_copy);
+                if (ret) {
+                        copied = ret;
+                        break;
+                }
+                iov_off += to_copy;
+                frag_off += to_copy;
+                copied += to_copy;
+        }
+        return copied;
+}
+/* ic starts out kzalloc()ed */
+void rds_iw_recv_init_ack(struct rds_iw_connection *ic)
+{
+        struct ib_send_wr *wr = &ic->i_ack_wr;
+        struct ib_sge *sge = &ic->i_ack_sge;
+        sge->addr = ic->i_ack_dma;
+        sge->length = sizeof(struct rds_header);
+        sge->lkey = rds_iw_local_dma_lkey(ic);
+        wr->sg_list = sge;
+        wr->num_sge = 1;
+        wr->opcode = IB_WR_SEND;
+        wr->wr_id = RDS_IW_ACK_WR_ID;
+        wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+}
+/*
+ * You'd think that with reliable IB connections you wouldn't need to ack
+ * messages that have been received.  The problem is that IB hardware generates
+ * an ack message before it has DMAed the message into memory.  This creates a
+ * potential message loss if the HCA is disabled for any reason between when it
+ * sends the ack and before the message is DMAed and processed.  This is only a
+ * potential issue if another HCA is available for fail-over.
+ *
+ * When the remote host receives our ack they'll free the sent message from
+ * their send queue.  To decrease the latency of this we always send an ack
+ * immediately after we've received messages.
+ *
+ * For simplicity, we only have one ack in flight at a time.  This puts
+ * pressure on senders to have deep enough send queues to absorb the latency of
+ * a single ack frame being in flight.  This might not be good enough.
+ *
+ * This is implemented by have a long-lived send_wr and sge which point to a
+ * statically allocated ack frame.  This ack wr does not fall under the ring
+ * accounting that the tx and rx wrs do.  The QP attribute specifically makes
+ * room for it beyond the ring size.  Send completion notices its special
+ * wr_id and avoids working with the ring in that case.
+ */
+static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
+                                int ack_required)
+{
+        rds_iw_set_64bit(&ic->i_ack_next, seq);
+        if (ack_required) {
+                smp_mb__before_clear_bit();
+                set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+        }
+}
+static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
+{
+        clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+        smp_mb__after_clear_bit();
+        return ic->i_ack_next;
+}
+static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits)
+{
+        struct rds_header *hdr = ic->i_ack;
+        struct ib_send_wr *failed_wr;
+        u64 seq;
+        int ret;
+        seq = rds_iw_get_ack(ic);
+        rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+        rds_message_populate_header(hdr, 0, 0, 0);
+        hdr->h_ack = cpu_to_be64(seq);
+        hdr->h_credit = adv_credits;
+        rds_message_make_checksum(hdr);
+        ic->i_ack_queued = jiffies;
+        ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
+        if (unlikely(ret)) {
+                /* Failed to send. Release the WR, and
+                 * force another ACK.
+                 */
+                clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+                set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+                rds_iw_stats_inc(s_iw_ack_send_failure);
+                /* Need to finesse this later. */
+                BUG();
+        } else
+                rds_iw_stats_inc(s_iw_ack_sent);
+}
+/*
+ * There are 3 ways of getting acknowledgements to the peer:
+ *  1.  We call rds_iw_attempt_ack from the recv completion handler
+ *      to send an ACK-only frame.
+ *      However, there can be only one such frame in the send queue
+ *      at any time, so we may have to postpone it.
+ *  2.  When another (data) packet is transmitted while there's
+ *      an ACK in the queue, we piggyback the ACK sequence number
+ *      on the data packet.
+ *  3.  If the ACK WR is done sending, we get called from the
+ *      send queue completion handler, and check whether there's
+ *      another ACK pending (postponed because the WR was on the
+ *      queue). If so, we transmit it.
+ *
+ * We maintain 2 variables:
+ *  -   i_ack_flags, which keeps track of whether the ACK WR
+ *      is currently in the send queue or not (IB_ACK_IN_FLIGHT)
+ *  -   i_ack_next, which is the last sequence number we received
+ *
+ * Potentially, send queue and receive queue handlers can run concurrently.
+ *
+ * Reconnecting complicates this picture just slightly. When we
+ * reconnect, we may be seeing duplicate packets. The peer
+ * is retransmitting them, because it hasn't seen an ACK for
+ * them. It is important that we ACK these.
+ *
+ * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
+ * this flag set *MUST* be acknowledged immediately.
+ */
+/*
+ * When we get here, we're called from the recv queue handler.
+ * Check whether we ought to transmit an ACK.
+ */
+void rds_iw_attempt_ack(struct rds_iw_connection *ic)
+{
+        unsigned int adv_credits;
+        if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+                return;
+        if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
+                rds_iw_stats_inc(s_iw_ack_send_delayed);
+                return;
+        }
+        /* Can we get a send credit? */
+        if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) {
+                rds_iw_stats_inc(s_iw_tx_throttle);
+                clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+                return;
+        }
+        clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+        rds_iw_send_ack(ic, adv_credits);
+}
+/*
+ * We get here from the send completion handler, when the
+ * adapter tells us the ACK frame was sent.
+ */
+void rds_iw_ack_send_complete(struct rds_iw_connection *ic)
+{
+        clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+        rds_iw_attempt_ack(ic);
+}
+/*
+ * This is called by the regular xmit code when it wants to piggyback
+ * an ACK on an outgoing frame.
+ */
+u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic)
+{
+        if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+                rds_iw_stats_inc(s_iw_ack_send_piggybacked);
+        return rds_iw_get_ack(ic);
+}
+/*
+ * It's kind of lame that we're copying from the posted receive pages into
+ * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
+ * them.  But receiving new congestion bitmaps should be a *rare* event, so
+ * hopefully we won't need to invest that complexity in making it more
+ * efficient.  By copying we can share a simpler core with TCP which has to
+ * copy.
+ */
+static void rds_iw_cong_recv(struct rds_connection *conn,
+                              struct rds_iw_incoming *iwinc)
+{
+        struct rds_cong_map *map;
+        unsigned int map_off;
+        unsigned int map_page;
+        struct rds_page_frag *frag;
+        unsigned long frag_off;
+        unsigned long to_copy;
+        unsigned long copied;
+        uint64_t uncongested = 0;
+        void *addr;
+        /* catch completely corrupt packets */
+        if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+                return;
+        map = conn->c_fcong;
+        map_page = 0;
+        map_off = 0;
+        frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
+        frag_off = 0;
+        copied = 0;
+        while (copied < RDS_CONG_MAP_BYTES) {
+                uint64_t *src, *dst;
+                unsigned int k;
+                to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
+                BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
+                addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
+                src = addr + frag_off;
+                dst = (void *)map->m_page_addrs[map_page] + map_off;
+                for (k = 0; k < to_copy; k += 8) {
+                        /* Record ports that became uncongested, ie
+                         * bits that changed from 0 to 1. */
+                        uncongested |= ~(*src) & *dst;
+                        *dst++ = *src++;
+                }
+                kunmap_atomic(addr, KM_SOFTIRQ0);
+                copied += to_copy;
+                map_off += to_copy;
+                if (map_off == PAGE_SIZE) {
+                        map_off = 0;
+                        map_page++;
+                }
+                frag_off += to_copy;
+                if (frag_off == RDS_FRAG_SIZE) {
+                        frag = list_entry(frag->f_item.next,
+                                          struct rds_page_frag, f_item);
+                        frag_off = 0;
+                }
+        }
+        /* the congestion map is in little endian order */
+        uncongested = le64_to_cpu(uncongested);
+        rds_cong_map_updated(map, uncongested);
+}
+/*
+ * Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rds_iw_ack_state {
+        u64             ack_next;
+        u64             ack_recv;
+        unsigned int    ack_required:1;
+        unsigned int    ack_next_valid:1;
+        unsigned int    ack_recv_valid:1;
+};
+static void rds_iw_process_recv(struct rds_connection *conn,
+                                struct rds_iw_recv_work *recv, u32 byte_len,
+                                struct rds_iw_ack_state *state)
+{
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        struct rds_iw_incoming *iwinc = ic->i_iwinc;
+        struct rds_header *ihdr, *hdr;
+        /* XXX shut down the connection if port 0,0 are seen? */
+        rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv,
+                 byte_len);
+        if (byte_len < sizeof(struct rds_header)) {
+                rds_iw_conn_error(conn, "incoming message "
+                       "from %pI4 didn't inclue a "
+                       "header, disconnecting and "
+                       "reconnecting\n",
+                       &conn->c_faddr);
+                return;
+        }
+        byte_len -= sizeof(struct rds_header);
+        ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+        /* Validate the checksum. */
+        if (!rds_message_verify_checksum(ihdr)) {
+                rds_iw_conn_error(conn, "incoming message "
+                       "from %pI4 has corrupted header - "
+                       "forcing a reconnect\n",
+                       &conn->c_faddr);
+                rds_stats_inc(s_recv_drop_bad_checksum);
+                return;
+        }
+        /* Process the ACK sequence which comes with every packet */
+        state->ack_recv = be64_to_cpu(ihdr->h_ack);
+        state->ack_recv_valid = 1;
+        /* Process the credits update if there was one */
+        if (ihdr->h_credit)
+                rds_iw_send_add_credits(conn, ihdr->h_credit);
+        if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
+                /* This is an ACK-only packet. The fact that it gets
+                 * special treatment here is that historically, ACKs
+                 * were rather special beasts.
+                 */
+                rds_iw_stats_inc(s_iw_ack_received);
+                /*
+                 * Usually the frags make their way on to incs and are then freed as
+                 * the inc is freed.  We don't go that route, so we have to drop the
+                 * page ref ourselves.  We can't just leave the page on the recv
+                 * because that confuses the dma mapping of pages and each recv's use
+                 * of a partial page.  We can leave the frag, though, it will be
+                 * reused.
+                 *
+                 * FIXME: Fold this into the code path below.
+                 */
+                rds_iw_frag_drop_page(recv->r_frag);
+                return;
+        }
+        /*
+         * If we don't already have an inc on the connection then this
+         * fragment has a header and starts a message.. copy its header
+         * into the inc and save the inc so we can hang upcoming fragments
+         * off its list.
+         */
+        if (iwinc == NULL) {
+                iwinc = recv->r_iwinc;
+                recv->r_iwinc = NULL;
+                ic->i_iwinc = iwinc;
+                hdr = &iwinc->ii_inc.i_hdr;
+                memcpy(hdr, ihdr, sizeof(*hdr));
+                ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+                rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc,
+                         ic->i_recv_data_rem, hdr->h_flags);
+        } else {
+                hdr = &iwinc->ii_inc.i_hdr;
+                /* We can't just use memcmp here; fragments of a
+                 * single message may carry different ACKs */
+                if (hdr->h_sequence != ihdr->h_sequence
+                 || hdr->h_len != ihdr->h_len
+                 || hdr->h_sport != ihdr->h_sport
+                 || hdr->h_dport != ihdr->h_dport) {
+                        rds_iw_conn_error(conn,
+                                "fragment header mismatch; forcing reconnect\n");
+                        return;
+                }
+        }
+        list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags);
+        recv->r_frag = NULL;
+        if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
+                ic->i_recv_data_rem -= RDS_FRAG_SIZE;
+        else {
+                ic->i_recv_data_rem = 0;
+                ic->i_iwinc = NULL;
+                if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+                        rds_iw_cong_recv(conn, iwinc);
+                else {
+                        rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+                                          &iwinc->ii_inc, GFP_ATOMIC,
+                                          KM_SOFTIRQ0);
+                        state->ack_next = be64_to_cpu(hdr->h_sequence);
+                        state->ack_next_valid = 1;
+                }
+                /* Evaluate the ACK_REQUIRED flag *after* we received
+                 * the complete frame, and after bumping the next_rx
+                 * sequence. */
+                if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
+                        rds_stats_inc(s_recv_ack_required);
+                        state->ack_required = 1;
+                }
+                rds_inc_put(&iwinc->ii_inc);
+        }
+}
+/*
+ * Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring.  Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rds_recv_incoming() per RDS connection.
+ */
+void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+        struct rds_connection *conn = context;
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        struct ib_wc wc;
+        struct rds_iw_ack_state state = { 0, };
+        struct rds_iw_recv_work *recv;
+        rdsdebug("conn %p cq %p\n", conn, cq);
+        rds_iw_stats_inc(s_iw_rx_cq_call);
+        ib_req_notify_cq(cq, IB_CQ_SOLICITED);
+        while (ib_poll_cq(cq, 1, &wc) > 0) {
+                rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+                         (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+                         be32_to_cpu(wc.ex.imm_data));
+                rds_iw_stats_inc(s_iw_rx_cq_event);
+                recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)];
+                rds_iw_recv_unmap_page(ic, recv);
+                /*
+                 * Also process recvs in connecting state because it is possible
+                 * to get a recv completion _before_ the rdmacm ESTABLISHED
+                 * event is processed.
+                 */
+                if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
+                        /* We expect errors as the qp is drained during shutdown */
+                        if (wc.status == IB_WC_SUCCESS) {
+                                rds_iw_process_recv(conn, recv, wc.byte_len, &state);
+                        } else {
+                                rds_iw_conn_error(conn, "recv completion on "
+                                       "%pI4 had status %u, disconnecting and "
+                                       "reconnecting\n", &conn->c_faddr,
+                                       wc.status);
+                        }
+                }
+                rds_iw_ring_free(&ic->i_recv_ring, 1);
+        }
+        if (state.ack_next_valid)
+                rds_iw_set_ack(ic, state.ack_next, state.ack_required);
+        if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+                rds_send_drop_acked(conn, state.ack_recv, NULL);
+                ic->i_ack_recv = state.ack_recv;
+        }
+        if (rds_conn_up(conn))
+                rds_iw_attempt_ack(ic);
+        /* If we ever end up with a really empty receive ring, we're
+         * in deep trouble, as the sender will definitely see RNR
+         * timeouts. */
+        if (rds_iw_ring_empty(&ic->i_recv_ring))
+                rds_iw_stats_inc(s_iw_rx_ring_empty);
+        /*
+         * If the ring is running low, then schedule the thread to refill.
+         */
+        if (rds_iw_ring_low(&ic->i_recv_ring))
+                queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+}
+int rds_iw_recv(struct rds_connection *conn)
+{
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        int ret = 0;
+        rdsdebug("conn %p\n", conn);
+        /*
+         * If we get a temporary posting failure in this context then
+         * we're really low and we want the caller to back off for a bit.
+         */
+        mutex_lock(&ic->i_recv_mutex);
+        if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
+                ret = -ENOMEM;
+        else
+                rds_iw_stats_inc(s_iw_rx_refill_from_thread);
+        mutex_unlock(&ic->i_recv_mutex);
+        if (rds_conn_up(conn))
+                rds_iw_attempt_ack(ic);
+        return ret;
+}
+int __init rds_iw_recv_init(void)
+{
+        struct sysinfo si;
+        int ret = -ENOMEM;
+        /* Default to 30% of all available RAM for recv memory */
+        si_meminfo(&si);
+        rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
+        rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
+                                        sizeof(struct rds_iw_incoming),
+                                        0, 0, NULL);
+        if (rds_iw_incoming_slab == NULL)
+                goto out;
+        rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
+                                        sizeof(struct rds_page_frag),
+                                        0, 0, NULL);
+        if (rds_iw_frag_slab == NULL)
+                kmem_cache_destroy(rds_iw_incoming_slab);
+        else
+                ret = 0;
+out:
+        return ret;
+}
+void rds_iw_recv_exit(void)
+{
+        kmem_cache_destroy(rds_iw_incoming_slab);
+        kmem_cache_destroy(rds_iw_frag_slab);
+}
diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c
new file mode 100644
index 000000000000..d422d4b5deef
--- /dev/null
+++ b/net/rds/iw_ring.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include "rds.h"
+#include "iw.h"
+/*
+ * Locking for IB rings.
+ * We assume that allocation is always protected by a mutex
+ * in the caller (this is a valid assumption for the current
+ * implementation).
+ *
+ * Freeing always happens in an interrupt, and hence only
+ * races with allocations, but not with other free()s.
+ *
+ * The interaction between allocation and freeing is that
+ * the alloc code has to determine the number of free entries.
+ * To this end, we maintain two counters; an allocation counter
+ * and a free counter. Both are allowed to run freely, and wrap
+ * around.
+ * The number of used entries is always (alloc_ctr - free_ctr) % NR.
+ *
+ * The current implementation makes free_ctr atomic. When the
+ * caller finds an allocation fails, it should set an "alloc fail"
+ * bit and retry the allocation. The "alloc fail" bit essentially tells
+ * the CQ completion handlers to wake it up after freeing some
+ * more entries.
+ */
+/*
+ * This only happens on shutdown.
+ */
+DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait);
+void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr)
+{
+        memset(ring, 0, sizeof(*ring));
+        ring->w_nr = nr;
+        rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
+}
+static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring)
+{
+        u32 diff;
+        /* This assumes that atomic_t has at least as many bits as u32 */
+        diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
+        BUG_ON(diff > ring->w_nr);
+        return diff;
+}
+void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr)
+{
+        /* We only ever get called from the connection setup code,
+         * prior to creating the QP. */
+        BUG_ON(__rds_iw_ring_used(ring));
+        ring->w_nr = nr;
+}
+static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring)
+{
+        return __rds_iw_ring_used(ring) == 0;
+}
+u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos)
+{
+        u32 ret = 0, avail;
+        avail = ring->w_nr - __rds_iw_ring_used(ring);
+        rdsdebug("ring %p val %u next %u free %u\n", ring, val,
+                 ring->w_alloc_ptr, avail);
+        if (val && avail) {
+                ret = min(val, avail);
+                *pos = ring->w_alloc_ptr;
+                ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
+                ring->w_alloc_ctr += ret;
+        }
+        return ret;
+}
+void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val)
+{
+        ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
+        atomic_add(val, &ring->w_free_ctr);
+        if (__rds_iw_ring_empty(ring) &&
+            waitqueue_active(&rds_iw_ring_empty_wait))
+                wake_up(&rds_iw_ring_empty_wait);
+}
+void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val)
+{
+        ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
+        ring->w_alloc_ctr -= val;
+}
+int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
+{
+        return __rds_iw_ring_empty(ring);
+}
+int rds_iw_ring_low(struct rds_iw_work_ring *ring)
+{
+        return __rds_iw_ring_used(ring) <= (ring->w_nr >> 2);
+}
+/*
+ * returns the oldest alloced ring entry.  This will be the next one
+ * freed.  This can't be called if there are none allocated.
+ */
+u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring)
+{
+        return ring->w_free_ptr;
+}
+/*
+ * returns the number of completed work requests.
+ */
+u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest)
+{
+        u32 ret;
+        if (oldest <= (unsigned long long)wr_id)
+                ret = (unsigned long long)wr_id - oldest + 1;
+        else
+                ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
+        rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
+                 wr_id, oldest);
+        return ret;
+}
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
new file mode 100644
index 000000000000..22dd38ffd608
--- /dev/null
+++ b/net/rds/iw_send.c
@@ -0,0 +1,975 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include "rds.h"
+#include "rdma.h"
+#include "iw.h"
+static void rds_iw_send_rdma_complete(struct rds_message *rm,
+                                      int wc_status)
+{
+        int notify_status;
+        switch (wc_status) {
+        case IB_WC_WR_FLUSH_ERR:
+                return;
+        case IB_WC_SUCCESS:
+                notify_status = RDS_RDMA_SUCCESS;
+                break;
+        case IB_WC_REM_ACCESS_ERR:
+                notify_status = RDS_RDMA_REMOTE_ERROR;
+                break;
+        default:
+                notify_status = RDS_RDMA_OTHER_ERROR;
+                break;
+        }
+        rds_rdma_send_complete(rm, notify_status);
+}
+static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
+                                   struct rds_rdma_op *op)
+{
+        if (op->r_mapped) {
+                ib_dma_unmap_sg(ic->i_cm_id->device,
+                        op->r_sg, op->r_nents,
+                        op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+                op->r_mapped = 0;
+        }
+}
+static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
+                          struct rds_iw_send_work *send,
+                          int wc_status)
+{
+        struct rds_message *rm = send->s_rm;
+        rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
+        ib_dma_unmap_sg(ic->i_cm_id->device,
+                     rm->m_sg, rm->m_nents,
+                     DMA_TO_DEVICE);
+        if (rm->m_rdma_op != NULL) {
+                rds_iw_send_unmap_rdma(ic, rm->m_rdma_op);
+                /* If the user asked for a completion notification on this
+                 * message, we can implement three different semantics:
+                 *  1.  Notify when we received the ACK on the RDS message
+                 *      that was queued with the RDMA. This provides reliable
+                 *      notification of RDMA status at the expense of a one-way
+                 *      packet delay.
+                 *  2.  Notify when the IB stack gives us the completion event for
+                 *      the RDMA operation.
+                 *  3.  Notify when the IB stack gives us the completion event for
+                 *      the accompanying RDS messages.
+                 * Here, we implement approach #3. To implement approach #2,
+                 * call rds_rdma_send_complete from the cq_handler. To implement #1,
+                 * don't call rds_rdma_send_complete at all, and fall back to the notify
+                 * handling in the ACK processing code.
+                 *
+                 * Note: There's no need to explicitly sync any RDMA buffers using
+                 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+                 * operation itself unmapped the RDMA buffers, which takes care
+                 * of synching.
+                 */
+                rds_iw_send_rdma_complete(rm, wc_status);
+                if (rm->m_rdma_op->r_write)
+                        rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
+                else
+                        rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
+        }
+        /* If anyone waited for this message to get flushed out, wake
+         * them up now */
+        rds_message_unmapped(rm);
+        rds_message_put(rm);
+        send->s_rm = NULL;
+}
+void rds_iw_send_init_ring(struct rds_iw_connection *ic)
+{
+        struct rds_iw_send_work *send;
+        u32 i;
+        for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+                struct ib_sge *sge;
+                send->s_rm = NULL;
+                send->s_op = NULL;
+                send->s_mapping = NULL;
+                send->s_wr.next = NULL;
+                send->s_wr.wr_id = i;
+                send->s_wr.sg_list = send->s_sge;
+                send->s_wr.num_sge = 1;
+                send->s_wr.opcode = IB_WR_SEND;
+                send->s_wr.send_flags = 0;
+                send->s_wr.ex.imm_data = 0;
+                sge = rds_iw_data_sge(ic, send->s_sge);
+                sge->lkey = 0;
+                sge = rds_iw_header_sge(ic, send->s_sge);
+                sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
+                sge->length = sizeof(struct rds_header);
+                sge->lkey = 0;
+                send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size);
+                if (IS_ERR(send->s_mr)) {
+                        printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n");
+                        break;
+                }
+                send->s_page_list = ib_alloc_fast_reg_page_list(
+                        ic->i_cm_id->device, fastreg_message_size);
+                if (IS_ERR(send->s_page_list)) {
+                        printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n");
+                        break;
+                }
+        }
+}
+void rds_iw_send_clear_ring(struct rds_iw_connection *ic)
+{
+        struct rds_iw_send_work *send;
+        u32 i;
+        for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+                BUG_ON(!send->s_mr);
+                ib_dereg_mr(send->s_mr);
+                BUG_ON(!send->s_page_list);
+                ib_free_fast_reg_page_list(send->s_page_list);
+                if (send->s_wr.opcode == 0xdead)
+                        continue;
+                if (send->s_rm)
+                        rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
+                if (send->s_op)
+                        rds_iw_send_unmap_rdma(ic, send->s_op);
+        }
+}
+/*
+ * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
+ * operations performed in the send path.  As the sender allocs and potentially
+ * unallocs the next free entry in the ring it doesn't alter which is
+ * the next to be freed, which is what this is concerned with.
+ */
+void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+        struct rds_connection *conn = context;
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        struct ib_wc wc;
+        struct rds_iw_send_work *send;
+        u32 completed;
+        u32 oldest;
+        u32 i;
+        int ret;
+        rdsdebug("cq %p conn %p\n", cq, conn);
+        rds_iw_stats_inc(s_iw_tx_cq_call);
+        ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+        if (ret)
+                rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+        while (ib_poll_cq(cq, 1, &wc) > 0) {
+                rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+                         (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+                         be32_to_cpu(wc.ex.imm_data));
+                rds_iw_stats_inc(s_iw_tx_cq_event);
+                if (wc.status != IB_WC_SUCCESS) {
+                        printk(KERN_ERR "WC Error:  status = %d opcode = %d\n", wc.status, wc.opcode);
+                        break;
+                }
+                if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) {
+                        ic->i_fastreg_posted = 0;
+                        continue;
+                }
+                if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) {
+                        ic->i_fastreg_posted = 1;
+                        continue;
+                }
+                if (wc.wr_id == RDS_IW_ACK_WR_ID) {
+                        if (ic->i_ack_queued + HZ/2 < jiffies)
+                                rds_iw_stats_inc(s_iw_tx_stalled);
+                        rds_iw_ack_send_complete(ic);
+                        continue;
+                }
+                oldest = rds_iw_ring_oldest(&ic->i_send_ring);
+                completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
+                for (i = 0; i < completed; i++) {
+                        send = &ic->i_sends[oldest];
+                        /* In the error case, wc.opcode sometimes contains garbage */
+                        switch (send->s_wr.opcode) {
+                        case IB_WR_SEND:
+                                if (send->s_rm)
+                                        rds_iw_send_unmap_rm(ic, send, wc.status);
+                                break;
+                        case IB_WR_FAST_REG_MR:
+                        case IB_WR_RDMA_WRITE:
+                        case IB_WR_RDMA_READ:
+                        case IB_WR_RDMA_READ_WITH_INV:
+                                /* Nothing to be done - the SG list will be unmapped
+                                 * when the SEND completes. */
+                                break;
+                        default:
+                                if (printk_ratelimit())
+                                        printk(KERN_NOTICE
+                                                "RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
+                                                __func__, send->s_wr.opcode);
+                                break;
+                        }
+                        send->s_wr.opcode = 0xdead;
+                        send->s_wr.num_sge = 1;
+                        if (send->s_queued + HZ/2 < jiffies)
+                                rds_iw_stats_inc(s_iw_tx_stalled);
+                        /* If a RDMA operation produced an error, signal this right
+                         * away. If we don't, the subsequent SEND that goes with this
+                         * RDMA will be canceled with ERR_WFLUSH, and the application
+                         * never learn that the RDMA failed. */
+                        if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
+                                struct rds_message *rm;
+                                rm = rds_send_get_message(conn, send->s_op);
+                                if (rm)
+                                        rds_iw_send_rdma_complete(rm, wc.status);
+                        }
+                        oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+                }
+                rds_iw_ring_free(&ic->i_send_ring, completed);
+                if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
+                 || test_bit(0, &conn->c_map_queued))
+                        queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+                /* We expect errors as the qp is drained during shutdown */
+                if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+                        rds_iw_conn_error(conn,
+                                "send completion on %pI4 "
+                                "had status %u, disconnecting and reconnecting\n",
+                                &conn->c_faddr, wc.status);
+                }
+        }
+}
+/*
+ * This is the main function for allocating credits when sending
+ * messages.
+ *
+ * Conceptually, we have two counters:
+ *  -   send credits: this tells us how many WRs we're allowed
+ *      to submit without overruning the reciever's queue. For
+ *      each SEND WR we post, we decrement this by one.
+ *
+ *  -   posted credits: this tells us how many WRs we recently
+ *      posted to the receive queue. This value is transferred
+ *      to the peer as a "credit update" in a RDS header field.
+ *      Every time we transmit credits to the peer, we subtract
+ *      the amount of transferred credits from this counter.
+ *
+ * It is essential that we avoid situations where both sides have
+ * exhausted their send credits, and are unable to send new credits
+ * to the peer. We achieve this by requiring that we send at least
+ * one credit update to the peer before exhausting our credits.
+ * When new credits arrive, we subtract one credit that is withheld
+ * until we've posted new buffers and are ready to transmit these
+ * credits (see rds_iw_send_add_credits below).
+ *
+ * The RDS send code is essentially single-threaded; rds_send_xmit
+ * grabs c_send_lock to ensure exclusive access to the send ring.
+ * However, the ACK sending code is independent and can race with
+ * message SENDs.
+ *
+ * In the send path, we need to update the counters for send credits
+ * and the counter of posted buffers atomically - when we use the
+ * last available credit, we cannot allow another thread to race us
+ * and grab the posted credits counter.  Hence, we have to use a
+ * spinlock to protect the credit counter, or use atomics.
+ *
+ * Spinlocks shared between the send and the receive path are bad,
+ * because they create unnecessary delays. An early implementation
+ * using a spinlock showed a 5% degradation in throughput at some
+ * loads.
+ *
+ * This implementation avoids spinlocks completely, putting both
+ * counters into a single atomic, and updating that atomic using
+ * atomic_add (in the receive path, when receiving fresh credits),
+ * and using atomic_cmpxchg when updating the two counters.
+ */
+int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
+                             u32 wanted, u32 *adv_credits, int need_posted)
+{
+        unsigned int avail, posted, got = 0, advertise;
+        long oldval, newval;
+        *adv_credits = 0;
+        if (!ic->i_flowctl)
+                return wanted;
+try_again:
+        advertise = 0;
+        oldval = newval = atomic_read(&ic->i_credits);
+        posted = IB_GET_POST_CREDITS(oldval);
+        avail = IB_GET_SEND_CREDITS(oldval);
+        rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n",
+                        wanted, avail, posted);
+        /* The last credit must be used to send a credit update. */
+        if (avail && !posted)
+                avail--;
+        if (avail < wanted) {
+                struct rds_connection *conn = ic->i_cm_id->context;
+                /* Oops, there aren't that many credits left! */
+                set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+                got = avail;
+        } else {
+                /* Sometimes you get what you want, lalala. */
+                got = wanted;
+        }
+        newval -= IB_SET_SEND_CREDITS(got);
+        /*
+         * If need_posted is non-zero, then the caller wants
+         * the posted regardless of whether any send credits are
+         * available.
+         */
+        if (posted && (got || need_posted)) {
+                advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
+                newval -= IB_SET_POST_CREDITS(advertise);
+        }
+        /* Finally bill everything */
+        if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
+                goto try_again;
+        *adv_credits = advertise;
+        return got;
+}
+void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits)
+{
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        if (credits == 0)
+                return;
+        rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n",
+                        credits,
+                        IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
+                        test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
+        atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
+        if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+                queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+        WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
+        rds_iw_stats_inc(s_iw_rx_credit_updates);
+}
+void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted)
+{
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        if (posted == 0)
+                return;
+        atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
+        /* Decide whether to send an update to the peer now.
+         * If we would send a credit update for every single buffer we
+         * post, we would end up with an ACK storm (ACK arrives,
+         * consumes buffer, we refill the ring, send ACK to remote
+         * advertising the newly posted buffer... ad inf)
+         *
+         * Performance pretty much depends on how often we send
+         * credit updates - too frequent updates mean lots of ACKs.
+         * Too infrequent updates, and the peer will run out of
+         * credits and has to throttle.
+         * For the time being, 16 seems to be a good compromise.
+         */
+        if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
+                set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+}
+static inline void
+rds_iw_xmit_populate_wr(struct rds_iw_connection *ic,
+                struct rds_iw_send_work *send, unsigned int pos,
+                unsigned long buffer, unsigned int length,
+                int send_flags)
+{
+        struct ib_sge *sge;
+        WARN_ON(pos != send - ic->i_sends);
+        send->s_wr.send_flags = send_flags;
+        send->s_wr.opcode = IB_WR_SEND;
+        send->s_wr.num_sge = 2;
+        send->s_wr.next = NULL;
+        send->s_queued = jiffies;
+        send->s_op = NULL;
+        if (length != 0) {
+                sge = rds_iw_data_sge(ic, send->s_sge);
+                sge->addr = buffer;
+                sge->length = length;
+                sge->lkey = rds_iw_local_dma_lkey(ic);
+                sge = rds_iw_header_sge(ic, send->s_sge);
+        } else {
+                /* We're sending a packet with no payload. There is only
+                 * one SGE */
+                send->s_wr.num_sge = 1;
+                sge = &send->s_sge[0];
+        }
+        sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
+        sge->length = sizeof(struct rds_header);
+        sge->lkey = rds_iw_local_dma_lkey(ic);
+}
+/*
+ * This can be called multiple times for a given message.  The first time
+ * we see a message we map its scatterlist into the IB device so that
+ * we can provide that mapped address to the IB scatter gather entries
+ * in the IB work requests.  We translate the scatterlist into a series
+ * of work requests that fragment the message.  These work requests complete
+ * in order so we pass ownership of the message to the completion handler
+ * once we send the final fragment.
+ *
+ * The RDS core uses the c_send_lock to only enter this function once
+ * per connection.  This makes sure that the tx ring alloc/unalloc pairs
+ * don't get out of sync and confuse the ring.
+ */
+int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
+                unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        struct ib_device *dev = ic->i_cm_id->device;
+        struct rds_iw_send_work *send = NULL;
+        struct rds_iw_send_work *first;
+        struct rds_iw_send_work *prev;
+        struct ib_send_wr *failed_wr;
+        struct scatterlist *scat;
+        u32 pos;
+        u32 i;
+        u32 work_alloc;
+        u32 credit_alloc;
+        u32 posted;
+        u32 adv_credits = 0;
+        int send_flags = 0;
+        int sent;
+        int ret;
+        int flow_controlled = 0;
+        BUG_ON(off % RDS_FRAG_SIZE);
+        BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
+        /* Fastreg support */
+        if (rds_rdma_cookie_key(rm->m_rdma_cookie)
+         && !ic->i_fastreg_posted) {
+                ret = -EAGAIN;
+                goto out;
+        }
+        /* FIXME we may overallocate here */
+        if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
+                i = 1;
+        else
+                i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
+        work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
+        if (work_alloc == 0) {
+                set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+                rds_iw_stats_inc(s_iw_tx_ring_full);
+                ret = -ENOMEM;
+                goto out;
+        }
+        credit_alloc = work_alloc;
+        if (ic->i_flowctl) {
+                credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0);
+                adv_credits += posted;
+                if (credit_alloc < work_alloc) {
+                        rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
+                        work_alloc = credit_alloc;
+                        flow_controlled++;
+                }
+                if (work_alloc == 0) {
+                        rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+                        rds_iw_stats_inc(s_iw_tx_throttle);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+        }
+        /* map the message the first time we see it */
+        if (ic->i_rm == NULL) {
+                /*
+                printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
+                                be16_to_cpu(rm->m_inc.i_hdr.h_dport),
+                                rm->m_inc.i_hdr.h_flags,
+                                be32_to_cpu(rm->m_inc.i_hdr.h_len));
+                   */
+                if (rm->m_nents) {
+                        rm->m_count = ib_dma_map_sg(dev,
+                                         rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
+                        rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
+                        if (rm->m_count == 0) {
+                                rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
+                                rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+                                ret = -ENOMEM; /* XXX ? */
+                                goto out;
+                        }
+                } else {
+                        rm->m_count = 0;
+                }
+                ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+                ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
+                rds_message_addref(rm);
+                ic->i_rm = rm;
+                /* Finalize the header */
+                if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
+                        rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
+                if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+                        rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+                /* If it has a RDMA op, tell the peer we did it. This is
+                 * used by the peer to release use-once RDMA MRs. */
+                if (rm->m_rdma_op) {
+                        struct rds_ext_header_rdma ext_hdr;
+                        ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
+                        rds_message_add_extension(&rm->m_inc.i_hdr,
+                                        RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+                }
+                if (rm->m_rdma_cookie) {
+                        rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+                                        rds_rdma_cookie_key(rm->m_rdma_cookie),
+                                        rds_rdma_cookie_offset(rm->m_rdma_cookie));
+                }
+                /* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so
+                 * we should not do this unless we have a chance of at least
+                 * sticking the header into the send ring. Which is why we
+                 * should call rds_iw_ring_alloc first. */
+                rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic));
+                rds_message_make_checksum(&rm->m_inc.i_hdr);
+                /*
+                 * Update adv_credits since we reset the ACK_REQUIRED bit.
+                 */
+                rds_iw_send_grab_credits(ic, 0, &posted, 1);
+                adv_credits += posted;
+                BUG_ON(adv_credits > 255);
+        } else if (ic->i_rm != rm)
+                BUG();
+        send = &ic->i_sends[pos];
+        first = send;
+        prev = NULL;
+        scat = &rm->m_sg[sg];
+        sent = 0;
+        i = 0;
+        /* Sometimes you want to put a fence between an RDMA
+         * READ and the following SEND.
+         * We could either do this all the time
+         * or when requested by the user. Right now, we let
+         * the application choose.
+         */
+        if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+                send_flags = IB_SEND_FENCE;
+        /*
+         * We could be copying the header into the unused tail of the page.
+         * That would need to be changed in the future when those pages might
+         * be mapped userspace pages or page cache pages.  So instead we always
+         * use a second sge and our long-lived ring of mapped headers.  We send
+         * the header after the data so that the data payload can be aligned on
+         * the receiver.
+         */
+        /* handle a 0-len message */
+        if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
+                rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
+                goto add_header;
+        }
+        /* if there's data reference it with a chain of work reqs */
+        for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
+                unsigned int len;
+                send = &ic->i_sends[pos];
+                len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+                rds_iw_xmit_populate_wr(ic, send, pos,
+                                ib_sg_dma_address(dev, scat) + off, len,
+                                send_flags);
+                /*
+                 * We want to delay signaling completions just enough to get
+                 * the batching benefits but not so much that we create dead time
+                 * on the wire.
+                 */
+                if (ic->i_unsignaled_wrs-- == 0) {
+                        ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+                        send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+                }
+                ic->i_unsignaled_bytes -= len;
+                if (ic->i_unsignaled_bytes <= 0) {
+                        ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
+                        send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+                }
+                /*
+                 * Always signal the last one if we're stopping due to flow control.
+                 */
+                if (flow_controlled && i == (work_alloc-1))
+                        send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+                rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+                         &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+                sent += len;
+                off += len;
+                if (off == ib_sg_dma_len(dev, scat)) {
+                        scat++;
+                        off = 0;
+                }
+add_header:
+                /* Tack on the header after the data. The header SGE should already
+                 * have been set up to point to the right header buffer. */
+                memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+                if (0) {
+                        struct rds_header *hdr = &ic->i_send_hdrs[pos];
+                        printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
+                                be16_to_cpu(hdr->h_dport),
+                                hdr->h_flags,
+                                be32_to_cpu(hdr->h_len));
+                }
+                if (adv_credits) {
+                        struct rds_header *hdr = &ic->i_send_hdrs[pos];
+                        /* add credit and redo the header checksum */
+                        hdr->h_credit = adv_credits;
+                        rds_message_make_checksum(hdr);
+                        adv_credits = 0;
+                        rds_iw_stats_inc(s_iw_tx_credit_updates);
+                }
+                if (prev)
+                        prev->s_wr.next = &send->s_wr;
+                prev = send;
+                pos = (pos + 1) % ic->i_send_ring.w_nr;
+        }
+        /* Account the RDS header in the number of bytes we sent, but just once.
+         * The caller has no concept of fragmentation. */
+        if (hdr_off == 0)
+                sent += sizeof(struct rds_header);
+        /* if we finished the message then send completion owns it */
+        if (scat == &rm->m_sg[rm->m_count]) {
+                prev->s_rm = ic->i_rm;
+                prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+                ic->i_rm = NULL;
+        }
+        if (i < work_alloc) {
+                rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+                work_alloc = i;
+        }
+        if (ic->i_flowctl && i < credit_alloc)
+                rds_iw_send_add_credits(conn, credit_alloc - i);
+        /* XXX need to worry about failed_wr and partial sends. */
+        failed_wr = &first->s_wr;
+        ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+        rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+                 first, &first->s_wr, ret, failed_wr);
+        BUG_ON(failed_wr != &first->s_wr);
+        if (ret) {
+                printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 "
+                       "returned %d\n", &conn->c_faddr, ret);
+                rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+                if (prev->s_rm) {
+                        ic->i_rm = prev->s_rm;
+                        prev->s_rm = NULL;
+                }
+                goto out;
+        }
+        ret = sent;
+out:
+        BUG_ON(adv_credits);
+        return ret;
+}
+static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr)
+{
+        BUG_ON(nent > send->s_page_list->max_page_list_len);
+        /*
+         * Perform a WR for the fast_reg_mr. Each individual page
+         * in the sg list is added to the fast reg page list and placed
+         * inside the fast_reg_mr WR.
+         */
+        send->s_wr.opcode = IB_WR_FAST_REG_MR;
+        send->s_wr.wr.fast_reg.length = len;
+        send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey;
+        send->s_wr.wr.fast_reg.page_list = send->s_page_list;
+        send->s_wr.wr.fast_reg.page_list_len = nent;
+        send->s_wr.wr.fast_reg.page_shift = rds_iwdev->page_shift;
+        send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE;
+        send->s_wr.wr.fast_reg.iova_start = sg_addr;
+        ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
+}
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
+{
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        struct rds_iw_send_work *send = NULL;
+        struct rds_iw_send_work *first;
+        struct rds_iw_send_work *prev;
+        struct ib_send_wr *failed_wr;
+        struct rds_iw_device *rds_iwdev;
+        struct scatterlist *scat;
+        unsigned long len;
+        u64 remote_addr = op->r_remote_addr;
+        u32 pos, fr_pos;
+        u32 work_alloc;
+        u32 i;
+        u32 j;
+        int sent;
+        int ret;
+        int num_sge;
+        rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+        /* map the message the first time we see it */
+        if (!op->r_mapped) {
+                op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
+                                        op->r_sg, op->r_nents, (op->r_write) ?
+                                        DMA_TO_DEVICE : DMA_FROM_DEVICE);
+                rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
+                if (op->r_count == 0) {
+                        rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
+                        ret = -ENOMEM; /* XXX ? */
+                        goto out;
+                }
+                op->r_mapped = 1;
+        }
+        if (!op->r_write) {
+                /* Alloc space on the send queue for the fastreg */
+                work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
+                if (work_alloc != 1) {
+                        rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+                        rds_iw_stats_inc(s_iw_tx_ring_full);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+        }
+        /*
+         * Instead of knowing how to return a partial rdma read/write we insist that there
+         * be enough work requests to send the entire message.
+         */
+        i = ceil(op->r_count, rds_iwdev->max_sge);
+        work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
+        if (work_alloc != i) {
+                rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+                rds_iw_stats_inc(s_iw_tx_ring_full);
+                ret = -ENOMEM;
+                goto out;
+        }
+        send = &ic->i_sends[pos];
+        if (!op->r_write) {
+                first = prev = &ic->i_sends[fr_pos];
+        } else {
+                first = send;
+                prev = NULL;
+        }
+        scat = &op->r_sg[0];
+        sent = 0;
+        num_sge = op->r_count;
+        for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
+                send->s_wr.send_flags = 0;
+                send->s_queued = jiffies;
+                /*
+                 * We want to delay signaling completions just enough to get
+                 * the batching benefits but not so much that we create dead time on the wire.
+                 */
+                if (ic->i_unsignaled_wrs-- == 0) {
+                        ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+                        send->s_wr.send_flags = IB_SEND_SIGNALED;
+                }
+                /* To avoid the need to have the plumbing to invalidate the fastreg_mr used
+                 * for local access after RDS is finished with it, using
+                 * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
+                 */
+                if (op->r_write)
+                        send->s_wr.opcode = IB_WR_RDMA_WRITE;
+                else
+                        send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+                send->s_wr.wr.rdma.remote_addr = remote_addr;
+                send->s_wr.wr.rdma.rkey = op->r_key;
+                send->s_op = op;
+                if (num_sge > rds_iwdev->max_sge) {
+                        send->s_wr.num_sge = rds_iwdev->max_sge;
+                        num_sge -= rds_iwdev->max_sge;
+                } else
+                        send->s_wr.num_sge = num_sge;
+                send->s_wr.next = NULL;
+                if (prev)
+                        prev->s_wr.next = &send->s_wr;
+                for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
+                        len = ib_sg_dma_len(ic->i_cm_id->device, scat);
+                        if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
+                                send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat);
+                        else {
+                                send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat);
+                                send->s_sge[j].length = len;
+                                send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic);
+                        }
+                        sent += len;
+                        rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
+                        remote_addr += len;
+                        scat++;
+                }
+                if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) {
+                        send->s_wr.num_sge = 1;
+                        send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr;
+                        send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes;
+                        send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey;
+                }
+                rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+                        &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+                prev = send;
+                if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
+                        send = ic->i_sends;
+        }
+        /* if we finished the message then send completion owns it */
+        if (scat == &op->r_sg[op->r_count])
+                first->s_wr.send_flags = IB_SEND_SIGNALED;
+        if (i < work_alloc) {
+                rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+                work_alloc = i;
+        }
+        /* On iWARP, local memory access by a remote system (ie, RDMA Read) is not
+         * recommended.  Putting the lkey on the wire is a security hole, as it can
+         * allow for memory access to all of memory on the remote system.  Some
+         * adapters do not allow using the lkey for this at all.  To bypass this use a
+         * fastreg_mr (or possibly a dma_mr)
+         */
+        if (!op->r_write) {
+                rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
+                        op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
+                work_alloc++;
+        }
+        failed_wr = &first->s_wr;
+        ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+        rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+                 first, &first->s_wr, ret, failed_wr);
+        BUG_ON(failed_wr != &first->s_wr);
+        if (ret) {
+                printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 "
+                       "returned %d\n", &conn->c_faddr, ret);
+                rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+                goto out;
+        }
+out:
+        return ret;
+}
+void rds_iw_xmit_complete(struct rds_connection *conn)
+{
+        struct rds_iw_connection *ic = conn->c_transport_data;
+        /* We may have a pending ACK or window update we were unable
+         * to send previously (due to flow control). Try again. */
+        rds_iw_attempt_ack(ic);
+}
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c
new file mode 100644
index 000000000000..ccc7e8f0bf0e
--- /dev/null
+++ b/net/rds/iw_stats.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include "rds.h"
+#include "iw.h"
+DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned;
+static char *rds_iw_stat_names[] = {
+        "iw_connect_raced",
+        "iw_listen_closed_stale",
+        "iw_tx_cq_call",
+        "iw_tx_cq_event",
+        "iw_tx_ring_full",
+        "iw_tx_throttle",
+        "iw_tx_sg_mapping_failure",
+        "iw_tx_stalled",
+        "iw_tx_credit_updates",
+        "iw_rx_cq_call",
+        "iw_rx_cq_event",
+        "iw_rx_ring_empty",
+        "iw_rx_refill_from_cq",
+        "iw_rx_refill_from_thread",
+        "iw_rx_alloc_limit",
+        "iw_rx_credit_updates",
+        "iw_ack_sent",
+        "iw_ack_send_failure",
+        "iw_ack_send_delayed",
+        "iw_ack_send_piggybacked",
+        "iw_ack_received",
+        "iw_rdma_mr_alloc",
+        "iw_rdma_mr_free",
+        "iw_rdma_mr_used",
+        "iw_rdma_mr_pool_flush",
+        "iw_rdma_mr_pool_wait",
+        "iw_rdma_mr_pool_depleted",
+};
+unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
+                                    unsigned int avail)
+{
+        struct rds_iw_statistics stats = {0, };
+        uint64_t *src;
+        uint64_t *sum;
+        size_t i;
+        int cpu;
+        if (avail < ARRAY_SIZE(rds_iw_stat_names))
+                goto out;
+        for_each_online_cpu(cpu) {
+                src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu));
+                sum = (uint64_t *)&stats;
+                for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+                        *(sum++) += *(src++);
+        }
+        rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names,
+                            ARRAY_SIZE(rds_iw_stat_names));
+out:
+        return ARRAY_SIZE(rds_iw_stat_names);
+}
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
new file mode 100644
index 000000000000..9590678cd616
--- /dev/null
+++ b/net/rds/iw_sysctl.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include "iw.h"
+static struct ctl_table_header *rds_iw_sysctl_hdr;
+unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR;
+unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR;
+unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
+static unsigned long rds_iw_sysctl_max_wr_min = 1;
+/* hardware will fail CQ creation long before this */
+static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0;
+unsigned long rds_iw_sysctl_max_unsig_wrs = 16;
+static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1;
+static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64;
+unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20);
+static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1;
+static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
+unsigned int rds_iw_sysctl_flow_control = 1;
+ctl_table rds_iw_sysctl_table[] = {
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "max_send_wr",
+                .data           = &rds_iw_sysctl_max_send_wr,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_minmax,
+                .extra1         = &rds_iw_sysctl_max_wr_min,
+                .extra2         = &rds_iw_sysctl_max_wr_max,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "max_recv_wr",
+                .data           = &rds_iw_sysctl_max_recv_wr,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_minmax,
+                .extra1         = &rds_iw_sysctl_max_wr_min,
+                .extra2         = &rds_iw_sysctl_max_wr_max,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "max_unsignaled_wr",
+                .data           = &rds_iw_sysctl_max_unsig_wrs,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_minmax,
+                .extra1         = &rds_iw_sysctl_max_unsig_wr_min,
+                .extra2         = &rds_iw_sysctl_max_unsig_wr_max,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "max_unsignaled_bytes",
+                .data           = &rds_iw_sysctl_max_unsig_bytes,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_minmax,
+                .extra1         = &rds_iw_sysctl_max_unsig_bytes_min,
+                .extra2         = &rds_iw_sysctl_max_unsig_bytes_max,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "max_recv_allocation",
+                .data           = &rds_iw_sysctl_max_recv_allocation,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_minmax,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "flow_control",
+                .data           = &rds_iw_sysctl_flow_control,
+                .maxlen         = sizeof(rds_iw_sysctl_flow_control),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        { .ctl_name = 0}
+};
+static struct ctl_path rds_iw_sysctl_path[] = {
+        { .procname = "net", .ctl_name = CTL_NET, },
+        { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
+        { .procname = "iw", .ctl_name = CTL_UNNUMBERED, },
+        { }
+};
+void rds_iw_sysctl_exit(void)
+{
+        if (rds_iw_sysctl_hdr)
+                unregister_sysctl_table(rds_iw_sysctl_hdr);
+}
+int __init rds_iw_sysctl_init(void)
+{
+        rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
+        if (rds_iw_sysctl_hdr == NULL)
+                return -ENOMEM;
+        return 0;
+}
diff --git a/net/rds/loop.c b/net/rds/loop.c
new file mode 100644
index 000000000000..4a61997f554d
--- /dev/null
+++ b/net/rds/loop.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include "rds.h"
+#include "loop.h"
+static DEFINE_SPINLOCK(loop_conns_lock);
+static LIST_HEAD(loop_conns);
+/*
+ * This 'loopback' transport is a special case for flows that originate
+ * and terminate on the same machine.
+ *
+ * Connection build-up notices if the destination address is thought of
+ * as a local address by a transport.  At that time it decides to use the
+ * loopback transport instead of the bound transport of the sending socket.
+ *
+ * The loopback transport's sending path just hands the sent rds_message
+ * straight to the receiving path via an embedded rds_incoming.
+ */
+/*
+ * Usually a message transits both the sender and receiver's conns as it
+ * flows to the receiver.  In the loopback case, though, the receive path
+ * is handed the sending conn so the sense of the addresses is reversed.
+ */
+static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
+                         unsigned int hdr_off, unsigned int sg,
+                         unsigned int off)
+{
+        BUG_ON(hdr_off || sg || off);
+        rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
+        rds_message_addref(rm); /* for the inc */
+        rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
+                          GFP_KERNEL, KM_USER0);
+        rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
+                            NULL);
+        rds_inc_put(&rm->m_inc);
+        return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len);
+}
+static int rds_loop_xmit_cong_map(struct rds_connection *conn,
+                                  struct rds_cong_map *map,
+                                  unsigned long offset)
+{
+        unsigned long i;
+        BUG_ON(offset);
+        BUG_ON(map != conn->c_lcong);
+        for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
+                memcpy((void *)conn->c_fcong->m_page_addrs[i],
+                       (void *)map->m_page_addrs[i], PAGE_SIZE);
+        }
+        rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+        return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
+}
+/* we need to at least give the thread something to succeed */
+static int rds_loop_recv(struct rds_connection *conn)
+{
+        return 0;
+}
+struct rds_loop_connection {
+        struct list_head loop_node;
+        struct rds_connection *conn;
+};
+/*
+ * Even the loopback transport needs to keep track of its connections,
+ * so it can call rds_conn_destroy() on them on exit. N.B. there are
+ * 1+ loopback addresses (127.*.*.*) so it's not a bug to have
+ * multiple loopback conns allocated, although rather useless.
+ */
+static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+        struct rds_loop_connection *lc;
+        unsigned long flags;
+        lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
+        if (lc == NULL)
+                return -ENOMEM;
+        INIT_LIST_HEAD(&lc->loop_node);
+        lc->conn = conn;
+        conn->c_transport_data = lc;
+        spin_lock_irqsave(&loop_conns_lock, flags);
+        list_add_tail(&lc->loop_node, &loop_conns);
+        spin_unlock_irqrestore(&loop_conns_lock, flags);
+        return 0;
+}
+static void rds_loop_conn_free(void *arg)
+{
+        struct rds_loop_connection *lc = arg;
+        rdsdebug("lc %p\n", lc);
+        list_del(&lc->loop_node);
+        kfree(lc);
+}
+static int rds_loop_conn_connect(struct rds_connection *conn)
+{
+        rds_connect_complete(conn);
+        return 0;
+}
+static void rds_loop_conn_shutdown(struct rds_connection *conn)
+{
+}
+void rds_loop_exit(void)
+{
+        struct rds_loop_connection *lc, *_lc;
+        LIST_HEAD(tmp_list);
+        /* avoid calling conn_destroy with irqs off */
+        spin_lock_irq(&loop_conns_lock);
+        list_splice(&loop_conns, &tmp_list);
+        INIT_LIST_HEAD(&loop_conns);
+        spin_unlock_irq(&loop_conns_lock);
+        list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) {
+                WARN_ON(lc->conn->c_passive);
+                rds_conn_destroy(lc->conn);
+        }
+}
+/*
+ * This is missing .xmit_* because loop doesn't go through generic
+ * rds_send_xmit() and doesn't call rds_recv_incoming().  .listen_stop and
+ * .laddr_check are missing because transport.c doesn't iterate over
+ * rds_loop_transport.
+ */
+struct rds_transport rds_loop_transport = {
+        .xmit                   = rds_loop_xmit,
+        .xmit_cong_map          = rds_loop_xmit_cong_map,
+        .recv                   = rds_loop_recv,
+        .conn_alloc             = rds_loop_conn_alloc,
+        .conn_free              = rds_loop_conn_free,
+        .conn_connect           = rds_loop_conn_connect,
+        .conn_shutdown          = rds_loop_conn_shutdown,
+        .inc_copy_to_user       = rds_message_inc_copy_to_user,
+        .inc_purge              = rds_message_inc_purge,
+        .inc_free               = rds_message_inc_free,
+        .t_name                 = "loopback",
+};
diff --git a/net/rds/loop.h b/net/rds/loop.h
new file mode 100644
index 000000000000..f32b0939a04d
--- /dev/null
+++ b/net/rds/loop.h
@@ -0,0 +1,9 @@
+#ifndef _RDS_LOOP_H
+#define _RDS_LOOP_H
+/* loop.c */
+extern struct rds_transport rds_loop_transport;
+void rds_loop_exit(void);
+#endif
diff --git a/net/rds/message.c b/net/rds/message.c
new file mode 100644
index 000000000000..5a15dc8d0cd7
--- /dev/null
+++ b/net/rds/message.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include "rds.h"
+#include "rdma.h"
+static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
+static unsigned int     rds_exthdr_size[__RDS_EXTHDR_MAX] = {
+[RDS_EXTHDR_NONE]       = 0,
+[RDS_EXTHDR_VERSION]    = sizeof(struct rds_ext_header_version),
+[RDS_EXTHDR_RDMA]       = sizeof(struct rds_ext_header_rdma),
+[RDS_EXTHDR_RDMA_DEST]  = sizeof(struct rds_ext_header_rdma_dest),
+};
+void rds_message_addref(struct rds_message *rm)
+{
+        rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
+        atomic_inc(&rm->m_refcount);
+}
+/*
+ * This relies on dma_map_sg() not touching sg[].page during merging.
+ */
+static void rds_message_purge(struct rds_message *rm)
+{
+        unsigned long i;
+        if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
+                return;
+        for (i = 0; i < rm->m_nents; i++) {
+                rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i]));
+                /* XXX will have to put_page for page refs */
+                __free_page(sg_page(&rm->m_sg[i]));
+        }
+        rm->m_nents = 0;
+        if (rm->m_rdma_op)
+                rds_rdma_free_op(rm->m_rdma_op);
+        if (rm->m_rdma_mr)
+                rds_mr_put(rm->m_rdma_mr);
+}
+void rds_message_inc_purge(struct rds_incoming *inc)
+{
+        struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+        rds_message_purge(rm);
+}
+void rds_message_put(struct rds_message *rm)
+{
+        rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
+        if (atomic_dec_and_test(&rm->m_refcount)) {
+                BUG_ON(!list_empty(&rm->m_sock_item));
+                BUG_ON(!list_empty(&rm->m_conn_item));
+                rds_message_purge(rm);
+                kfree(rm);
+        }
+}
+void rds_message_inc_free(struct rds_incoming *inc)
+{
+        struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+        rds_message_put(rm);
+}
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+                                 __be16 dport, u64 seq)
+{
+        hdr->h_flags = 0;
+        hdr->h_sport = sport;
+        hdr->h_dport = dport;
+        hdr->h_sequence = cpu_to_be64(seq);
+        hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
+}
+int rds_message_add_extension(struct rds_header *hdr,
+                unsigned int type, const void *data, unsigned int len)
+{
+        unsigned int ext_len = sizeof(u8) + len;
+        unsigned char *dst;
+        /* For now, refuse to add more than one extension header */
+        if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
+                return 0;
+        if (type >= __RDS_EXTHDR_MAX
+         || len != rds_exthdr_size[type])
+                return 0;
+        if (ext_len >= RDS_HEADER_EXT_SPACE)
+                return 0;
+        dst = hdr->h_exthdr;
+        *dst++ = type;
+        memcpy(dst, data, len);
+        dst[len] = RDS_EXTHDR_NONE;
+        return 1;
+}
+/*
+ * If a message has extension headers, retrieve them here.
+ * Call like this:
+ *
+ * unsigned int pos = 0;
+ *
+ * while (1) {
+ *      buflen = sizeof(buffer);
+ *      type = rds_message_next_extension(hdr, &pos, buffer, &buflen);
+ *      if (type == RDS_EXTHDR_NONE)
+ *              break;
+ *      ...
+ * }
+ */
+int rds_message_next_extension(struct rds_header *hdr,
+                unsigned int *pos, void *buf, unsigned int *buflen)
+{
+        unsigned int offset, ext_type, ext_len;
+        u8 *src = hdr->h_exthdr;
+        offset = *pos;
+        if (offset >= RDS_HEADER_EXT_SPACE)
+                goto none;
+        /* Get the extension type and length. For now, the
+         * length is implied by the extension type. */
+        ext_type = src[offset++];
+        if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX)
+                goto none;
+        ext_len = rds_exthdr_size[ext_type];
+        if (offset + ext_len > RDS_HEADER_EXT_SPACE)
+                goto none;
+        *pos = offset + ext_len;
+        if (ext_len < *buflen)
+                *buflen = ext_len;
+        memcpy(buf, src + offset, *buflen);
+        return ext_type;
+none:
+        *pos = RDS_HEADER_EXT_SPACE;
+        *buflen = 0;
+        return RDS_EXTHDR_NONE;
+}
+int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version)
+{
+        struct rds_ext_header_version ext_hdr;
+        ext_hdr.h_version = cpu_to_be32(version);
+        return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr));
+}
+int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version)
+{
+        struct rds_ext_header_version ext_hdr;
+        unsigned int pos = 0, len = sizeof(ext_hdr);
+        /* We assume the version extension is the only one present */
+        if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION)
+                return 0;
+        *version = be32_to_cpu(ext_hdr.h_version);
+        return 1;
+}
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
+{
+        struct rds_ext_header_rdma_dest ext_hdr;
+        ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
+        ext_hdr.h_rdma_offset = cpu_to_be32(offset);
+        return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
+}
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
+{
+        struct rds_message *rm;
+        rm = kzalloc(sizeof(struct rds_message) +
+                     (nents * sizeof(struct scatterlist)), gfp);
+        if (!rm)
+                goto out;
+        if (nents)
+                sg_init_table(rm->m_sg, nents);
+        atomic_set(&rm->m_refcount, 1);
+        INIT_LIST_HEAD(&rm->m_sock_item);
+        INIT_LIST_HEAD(&rm->m_conn_item);
+        spin_lock_init(&rm->m_rs_lock);
+out:
+        return rm;
+}
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
+{
+        struct rds_message *rm;
+        unsigned int i;
+        rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
+        if (rm == NULL)
+                return ERR_PTR(-ENOMEM);
+        set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
+        rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
+        rm->m_nents = ceil(total_len, PAGE_SIZE);
+        for (i = 0; i < rm->m_nents; ++i) {
+                sg_set_page(&rm->m_sg[i],
+                                virt_to_page(page_addrs[i]),
+                                PAGE_SIZE, 0);
+        }
+        return rm;
+}
+struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+                                               size_t total_len)
+{
+        unsigned long to_copy;
+        unsigned long iov_off;
+        unsigned long sg_off;
+        struct rds_message *rm;
+        struct iovec *iov;
+        struct scatterlist *sg;
+        int ret;
+        rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
+        if (rm == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
+        /*
+         * now allocate and copy in the data payload.
+         */
+        sg = rm->m_sg;
+        iov = first_iov;
+        iov_off = 0;
+        sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
+        while (total_len) {
+                if (sg_page(sg) == NULL) {
+                        ret = rds_page_remainder_alloc(sg, total_len,
+                                                       GFP_HIGHUSER);
+                        if (ret)
+                                goto out;
+                        rm->m_nents++;
+                        sg_off = 0;
+                }
+                while (iov_off == iov->iov_len) {
+                        iov_off = 0;
+                        iov++;
+                }
+                to_copy = min(iov->iov_len - iov_off, sg->length - sg_off);
+                to_copy = min_t(size_t, to_copy, total_len);
+                rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to "
+                         "sg [%p, %u, %u] + %lu\n",
+                         to_copy, iov->iov_base, iov->iov_len, iov_off,
+                         (void *)sg_page(sg), sg->offset, sg->length, sg_off);
+                ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off,
+                                              iov->iov_base + iov_off,
+                                              to_copy);
+                if (ret)
+                        goto out;
+                iov_off += to_copy;
+                total_len -= to_copy;
+                sg_off += to_copy;
+                if (sg_off == sg->length)
+                        sg++;
+        }
+        ret = 0;
+out:
+        if (ret) {
+                if (rm)
+                        rds_message_put(rm);
+                rm = ERR_PTR(ret);
+        }
+        return rm;
+}
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+                                 struct iovec *first_iov, size_t size)
+{
+        struct rds_message *rm;
+        struct iovec *iov;
+        struct scatterlist *sg;
+        unsigned long to_copy;
+        unsigned long iov_off;
+        unsigned long vec_off;
+        int copied;
+        int ret;
+        u32 len;
+        rm = container_of(inc, struct rds_message, m_inc);
+        len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+        iov = first_iov;
+        iov_off = 0;
+        sg = rm->m_sg;
+        vec_off = 0;
+        copied = 0;
+        while (copied < size && copied < len) {
+                while (iov_off == iov->iov_len) {
+                        iov_off = 0;
+                        iov++;
+                }
+                to_copy = min(iov->iov_len - iov_off, sg->length - vec_off);
+                to_copy = min_t(size_t, to_copy, size - copied);
+                to_copy = min_t(unsigned long, to_copy, len - copied);
+                rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to "
+                         "sg [%p, %u, %u] + %lu\n",
+                         to_copy, iov->iov_base, iov->iov_len, iov_off,
+                         sg_page(sg), sg->offset, sg->length, vec_off);
+                ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off,
+                                            iov->iov_base + iov_off,
+                                            to_copy);
+                if (ret) {
+                        copied = ret;
+                        break;
+                }
+                iov_off += to_copy;
+                vec_off += to_copy;
+                copied += to_copy;
+                if (vec_off == sg->length) {
+                        vec_off = 0;
+                        sg++;
+                }
+        }
+        return copied;
+}
+/*
+ * If the message is still on the send queue, wait until the transport
+ * is done with it. This is particularly important for RDMA operations.
+ */
+void rds_message_wait(struct rds_message *rm)
+{
+        wait_event(rds_message_flush_waitq,
+                        !test_bit(RDS_MSG_MAPPED, &rm->m_flags));
+}
+void rds_message_unmapped(struct rds_message *rm)
+{
+        clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+        if (waitqueue_active(&rds_message_flush_waitq))
+                wake_up(&rds_message_flush_waitq);
+}
diff --git a/net/rds/page.c b/net/rds/page.c
new file mode 100644
index 000000000000..c460743a89ad
--- /dev/null
+++ b/net/rds/page.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/highmem.h>
+#include "rds.h"
+struct rds_page_remainder {
+        struct page     *r_page;
+        unsigned long   r_offset;
+};
+DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned;
+/*
+ * returns 0 on success or -errno on failure.
+ *
+ * We don't have to worry about flush_dcache_page() as this only works
+ * with private pages.  If, say, we were to do directed receive to pinned
+ * user pages we'd have to worry more about cache coherence.  (Though
+ * the flush_dcache_page() in get_user_pages() would probably be enough).
+ */
+int rds_page_copy_user(struct page *page, unsigned long offset,
+                       void __user *ptr, unsigned long bytes,
+                       int to_user)
+{
+        unsigned long ret;
+        void *addr;
+        if (to_user)
+                rds_stats_add(s_copy_to_user, bytes);
+        else
+                rds_stats_add(s_copy_from_user, bytes);
+        addr = kmap_atomic(page, KM_USER0);
+        if (to_user)
+                ret = __copy_to_user_inatomic(ptr, addr + offset, bytes);
+        else
+                ret = __copy_from_user_inatomic(addr + offset, ptr, bytes);
+        kunmap_atomic(addr, KM_USER0);
+        if (ret) {
+                addr = kmap(page);
+                if (to_user)
+                        ret = copy_to_user(ptr, addr + offset, bytes);
+                else
+                        ret = copy_from_user(addr + offset, ptr, bytes);
+                kunmap(page);
+                if (ret)
+                        return -EFAULT;
+        }
+        return 0;
+}
+/*
+ * Message allocation uses this to build up regions of a message.
+ *
+ * @bytes - the number of bytes needed.
+ * @gfp - the waiting behaviour of the allocation
+ *
+ * @gfp is always ored with __GFP_HIGHMEM.  Callers must be prepared to
+ * kmap the pages, etc.
+ *
+ * If @bytes is at least a full page then this just returns a page from
+ * alloc_page().
+ *
+ * If @bytes is a partial page then this stores the unused region of the
+ * page in a per-cpu structure.  Future partial-page allocations may be
+ * satisfied from that cached region.  This lets us waste less memory on
+ * small allocations with minimal complexity.  It works because the transmit
+ * path passes read-only page regions down to devices.  They hold a page
+ * reference until they are done with the region.
+ */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+                             gfp_t gfp)
+{
+        struct rds_page_remainder *rem;
+        unsigned long flags;
+        struct page *page;
+        int ret;
+        gfp |= __GFP_HIGHMEM;
+        /* jump straight to allocation if we're trying for a huge page */
+        if (bytes >= PAGE_SIZE) {
+                page = alloc_page(gfp);
+                if (page == NULL) {
+                        ret = -ENOMEM;
+                } else {
+                        sg_set_page(scat, page, PAGE_SIZE, 0);
+                        ret = 0;
+                }
+                goto out;
+        }
+        rem = &per_cpu(rds_page_remainders, get_cpu());
+        local_irq_save(flags);
+        while (1) {
+                /* avoid a tiny region getting stuck by tossing it */
+                if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) {
+                        rds_stats_inc(s_page_remainder_miss);
+                        __free_page(rem->r_page);
+                        rem->r_page = NULL;
+                }
+                /* hand out a fragment from the cached page */
+                if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) {
+                        sg_set_page(scat, rem->r_page, bytes, rem->r_offset);
+                        get_page(sg_page(scat));
+                        if (rem->r_offset != 0)
+                                rds_stats_inc(s_page_remainder_hit);
+                        rem->r_offset += bytes;
+                        if (rem->r_offset == PAGE_SIZE) {
+                                __free_page(rem->r_page);
+                                rem->r_page = NULL;
+                        }
+                        ret = 0;
+                        break;
+                }
+                /* alloc if there is nothing for us to use */
+                local_irq_restore(flags);
+                put_cpu();
+                page = alloc_page(gfp);
+                rem = &per_cpu(rds_page_remainders, get_cpu());
+                local_irq_save(flags);
+                if (page == NULL) {
+                        ret = -ENOMEM;
+                        break;
+                }
+                /* did someone race to fill the remainder before us? */
+                if (rem->r_page) {
+                        __free_page(page);
+                        continue;
+                }
+                /* otherwise install our page and loop around to alloc */
+                rem->r_page = page;
+                rem->r_offset = 0;
+        }
+        local_irq_restore(flags);
+        put_cpu();
+out:
+        rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
+                 ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,
+                 ret ? 0 : scat->length);
+        return ret;
+}
+static int rds_page_remainder_cpu_notify(struct notifier_block *self,
+                                         unsigned long action, void *hcpu)
+{
+        struct rds_page_remainder *rem;
+        long cpu = (long)hcpu;
+        rem = &per_cpu(rds_page_remainders, cpu);
+        rdsdebug("cpu %ld action 0x%lx\n", cpu, action);
+        switch (action) {
+        case CPU_DEAD:
+                if (rem->r_page)
+                        __free_page(rem->r_page);
+                rem->r_page = NULL;
+                break;
+        }
+        return 0;
+}
+static struct notifier_block rds_page_remainder_nb = {
+        .notifier_call = rds_page_remainder_cpu_notify,
+};
+void rds_page_exit(void)
+{
+        int i;
+        for_each_possible_cpu(i)
+                rds_page_remainder_cpu_notify(&rds_page_remainder_nb,
+                                              (unsigned long)CPU_DEAD,
+                                              (void *)(long)i);
+}
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
new file mode 100644
index 000000000000..eaeeb91e1119
--- /dev/null
+++ b/net/rds/rdma.c
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2007 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/pagemap.h>
+#include <linux/rbtree.h>
+#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
+#include "rdma.h"
+/*
+ * XXX
+ *  - build with sparse
+ *  - should we limit the size of a mr region?  let transport return failure?
+ *  - should we detect duplicate keys on a socket?  hmm.
+ *  - an rdma is an mlock, apply rlimit?
+ */
+/*
+ * get the number of pages by looking at the page indices that the start and
+ * end addresses fall in.
+ *
+ * Returns 0 if the vec is invalid.  It is invalid if the number of bytes
+ * causes the address to wrap or overflows an unsigned int.  This comes
+ * from being stored in the 'length' member of 'struct scatterlist'.
+ */
+static unsigned int rds_pages_in_vec(struct rds_iovec *vec)
+{
+        if ((vec->addr + vec->bytes <= vec->addr) ||
+            (vec->bytes > (u64)UINT_MAX))
+                return 0;
+        return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+                (vec->addr >> PAGE_SHIFT);
+}
+static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
+                                       struct rds_mr *insert)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct rds_mr *mr;
+        while (*p) {
+                parent = *p;
+                mr = rb_entry(parent, struct rds_mr, r_rb_node);
+                if (key < mr->r_key)
+                        p = &(*p)->rb_left;
+                else if (key > mr->r_key)
+                        p = &(*p)->rb_right;
+                else
+                        return mr;
+        }
+        if (insert) {
+                rb_link_node(&insert->r_rb_node, parent, p);
+                rb_insert_color(&insert->r_rb_node, root);
+                atomic_inc(&insert->r_refcount);
+        }
+        return NULL;
+}
+/*
+ * Destroy the transport-specific part of a MR.
+ */
+static void rds_destroy_mr(struct rds_mr *mr)
+{
+        struct rds_sock *rs = mr->r_sock;
+        void *trans_private = NULL;
+        unsigned long flags;
+        rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
+                        mr->r_key, atomic_read(&mr->r_refcount));
+        if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state))
+                return;
+        spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+        if (!RB_EMPTY_NODE(&mr->r_rb_node))
+                rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+        trans_private = mr->r_trans_private;
+        mr->r_trans_private = NULL;
+        spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+        if (trans_private)
+                mr->r_trans->free_mr(trans_private, mr->r_invalidate);
+}
+void __rds_put_mr_final(struct rds_mr *mr)
+{
+        rds_destroy_mr(mr);
+        kfree(mr);
+}
+/*
+ * By the time this is called we can't have any more ioctls called on
+ * the socket so we don't need to worry about racing with others.
+ */
+void rds_rdma_drop_keys(struct rds_sock *rs)
+{
+        struct rds_mr *mr;
+        struct rb_node *node;
+        /* Release any MRs associated with this socket */
+        while ((node = rb_first(&rs->rs_rdma_keys))) {
+                mr = container_of(node, struct rds_mr, r_rb_node);
+                if (mr->r_trans == rs->rs_transport)
+                        mr->r_invalidate = 0;
+                rds_mr_put(mr);
+        }
+        if (rs->rs_transport && rs->rs_transport->flush_mrs)
+                rs->rs_transport->flush_mrs();
+}
+/*
+ * Helper function to pin user pages.
+ */
+static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
+                        struct page **pages, int write)
+{
+        int ret;
+        down_read(&current->mm->mmap_sem);
+        ret = get_user_pages(current, current->mm, user_addr,
+                             nr_pages, write, 0, pages, NULL);
+        up_read(&current->mm->mmap_sem);
+        if (0 <= ret && (unsigned) ret < nr_pages) {
+                while (ret--)
+                        put_page(pages[ret]);
+                ret = -EFAULT;
+        }
+        return ret;
+}
+static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
+                                u64 *cookie_ret, struct rds_mr **mr_ret)
+{
+        struct rds_mr *mr = NULL, *found;
+        unsigned int nr_pages;
+        struct page **pages = NULL;
+        struct scatterlist *sg;
+        void *trans_private;
+        unsigned long flags;
+        rds_rdma_cookie_t cookie;
+        unsigned int nents;
+        long i;
+        int ret;
+        if (rs->rs_bound_addr == 0) {
+                ret = -ENOTCONN; /* XXX not a great errno */
+                goto out;
+        }
+        if (rs->rs_transport->get_mr == NULL) {
+                ret = -EOPNOTSUPP;
+                goto out;
+        }
+        nr_pages = rds_pages_in_vec(&args->vec);
+        if (nr_pages == 0) {
+                ret = -EINVAL;
+                goto out;
+        }
+        rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
+                args->vec.addr, args->vec.bytes, nr_pages);
+        /* XXX clamp nr_pages to limit the size of this alloc? */
+        pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+        if (pages == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
+        if (mr == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        atomic_set(&mr->r_refcount, 1);
+        RB_CLEAR_NODE(&mr->r_rb_node);
+        mr->r_trans = rs->rs_transport;
+        mr->r_sock = rs;
+        if (args->flags & RDS_RDMA_USE_ONCE)
+                mr->r_use_once = 1;
+        if (args->flags & RDS_RDMA_INVALIDATE)
+                mr->r_invalidate = 1;
+        if (args->flags & RDS_RDMA_READWRITE)
+                mr->r_write = 1;
+        /*
+         * Pin the pages that make up the user buffer and transfer the page
+         * pointers to the mr's sg array.  We check to see if we've mapped
+         * the whole region after transferring the partial page references
+         * to the sg array so that we can have one page ref cleanup path.
+         *
+         * For now we have no flag that tells us whether the mapping is
+         * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
+         * the zero page.
+         */
+        ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1);
+        if (ret < 0)
+                goto out;
+        nents = ret;
+        sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
+        if (sg == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        WARN_ON(!nents);
+        sg_init_table(sg, nents);
+        /* Stick all pages into the scatterlist */
+        for (i = 0 ; i < nents; i++)
+                sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+        rdsdebug("RDS: trans_private nents is %u\n", nents);
+        /* Obtain a transport specific MR. If this succeeds, the
+         * s/g list is now owned by the MR.
+         * Note that dma_map() implies that pending writes are
+         * flushed to RAM, so no dma_sync is needed here. */
+        trans_private = rs->rs_transport->get_mr(sg, nents, rs,
+                                                 &mr->r_key);
+        if (IS_ERR(trans_private)) {
+                for (i = 0 ; i < nents; i++)
+                        put_page(sg_page(&sg[i]));
+                kfree(sg);
+                ret = PTR_ERR(trans_private);
+                goto out;
+        }
+        mr->r_trans_private = trans_private;
+        rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n",
+               mr->r_key, (void *)(unsigned long) args->cookie_addr);
+        /* The user may pass us an unaligned address, but we can only
+         * map page aligned regions. So we keep the offset, and build
+         * a 64bit cookie containing <R_Key, offset> and pass that
+         * around. */
+        cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
+        if (cookie_ret)
+                *cookie_ret = cookie;
+        if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
+                ret = -EFAULT;
+                goto out;
+        }
+        /* Inserting the new MR into the rbtree bumps its
+         * reference count. */
+        spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+        found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
+        spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+        BUG_ON(found && found != mr);
+        rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
+        if (mr_ret) {
+                atomic_inc(&mr->r_refcount);
+                *mr_ret = mr;
+        }
+        ret = 0;
+out:
+        kfree(pages);
+        if (mr)
+                rds_mr_put(mr);
+        return ret;
+}
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
+{
+        struct rds_get_mr_args args;
+        if (optlen != sizeof(struct rds_get_mr_args))
+                return -EINVAL;
+        if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval,
+                           sizeof(struct rds_get_mr_args)))
+                return -EFAULT;
+        return __rds_rdma_map(rs, &args, NULL, NULL);
+}
+/*
+ * Free the MR indicated by the given R_Key
+ */
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
+{
+        struct rds_free_mr_args args;
+        struct rds_mr *mr;
+        unsigned long flags;
+        if (optlen != sizeof(struct rds_free_mr_args))
+                return -EINVAL;
+        if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval,
+                           sizeof(struct rds_free_mr_args)))
+                return -EFAULT;
+        /* Special case - a null cookie means flush all unused MRs */
+        if (args.cookie == 0) {
+                if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
+                        return -EINVAL;
+                rs->rs_transport->flush_mrs();
+                return 0;
+        }
+        /* Look up the MR given its R_key and remove it from the rbtree
+         * so nobody else finds it.
+         * This should also prevent races with rds_rdma_unuse.
+         */
+        spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+        mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL);
+        if (mr) {
+                rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+                RB_CLEAR_NODE(&mr->r_rb_node);
+                if (args.flags & RDS_RDMA_INVALIDATE)
+                        mr->r_invalidate = 1;
+        }
+        spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+        if (!mr)
+                return -EINVAL;
+        /*
+         * call rds_destroy_mr() ourselves so that we're sure it's done by the time
+         * we return.  If we let rds_mr_put() do it it might not happen until
+         * someone else drops their ref.
+         */
+        rds_destroy_mr(mr);
+        rds_mr_put(mr);
+        return 0;
+}
+/*
+ * This is called when we receive an extension header that
+ * tells us this MR was used. It allows us to implement
+ * use_once semantics
+ */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
+{
+        struct rds_mr *mr;
+        unsigned long flags;
+        int zot_me = 0;
+        spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+        mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+        if (mr && (mr->r_use_once || force)) {
+                rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+                RB_CLEAR_NODE(&mr->r_rb_node);
+                zot_me = 1;
+        } else if (mr)
+                atomic_inc(&mr->r_refcount);
+        spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+        /* May have to issue a dma_sync on this memory region.
+         * Note we could avoid this if the operation was a RDMA READ,
+         * but at this point we can't tell. */
+        if (mr != NULL) {
+                if (mr->r_trans->sync_mr)
+                        mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+                /* If the MR was marked as invalidate, this will
+                 * trigger an async flush. */
+                if (zot_me)
+                        rds_destroy_mr(mr);
+                rds_mr_put(mr);
+        }
+}
+void rds_rdma_free_op(struct rds_rdma_op *ro)
+{
+        unsigned int i;
+        for (i = 0; i < ro->r_nents; i++) {
+                struct page *page = sg_page(&ro->r_sg[i]);
+                /* Mark page dirty if it was possibly modified, which
+                 * is the case for a RDMA_READ which copies from remote
+                 * to local memory */
+                if (!ro->r_write)
+                        set_page_dirty(page);
+                put_page(page);
+        }
+        kfree(ro->r_notifier);
+        kfree(ro);
+}
+/*
+ * args is a pointer to an in-kernel copy in the sendmsg cmsg.
+ */
+static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
+                                            struct rds_rdma_args *args)
+{
+        struct rds_iovec vec;
+        struct rds_rdma_op *op = NULL;
+        unsigned int nr_pages;
+        unsigned int max_pages;
+        unsigned int nr_bytes;
+        struct page **pages = NULL;
+        struct rds_iovec __user *local_vec;
+        struct scatterlist *sg;
+        unsigned int nr;
+        unsigned int i, j;
+        int ret;
+        if (rs->rs_bound_addr == 0) {
+                ret = -ENOTCONN; /* XXX not a great errno */
+                goto out;
+        }
+        if (args->nr_local > (u64)UINT_MAX) {
+                ret = -EMSGSIZE;
+                goto out;
+        }
+        nr_pages = 0;
+        max_pages = 0;
+        local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+        /* figure out the number of pages in the vector */
+        for (i = 0; i < args->nr_local; i++) {
+                if (copy_from_user(&vec, &local_vec[i],
+                                   sizeof(struct rds_iovec))) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                nr = rds_pages_in_vec(&vec);
+                if (nr == 0) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                max_pages = max(nr, max_pages);
+                nr_pages += nr;
+        }
+        pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
+        if (pages == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
+        if (op == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
+        op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
+        op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+        op->r_recverr = rs->rs_recverr;
+        WARN_ON(!nr_pages);
+        sg_init_table(op->r_sg, nr_pages);
+        if (op->r_notify || op->r_recverr) {
+                /* We allocate an uninitialized notifier here, because
+                 * we don't want to do that in the completion handler. We
+                 * would have to use GFP_ATOMIC there, and don't want to deal
+                 * with failed allocations.
+                 */
+                op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
+                if (!op->r_notifier) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                op->r_notifier->n_user_token = args->user_token;
+                op->r_notifier->n_status = RDS_RDMA_SUCCESS;
+        }
+        /* The cookie contains the R_Key of the remote memory region, and
+         * optionally an offset into it. This is how we implement RDMA into
+         * unaligned memory.
+         * When setting up the RDMA, we need to add that offset to the
+         * destination address (which is really an offset into the MR)
+         * FIXME: We may want to move this into ib_rdma.c
+         */
+        op->r_key = rds_rdma_cookie_key(args->cookie);
+        op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
+        nr_bytes = 0;
+        rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
+               (unsigned long long)args->nr_local,
+               (unsigned long long)args->remote_vec.addr,
+               op->r_key);
+        for (i = 0; i < args->nr_local; i++) {
+                if (copy_from_user(&vec, &local_vec[i],
+                                   sizeof(struct rds_iovec))) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                nr = rds_pages_in_vec(&vec);
+                if (nr == 0) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                rs->rs_user_addr = vec.addr;
+                rs->rs_user_bytes = vec.bytes;
+                /* did the user change the vec under us? */
+                if (nr > max_pages || op->r_nents + nr > nr_pages) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                /* If it's a WRITE operation, we want to pin the pages for reading.
+                 * If it's a READ operation, we need to pin the pages for writing.
+                 */
+                ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write);
+                if (ret < 0)
+                        goto out;
+                rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n",
+                       nr_bytes, nr, vec.bytes, vec.addr);
+                nr_bytes += vec.bytes;
+                for (j = 0; j < nr; j++) {
+                        unsigned int offset = vec.addr & ~PAGE_MASK;
+                        sg = &op->r_sg[op->r_nents + j];
+                        sg_set_page(sg, pages[j],
+                                        min_t(unsigned int, vec.bytes, PAGE_SIZE - offset),
+                                        offset);
+                        rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n",
+                               sg->offset, sg->length, vec.addr, vec.bytes);
+                        vec.addr += sg->length;
+                        vec.bytes -= sg->length;
+                }
+                op->r_nents += nr;
+        }
+        if (nr_bytes > args->remote_vec.bytes) {
+                rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
+                                nr_bytes,
+                                (unsigned int) args->remote_vec.bytes);
+                ret = -EINVAL;
+                goto out;
+        }
+        op->r_bytes = nr_bytes;
+        ret = 0;
+out:
+        kfree(pages);
+        if (ret) {
+                if (op)
+                        rds_rdma_free_op(op);
+                op = ERR_PTR(ret);
+        }
+        return op;
+}
+/*
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
+ */
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+                          struct cmsghdr *cmsg)
+{
+        struct rds_rdma_op *op;
+        if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
+         || rm->m_rdma_op != NULL)
+                return -EINVAL;
+        op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
+        if (IS_ERR(op))
+                return PTR_ERR(op);
+        rds_stats_inc(s_send_rdma);
+        rm->m_rdma_op = op;
+        return 0;
+}
+/*
+ * The application wants us to pass an RDMA destination (aka MR)
+ * to the remote
+ */
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+                          struct cmsghdr *cmsg)
+{
+        unsigned long flags;
+        struct rds_mr *mr;
+        u32 r_key;
+        int err = 0;
+        if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t))
+         || rm->m_rdma_cookie != 0)
+                return -EINVAL;
+        memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
+        /* We are reusing a previously mapped MR here. Most likely, the
+         * application has written to the buffer, so we need to explicitly
+         * flush those writes to RAM. Otherwise the HCA may not see them
+         * when doing a DMA from that buffer.
+         */
+        r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);
+        spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+        mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+        if (mr == NULL)
+                err = -EINVAL;  /* invalid r_key */
+        else
+                atomic_inc(&mr->r_refcount);
+        spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+        if (mr) {
+                mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
+                rm->m_rdma_mr = mr;
+        }
+        return err;
+}
+/*
+ * The application passes us an address range it wants to enable RDMA
+ * to/from. We map the area, and save the <R_Key,offset> pair
+ * in rm->m_rdma_cookie. This causes it to be sent along to the peer
+ * in an extension header.
+ */
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+                          struct cmsghdr *cmsg)
+{
+        if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args))
+         || rm->m_rdma_cookie != 0)
+                return -EINVAL;
+        return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr);
+}
diff --git a/net/rds/rdma.h b/net/rds/rdma.h
new file mode 100644
index 000000000000..425512098b0b
--- /dev/null
+++ b/net/rds/rdma.h
@@ -0,0 +1,84 @@
+#ifndef _RDS_RDMA_H
+#define _RDS_RDMA_H
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/scatterlist.h>
+#include "rds.h"
+struct rds_mr {
+        struct rb_node          r_rb_node;
+        atomic_t                r_refcount;
+        u32                     r_key;
+        /* A copy of the creation flags */
+        unsigned int            r_use_once:1;
+        unsigned int            r_invalidate:1;
+        unsigned int            r_write:1;
+        /* This is for RDS_MR_DEAD.
+         * It would be nice & consistent to make this part of the above
+         * bit field here, but we need to use test_and_set_bit.
+         */
+        unsigned long           r_state;
+        struct rds_sock         *r_sock; /* back pointer to the socket that owns us */
+        struct rds_transport    *r_trans;
+        void                    *r_trans_private;
+};
+/* Flags for mr->r_state */
+#define RDS_MR_DEAD             0
+struct rds_rdma_op {
+        u32                     r_key;
+        u64                     r_remote_addr;
+        unsigned int            r_write:1;
+        unsigned int            r_fence:1;
+        unsigned int            r_notify:1;
+        unsigned int            r_recverr:1;
+        unsigned int            r_mapped:1;
+        struct rds_notifier     *r_notifier;
+        unsigned int            r_bytes;
+        unsigned int            r_nents;
+        unsigned int            r_count;
+        struct scatterlist      r_sg[0];
+};
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+        return r_key | (((u64) offset) << 32);
+}
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+        return cookie;
+}
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+        return cookie >> 32;
+}
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+                          struct cmsghdr *cmsg);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+                          struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+                          struct cmsghdr *cmsg);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+                          struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rds_rdma_op *ro);
+void rds_rdma_send_complete(struct rds_message *rm, int);
+extern void __rds_put_mr_final(struct rds_mr *mr);
+static inline void rds_mr_put(struct rds_mr *mr)
+{
+        if (atomic_dec_and_test(&mr->r_refcount))
+                __rds_put_mr_final(mr);
+}
+#endif
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
new file mode 100644
index 000000000000..7b19024f9706
--- /dev/null
+++ b/net/rds/rdma_transport.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2009 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <rdma/rdma_cm.h>
+#include "rdma_transport.h"
+static struct rdma_cm_id *rds_iw_listen_id;
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+                              struct rdma_cm_event *event)
+{
+        /* this can be null in the listening path */
+        struct rds_connection *conn = cm_id->context;
+        struct rds_transport *trans;
+        int ret = 0;
+        rdsdebug("conn %p id %p handling event %u\n", conn, cm_id,
+                 event->event);
+        if (cm_id->device->node_type == RDMA_NODE_RNIC)
+                trans = &rds_iw_transport;
+        else
+                trans = &rds_ib_transport;
+        /* Prevent shutdown from tearing down the connection
+         * while we're executing. */
+        if (conn) {
+                mutex_lock(&conn->c_cm_lock);
+                /* If the connection is being shut down, bail out
+                 * right away. We return 0 so cm_id doesn't get
+                 * destroyed prematurely */
+                if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) {
+                        /* Reject incoming connections while we're tearing
+                         * down an existing one. */
+                        if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
+                                ret = 1;
+                        goto out;
+                }
+        }
+        switch (event->event) {
+        case RDMA_CM_EVENT_CONNECT_REQUEST:
+                ret = trans->cm_handle_connect(cm_id, event);
+                break;
+        case RDMA_CM_EVENT_ADDR_RESOLVED:
+                /* XXX do we need to clean up if this fails? */
+                ret = rdma_resolve_route(cm_id,
+                                         RDS_RDMA_RESOLVE_TIMEOUT_MS);
+                break;
+        case RDMA_CM_EVENT_ROUTE_RESOLVED:
+                /* XXX worry about racing with listen acceptance */
+                ret = trans->cm_initiate_connect(cm_id);
+                break;
+        case RDMA_CM_EVENT_ESTABLISHED:
+                trans->cm_connect_complete(conn, event);
+                break;
+        case RDMA_CM_EVENT_ADDR_ERROR:
+        case RDMA_CM_EVENT_ROUTE_ERROR:
+        case RDMA_CM_EVENT_CONNECT_ERROR:
+        case RDMA_CM_EVENT_UNREACHABLE:
+        case RDMA_CM_EVENT_REJECTED:
+        case RDMA_CM_EVENT_DEVICE_REMOVAL:
+        case RDMA_CM_EVENT_ADDR_CHANGE:
+                if (conn)
+                        rds_conn_drop(conn);
+                break;
+        case RDMA_CM_EVENT_DISCONNECTED:
+                printk(KERN_WARNING "RDS/IW: DISCONNECT event - dropping connection "
+                        "%pI4->%pI4\n", &conn->c_laddr,
+                         &conn->c_faddr);
+                rds_conn_drop(conn);
+                break;
+        default:
+                /* things like device disconnect? */
+                printk(KERN_ERR "unknown event %u\n", event->event);
+                BUG();
+                break;
+        }
+out:
+        if (conn)
+                mutex_unlock(&conn->c_cm_lock);
+        rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret);
+        return ret;
+}
+static int __init rds_rdma_listen_init(void)
+{
+        struct sockaddr_in sin;
+        struct rdma_cm_id *cm_id;
+        int ret;
+        cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP);
+        if (IS_ERR(cm_id)) {
+                ret = PTR_ERR(cm_id);
+                printk(KERN_ERR "RDS/IW: failed to setup listener, "
+                       "rdma_create_id() returned %d\n", ret);
+                goto out;
+        }
+        sin.sin_family = PF_INET,
+        sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
+        sin.sin_port = (__force u16)htons(RDS_PORT);
+        /*
+         * XXX I bet this binds the cm_id to a device.  If we want to support
+         * fail-over we'll have to take this into consideration.
+         */
+        ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+        if (ret) {
+                printk(KERN_ERR "RDS/IW: failed to setup listener, "
+                       "rdma_bind_addr() returned %d\n", ret);
+                goto out;
+        }
+        ret = rdma_listen(cm_id, 128);
+        if (ret) {
+                printk(KERN_ERR "RDS/IW: failed to setup listener, "
+                       "rdma_listen() returned %d\n", ret);
+                goto out;
+        }
+        rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
+        rds_iw_listen_id = cm_id;
+        cm_id = NULL;
+out:
+        if (cm_id)
+                rdma_destroy_id(cm_id);
+        return ret;
+}
+static void rds_rdma_listen_stop(void)
+{
+        if (rds_iw_listen_id) {
+                rdsdebug("cm %p\n", rds_iw_listen_id);
+                rdma_destroy_id(rds_iw_listen_id);
+                rds_iw_listen_id = NULL;
+        }
+}
+int __init rds_rdma_init(void)
+{
+        int ret;
+        ret = rds_rdma_listen_init();
+        if (ret)
+                goto out;
+        ret = rds_iw_init();
+        if (ret)
+                goto err_iw_init;
+        ret = rds_ib_init();
+        if (ret)
+                goto err_ib_init;
+        goto out;
+err_ib_init:
+        rds_iw_exit();
+err_iw_init:
+        rds_rdma_listen_stop();
+out:
+        return ret;
+}
+void rds_rdma_exit(void)
+{
+        /* stop listening first to ensure no new connections are attempted */
+        rds_rdma_listen_stop();
+        rds_ib_exit();
+        rds_iw_exit();
+}
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
new file mode 100644
index 000000000000..2f2c7d976c21
--- /dev/null
+++ b/net/rds/rdma_transport.h
@@ -0,0 +1,28 @@
+#ifndef _RDMA_TRANSPORT_H
+#define _RDMA_TRANSPORT_H
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+#define RDS_RDMA_RESOLVE_TIMEOUT_MS     5000
+int rds_rdma_conn_connect(struct rds_connection *conn);
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+                              struct rdma_cm_event *event);
+/* from rdma_transport.c */
+int rds_rdma_init(void);
+void rds_rdma_exit(void);
+/* from ib.c */
+extern struct rds_transport rds_ib_transport;
+int rds_ib_init(void);
+void rds_ib_exit(void);
+/* from iw.c */
+extern struct rds_transport rds_iw_transport;
+int rds_iw_init(void);
+void rds_iw_exit(void);
+#endif
diff --git a/net/rds/rds.h b/net/rds/rds.h
new file mode 100644
index 000000000000..060400704979
--- /dev/null
+++ b/net/rds/rds.h
@@ -0,0 +1,686 @@
+#ifndef _RDS_RDS_H
+#define _RDS_RDS_H
+#include <net/sock.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <rdma/rdma_cm.h>
+#include <linux/mutex.h>
+#include <linux/rds.h>
+#include "info.h"
+/*
+ * RDS Network protocol version
+ */
+#define RDS_PROTOCOL_3_0        0x0300
+#define RDS_PROTOCOL_3_1        0x0301
+#define RDS_PROTOCOL_VERSION    RDS_PROTOCOL_3_1
+#define RDS_PROTOCOL_MAJOR(v)   ((v) >> 8)
+#define RDS_PROTOCOL_MINOR(v)   ((v) & 255)
+#define RDS_PROTOCOL(maj, min)  (((maj) << 8) | min)
+/*
+ * XXX randomly chosen, but at least seems to be unused:
+ * #               18464-18768 Unassigned
+ * We should do better.  We want a reserved port to discourage unpriv'ed
+ * userspace from listening.
+ */
+#define RDS_PORT        18634
+#ifdef DEBUG
+#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
+#else
+/* sigh, pr_debug() causes unused variable warnings */
+static inline void __attribute__ ((format (printf, 1, 2)))
+rdsdebug(char *fmt, ...)
+{
+}
+#endif
+/* XXX is there one of these somewhere? */
+#define ceil(x, y) \
+        ({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; })
+#define RDS_FRAG_SHIFT  12
+#define RDS_FRAG_SIZE   ((unsigned int)(1 << RDS_FRAG_SHIFT))
+#define RDS_CONG_MAP_BYTES      (65536 / 8)
+#define RDS_CONG_MAP_LONGS      (RDS_CONG_MAP_BYTES / sizeof(unsigned long))
+#define RDS_CONG_MAP_PAGES      (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
+#define RDS_CONG_MAP_PAGE_BITS  (PAGE_SIZE * 8)
+struct rds_cong_map {
+        struct rb_node          m_rb_node;
+        __be32                  m_addr;
+        wait_queue_head_t       m_waitq;
+        struct list_head        m_conn_list;
+        unsigned long           m_page_addrs[RDS_CONG_MAP_PAGES];
+};
+/*
+ * This is how we will track the connection state:
+ * A connection is always in one of the following
+ * states. Updates to the state are atomic and imply
+ * a memory barrier.
+ */
+enum {
+        RDS_CONN_DOWN = 0,
+        RDS_CONN_CONNECTING,
+        RDS_CONN_DISCONNECTING,
+        RDS_CONN_UP,
+        RDS_CONN_ERROR,
+};
+/* Bits for c_flags */
+#define RDS_LL_SEND_FULL        0
+#define RDS_RECONNECT_PENDING   1
+struct rds_connection {
+        struct hlist_node       c_hash_node;
+        __be32                  c_laddr;
+        __be32                  c_faddr;
+        unsigned int            c_loopback:1;
+        struct rds_connection   *c_passive;
+        struct rds_cong_map     *c_lcong;
+        struct rds_cong_map     *c_fcong;
+        struct mutex            c_send_lock;    /* protect send ring */
+        struct rds_message      *c_xmit_rm;
+        unsigned long           c_xmit_sg;
+        unsigned int            c_xmit_hdr_off;
+        unsigned int            c_xmit_data_off;
+        unsigned int            c_xmit_rdma_sent;
+        spinlock_t              c_lock;         /* protect msg queues */
+        u64                     c_next_tx_seq;
+        struct list_head        c_send_queue;
+        struct list_head        c_retrans;
+        u64                     c_next_rx_seq;
+        struct rds_transport    *c_trans;
+        void                    *c_transport_data;
+        atomic_t                c_state;
+        unsigned long           c_flags;
+        unsigned long           c_reconnect_jiffies;
+        struct delayed_work     c_send_w;
+        struct delayed_work     c_recv_w;
+        struct delayed_work     c_conn_w;
+        struct work_struct      c_down_w;
+        struct mutex            c_cm_lock;      /* protect conn state & cm */
+        struct list_head        c_map_item;
+        unsigned long           c_map_queued;
+        unsigned long           c_map_offset;
+        unsigned long           c_map_bytes;
+        unsigned int            c_unacked_packets;
+        unsigned int            c_unacked_bytes;
+        /* Protocol version */
+        unsigned int            c_version;
+};
+#define RDS_FLAG_CONG_BITMAP    0x01
+#define RDS_FLAG_ACK_REQUIRED   0x02
+#define RDS_FLAG_RETRANSMITTED  0x04
+#define RDS_MAX_ADV_CREDIT      127
+/*
+ * Maximum space available for extension headers.
+ */
+#define RDS_HEADER_EXT_SPACE    16
+struct rds_header {
+        __be64  h_sequence;
+        __be64  h_ack;
+        __be32  h_len;
+        __be16  h_sport;
+        __be16  h_dport;
+        u8      h_flags;
+        u8      h_credit;
+        u8      h_padding[4];
+        __sum16 h_csum;
+        u8      h_exthdr[RDS_HEADER_EXT_SPACE];
+};
+/*
+ * Reserved - indicates end of extensions
+ */
+#define RDS_EXTHDR_NONE         0
+/*
+ * This extension header is included in the very
+ * first message that is sent on a new connection,
+ * and identifies the protocol level. This will help
+ * rolling updates if a future change requires breaking
+ * the protocol.
+ * NB: This is no longer true for IB, where we do a version
+ * negotiation during the connection setup phase (protocol
+ * version information is included in the RDMA CM private data).
+ */
+#define RDS_EXTHDR_VERSION      1
+struct rds_ext_header_version {
+        __be32                  h_version;
+};
+/*
+ * This extension header is included in the RDS message
+ * chasing an RDMA operation.
+ */
+#define RDS_EXTHDR_RDMA         2
+struct rds_ext_header_rdma {
+        __be32                  h_rdma_rkey;
+};
+/*
+ * This extension header tells the peer about the
+ * destination <R_Key,offset> of the requested RDMA
+ * operation.
+ */
+#define RDS_EXTHDR_RDMA_DEST    3
+struct rds_ext_header_rdma_dest {
+        __be32                  h_rdma_rkey;
+        __be32                  h_rdma_offset;
+};
+#define __RDS_EXTHDR_MAX        16 /* for now */
+struct rds_incoming {
+        atomic_t                i_refcount;
+        struct list_head        i_item;
+        struct rds_connection   *i_conn;
+        struct rds_header       i_hdr;
+        unsigned long           i_rx_jiffies;
+        __be32                  i_saddr;
+        rds_rdma_cookie_t       i_rdma_cookie;
+};
+/*
+ * m_sock_item and m_conn_item are on lists that are serialized under
+ * conn->c_lock.  m_sock_item has additional meaning in that once it is empty
+ * the message will not be put back on the retransmit list after being sent.
+ * messages that are canceled while being sent rely on this.
+ *
+ * m_inc is used by loopback so that it can pass an incoming message straight
+ * back up into the rx path.  It embeds a wire header which is also used by
+ * the send path, which is kind of awkward.
+ *
+ * m_sock_item indicates the message's presence on a socket's send or receive
+ * queue.  m_rs will point to that socket.
+ *
+ * m_daddr is used by cancellation to prune messages to a given destination.
+ *
+ * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock
+ * nesting.  As paths iterate over messages on a sock, or conn, they must
+ * also lock the conn, or sock, to remove the message from those lists too.
+ * Testing the flag to determine if the message is still on the lists lets
+ * us avoid testing the list_head directly.  That means each path can use
+ * the message's list_head to keep it on a local list while juggling locks
+ * without confusing the other path.
+ *
+ * m_ack_seq is an optional field set by transports who need a different
+ * sequence number range to invalidate.  They can use this in a callback
+ * that they pass to rds_send_drop_acked() to see if each message has been
+ * acked.  The HAS_ACK_SEQ flag can be used to detect messages which haven't
+ * had ack_seq set yet.
+ */
+#define RDS_MSG_ON_SOCK         1
+#define RDS_MSG_ON_CONN         2
+#define RDS_MSG_HAS_ACK_SEQ     3
+#define RDS_MSG_ACK_REQUIRED    4
+#define RDS_MSG_RETRANSMITTED   5
+#define RDS_MSG_MAPPED          6
+#define RDS_MSG_PAGEVEC         7
+struct rds_message {
+        atomic_t                m_refcount;
+        struct list_head        m_sock_item;
+        struct list_head        m_conn_item;
+        struct rds_incoming     m_inc;
+        u64                     m_ack_seq;
+        __be32                  m_daddr;
+        unsigned long           m_flags;
+        /* Never access m_rs without holding m_rs_lock.
+         * Lock nesting is
+         *  rm->m_rs_lock
+         *   -> rs->rs_lock
+         */
+        spinlock_t              m_rs_lock;
+        struct rds_sock         *m_rs;
+        struct rds_rdma_op      *m_rdma_op;
+        rds_rdma_cookie_t       m_rdma_cookie;
+        struct rds_mr           *m_rdma_mr;
+        unsigned int            m_nents;
+        unsigned int            m_count;
+        struct scatterlist      m_sg[0];
+};
+/*
+ * The RDS notifier is used (optionally) to tell the application about
+ * completed RDMA operations. Rather than keeping the whole rds message
+ * around on the queue, we allocate a small notifier that is put on the
+ * socket's notifier_list. Notifications are delivered to the application
+ * through control messages.
+ */
+struct rds_notifier {
+        struct list_head        n_list;
+        uint64_t                n_user_token;
+        int                     n_status;
+};
+/**
+ * struct rds_transport -  transport specific behavioural hooks
+ *
+ * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send
+ *        part of a message.  The caller serializes on the send_sem so this
+ *        doesn't need to be reentrant for a given conn.  The header must be
+ *        sent before the data payload.  .xmit must be prepared to send a
+ *        message with no data payload.  .xmit should return the number of
+ *        bytes that were sent down the connection, including header bytes.
+ *        Returning 0 tells the caller that it doesn't need to perform any
+ *        additional work now.  This is usually the case when the transport has
+ *        filled the sending queue for its connection and will handle
+ *        triggering the rds thread to continue the send when space becomes
+ *        available.  Returning -EAGAIN tells the caller to retry the send
+ *        immediately.  Returning -ENOMEM tells the caller to retry the send at
+ *        some point in the future.
+ *
+ * @conn_shutdown: conn_shutdown stops traffic on the given connection.  Once
+ *                 it returns the connection can not call rds_recv_incoming().
+ *                 This will only be called once after conn_connect returns
+ *                 non-zero success and will The caller serializes this with
+ *                 the send and connecting paths (xmit_* and conn_*).  The
+ *                 transport is responsible for other serialization, including
+ *                 rds_recv_incoming().  This is called in process context but
+ *                 should try hard not to block.
+ *
+ * @xmit_cong_map: This asks the transport to send the local bitmap down the
+ *                 given connection.  XXX get a better story about the bitmap
+ *                 flag and header.
+ */
+struct rds_transport {
+        char                    t_name[TRANSNAMSIZ];
+        struct list_head        t_item;
+        struct module           *t_owner;
+        unsigned int            t_prefer_loopback:1;
+        int (*laddr_check)(__be32 addr);
+        int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
+        void (*conn_free)(void *data);
+        int (*conn_connect)(struct rds_connection *conn);
+        void (*conn_shutdown)(struct rds_connection *conn);
+        void (*xmit_prepare)(struct rds_connection *conn);
+        void (*xmit_complete)(struct rds_connection *conn);
+        int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
+                    unsigned int hdr_off, unsigned int sg, unsigned int off);
+        int (*xmit_cong_map)(struct rds_connection *conn,
+                             struct rds_cong_map *map, unsigned long offset);
+        int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
+        int (*recv)(struct rds_connection *conn);
+        int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
+                                size_t size);
+        void (*inc_purge)(struct rds_incoming *inc);
+        void (*inc_free)(struct rds_incoming *inc);
+        int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
+                                 struct rdma_cm_event *event);
+        int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
+        void (*cm_connect_complete)(struct rds_connection *conn,
+                                    struct rdma_cm_event *event);
+        unsigned int (*stats_info_copy)(struct rds_info_iterator *iter,
+                                        unsigned int avail);
+        void (*exit)(void);
+        void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
+                        struct rds_sock *rs, u32 *key_ret);
+        void (*sync_mr)(void *trans_private, int direction);
+        void (*free_mr)(void *trans_private, int invalidate);
+        void (*flush_mrs)(void);
+};
+struct rds_sock {
+        struct sock             rs_sk;
+        u64                     rs_user_addr;
+        u64                     rs_user_bytes;
+        /*
+         * bound_addr used for both incoming and outgoing, no INADDR_ANY
+         * support.
+         */
+        struct rb_node          rs_bound_node;
+        __be32                  rs_bound_addr;
+        __be32                  rs_conn_addr;
+        __be16                  rs_bound_port;
+        __be16                  rs_conn_port;
+        /*
+         * This is only used to communicate the transport between bind and
+         * initiating connections.  All other trans use is referenced through
+         * the connection.
+         */
+        struct rds_transport    *rs_transport;
+        /*
+         * rds_sendmsg caches the conn it used the last time around.
+         * This helps avoid costly lookups.
+         */
+        struct rds_connection   *rs_conn;
+        /* flag indicating we were congested or not */
+        int                     rs_congested;
+        /* rs_lock protects all these adjacent members before the newline */
+        spinlock_t              rs_lock;
+        struct list_head        rs_send_queue;
+        u32                     rs_snd_bytes;
+        int                     rs_rcv_bytes;
+        struct list_head        rs_notify_queue;        /* currently used for failed RDMAs */
+        /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask
+         * to decide whether the application should be woken up.
+         * If not set, we use rs_cong_track to find out whether a cong map
+         * update arrived.
+         */
+        uint64_t                rs_cong_mask;
+        uint64_t                rs_cong_notify;
+        struct list_head        rs_cong_list;
+        unsigned long           rs_cong_track;
+        /*
+         * rs_recv_lock protects the receive queue, and is
+         * used to serialize with rds_release.
+         */
+        rwlock_t                rs_recv_lock;
+        struct list_head        rs_recv_queue;
+        /* just for stats reporting */
+        struct list_head        rs_item;
+        /* these have their own lock */
+        spinlock_t              rs_rdma_lock;
+        struct rb_root          rs_rdma_keys;
+        /* Socket options - in case there will be more */
+        unsigned char           rs_recverr,
+                                rs_cong_monitor;
+};
+static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
+{
+        return container_of(sk, struct rds_sock, rs_sk);
+}
+static inline struct sock *rds_rs_to_sk(struct rds_sock *rs)
+{
+        return &rs->rs_sk;
+}
+/*
+ * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value
+ * to account for overhead.  We don't account for overhead, we just apply
+ * the number of payload bytes to the specified value.
+ */
+static inline int rds_sk_sndbuf(struct rds_sock *rs)
+{
+        return rds_rs_to_sk(rs)->sk_sndbuf / 2;
+}
+static inline int rds_sk_rcvbuf(struct rds_sock *rs)
+{
+        return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
+}
+struct rds_statistics {
+        uint64_t        s_conn_reset;
+        uint64_t        s_recv_drop_bad_checksum;
+        uint64_t        s_recv_drop_old_seq;
+        uint64_t        s_recv_drop_no_sock;
+        uint64_t        s_recv_drop_dead_sock;
+        uint64_t        s_recv_deliver_raced;
+        uint64_t        s_recv_delivered;
+        uint64_t        s_recv_queued;
+        uint64_t        s_recv_immediate_retry;
+        uint64_t        s_recv_delayed_retry;
+        uint64_t        s_recv_ack_required;
+        uint64_t        s_recv_rdma_bytes;
+        uint64_t        s_recv_ping;
+        uint64_t        s_send_queue_empty;
+        uint64_t        s_send_queue_full;
+        uint64_t        s_send_sem_contention;
+        uint64_t        s_send_sem_queue_raced;
+        uint64_t        s_send_immediate_retry;
+        uint64_t        s_send_delayed_retry;
+        uint64_t        s_send_drop_acked;
+        uint64_t        s_send_ack_required;
+        uint64_t        s_send_queued;
+        uint64_t        s_send_rdma;
+        uint64_t        s_send_rdma_bytes;
+        uint64_t        s_send_pong;
+        uint64_t        s_page_remainder_hit;
+        uint64_t        s_page_remainder_miss;
+        uint64_t        s_copy_to_user;
+        uint64_t        s_copy_from_user;
+        uint64_t        s_cong_update_queued;
+        uint64_t        s_cong_update_received;
+        uint64_t        s_cong_send_error;
+        uint64_t        s_cong_send_blocked;
+};
+/* af_rds.c */
+void rds_sock_addref(struct rds_sock *rs);
+void rds_sock_put(struct rds_sock *rs);
+void rds_wake_sk_sleep(struct rds_sock *rs);
+static inline void __rds_wake_sk_sleep(struct sock *sk)
+{
+        wait_queue_head_t *waitq = sk->sk_sleep;
+        if (!sock_flag(sk, SOCK_DEAD) && waitq)
+                wake_up(waitq);
+}
+extern wait_queue_head_t rds_poll_waitq;
+/* bind.c */
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
+void rds_remove_bound(struct rds_sock *rs);
+struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
+/* cong.c */
+int rds_cong_get_maps(struct rds_connection *conn);
+void rds_cong_add_conn(struct rds_connection *conn);
+void rds_cong_remove_conn(struct rds_connection *conn);
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port);
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port);
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs);
+void rds_cong_queue_updates(struct rds_cong_map *map);
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t);
+int rds_cong_updated_since(unsigned long *recent);
+void rds_cong_add_socket(struct rds_sock *);
+void rds_cong_remove_socket(struct rds_sock *);
+void rds_cong_exit(void);
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
+/* conn.c */
+int __init rds_conn_init(void);
+void rds_conn_exit(void);
+struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+                                       struct rds_transport *trans, gfp_t gfp);
+struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+                               struct rds_transport *trans, gfp_t gfp);
+void rds_conn_destroy(struct rds_connection *conn);
+void rds_conn_reset(struct rds_connection *conn);
+void rds_conn_drop(struct rds_connection *conn);
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+                          struct rds_info_iterator *iter,
+                          struct rds_info_lengths *lens,
+                          int (*visitor)(struct rds_connection *, void *),
+                          size_t item_len);
+void __rds_conn_error(struct rds_connection *conn, const char *, ...)
+                                __attribute__ ((format (printf, 2, 3)));
+#define rds_conn_error(conn, fmt...) \
+        __rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
+static inline int
+rds_conn_transition(struct rds_connection *conn, int old, int new)
+{
+        return atomic_cmpxchg(&conn->c_state, old, new) == old;
+}
+static inline int
+rds_conn_state(struct rds_connection *conn)
+{
+        return atomic_read(&conn->c_state);
+}
+static inline int
+rds_conn_up(struct rds_connection *conn)
+{
+        return atomic_read(&conn->c_state) == RDS_CONN_UP;
+}
+static inline int
+rds_conn_connecting(struct rds_connection *conn)
+{
+        return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING;
+}
+/* message.c */
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
+struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+                                               size_t total_len);
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+                                 __be16 dport, u64 seq);
+int rds_message_add_extension(struct rds_header *hdr,
+                              unsigned int type, const void *data, unsigned int len);
+int rds_message_next_extension(struct rds_header *hdr,
+                               unsigned int *pos, void *buf, unsigned int *buflen);
+int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version);
+int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version);
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+                                 struct iovec *first_iov, size_t size);
+void rds_message_inc_purge(struct rds_incoming *inc);
+void rds_message_inc_free(struct rds_incoming *inc);
+void rds_message_addref(struct rds_message *rm);
+void rds_message_put(struct rds_message *rm);
+void rds_message_wait(struct rds_message *rm);
+void rds_message_unmapped(struct rds_message *rm);
+static inline void rds_message_make_checksum(struct rds_header *hdr)
+{
+        hdr->h_csum = 0;
+        hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2);
+}
+static inline int rds_message_verify_checksum(const struct rds_header *hdr)
+{
+        return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0;
+}
+/* page.c */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+                             gfp_t gfp);
+int rds_page_copy_user(struct page *page, unsigned long offset,
+                       void __user *ptr, unsigned long bytes,
+                       int to_user);
+#define rds_page_copy_to_user(page, offset, ptr, bytes) \
+        rds_page_copy_user(page, offset, ptr, bytes, 1)
+#define rds_page_copy_from_user(page, offset, ptr, bytes) \
+        rds_page_copy_user(page, offset, ptr, bytes, 0)
+void rds_page_exit(void);
+/* recv.c */
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+                  __be32 saddr);
+void rds_inc_addref(struct rds_incoming *inc);
+void rds_inc_put(struct rds_incoming *inc);
+void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+                       struct rds_incoming *inc, gfp_t gfp, enum km_type km);
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+                size_t size, int msg_flags);
+void rds_clear_recv_queue(struct rds_sock *rs);
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
+void rds_inc_info_copy(struct rds_incoming *inc,
+                       struct rds_info_iterator *iter,
+                       __be32 saddr, __be32 daddr, int flip);
+/* send.c */
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+                size_t payload_len);
+void rds_send_reset(struct rds_connection *conn);
+int rds_send_xmit(struct rds_connection *conn);
+struct sockaddr_in;
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
+typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+                         is_acked_func is_acked);
+int rds_send_acked_before(struct rds_connection *conn, u64 seq);
+void rds_send_remove_from_sock(struct list_head *messages, int status);
+int rds_send_pong(struct rds_connection *conn, __be16 dport);
+struct rds_message *rds_send_get_message(struct rds_connection *,
+                                         struct rds_rdma_op *);
+/* rdma.c */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
+/* stats.c */
+DECLARE_PER_CPU(struct rds_statistics, rds_stats);
+#define rds_stats_inc_which(which, member) do {         \
+        per_cpu(which, get_cpu()).member++;             \
+        put_cpu();                                      \
+} while (0)
+#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member)
+#define rds_stats_add_which(which, member, count) do {          \
+        per_cpu(which, get_cpu()).member += count;      \
+        put_cpu();                                      \
+} while (0)
+#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
+int __init rds_stats_init(void);
+void rds_stats_exit(void);
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+                         uint64_t *values, char **names, size_t nr);
+/* sysctl.c */
+int __init rds_sysctl_init(void);
+void rds_sysctl_exit(void);
+extern unsigned long rds_sysctl_sndbuf_min;
+extern unsigned long rds_sysctl_sndbuf_default;
+extern unsigned long rds_sysctl_sndbuf_max;
+extern unsigned long rds_sysctl_reconnect_min_jiffies;
+extern unsigned long rds_sysctl_reconnect_max_jiffies;
+extern unsigned int  rds_sysctl_max_unacked_packets;
+extern unsigned int  rds_sysctl_max_unacked_bytes;
+extern unsigned int  rds_sysctl_ping_enable;
+extern unsigned long rds_sysctl_trace_flags;
+extern unsigned int  rds_sysctl_trace_level;
+/* threads.c */
+int __init rds_threads_init(void);
+void rds_threads_exit(void);
+extern struct workqueue_struct *rds_wq;
+void rds_connect_worker(struct work_struct *);
+void rds_shutdown_worker(struct work_struct *);
+void rds_send_worker(struct work_struct *);
+void rds_recv_worker(struct work_struct *);
+void rds_connect_complete(struct rds_connection *conn);
+/* transport.c */
+int rds_trans_register(struct rds_transport *trans);
+void rds_trans_unregister(struct rds_transport *trans);
+struct rds_transport *rds_trans_get_preferred(__be32 addr);
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+                                       unsigned int avail);
+int __init rds_trans_init(void);
+void rds_trans_exit(void);
+#endif
diff --git a/net/rds/recv.c b/net/rds/recv.c
new file mode 100644
index 000000000000..f2118c51cfa3
--- /dev/null
+++ b/net/rds/recv.c
@@ -0,0 +1,542 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include "rds.h"
+#include "rdma.h"
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+                  __be32 saddr)
+{
+        atomic_set(&inc->i_refcount, 1);
+        INIT_LIST_HEAD(&inc->i_item);
+        inc->i_conn = conn;
+        inc->i_saddr = saddr;
+        inc->i_rdma_cookie = 0;
+}
+void rds_inc_addref(struct rds_incoming *inc)
+{
+        rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
+        atomic_inc(&inc->i_refcount);
+}
+void rds_inc_put(struct rds_incoming *inc)
+{
+        rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
+        if (atomic_dec_and_test(&inc->i_refcount)) {
+                BUG_ON(!list_empty(&inc->i_item));
+                inc->i_conn->c_trans->inc_free(inc);
+        }
+}
+static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
+                                  struct rds_cong_map *map,
+                                  int delta, __be16 port)
+{
+        int now_congested;
+        if (delta == 0)
+                return;
+        rs->rs_rcv_bytes += delta;
+        now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
+        rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
+          "now_cong %d delta %d\n",
+          rs, &rs->rs_bound_addr,
+          ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
+          rds_sk_rcvbuf(rs), now_congested, delta);
+        /* wasn't -> am congested */
+        if (!rs->rs_congested && now_congested) {
+                rs->rs_congested = 1;
+                rds_cong_set_bit(map, port);
+                rds_cong_queue_updates(map);
+        }
+        /* was -> aren't congested */
+        /* Require more free space before reporting uncongested to prevent
+           bouncing cong/uncong state too often */
+        else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
+                rs->rs_congested = 0;
+                rds_cong_clear_bit(map, port);
+                rds_cong_queue_updates(map);
+        }
+        /* do nothing if no change in cong state */
+}
+/*
+ * Process all extension headers that come with this message.
+ */
+static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
+{
+        struct rds_header *hdr = &inc->i_hdr;
+        unsigned int pos = 0, type, len;
+        union {
+                struct rds_ext_header_version version;
+                struct rds_ext_header_rdma rdma;
+                struct rds_ext_header_rdma_dest rdma_dest;
+        } buffer;
+        while (1) {
+                len = sizeof(buffer);
+                type = rds_message_next_extension(hdr, &pos, &buffer, &len);
+                if (type == RDS_EXTHDR_NONE)
+                        break;
+                /* Process extension header here */
+                switch (type) {
+                case RDS_EXTHDR_RDMA:
+                        rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
+                        break;
+                case RDS_EXTHDR_RDMA_DEST:
+                        /* We ignore the size for now. We could stash it
+                         * somewhere and use it for error checking. */
+                        inc->i_rdma_cookie = rds_rdma_make_cookie(
+                                        be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
+                                        be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
+                        break;
+                }
+        }
+}
+/*
+ * The transport must make sure that this is serialized against other
+ * rx and conn reset on this specific conn.
+ *
+ * We currently assert that only one fragmented message will be sent
+ * down a connection at a time.  This lets us reassemble in the conn
+ * instead of per-flow which means that we don't have to go digging through
+ * flows to tear down partial reassembly progress on conn failure and
+ * we save flow lookup and locking for each frag arrival.  It does mean
+ * that small messages will wait behind large ones.  Fragmenting at all
+ * is only to reduce the memory consumption of pre-posted buffers.
+ *
+ * The caller passes in saddr and daddr instead of us getting it from the
+ * conn.  This lets loopback, who only has one conn for both directions,
+ * tell us which roles the addrs in the conn are playing for this message.
+ */
+void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+                       struct rds_incoming *inc, gfp_t gfp, enum km_type km)
+{
+        struct rds_sock *rs = NULL;
+        struct sock *sk;
+        unsigned long flags;
+        inc->i_conn = conn;
+        inc->i_rx_jiffies = jiffies;
+        rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
+                 "flags 0x%x rx_jiffies %lu\n", conn,
+                 (unsigned long long)conn->c_next_rx_seq,
+                 inc,
+                 (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
+                 be32_to_cpu(inc->i_hdr.h_len),
+                 be16_to_cpu(inc->i_hdr.h_sport),
+                 be16_to_cpu(inc->i_hdr.h_dport),
+                 inc->i_hdr.h_flags,
+                 inc->i_rx_jiffies);
+        /*
+         * Sequence numbers should only increase.  Messages get their
+         * sequence number as they're queued in a sending conn.  They
+         * can be dropped, though, if the sending socket is closed before
+         * they hit the wire.  So sequence numbers can skip forward
+         * under normal operation.  They can also drop back in the conn
+         * failover case as previously sent messages are resent down the
+         * new instance of a conn.  We drop those, otherwise we have
+         * to assume that the next valid seq does not come after a
+         * hole in the fragment stream.
+         *
+         * The headers don't give us a way to realize if fragments of
+         * a message have been dropped.  We assume that frags that arrive
+         * to a flow are part of the current message on the flow that is
+         * being reassembled.  This means that senders can't drop messages
+         * from the sending conn until all their frags are sent.
+         *
+         * XXX we could spend more on the wire to get more robust failure
+         * detection, arguably worth it to avoid data corruption.
+         */
+        if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq
+         && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
+                rds_stats_inc(s_recv_drop_old_seq);
+                goto out;
+        }
+        conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
+        if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
+                rds_stats_inc(s_recv_ping);
+                rds_send_pong(conn, inc->i_hdr.h_sport);
+                goto out;
+        }
+        rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
+        if (rs == NULL) {
+                rds_stats_inc(s_recv_drop_no_sock);
+                goto out;
+        }
+        /* Process extension headers */
+        rds_recv_incoming_exthdrs(inc, rs);
+        /* We can be racing with rds_release() which marks the socket dead. */
+        sk = rds_rs_to_sk(rs);
+        /* serialize with rds_release -> sock_orphan */
+        write_lock_irqsave(&rs->rs_recv_lock, flags);
+        if (!sock_flag(sk, SOCK_DEAD)) {
+                rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
+                rds_stats_inc(s_recv_queued);
+                rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+                                      be32_to_cpu(inc->i_hdr.h_len),
+                                      inc->i_hdr.h_dport);
+                rds_inc_addref(inc);
+                list_add_tail(&inc->i_item, &rs->rs_recv_queue);
+                __rds_wake_sk_sleep(sk);
+        } else {
+                rds_stats_inc(s_recv_drop_dead_sock);
+        }
+        write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+out:
+        if (rs)
+                rds_sock_put(rs);
+}
+/*
+ * be very careful here.  This is being called as the condition in
+ * wait_event_*() needs to cope with being called many times.
+ */
+static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
+{
+        unsigned long flags;
+        if (*inc == NULL) {
+                read_lock_irqsave(&rs->rs_recv_lock, flags);
+                if (!list_empty(&rs->rs_recv_queue)) {
+                        *inc = list_entry(rs->rs_recv_queue.next,
+                                          struct rds_incoming,
+                                          i_item);
+                        rds_inc_addref(*inc);
+                }
+                read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+        }
+        return *inc != NULL;
+}
+static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
+                            int drop)
+{
+        struct sock *sk = rds_rs_to_sk(rs);
+        int ret = 0;
+        unsigned long flags;
+        write_lock_irqsave(&rs->rs_recv_lock, flags);
+        if (!list_empty(&inc->i_item)) {
+                ret = 1;
+                if (drop) {
+                        /* XXX make sure this i_conn is reliable */
+                        rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+                                              -be32_to_cpu(inc->i_hdr.h_len),
+                                              inc->i_hdr.h_dport);
+                        list_del_init(&inc->i_item);
+                        rds_inc_put(inc);
+                }
+        }
+        write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+        rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
+        return ret;
+}
+/*
+ * Pull errors off the error queue.
+ * If msghdr is NULL, we will just purge the error queue.
+ */
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
+{
+        struct rds_notifier *notifier;
+        struct rds_rdma_notify cmsg;
+        unsigned int count = 0, max_messages = ~0U;
+        unsigned long flags;
+        LIST_HEAD(copy);
+        int err = 0;
+        /* put_cmsg copies to user space and thus may sleep. We can't do this
+         * with rs_lock held, so first grab as many notifications as we can stuff
+         * in the user provided cmsg buffer. We don't try to copy more, to avoid
+         * losing notifications - except when the buffer is so small that it wouldn't
+         * even hold a single notification. Then we give him as much of this single
+         * msg as we can squeeze in, and set MSG_CTRUNC.
+         */
+        if (msghdr) {
+                max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
+                if (!max_messages)
+                        max_messages = 1;
+        }
+        spin_lock_irqsave(&rs->rs_lock, flags);
+        while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
+                notifier = list_entry(rs->rs_notify_queue.next,
+                                struct rds_notifier, n_list);
+                list_move(&notifier->n_list, &copy);
+                count++;
+        }
+        spin_unlock_irqrestore(&rs->rs_lock, flags);
+        if (!count)
+                return 0;
+        while (!list_empty(&copy)) {
+                notifier = list_entry(copy.next, struct rds_notifier, n_list);
+                if (msghdr) {
+                        cmsg.user_token = notifier->n_user_token;
+                        cmsg.status  = notifier->n_status;
+                        err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
+                                        sizeof(cmsg), &cmsg);
+                        if (err)
+                                break;
+                }
+                list_del_init(&notifier->n_list);
+                kfree(notifier);
+        }
+        /* If we bailed out because of an error in put_cmsg,
+         * we may be left with one or more notifications that we
+         * didn't process. Return them to the head of the list. */
+        if (!list_empty(&copy)) {
+                spin_lock_irqsave(&rs->rs_lock, flags);
+                list_splice(&copy, &rs->rs_notify_queue);
+                spin_unlock_irqrestore(&rs->rs_lock, flags);
+        }
+        return err;
+}
+/*
+ * Queue a congestion notification
+ */
+static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
+{
+        uint64_t notify = rs->rs_cong_notify;
+        unsigned long flags;
+        int err;
+        err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
+                        sizeof(notify), &notify);
+        if (err)
+                return err;
+        spin_lock_irqsave(&rs->rs_lock, flags);
+        rs->rs_cong_notify &= ~notify;
+        spin_unlock_irqrestore(&rs->rs_lock, flags);
+        return 0;
+}
+/*
+ * Receive any control messages.
+ */
+static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
+{
+        int ret = 0;
+        if (inc->i_rdma_cookie) {
+                ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
+                                sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+                size_t size, int msg_flags)
+{
+        struct sock *sk = sock->sk;
+        struct rds_sock *rs = rds_sk_to_rs(sk);
+        long timeo;
+        int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
+        struct sockaddr_in *sin;
+        struct rds_incoming *inc = NULL;
+        /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
+        timeo = sock_rcvtimeo(sk, nonblock);
+        rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
+        if (msg_flags & MSG_OOB)
+                goto out;
+        /* If there are pending notifications, do those - and nothing else */
+        if (!list_empty(&rs->rs_notify_queue)) {
+                ret = rds_notify_queue_get(rs, msg);
+                goto out;
+        }
+        if (rs->rs_cong_notify) {
+                ret = rds_notify_cong(rs, msg);
+                goto out;
+        }
+        while (1) {
+                if (!rds_next_incoming(rs, &inc)) {
+                        if (nonblock) {
+                                ret = -EAGAIN;
+                                break;
+                        }
+                        timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
+                                                rds_next_incoming(rs, &inc),
+                                                timeo);
+                        rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
+                                 timeo);
+                        if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+                                continue;
+                        ret = timeo;
+                        if (ret == 0)
+                                ret = -ETIMEDOUT;
+                        break;
+                }
+                rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
+                         &inc->i_conn->c_faddr,
+                         ntohs(inc->i_hdr.h_sport));
+                ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov,
+                                                             size);
+                if (ret < 0)
+                        break;
+                /*
+                 * if the message we just copied isn't at the head of the
+                 * recv queue then someone else raced us to return it, try
+                 * to get the next message.
+                 */
+                if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
+                        rds_inc_put(inc);
+                        inc = NULL;
+                        rds_stats_inc(s_recv_deliver_raced);
+                        continue;
+                }
+                if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
+                        if (msg_flags & MSG_TRUNC)
+                                ret = be32_to_cpu(inc->i_hdr.h_len);
+                        msg->msg_flags |= MSG_TRUNC;
+                }
+                if (rds_cmsg_recv(inc, msg)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                rds_stats_inc(s_recv_delivered);
+                sin = (struct sockaddr_in *)msg->msg_name;
+                if (sin) {
+                        sin->sin_family = AF_INET;
+                        sin->sin_port = inc->i_hdr.h_sport;
+                        sin->sin_addr.s_addr = inc->i_saddr;
+                        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+                }
+                break;
+        }
+        if (inc)
+                rds_inc_put(inc);
+out:
+        return ret;
+}
+/*
+ * The socket is being shut down and we're asked to drop messages that were
+ * queued for recvmsg.  The caller has unbound the socket so the receive path
+ * won't queue any more incoming fragments or messages on the socket.
+ */
+void rds_clear_recv_queue(struct rds_sock *rs)
+{
+        struct sock *sk = rds_rs_to_sk(rs);
+        struct rds_incoming *inc, *tmp;
+        unsigned long flags;
+        write_lock_irqsave(&rs->rs_recv_lock, flags);
+        list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
+                rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+                                      -be32_to_cpu(inc->i_hdr.h_len),
+                                      inc->i_hdr.h_dport);
+                list_del_init(&inc->i_item);
+                rds_inc_put(inc);
+        }
+        write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+/*
+ * inc->i_saddr isn't used here because it is only set in the receive
+ * path.
+ */
+void rds_inc_info_copy(struct rds_incoming *inc,
+                       struct rds_info_iterator *iter,
+                       __be32 saddr, __be32 daddr, int flip)
+{
+        struct rds_info_message minfo;
+        minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+        minfo.len = be32_to_cpu(inc->i_hdr.h_len);
+        if (flip) {
+                minfo.laddr = daddr;
+                minfo.faddr = saddr;
+                minfo.lport = inc->i_hdr.h_dport;
+                minfo.fport = inc->i_hdr.h_sport;
+        } else {
+                minfo.laddr = saddr;
+                minfo.faddr = daddr;
+                minfo.lport = inc->i_hdr.h_sport;
+                minfo.fport = inc->i_hdr.h_dport;
+        }
+        rds_info_copy(iter, &minfo, sizeof(minfo));
+}
diff --git a/net/rds/send.c b/net/rds/send.c
new file mode 100644
index 000000000000..1b37364656f0
--- /dev/null
+++ b/net/rds/send.c
@@ -0,0 +1,1003 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/list.h>
+#include "rds.h"
+#include "rdma.h"
+/* When transmitting messages in rds_send_xmit, we need to emerge from
+ * time to time and briefly release the CPU. Otherwise the softlock watchdog
+ * will kick our shin.
+ * Also, it seems fairer to not let one busy connection stall all the
+ * others.
+ *
+ * send_batch_count is the number of times we'll loop in send_xmit. Setting
+ * it to 0 will restore the old behavior (where we looped until we had
+ * drained the queue).
+ */
+static int send_batch_count = 64;
+module_param(send_batch_count, int, 0444);
+MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
+/*
+ * Reset the send state. Caller must hold c_send_lock when calling here.
+ */
+void rds_send_reset(struct rds_connection *conn)
+{
+        struct rds_message *rm, *tmp;
+        unsigned long flags;
+        if (conn->c_xmit_rm) {
+                /* Tell the user the RDMA op is no longer mapped by the
+                 * transport. This isn't entirely true (it's flushed out
+                 * independently) but as the connection is down, there's
+                 * no ongoing RDMA to/from that memory */
+                rds_message_unmapped(conn->c_xmit_rm);
+                rds_message_put(conn->c_xmit_rm);
+                conn->c_xmit_rm = NULL;
+        }
+        conn->c_xmit_sg = 0;
+        conn->c_xmit_hdr_off = 0;
+        conn->c_xmit_data_off = 0;
+        conn->c_xmit_rdma_sent = 0;
+        conn->c_map_queued = 0;
+        conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
+        conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+        /* Mark messages as retransmissions, and move them to the send q */
+        spin_lock_irqsave(&conn->c_lock, flags);
+        list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+                set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+                set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
+        }
+        list_splice_init(&conn->c_retrans, &conn->c_send_queue);
+        spin_unlock_irqrestore(&conn->c_lock, flags);
+}
+/*
+ * We're making the concious trade-off here to only send one message
+ * down the connection at a time.
+ *   Pro:
+ *      - tx queueing is a simple fifo list
+ *      - reassembly is optional and easily done by transports per conn
+ *      - no per flow rx lookup at all, straight to the socket
+ *      - less per-frag memory and wire overhead
+ *   Con:
+ *      - queued acks can be delayed behind large messages
+ *   Depends:
+ *      - small message latency is higher behind queued large messages
+ *      - large message latency isn't starved by intervening small sends
+ */
+int rds_send_xmit(struct rds_connection *conn)
+{
+        struct rds_message *rm;
+        unsigned long flags;
+        unsigned int tmp;
+        unsigned int send_quota = send_batch_count;
+        struct scatterlist *sg;
+        int ret = 0;
+        int was_empty = 0;
+        LIST_HEAD(to_be_dropped);
+        /*
+         * sendmsg calls here after having queued its message on the send
+         * queue.  We only have one task feeding the connection at a time.  If
+         * another thread is already feeding the queue then we back off.  This
+         * avoids blocking the caller and trading per-connection data between
+         * caches per message.
+         *
+         * The sem holder will issue a retry if they notice that someone queued
+         * a message after they stopped walking the send queue but before they
+         * dropped the sem.
+         */
+        if (!mutex_trylock(&conn->c_send_lock)) {
+                rds_stats_inc(s_send_sem_contention);
+                ret = -ENOMEM;
+                goto out;
+        }
+        if (conn->c_trans->xmit_prepare)
+                conn->c_trans->xmit_prepare(conn);
+        /*
+         * spin trying to push headers and data down the connection until
+         * the connection doens't make forward progress.
+         */
+        while (--send_quota) {
+                /*
+                 * See if need to send a congestion map update if we're
+                 * between sending messages.  The send_sem protects our sole
+                 * use of c_map_offset and _bytes.
+                 * Note this is used only by transports that define a special
+                 * xmit_cong_map function. For all others, we create allocate
+                 * a cong_map message and treat it just like any other send.
+                 */
+                if (conn->c_map_bytes) {
+                        ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
+                                                conn->c_map_offset);
+                        if (ret <= 0)
+                                break;
+                        conn->c_map_offset += ret;
+                        conn->c_map_bytes -= ret;
+                        if (conn->c_map_bytes)
+                                continue;
+                }
+                /* If we're done sending the current message, clear the
+                 * offset and S/G temporaries.
+                 */
+                rm = conn->c_xmit_rm;
+                if (rm != NULL &&
+                    conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
+                    conn->c_xmit_sg == rm->m_nents) {
+                        conn->c_xmit_rm = NULL;
+                        conn->c_xmit_sg = 0;
+                        conn->c_xmit_hdr_off = 0;
+                        conn->c_xmit_data_off = 0;
+                        conn->c_xmit_rdma_sent = 0;
+                        /* Release the reference to the previous message. */
+                        rds_message_put(rm);
+                        rm = NULL;
+                }
+                /* If we're asked to send a cong map update, do so.
+                 */
+                if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
+                        if (conn->c_trans->xmit_cong_map != NULL) {
+                                conn->c_map_offset = 0;
+                                conn->c_map_bytes = sizeof(struct rds_header) +
+                                        RDS_CONG_MAP_BYTES;
+                                continue;
+                        }
+                        rm = rds_cong_update_alloc(conn);
+                        if (IS_ERR(rm)) {
+                                ret = PTR_ERR(rm);
+                                break;
+                        }
+                        conn->c_xmit_rm = rm;
+                }
+                /*
+                 * Grab the next message from the send queue, if there is one.
+                 *
+                 * c_xmit_rm holds a ref while we're sending this message down
+                 * the connction.  We can use this ref while holding the
+                 * send_sem.. rds_send_reset() is serialized with it.
+                 */
+                if (rm == NULL) {
+                        unsigned int len;
+                        spin_lock_irqsave(&conn->c_lock, flags);
+                        if (!list_empty(&conn->c_send_queue)) {
+                                rm = list_entry(conn->c_send_queue.next,
+                                                struct rds_message,
+                                                m_conn_item);
+                                rds_message_addref(rm);
+                                /*
+                                 * Move the message from the send queue to the retransmit
+                                 * list right away.
+                                 */
+                                list_move_tail(&rm->m_conn_item, &conn->c_retrans);
+                        }
+                        spin_unlock_irqrestore(&conn->c_lock, flags);
+                        if (rm == NULL) {
+                                was_empty = 1;
+                                break;
+                        }
+                        /* Unfortunately, the way Infiniband deals with
+                         * RDMA to a bad MR key is by moving the entire
+                         * queue pair to error state. We cold possibly
+                         * recover from that, but right now we drop the
+                         * connection.
+                         * Therefore, we never retransmit messages with RDMA ops.
+                         */
+                        if (rm->m_rdma_op
+                         && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
+                                spin_lock_irqsave(&conn->c_lock, flags);
+                                if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
+                                        list_move(&rm->m_conn_item, &to_be_dropped);
+                                spin_unlock_irqrestore(&conn->c_lock, flags);
+                                rds_message_put(rm);
+                                continue;
+                        }
+                        /* Require an ACK every once in a while */
+                        len = ntohl(rm->m_inc.i_hdr.h_len);
+                        if (conn->c_unacked_packets == 0
+                         || conn->c_unacked_bytes < len) {
+                                __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+                                conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
+                                conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+                                rds_stats_inc(s_send_ack_required);
+                        } else {
+                                conn->c_unacked_bytes -= len;
+                                conn->c_unacked_packets--;
+                        }
+                        conn->c_xmit_rm = rm;
+                }
+                /*
+                 * Try and send an rdma message.  Let's see if we can
+                 * keep this simple and require that the transport either
+                 * send the whole rdma or none of it.
+                 */
+                if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
+                        ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
+                        if (ret)
+                                break;
+                        conn->c_xmit_rdma_sent = 1;
+                        /* The transport owns the mapped memory for now.
+                         * You can't unmap it while it's on the send queue */
+                        set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+                }
+                if (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
+                    conn->c_xmit_sg < rm->m_nents) {
+                        ret = conn->c_trans->xmit(conn, rm,
+                                                  conn->c_xmit_hdr_off,
+                                                  conn->c_xmit_sg,
+                                                  conn->c_xmit_data_off);
+                        if (ret <= 0)
+                                break;
+                        if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
+                                tmp = min_t(int, ret,
+                                            sizeof(struct rds_header) -
+                                            conn->c_xmit_hdr_off);
+                                conn->c_xmit_hdr_off += tmp;
+                                ret -= tmp;
+                        }
+                        sg = &rm->m_sg[conn->c_xmit_sg];
+                        while (ret) {
+                                tmp = min_t(int, ret, sg->length -
+                                                      conn->c_xmit_data_off);
+                                conn->c_xmit_data_off += tmp;
+                                ret -= tmp;
+                                if (conn->c_xmit_data_off == sg->length) {
+                                        conn->c_xmit_data_off = 0;
+                                        sg++;
+                                        conn->c_xmit_sg++;
+                                        BUG_ON(ret != 0 &&
+                                               conn->c_xmit_sg == rm->m_nents);
+                                }
+                        }
+                }
+        }
+        /* Nuke any messages we decided not to retransmit. */
+        if (!list_empty(&to_be_dropped))
+                rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
+        if (conn->c_trans->xmit_complete)
+                conn->c_trans->xmit_complete(conn);
+        /*
+         * We might be racing with another sender who queued a message but
+         * backed off on noticing that we held the c_send_lock.  If we check
+         * for queued messages after dropping the sem then either we'll
+         * see the queued message or the queuer will get the sem.  If we
+         * notice the queued message then we trigger an immediate retry.
+         *
+         * We need to be careful only to do this when we stopped processing
+         * the send queue because it was empty.  It's the only way we
+         * stop processing the loop when the transport hasn't taken
+         * responsibility for forward progress.
+         */
+        mutex_unlock(&conn->c_send_lock);
+        if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) {
+                /* We exhausted the send quota, but there's work left to
+                 * do. Return and (re-)schedule the send worker.
+                 */
+                ret = -EAGAIN;
+        }
+        if (ret == 0 && was_empty) {
+                /* A simple bit test would be way faster than taking the
+                 * spin lock */
+                spin_lock_irqsave(&conn->c_lock, flags);
+                if (!list_empty(&conn->c_send_queue)) {
+                        rds_stats_inc(s_send_sem_queue_raced);
+                        ret = -EAGAIN;
+                }
+                spin_unlock_irqrestore(&conn->c_lock, flags);
+        }
+out:
+        return ret;
+}
+static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm)
+{
+        u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+        assert_spin_locked(&rs->rs_lock);
+        BUG_ON(rs->rs_snd_bytes < len);
+        rs->rs_snd_bytes -= len;
+        if (rs->rs_snd_bytes == 0)
+                rds_stats_inc(s_send_queue_empty);
+}
+static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
+                                    is_acked_func is_acked)
+{
+        if (is_acked)
+                return is_acked(rm, ack);
+        return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
+}
+/*
+ * Returns true if there are no messages on the send and retransmit queues
+ * which have a sequence number greater than or equal to the given sequence
+ * number.
+ */
+int rds_send_acked_before(struct rds_connection *conn, u64 seq)
+{
+        struct rds_message *rm, *tmp;
+        int ret = 1;
+        spin_lock(&conn->c_lock);
+        list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+                if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
+                        ret = 0;
+                break;
+        }
+        list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
+                if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
+                        ret = 0;
+                break;
+        }
+        spin_unlock(&conn->c_lock);
+        return ret;
+}
+/*
+ * This is pretty similar to what happens below in the ACK
+ * handling code - except that we call here as soon as we get
+ * the IB send completion on the RDMA op and the accompanying
+ * message.
+ */
+void rds_rdma_send_complete(struct rds_message *rm, int status)
+{
+        struct rds_sock *rs = NULL;
+        struct rds_rdma_op *ro;
+        struct rds_notifier *notifier;
+        spin_lock(&rm->m_rs_lock);
+        ro = rm->m_rdma_op;
+        if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
+         && ro && ro->r_notify && ro->r_notifier) {
+                notifier = ro->r_notifier;
+                rs = rm->m_rs;
+                sock_hold(rds_rs_to_sk(rs));
+                notifier->n_status = status;
+                spin_lock(&rs->rs_lock);
+                list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+                spin_unlock(&rs->rs_lock);
+                ro->r_notifier = NULL;
+        }
+        spin_unlock(&rm->m_rs_lock);
+        if (rs) {
+                rds_wake_sk_sleep(rs);
+                sock_put(rds_rs_to_sk(rs));
+        }
+}
+/*
+ * This is the same as rds_rdma_send_complete except we
+ * don't do any locking - we have all the ingredients (message,
+ * socket, socket lock) and can just move the notifier.
+ */
+static inline void
+__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
+{
+        struct rds_rdma_op *ro;
+        ro = rm->m_rdma_op;
+        if (ro && ro->r_notify && ro->r_notifier) {
+                ro->r_notifier->n_status = status;
+                list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
+                ro->r_notifier = NULL;
+        }
+        /* No need to wake the app - caller does this */
+}
+/*
+ * This is called from the IB send completion when we detect
+ * a RDMA operation that failed with remote access error.
+ * So speed is not an issue here.
+ */
+struct rds_message *rds_send_get_message(struct rds_connection *conn,
+                                         struct rds_rdma_op *op)
+{
+        struct rds_message *rm, *tmp, *found = NULL;
+        unsigned long flags;
+        spin_lock_irqsave(&conn->c_lock, flags);
+        list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+                if (rm->m_rdma_op == op) {
+                        atomic_inc(&rm->m_refcount);
+                        found = rm;
+                        goto out;
+                }
+        }
+        list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
+                if (rm->m_rdma_op == op) {
+                        atomic_inc(&rm->m_refcount);
+                        found = rm;
+                        break;
+                }
+        }
+out:
+        spin_unlock_irqrestore(&conn->c_lock, flags);
+        return found;
+}
+/*
+ * This removes messages from the socket's list if they're on it.  The list
+ * argument must be private to the caller, we must be able to modify it
+ * without locks.  The messages must have a reference held for their
+ * position on the list.  This function will drop that reference after
+ * removing the messages from the 'messages' list regardless of if it found
+ * the messages on the socket list or not.
+ */
+void rds_send_remove_from_sock(struct list_head *messages, int status)
+{
+        unsigned long flags = 0; /* silence gcc :P */
+        struct rds_sock *rs = NULL;
+        struct rds_message *rm;
+        local_irq_save(flags);
+        while (!list_empty(messages)) {
+                rm = list_entry(messages->next, struct rds_message,
+                                m_conn_item);
+                list_del_init(&rm->m_conn_item);
+                /*
+                 * If we see this flag cleared then we're *sure* that someone
+                 * else beat us to removing it from the sock.  If we race
+                 * with their flag update we'll get the lock and then really
+                 * see that the flag has been cleared.
+                 *
+                 * The message spinlock makes sure nobody clears rm->m_rs
+                 * while we're messing with it. It does not prevent the
+                 * message from being removed from the socket, though.
+                 */
+                spin_lock(&rm->m_rs_lock);
+                if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
+                        goto unlock_and_drop;
+                if (rs != rm->m_rs) {
+                        if (rs) {
+                                spin_unlock(&rs->rs_lock);
+                                rds_wake_sk_sleep(rs);
+                                sock_put(rds_rs_to_sk(rs));
+                        }
+                        rs = rm->m_rs;
+                        spin_lock(&rs->rs_lock);
+                        sock_hold(rds_rs_to_sk(rs));
+                }
+                if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
+                        struct rds_rdma_op *ro = rm->m_rdma_op;
+                        struct rds_notifier *notifier;
+                        list_del_init(&rm->m_sock_item);
+                        rds_send_sndbuf_remove(rs, rm);
+                        if (ro && ro->r_notifier
+                           && (status || ro->r_notify)) {
+                                notifier = ro->r_notifier;
+                                list_add_tail(&notifier->n_list,
+                                                &rs->rs_notify_queue);
+                                if (!notifier->n_status)
+                                        notifier->n_status = status;
+                                rm->m_rdma_op->r_notifier = NULL;
+                        }
+                        rds_message_put(rm);
+                        rm->m_rs = NULL;
+                }
+unlock_and_drop:
+                spin_unlock(&rm->m_rs_lock);
+                rds_message_put(rm);
+        }
+        if (rs) {
+                spin_unlock(&rs->rs_lock);
+                rds_wake_sk_sleep(rs);
+                sock_put(rds_rs_to_sk(rs));
+        }
+        local_irq_restore(flags);
+}
+/*
+ * Transports call here when they've determined that the receiver queued
+ * messages up to, and including, the given sequence number.  Messages are
+ * moved to the retrans queue when rds_send_xmit picks them off the send
+ * queue. This means that in the TCP case, the message may not have been
+ * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
+ * checks the RDS_MSG_HAS_ACK_SEQ bit.
+ *
+ * XXX It's not clear to me how this is safely serialized with socket
+ * destruction.  Maybe it should bail if it sees SOCK_DEAD.
+ */
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+                         is_acked_func is_acked)
+{
+        struct rds_message *rm, *tmp;
+        unsigned long flags;
+        LIST_HEAD(list);
+        spin_lock_irqsave(&conn->c_lock, flags);
+        list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+                if (!rds_send_is_acked(rm, ack, is_acked))
+                        break;
+                list_move(&rm->m_conn_item, &list);
+                clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+        }
+        /* order flag updates with spin locks */
+        if (!list_empty(&list))
+                smp_mb__after_clear_bit();
+        spin_unlock_irqrestore(&conn->c_lock, flags);
+        /* now remove the messages from the sock list as needed */
+        rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
+}
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
+{
+        struct rds_message *rm, *tmp;
+        struct rds_connection *conn;
+        unsigned long flags;
+        LIST_HEAD(list);
+        int wake = 0;
+        /* get all the messages we're dropping under the rs lock */
+        spin_lock_irqsave(&rs->rs_lock, flags);
+        list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
+                if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
+                             dest->sin_port != rm->m_inc.i_hdr.h_dport))
+                        continue;
+                wake = 1;
+                list_move(&rm->m_sock_item, &list);
+                rds_send_sndbuf_remove(rs, rm);
+                clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+                /* If this is a RDMA operation, notify the app. */
+                __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
+        }
+        /* order flag updates with the rs lock */
+        if (wake)
+                smp_mb__after_clear_bit();
+        spin_unlock_irqrestore(&rs->rs_lock, flags);
+        if (wake)
+                rds_wake_sk_sleep(rs);
+        conn = NULL;
+        /* now remove the messages from the conn list as needed */
+        list_for_each_entry(rm, &list, m_sock_item) {
+                /* We do this here rather than in the loop above, so that
+                 * we don't have to nest m_rs_lock under rs->rs_lock */
+                spin_lock(&rm->m_rs_lock);
+                rm->m_rs = NULL;
+                spin_unlock(&rm->m_rs_lock);
+                /*
+                 * If we see this flag cleared then we're *sure* that someone
+                 * else beat us to removing it from the conn.  If we race
+                 * with their flag update we'll get the lock and then really
+                 * see that the flag has been cleared.
+                 */
+                if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
+                        continue;
+                if (conn != rm->m_inc.i_conn) {
+                        if (conn)
+                                spin_unlock_irqrestore(&conn->c_lock, flags);
+                        conn = rm->m_inc.i_conn;
+                        spin_lock_irqsave(&conn->c_lock, flags);
+                }
+                if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
+                        list_del_init(&rm->m_conn_item);
+                        rds_message_put(rm);
+                }
+        }
+        if (conn)
+                spin_unlock_irqrestore(&conn->c_lock, flags);
+        while (!list_empty(&list)) {
+                rm = list_entry(list.next, struct rds_message, m_sock_item);
+                list_del_init(&rm->m_sock_item);
+                rds_message_wait(rm);
+                rds_message_put(rm);
+        }
+}
+/*
+ * we only want this to fire once so we use the callers 'queued'.  It's
+ * possible that another thread can race with us and remove the
+ * message from the flow with RDS_CANCEL_SENT_TO.
+ */
+static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
+                             struct rds_message *rm, __be16 sport,
+                             __be16 dport, int *queued)
+{
+        unsigned long flags;
+        u32 len;
+        if (*queued)
+                goto out;
+        len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+        /* this is the only place which holds both the socket's rs_lock
+         * and the connection's c_lock */
+        spin_lock_irqsave(&rs->rs_lock, flags);
+        /*
+         * If there is a little space in sndbuf, we don't queue anything,
+         * and userspace gets -EAGAIN. But poll() indicates there's send
+         * room. This can lead to bad behavior (spinning) if snd_bytes isn't
+         * freed up by incoming acks. So we check the *old* value of
+         * rs_snd_bytes here to allow the last msg to exceed the buffer,
+         * and poll() now knows no more data can be sent.
+         */
+        if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
+                rs->rs_snd_bytes += len;
+                /* let recv side know we are close to send space exhaustion.
+                 * This is probably not the optimal way to do it, as this
+                 * means we set the flag on *all* messages as soon as our
+                 * throughput hits a certain threshold.
+                 */
+                if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
+                        __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+                list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
+                set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+                rds_message_addref(rm);
+                rm->m_rs = rs;
+                /* The code ordering is a little weird, but we're
+                   trying to minimize the time we hold c_lock */
+                rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
+                rm->m_inc.i_conn = conn;
+                rds_message_addref(rm);
+                spin_lock(&conn->c_lock);
+                rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
+                list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+                set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+                spin_unlock(&conn->c_lock);
+                rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
+                         rm, len, rs, rs->rs_snd_bytes,
+                         (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
+                *queued = 1;
+        }
+        spin_unlock_irqrestore(&rs->rs_lock, flags);
+out:
+        return *queued;
+}
+static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
+                         struct msghdr *msg, int *allocated_mr)
+{
+        struct cmsghdr *cmsg;
+        int ret = 0;
+        for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+                if (!CMSG_OK(msg, cmsg))
+                        return -EINVAL;
+                if (cmsg->cmsg_level != SOL_RDS)
+                        continue;
+                /* As a side effect, RDMA_DEST and RDMA_MAP will set
+                 * rm->m_rdma_cookie and rm->m_rdma_mr.
+                 */
+                switch (cmsg->cmsg_type) {
+                case RDS_CMSG_RDMA_ARGS:
+                        ret = rds_cmsg_rdma_args(rs, rm, cmsg);
+                        break;
+                case RDS_CMSG_RDMA_DEST:
+                        ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
+                        break;
+                case RDS_CMSG_RDMA_MAP:
+                        ret = rds_cmsg_rdma_map(rs, rm, cmsg);
+                        if (!ret)
+                                *allocated_mr = 1;
+                        break;
+                default:
+                        return -EINVAL;
+                }
+                if (ret)
+                        break;
+        }
+        return ret;
+}
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+                size_t payload_len)
+{
+        struct sock *sk = sock->sk;
+        struct rds_sock *rs = rds_sk_to_rs(sk);
+        struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+        __be32 daddr;
+        __be16 dport;
+        struct rds_message *rm = NULL;
+        struct rds_connection *conn;
+        int ret = 0;
+        int queued = 0, allocated_mr = 0;
+        int nonblock = msg->msg_flags & MSG_DONTWAIT;
+        long timeo = sock_rcvtimeo(sk, nonblock);
+        /* Mirror Linux UDP mirror of BSD error message compatibility */
+        /* XXX: Perhaps MSG_MORE someday */
+        if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
+                printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags);
+                ret = -EOPNOTSUPP;
+                goto out;
+        }
+        if (msg->msg_namelen) {
+                /* XXX fail non-unicast destination IPs? */
+                if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                daddr = usin->sin_addr.s_addr;
+                dport = usin->sin_port;
+        } else {
+                /* We only care about consistency with ->connect() */
+                lock_sock(sk);
+                daddr = rs->rs_conn_addr;
+                dport = rs->rs_conn_port;
+                release_sock(sk);
+        }
+        /* racing with another thread binding seems ok here */
+        if (daddr == 0 || rs->rs_bound_addr == 0) {
+                ret = -ENOTCONN; /* XXX not a great errno */
+                goto out;
+        }
+        rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
+        if (IS_ERR(rm)) {
+                ret = PTR_ERR(rm);
+                rm = NULL;
+                goto out;
+        }
+        rm->m_daddr = daddr;
+        /* Parse any control messages the user may have included. */
+        ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
+        if (ret)
+                goto out;
+        /* rds_conn_create has a spinlock that runs with IRQ off.
+         * Caching the conn in the socket helps a lot. */
+        if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
+                conn = rs->rs_conn;
+        else {
+                conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
+                                        rs->rs_transport,
+                                        sock->sk->sk_allocation);
+                if (IS_ERR(conn)) {
+                        ret = PTR_ERR(conn);
+                        goto out;
+                }
+                rs->rs_conn = conn;
+        }
+        if ((rm->m_rdma_cookie || rm->m_rdma_op)
+         && conn->c_trans->xmit_rdma == NULL) {
+                if (printk_ratelimit())
+                        printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
+                                rm->m_rdma_op, conn->c_trans->xmit_rdma);
+                ret = -EOPNOTSUPP;
+                goto out;
+        }
+        /* If the connection is down, trigger a connect. We may
+         * have scheduled a delayed reconnect however - in this case
+         * we should not interfere.
+         */
+        if (rds_conn_state(conn) == RDS_CONN_DOWN
+         && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+                queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+        ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
+        if (ret)
+                goto out;
+        while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
+                                  dport, &queued)) {
+                rds_stats_inc(s_send_queue_full);
+                /* XXX make sure this is reasonable */
+                if (payload_len > rds_sk_sndbuf(rs)) {
+                        ret = -EMSGSIZE;
+                        goto out;
+                }
+                if (nonblock) {
+                        ret = -EAGAIN;
+                        goto out;
+                }
+                timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
+                                        rds_send_queue_rm(rs, conn, rm,
+                                                          rs->rs_bound_port,
+                                                          dport,
+                                                          &queued),
+                                        timeo);
+                rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
+                if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+                        continue;
+                ret = timeo;
+                if (ret == 0)
+                        ret = -ETIMEDOUT;
+                goto out;
+        }
+        /*
+         * By now we've committed to the send.  We reuse rds_send_worker()
+         * to retry sends in the rds thread if the transport asks us to.
+         */
+        rds_stats_inc(s_send_queued);
+        if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+                rds_send_worker(&conn->c_send_w.work);
+        rds_message_put(rm);
+        return payload_len;
+out:
+        /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
+         * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
+         * or in any other way, we need to destroy the MR again */
+        if (allocated_mr)
+                rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
+        if (rm)
+                rds_message_put(rm);
+        return ret;
+}
+/*
+ * Reply to a ping packet.
+ */
+int
+rds_send_pong(struct rds_connection *conn, __be16 dport)
+{
+        struct rds_message *rm;
+        unsigned long flags;
+        int ret = 0;
+        rm = rds_message_alloc(0, GFP_ATOMIC);
+        if (rm == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        rm->m_daddr = conn->c_faddr;
+        /* If the connection is down, trigger a connect. We may
+         * have scheduled a delayed reconnect however - in this case
+         * we should not interfere.
+         */
+        if (rds_conn_state(conn) == RDS_CONN_DOWN
+         && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+                queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+        ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
+        if (ret)
+                goto out;
+        spin_lock_irqsave(&conn->c_lock, flags);
+        list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+        set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+        rds_message_addref(rm);
+        rm->m_inc.i_conn = conn;
+        rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
+                                    conn->c_next_tx_seq);
+        conn->c_next_tx_seq++;
+        spin_unlock_irqrestore(&conn->c_lock, flags);
+        rds_stats_inc(s_send_queued);
+        rds_stats_inc(s_send_pong);
+        queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+        rds_message_put(rm);
+        return 0;
+out:
+        if (rm)
+                rds_message_put(rm);
+        return ret;
+}
diff --git a/net/rds/stats.c b/net/rds/stats.c
new file mode 100644
index 000000000000..637146893cf3
--- /dev/null
+++ b/net/rds/stats.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include "rds.h"
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+/* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */
+static char *rds_stat_names[] = {
+        "conn_reset",
+        "recv_drop_bad_checksum",
+        "recv_drop_old_seq",
+        "recv_drop_no_sock",
+        "recv_drop_dead_sock",
+        "recv_deliver_raced",
+        "recv_delivered",
+        "recv_queued",
+        "recv_immediate_retry",
+        "recv_delayed_retry",
+        "recv_ack_required",
+        "recv_rdma_bytes",
+        "recv_ping",
+        "send_queue_empty",
+        "send_queue_full",
+        "send_sem_contention",
+        "send_sem_queue_raced",
+        "send_immediate_retry",
+        "send_delayed_retry",
+        "send_drop_acked",
+        "send_ack_required",
+        "send_queued",
+        "send_rdma",
+        "send_rdma_bytes",
+        "send_pong",
+        "page_remainder_hit",
+        "page_remainder_miss",
+        "copy_to_user",
+        "copy_from_user",
+        "cong_update_queued",
+        "cong_update_received",
+        "cong_send_error",
+        "cong_send_blocked",
+};
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+                         uint64_t *values, char **names, size_t nr)
+{
+        struct rds_info_counter ctr;
+        size_t i;
+        for (i = 0; i < nr; i++) {
+                BUG_ON(strlen(names[i]) >= sizeof(ctr.name));
+                strncpy(ctr.name, names[i], sizeof(ctr.name) - 1);
+                ctr.value = values[i];
+                rds_info_copy(iter, &ctr, sizeof(ctr));
+        }
+}
+/*
+ * This gives global counters across all the transports.  The strings
+ * are copied in so that the tool doesn't need knowledge of the specific
+ * stats that we're exporting.  Some are pretty implementation dependent
+ * and may change over time.  That doesn't stop them from being useful.
+ *
+ * This is the only function in the chain that knows about the byte granular
+ * length in userspace.  It converts it to number of stat entries that the
+ * rest of the functions operate in.
+ */
+static void rds_stats_info(struct socket *sock, unsigned int len,
+                           struct rds_info_iterator *iter,
+                           struct rds_info_lengths *lens)
+{
+        struct rds_statistics stats = {0, };
+        uint64_t *src;
+        uint64_t *sum;
+        size_t i;
+        int cpu;
+        unsigned int avail;
+        avail = len / sizeof(struct rds_info_counter);
+        if (avail < ARRAY_SIZE(rds_stat_names)) {
+                avail = 0;
+                goto trans;
+        }
+        for_each_online_cpu(cpu) {
+                src = (uint64_t *)&(per_cpu(rds_stats, cpu));
+                sum = (uint64_t *)&stats;
+                for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+                        *(sum++) += *(src++);
+        }
+        rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names,
+                            ARRAY_SIZE(rds_stat_names));
+        avail -= ARRAY_SIZE(rds_stat_names);
+trans:
+        lens->each = sizeof(struct rds_info_counter);
+        lens->nr = rds_trans_stats_info_copy(iter, avail) +
+                   ARRAY_SIZE(rds_stat_names);
+}
+void rds_stats_exit(void)
+{
+        rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
+}
+int __init rds_stats_init(void)
+{
+        rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
+        return 0;
+}
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
new file mode 100644
index 000000000000..307dc5c1be15
--- /dev/null
+++ b/net/rds/sysctl.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include "rds.h"
+static struct ctl_table_header *rds_sysctl_reg_table;
+static unsigned long rds_sysctl_reconnect_min = 1;
+static unsigned long rds_sysctl_reconnect_max = ~0UL;
+unsigned long rds_sysctl_reconnect_min_jiffies;
+unsigned long rds_sysctl_reconnect_max_jiffies = HZ;
+unsigned int  rds_sysctl_max_unacked_packets = 8;
+unsigned int  rds_sysctl_max_unacked_bytes = (16 << 20);
+unsigned int rds_sysctl_ping_enable = 1;
+static ctl_table rds_sysctl_rds_table[] = {
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "reconnect_min_delay_ms",
+                .data           = &rds_sysctl_reconnect_min_jiffies,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_ms_jiffies_minmax,
+                .extra1         = &rds_sysctl_reconnect_min,
+                .extra2         = &rds_sysctl_reconnect_max_jiffies,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "reconnect_max_delay_ms",
+                .data           = &rds_sysctl_reconnect_max_jiffies,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_ms_jiffies_minmax,
+                .extra1         = &rds_sysctl_reconnect_min_jiffies,
+                .extra2         = &rds_sysctl_reconnect_max,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "max_unacked_packets",
+                .data           = &rds_sysctl_max_unacked_packets,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "max_unacked_bytes",
+                .data           = &rds_sysctl_max_unacked_bytes,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "ping_enable",
+                .data           = &rds_sysctl_ping_enable,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        { .ctl_name = 0}
+};
+static struct ctl_path rds_sysctl_path[] = {
+        { .procname = "net", .ctl_name = CTL_NET, },
+        { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
+        { }
+};
+void rds_sysctl_exit(void)
+{
+        if (rds_sysctl_reg_table)
+                unregister_sysctl_table(rds_sysctl_reg_table);
+}
+int __init rds_sysctl_init(void)
+{
+        rds_sysctl_reconnect_min = msecs_to_jiffies(1);
+        rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
+        rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
+        if (rds_sysctl_reg_table == NULL)
+                return -ENOMEM;
+        return 0;
+}
diff --git a/net/rds/threads.c b/net/rds/threads.c
new file mode 100644
index 000000000000..828a1bf9ea92
--- /dev/null
+++ b/net/rds/threads.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include "rds.h"
+/*
+ * All of connection management is simplified by serializing it through
+ * work queues that execute in a connection managing thread.
+ *
+ * TCP wants to send acks through sendpage() in response to data_ready(),
+ * but it needs a process context to do so.
+ *
+ * The receive paths need to allocate but can't drop packets (!) so we have
+ * a thread around to block allocating if the receive fast path sees an
+ * allocation failure.
+ */
+/* Grand Unified Theory of connection life cycle:
+ * At any point in time, the connection can be in one of these states:
+ * DOWN, CONNECTING, UP, DISCONNECTING, ERROR
+ *
+ * The following transitions are possible:
+ *  ANY           -> ERROR
+ *  UP            -> DISCONNECTING
+ *  ERROR         -> DISCONNECTING
+ *  DISCONNECTING -> DOWN
+ *  DOWN          -> CONNECTING
+ *  CONNECTING    -> UP
+ *
+ * Transition to state DISCONNECTING/DOWN:
+ *  -   Inside the shutdown worker; synchronizes with xmit path
+ *      through c_send_lock, and with connection management callbacks
+ *      via c_cm_lock.
+ *
+ *      For receive callbacks, we rely on the underlying transport
+ *      (TCP, IB/RDMA) to provide the necessary synchronisation.
+ */
+struct workqueue_struct *rds_wq;
+void rds_connect_complete(struct rds_connection *conn)
+{
+        if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) {
+                printk(KERN_WARNING "%s: Cannot transition to state UP, "
+                                "current state is %d\n",
+                                __func__,
+                                atomic_read(&conn->c_state));
+                atomic_set(&conn->c_state, RDS_CONN_ERROR);
+                queue_work(rds_wq, &conn->c_down_w);
+                return;
+        }
+        rdsdebug("conn %p for %pI4 to %pI4 complete\n",
+          conn, &conn->c_laddr, &conn->c_faddr);
+        conn->c_reconnect_jiffies = 0;
+        set_bit(0, &conn->c_map_queued);
+        queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+        queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+}
+/*
+ * This random exponential backoff is relied on to eventually resolve racing
+ * connects.
+ *
+ * If connect attempts race then both parties drop both connections and come
+ * here to wait for a random amount of time before trying again.  Eventually
+ * the backoff range will be so much greater than the time it takes to
+ * establish a connection that one of the pair will establish the connection
+ * before the other's random delay fires.
+ *
+ * Connection attempts that arrive while a connection is already established
+ * are also considered to be racing connects.  This lets a connection from
+ * a rebooted machine replace an existing stale connection before the transport
+ * notices that the connection has failed.
+ *
+ * We should *always* start with a random backoff; otherwise a broken connection
+ * will always take several iterations to be re-established.
+ */
+static void rds_queue_reconnect(struct rds_connection *conn)
+{
+        unsigned long rand;
+        rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
+          conn, &conn->c_laddr, &conn->c_faddr,
+          conn->c_reconnect_jiffies);
+        set_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
+        if (conn->c_reconnect_jiffies == 0) {
+                conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
+                queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+                return;
+        }
+        get_random_bytes(&rand, sizeof(rand));
+        rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
+                 rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
+                 conn, &conn->c_laddr, &conn->c_faddr);
+        queue_delayed_work(rds_wq, &conn->c_conn_w,
+                           rand % conn->c_reconnect_jiffies);
+        conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2,
+                                        rds_sysctl_reconnect_max_jiffies);
+}
+void rds_connect_worker(struct work_struct *work)
+{
+        struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work);
+        int ret;
+        clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
+        if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+                ret = conn->c_trans->conn_connect(conn);
+                rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
+                        conn, &conn->c_laddr, &conn->c_faddr, ret);
+                if (ret) {
+                        if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN))
+                                rds_queue_reconnect(conn);
+                        else
+                                rds_conn_error(conn, "RDS: connect failed\n");
+                }
+        }
+}
+void rds_shutdown_worker(struct work_struct *work)
+{
+        struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
+        /* shut it down unless it's down already */
+        if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+                /*
+                 * Quiesce the connection mgmt handlers before we start tearing
+                 * things down. We don't hold the mutex for the entire
+                 * duration of the shutdown operation, else we may be
+                 * deadlocking with the CM handler. Instead, the CM event
+                 * handler is supposed to check for state DISCONNECTING
+                 */
+                mutex_lock(&conn->c_cm_lock);
+                if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
+                 && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
+                        rds_conn_error(conn, "shutdown called in state %d\n",
+                                        atomic_read(&conn->c_state));
+                        mutex_unlock(&conn->c_cm_lock);
+                        return;
+                }
+                mutex_unlock(&conn->c_cm_lock);
+                mutex_lock(&conn->c_send_lock);
+                conn->c_trans->conn_shutdown(conn);
+                rds_conn_reset(conn);
+                mutex_unlock(&conn->c_send_lock);
+                if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+                        /* This can happen - eg when we're in the middle of tearing
+                         * down the connection, and someone unloads the rds module.
+                         * Quite reproduceable with loopback connections.
+                         * Mostly harmless.
+                         */
+                        rds_conn_error(conn,
+                                "%s: failed to transition to state DOWN, "
+                                "current state is %d\n",
+                                __func__,
+                                atomic_read(&conn->c_state));
+                        return;
+                }
+        }
+        /* Then reconnect if it's still live.
+         * The passive side of an IB loopback connection is never added
+         * to the conn hash, so we never trigger a reconnect on this
+         * conn - the reconnect is always triggered by the active peer. */
+        cancel_delayed_work(&conn->c_conn_w);
+        if (!hlist_unhashed(&conn->c_hash_node))
+                rds_queue_reconnect(conn);
+}
+void rds_send_worker(struct work_struct *work)
+{
+        struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
+        int ret;
+        if (rds_conn_state(conn) == RDS_CONN_UP) {
+                ret = rds_send_xmit(conn);
+                rdsdebug("conn %p ret %d\n", conn, ret);
+                switch (ret) {
+                case -EAGAIN:
+                        rds_stats_inc(s_send_immediate_retry);
+                        queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+                        break;
+                case -ENOMEM:
+                        rds_stats_inc(s_send_delayed_retry);
+                        queue_delayed_work(rds_wq, &conn->c_send_w, 2);
+                default:
+                        break;
+                }
+        }
+}
+void rds_recv_worker(struct work_struct *work)
+{
+        struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work);
+        int ret;
+        if (rds_conn_state(conn) == RDS_CONN_UP) {
+                ret = conn->c_trans->recv(conn);
+                rdsdebug("conn %p ret %d\n", conn, ret);
+                switch (ret) {
+                case -EAGAIN:
+                        rds_stats_inc(s_recv_immediate_retry);
+                        queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+                        break;
+                case -ENOMEM:
+                        rds_stats_inc(s_recv_delayed_retry);
+                        queue_delayed_work(rds_wq, &conn->c_recv_w, 2);
+                default:
+                        break;
+                }
+        }
+}
+void rds_threads_exit(void)
+{
+        destroy_workqueue(rds_wq);
+}
+int __init rds_threads_init(void)
+{
+        rds_wq = create_singlethread_workqueue("krdsd");
+        if (rds_wq == NULL)
+                return -ENOMEM;
+        return 0;
+}
diff --git a/net/rds/transport.c b/net/rds/transport.c
new file mode 100644
index 000000000000..767da61ad2f3
--- /dev/null
+++ b/net/rds/transport.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include "rds.h"
+#include "loop.h"
+static LIST_HEAD(rds_transports);
+static DECLARE_RWSEM(rds_trans_sem);
+int rds_trans_register(struct rds_transport *trans)
+{
+        BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ);
+        down_write(&rds_trans_sem);
+        list_add_tail(&trans->t_item, &rds_transports);
+        printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
+        up_write(&rds_trans_sem);
+        return 0;
+}
+void rds_trans_unregister(struct rds_transport *trans)
+{
+        down_write(&rds_trans_sem);
+        list_del_init(&trans->t_item);
+        printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name);
+        up_write(&rds_trans_sem);
+}
+struct rds_transport *rds_trans_get_preferred(__be32 addr)
+{
+        struct rds_transport *trans;
+        struct rds_transport *ret = NULL;
+        if (IN_LOOPBACK(ntohl(addr)))
+                return &rds_loop_transport;
+        down_read(&rds_trans_sem);
+        list_for_each_entry(trans, &rds_transports, t_item) {
+                if (trans->laddr_check(addr) == 0) {
+                        ret = trans;
+                        break;
+                }
+        }
+        up_read(&rds_trans_sem);
+        return ret;
+}
+/*
+ * This returns the number of stats entries in the snapshot and only
+ * copies them using the iter if there is enough space for them.  The
+ * caller passes in the global stats so that we can size and copy while
+ * holding the lock.
+ */
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+                                       unsigned int avail)
+{
+        struct rds_transport *trans;
+        unsigned int total = 0;
+        unsigned int part;
+        rds_info_iter_unmap(iter);
+        down_read(&rds_trans_sem);
+        list_for_each_entry(trans, &rds_transports, t_item) {
+                if (trans->stats_info_copy == NULL)
+                        continue;
+                part = trans->stats_info_copy(iter, avail);
+                avail -= min(avail, part);
+                total += part;
+        }
+        up_read(&rds_trans_sem);
+        return total;
+}
author	Ingo Molnar <mingo@elte.hu>	2009-03-30 17:53:32 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-03-30 17:53:32 -0400
commit	65fb0d23fcddd8697c871047b700c78817bdaa43 (patch)
tree	119e6e5f276622c4c862f6c9b6d795264ba1603a /net/rds
parent	8c083f081d0014057901c68a0a3e0f8ca7ac8d23 (diff)
parent	dfbbe89e197a77f2c8046a51c74e33e35f878080 (diff)