RDS: recv.c

Upon receiving a datagram from the transport, RDS parses the headers and potentially queues an ACK. Signed-off-by: Andy Grover <andy.grover@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Andy Grover <andy.grover@oracle.com> 2009-02-24 10:30:28 -0500
committer: David S. Miller <davem@davemloft.net> 2009-02-27 02:39:29 -0500
commit: bdbe6fbc6a2f2ccfb384b141b257677d2a8d36fb (patch)
tree: d95159804fa3c11576fc00d2ef2eebba909638d8 /net/rds
parent: 5c11559046c4b3498d1977a029de8a312eacce35 (diff)
1 files changed, 542 insertions, 0 deletions
diff --git a/net/rds/recv.c b/net/rds/recv.c
new file mode 100644
index 000000000000..f2118c51cfa3
--- /dev/null
+++ b/net/rds/recv.c
@@ -0,0 +1,542 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include "rds.h"
+#include "rdma.h"
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+                  __be32 saddr)
+{
+        atomic_set(&inc->i_refcount, 1);
+        INIT_LIST_HEAD(&inc->i_item);
+        inc->i_conn = conn;
+        inc->i_saddr = saddr;
+        inc->i_rdma_cookie = 0;
+}
+void rds_inc_addref(struct rds_incoming *inc)
+{
+        rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
+        atomic_inc(&inc->i_refcount);
+}
+void rds_inc_put(struct rds_incoming *inc)
+{
+        rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
+        if (atomic_dec_and_test(&inc->i_refcount)) {
+                BUG_ON(!list_empty(&inc->i_item));
+                inc->i_conn->c_trans->inc_free(inc);
+        }
+}
+static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
+                                  struct rds_cong_map *map,
+                                  int delta, __be16 port)
+{
+        int now_congested;
+        if (delta == 0)
+                return;
+        rs->rs_rcv_bytes += delta;
+        now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
+        rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
+          "now_cong %d delta %d\n",
+          rs, &rs->rs_bound_addr,
+          ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
+          rds_sk_rcvbuf(rs), now_congested, delta);
+        /* wasn't -> am congested */
+        if (!rs->rs_congested && now_congested) {
+                rs->rs_congested = 1;
+                rds_cong_set_bit(map, port);
+                rds_cong_queue_updates(map);
+        }
+        /* was -> aren't congested */
+        /* Require more free space before reporting uncongested to prevent
+           bouncing cong/uncong state too often */
+        else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
+                rs->rs_congested = 0;
+                rds_cong_clear_bit(map, port);
+                rds_cong_queue_updates(map);
+        }
+        /* do nothing if no change in cong state */
+}
+/*
+ * Process all extension headers that come with this message.
+ */
+static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
+{
+        struct rds_header *hdr = &inc->i_hdr;
+        unsigned int pos = 0, type, len;
+        union {
+                struct rds_ext_header_version version;
+                struct rds_ext_header_rdma rdma;
+                struct rds_ext_header_rdma_dest rdma_dest;
+        } buffer;
+        while (1) {
+                len = sizeof(buffer);
+                type = rds_message_next_extension(hdr, &pos, &buffer, &len);
+                if (type == RDS_EXTHDR_NONE)
+                        break;
+                /* Process extension header here */
+                switch (type) {
+                case RDS_EXTHDR_RDMA:
+                        rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
+                        break;
+                case RDS_EXTHDR_RDMA_DEST:
+                        /* We ignore the size for now. We could stash it
+                         * somewhere and use it for error checking. */
+                        inc->i_rdma_cookie = rds_rdma_make_cookie(
+                                        be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
+                                        be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
+                        break;
+                }
+        }
+}
+/*
+ * The transport must make sure that this is serialized against other
+ * rx and conn reset on this specific conn.
+ *
+ * We currently assert that only one fragmented message will be sent
+ * down a connection at a time.  This lets us reassemble in the conn
+ * instead of per-flow which means that we don't have to go digging through
+ * flows to tear down partial reassembly progress on conn failure and
+ * we save flow lookup and locking for each frag arrival.  It does mean
+ * that small messages will wait behind large ones.  Fragmenting at all
+ * is only to reduce the memory consumption of pre-posted buffers.
+ *
+ * The caller passes in saddr and daddr instead of us getting it from the
+ * conn.  This lets loopback, who only has one conn for both directions,
+ * tell us which roles the addrs in the conn are playing for this message.
+ */
+void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+                       struct rds_incoming *inc, gfp_t gfp, enum km_type km)
+{
+        struct rds_sock *rs = NULL;
+        struct sock *sk;
+        unsigned long flags;
+        inc->i_conn = conn;
+        inc->i_rx_jiffies = jiffies;
+        rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
+                 "flags 0x%x rx_jiffies %lu\n", conn,
+                 (unsigned long long)conn->c_next_rx_seq,
+                 inc,
+                 (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
+                 be32_to_cpu(inc->i_hdr.h_len),
+                 be16_to_cpu(inc->i_hdr.h_sport),
+                 be16_to_cpu(inc->i_hdr.h_dport),
+                 inc->i_hdr.h_flags,
+                 inc->i_rx_jiffies);
+        /*
+         * Sequence numbers should only increase.  Messages get their
+         * sequence number as they're queued in a sending conn.  They
+         * can be dropped, though, if the sending socket is closed before
+         * they hit the wire.  So sequence numbers can skip forward
+         * under normal operation.  They can also drop back in the conn
+         * failover case as previously sent messages are resent down the
+         * new instance of a conn.  We drop those, otherwise we have
+         * to assume that the next valid seq does not come after a
+         * hole in the fragment stream.
+         *
+         * The headers don't give us a way to realize if fragments of
+         * a message have been dropped.  We assume that frags that arrive
+         * to a flow are part of the current message on the flow that is
+         * being reassembled.  This means that senders can't drop messages
+         * from the sending conn until all their frags are sent.
+         *
+         * XXX we could spend more on the wire to get more robust failure
+         * detection, arguably worth it to avoid data corruption.
+         */
+        if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq
+         && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
+                rds_stats_inc(s_recv_drop_old_seq);
+                goto out;
+        }
+        conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
+        if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
+                rds_stats_inc(s_recv_ping);
+                rds_send_pong(conn, inc->i_hdr.h_sport);
+                goto out;
+        }
+        rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
+        if (rs == NULL) {
+                rds_stats_inc(s_recv_drop_no_sock);
+                goto out;
+        }
+        /* Process extension headers */
+        rds_recv_incoming_exthdrs(inc, rs);
+        /* We can be racing with rds_release() which marks the socket dead. */
+        sk = rds_rs_to_sk(rs);
+        /* serialize with rds_release -> sock_orphan */
+        write_lock_irqsave(&rs->rs_recv_lock, flags);
+        if (!sock_flag(sk, SOCK_DEAD)) {
+                rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
+                rds_stats_inc(s_recv_queued);
+                rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+                                      be32_to_cpu(inc->i_hdr.h_len),
+                                      inc->i_hdr.h_dport);
+                rds_inc_addref(inc);
+                list_add_tail(&inc->i_item, &rs->rs_recv_queue);
+                __rds_wake_sk_sleep(sk);
+        } else {
+                rds_stats_inc(s_recv_drop_dead_sock);
+        }
+        write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+out:
+        if (rs)
+                rds_sock_put(rs);
+}
+/*
+ * be very careful here.  This is being called as the condition in
+ * wait_event_*() needs to cope with being called many times.
+ */
+static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
+{
+        unsigned long flags;
+        if (*inc == NULL) {
+                read_lock_irqsave(&rs->rs_recv_lock, flags);
+                if (!list_empty(&rs->rs_recv_queue)) {
+                        *inc = list_entry(rs->rs_recv_queue.next,
+                                          struct rds_incoming,
+                                          i_item);
+                        rds_inc_addref(*inc);
+                }
+                read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+        }
+        return *inc != NULL;
+}
+static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
+                            int drop)
+{
+        struct sock *sk = rds_rs_to_sk(rs);
+        int ret = 0;
+        unsigned long flags;
+        write_lock_irqsave(&rs->rs_recv_lock, flags);
+        if (!list_empty(&inc->i_item)) {
+                ret = 1;
+                if (drop) {
+                        /* XXX make sure this i_conn is reliable */
+                        rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+                                              -be32_to_cpu(inc->i_hdr.h_len),
+                                              inc->i_hdr.h_dport);
+                        list_del_init(&inc->i_item);
+                        rds_inc_put(inc);
+                }
+        }
+        write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+        rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
+        return ret;
+}
+/*
+ * Pull errors off the error queue.
+ * If msghdr is NULL, we will just purge the error queue.
+ */
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
+{
+        struct rds_notifier *notifier;
+        struct rds_rdma_notify cmsg;
+        unsigned int count = 0, max_messages = ~0U;
+        unsigned long flags;
+        LIST_HEAD(copy);
+        int err = 0;
+        /* put_cmsg copies to user space and thus may sleep. We can't do this
+         * with rs_lock held, so first grab as many notifications as we can stuff
+         * in the user provided cmsg buffer. We don't try to copy more, to avoid
+         * losing notifications - except when the buffer is so small that it wouldn't
+         * even hold a single notification. Then we give him as much of this single
+         * msg as we can squeeze in, and set MSG_CTRUNC.
+         */
+        if (msghdr) {
+                max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
+                if (!max_messages)
+                        max_messages = 1;
+        }
+        spin_lock_irqsave(&rs->rs_lock, flags);
+        while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
+                notifier = list_entry(rs->rs_notify_queue.next,
+                                struct rds_notifier, n_list);
+                list_move(&notifier->n_list, &copy);
+                count++;
+        }
+        spin_unlock_irqrestore(&rs->rs_lock, flags);
+        if (!count)
+                return 0;
+        while (!list_empty(&copy)) {
+                notifier = list_entry(copy.next, struct rds_notifier, n_list);
+                if (msghdr) {
+                        cmsg.user_token = notifier->n_user_token;
+                        cmsg.status  = notifier->n_status;
+                        err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
+                                        sizeof(cmsg), &cmsg);
+                        if (err)
+                                break;
+                }
+                list_del_init(&notifier->n_list);
+                kfree(notifier);
+        }
+        /* If we bailed out because of an error in put_cmsg,
+         * we may be left with one or more notifications that we
+         * didn't process. Return them to the head of the list. */
+        if (!list_empty(&copy)) {
+                spin_lock_irqsave(&rs->rs_lock, flags);
+                list_splice(&copy, &rs->rs_notify_queue);
+                spin_unlock_irqrestore(&rs->rs_lock, flags);
+        }
+        return err;
+}
+/*
+ * Queue a congestion notification
+ */
+static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
+{
+        uint64_t notify = rs->rs_cong_notify;
+        unsigned long flags;
+        int err;
+        err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
+                        sizeof(notify), &notify);
+        if (err)
+                return err;
+        spin_lock_irqsave(&rs->rs_lock, flags);
+        rs->rs_cong_notify &= ~notify;
+        spin_unlock_irqrestore(&rs->rs_lock, flags);
+        return 0;
+}
+/*
+ * Receive any control messages.
+ */
+static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
+{
+        int ret = 0;
+        if (inc->i_rdma_cookie) {
+                ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
+                                sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+                size_t size, int msg_flags)
+{
+        struct sock *sk = sock->sk;
+        struct rds_sock *rs = rds_sk_to_rs(sk);
+        long timeo;
+        int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
+        struct sockaddr_in *sin;
+        struct rds_incoming *inc = NULL;
+        /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
+        timeo = sock_rcvtimeo(sk, nonblock);
+        rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
+        if (msg_flags & MSG_OOB)
+                goto out;
+        /* If there are pending notifications, do those - and nothing else */
+        if (!list_empty(&rs->rs_notify_queue)) {
+                ret = rds_notify_queue_get(rs, msg);
+                goto out;
+        }
+        if (rs->rs_cong_notify) {
+                ret = rds_notify_cong(rs, msg);
+                goto out;
+        }
+        while (1) {
+                if (!rds_next_incoming(rs, &inc)) {
+                        if (nonblock) {
+                                ret = -EAGAIN;
+                                break;
+                        }
+                        timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
+                                                rds_next_incoming(rs, &inc),
+                                                timeo);
+                        rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
+                                 timeo);
+                        if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+                                continue;
+                        ret = timeo;
+                        if (ret == 0)
+                                ret = -ETIMEDOUT;
+                        break;
+                }
+                rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
+                         &inc->i_conn->c_faddr,
+                         ntohs(inc->i_hdr.h_sport));
+                ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov,
+                                                             size);
+                if (ret < 0)
+                        break;
+                /*
+                 * if the message we just copied isn't at the head of the
+                 * recv queue then someone else raced us to return it, try
+                 * to get the next message.
+                 */
+                if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
+                        rds_inc_put(inc);
+                        inc = NULL;
+                        rds_stats_inc(s_recv_deliver_raced);
+                        continue;
+                }
+                if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
+                        if (msg_flags & MSG_TRUNC)
+                                ret = be32_to_cpu(inc->i_hdr.h_len);
+                        msg->msg_flags |= MSG_TRUNC;
+                }
+                if (rds_cmsg_recv(inc, msg)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                rds_stats_inc(s_recv_delivered);
+                sin = (struct sockaddr_in *)msg->msg_name;
+                if (sin) {
+                        sin->sin_family = AF_INET;
+                        sin->sin_port = inc->i_hdr.h_sport;
+                        sin->sin_addr.s_addr = inc->i_saddr;
+                        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+                }
+                break;
+        }
+        if (inc)
+                rds_inc_put(inc);
+out:
+        return ret;
+}
+/*
+ * The socket is being shut down and we're asked to drop messages that were
+ * queued for recvmsg.  The caller has unbound the socket so the receive path
+ * won't queue any more incoming fragments or messages on the socket.
+ */
+void rds_clear_recv_queue(struct rds_sock *rs)
+{
+        struct sock *sk = rds_rs_to_sk(rs);
+        struct rds_incoming *inc, *tmp;
+        unsigned long flags;
+        write_lock_irqsave(&rs->rs_recv_lock, flags);
+        list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
+                rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+                                      -be32_to_cpu(inc->i_hdr.h_len),
+                                      inc->i_hdr.h_dport);
+                list_del_init(&inc->i_item);
+                rds_inc_put(inc);
+        }
+        write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+/*
+ * inc->i_saddr isn't used here because it is only set in the receive
+ * path.
+ */
+void rds_inc_info_copy(struct rds_incoming *inc,
+                       struct rds_info_iterator *iter,
+                       __be32 saddr, __be32 daddr, int flip)
+{
+        struct rds_info_message minfo;
+        minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+        minfo.len = be32_to_cpu(inc->i_hdr.h_len);
+        if (flip) {
+                minfo.laddr = daddr;
+                minfo.faddr = saddr;
+                minfo.lport = inc->i_hdr.h_dport;
+                minfo.fport = inc->i_hdr.h_sport;
+        } else {
+                minfo.laddr = saddr;
+                minfo.faddr = daddr;
+                minfo.lport = inc->i_hdr.h_sport;
+                minfo.fport = inc->i_hdr.h_dport;
+        }
+        rds_info_copy(iter, &minfo, sizeof(minfo));
+}
author	Andy Grover <andy.grover@oracle.com>	2009-02-24 10:30:28 -0500
committer	David S. Miller <davem@davemloft.net>	2009-02-27 02:39:29 -0500
commit	bdbe6fbc6a2f2ccfb384b141b257677d2a8d36fb (patch)
tree	d95159804fa3c11576fc00d2ef2eebba909638d8 /net/rds
parent	5c11559046c4b3498d1977a029de8a312eacce35 (diff)

diff --git a/net/rds/recv.c b/net/rds/recv.c new file mode 100644 index 000000000000..f2118c51cfa3 --- /dev/null +++ b/net/rds/recv.c
@@ -0,0 +1,542 @@
	1	/*
	2	* Copyright (c) 2006 Oracle. All rights reserved.
	3	*
	4	* This software is available to you under a choice of one of two
	5	* licenses. You may choose to be licensed under the terms of the GNU
	6	* General Public License (GPL) Version 2, available from the file
	7	* COPYING in the main directory of this source tree, or the
	8	* OpenIB.org BSD license below:
	9	*
	10	* Redistribution and use in source and binary forms, with or
	11	* without modification, are permitted provided that the following
	12	* conditions are met:
	13	*
	14	* - Redistributions of source code must retain the above
	15	* copyright notice, this list of conditions and the following
	16	* disclaimer.
	17	*
	18	* - Redistributions in binary form must reproduce the above
	19	* copyright notice, this list of conditions and the following
	20	* disclaimer in the documentation and/or other materials
	21	* provided with the distribution.
	22	*
	23	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	24	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	25	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	26	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	27	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	28	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	29	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	30	* SOFTWARE.
	31	*
	32	*/
	33	#include <linux/kernel.h>
	34	#include <net/sock.h>
	35	#include <linux/in.h>
	36
	37	#include "rds.h"
	38	#include "rdma.h"
	39
	40	void rds_inc_init(struct rds_incoming inc, struct rds_connection conn,
	41	__be32 saddr)
	42	{
	43	atomic_set(&inc->i_refcount, 1);
	44	INIT_LIST_HEAD(&inc->i_item);
	45	inc->i_conn = conn;
	46	inc->i_saddr = saddr;
	47	inc->i_rdma_cookie = 0;
	48	}
	49
	50	void rds_inc_addref(struct rds_incoming *inc)
	51	{
	52	rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
	53	atomic_inc(&inc->i_refcount);
	54	}
	55
	56	void rds_inc_put(struct rds_incoming *inc)
	57	{
	58	rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
	59	if (atomic_dec_and_test(&inc->i_refcount)) {
	60	BUG_ON(!list_empty(&inc->i_item));
	61
	62	inc->i_conn->c_trans->inc_free(inc);
	63	}
	64	}
	65
	66	static void rds_recv_rcvbuf_delta(struct rds_sock rs, struct sock sk,
	67	struct rds_cong_map *map,
	68	int delta, __be16 port)
	69	{
	70	int now_congested;
	71
	72	if (delta == 0)
	73	return;
	74
	75	rs->rs_rcv_bytes += delta;
	76	now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
	77
	78	rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
	79	"now_cong %d delta %d\n",
	80	rs, &rs->rs_bound_addr,
	81	ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
	82	rds_sk_rcvbuf(rs), now_congested, delta);
	83
	84	/* wasn't -> am congested */
	85	if (!rs->rs_congested && now_congested) {
	86	rs->rs_congested = 1;
	87	rds_cong_set_bit(map, port);
	88	rds_cong_queue_updates(map);
	89	}
	90	/* was -> aren't congested */
	91	/* Require more free space before reporting uncongested to prevent
	92	bouncing cong/uncong state too often */
	93	else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
	94	rs->rs_congested = 0;
	95	rds_cong_clear_bit(map, port);
	96	rds_cong_queue_updates(map);
	97	}
	98
	99	/* do nothing if no change in cong state */
	100	}
	101
	102	/*
	103	* Process all extension headers that come with this message.
	104	*/
	105	static void rds_recv_incoming_exthdrs(struct rds_incoming inc, struct rds_sock rs)
	106	{
	107	struct rds_header *hdr = &inc->i_hdr;
	108	unsigned int pos = 0, type, len;
	109	union {
	110	struct rds_ext_header_version version;
	111	struct rds_ext_header_rdma rdma;
	112	struct rds_ext_header_rdma_dest rdma_dest;
	113	} buffer;
	114
	115	while (1) {
	116	len = sizeof(buffer);
	117	type = rds_message_next_extension(hdr, &pos, &buffer, &len);
	118	if (type == RDS_EXTHDR_NONE)
	119	break;
	120	/* Process extension header here */
	121	switch (type) {
	122	case RDS_EXTHDR_RDMA:
	123	rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
	124	break;
	125
	126	case RDS_EXTHDR_RDMA_DEST:
	127	/* We ignore the size for now. We could stash it
	128	* somewhere and use it for error checking. */
	129	inc->i_rdma_cookie = rds_rdma_make_cookie(
	130	be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
	131	be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
	132
	133	break;
	134	}
	135	}
	136	}
	137
	138	/*
	139	* The transport must make sure that this is serialized against other
	140	* rx and conn reset on this specific conn.
	141	*
	142	* We currently assert that only one fragmented message will be sent
	143	* down a connection at a time. This lets us reassemble in the conn
	144	* instead of per-flow which means that we don't have to go digging through
	145	* flows to tear down partial reassembly progress on conn failure and
	146	* we save flow lookup and locking for each frag arrival. It does mean
	147	* that small messages will wait behind large ones. Fragmenting at all
	148	* is only to reduce the memory consumption of pre-posted buffers.
	149	*
	150	* The caller passes in saddr and daddr instead of us getting it from the
	151	* conn. This lets loopback, who only has one conn for both directions,
	152	* tell us which roles the addrs in the conn are playing for this message.
	153	*/
	154	void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
	155	struct rds_incoming *inc, gfp_t gfp, enum km_type km)
	156	{
	157	struct rds_sock *rs = NULL;
	158	struct sock *sk;
	159	unsigned long flags;
	160
	161	inc->i_conn = conn;
	162	inc->i_rx_jiffies = jiffies;
	163
	164	rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
	165	"flags 0x%x rx_jiffies %lu\n", conn,
	166	(unsigned long long)conn->c_next_rx_seq,
	167	inc,
	168	(unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
	169	be32_to_cpu(inc->i_hdr.h_len),
	170	be16_to_cpu(inc->i_hdr.h_sport),
	171	be16_to_cpu(inc->i_hdr.h_dport),
	172	inc->i_hdr.h_flags,
	173	inc->i_rx_jiffies);
	174
	175	/*
	176	* Sequence numbers should only increase. Messages get their
	177	* sequence number as they're queued in a sending conn. They
	178	* can be dropped, though, if the sending socket is closed before
	179	* they hit the wire. So sequence numbers can skip forward
	180	* under normal operation. They can also drop back in the conn
	181	* failover case as previously sent messages are resent down the
	182	* new instance of a conn. We drop those, otherwise we have
	183	* to assume that the next valid seq does not come after a
	184	* hole in the fragment stream.
	185	*
	186	* The headers don't give us a way to realize if fragments of
	187	* a message have been dropped. We assume that frags that arrive
	188	* to a flow are part of the current message on the flow that is
	189	* being reassembled. This means that senders can't drop messages
	190	* from the sending conn until all their frags are sent.
	191	*
	192	* XXX we could spend more on the wire to get more robust failure
	193	* detection, arguably worth it to avoid data corruption.
	194	*/
	195	if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq
	196	&& (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
	197	rds_stats_inc(s_recv_drop_old_seq);
	198	goto out;
	199	}
	200	conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
	201
	202	if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
	203	rds_stats_inc(s_recv_ping);
	204	rds_send_pong(conn, inc->i_hdr.h_sport);
	205	goto out;
	206	}
	207
	208	rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
	209	if (rs == NULL) {
	210	rds_stats_inc(s_recv_drop_no_sock);
	211	goto out;
	212	}
	213
	214	/* Process extension headers */
	215	rds_recv_incoming_exthdrs(inc, rs);
	216
	217	/* We can be racing with rds_release() which marks the socket dead. */
	218	sk = rds_rs_to_sk(rs);
	219
	220	/* serialize with rds_release -> sock_orphan */
	221	write_lock_irqsave(&rs->rs_recv_lock, flags);
	222	if (!sock_flag(sk, SOCK_DEAD)) {
	223	rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
	224	rds_stats_inc(s_recv_queued);
	225	rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
	226	be32_to_cpu(inc->i_hdr.h_len),
	227	inc->i_hdr.h_dport);
	228	rds_inc_addref(inc);
	229	list_add_tail(&inc->i_item, &rs->rs_recv_queue);
	230	__rds_wake_sk_sleep(sk);
	231	} else {
	232	rds_stats_inc(s_recv_drop_dead_sock);
	233	}
	234	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
	235
	236	out:
	237	if (rs)
	238	rds_sock_put(rs);
	239	}
	240
	241	/*
	242	* be very careful here. This is being called as the condition in
	243	* wait_event_*() needs to cope with being called many times.
	244	*/
	245	static int rds_next_incoming(struct rds_sock rs, struct rds_incoming *inc)
	246	{
	247	unsigned long flags;
	248
	249	if (*inc == NULL) {
	250	read_lock_irqsave(&rs->rs_recv_lock, flags);
	251	if (!list_empty(&rs->rs_recv_queue)) {
	252	*inc = list_entry(rs->rs_recv_queue.next,
	253	struct rds_incoming,
	254	i_item);
	255	rds_inc_addref(*inc);
	256	}
	257	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
	258	}
	259
	260	return *inc != NULL;
	261	}
	262
	263	static int rds_still_queued(struct rds_sock rs, struct rds_incoming inc,
	264	int drop)
	265	{
	266	struct sock *sk = rds_rs_to_sk(rs);
	267	int ret = 0;
	268	unsigned long flags;
	269
	270	write_lock_irqsave(&rs->rs_recv_lock, flags);
	271	if (!list_empty(&inc->i_item)) {
	272	ret = 1;
	273	if (drop) {
	274	/* XXX make sure this i_conn is reliable */
	275	rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
	276	-be32_to_cpu(inc->i_hdr.h_len),
	277	inc->i_hdr.h_dport);
	278	list_del_init(&inc->i_item);
	279	rds_inc_put(inc);
	280	}
	281	}
	282	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
	283
	284	rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
	285	return ret;
	286	}
	287
	288	/*
	289	* Pull errors off the error queue.
	290	* If msghdr is NULL, we will just purge the error queue.
	291	*/
	292	int rds_notify_queue_get(struct rds_sock rs, struct msghdr msghdr)
	293	{
	294	struct rds_notifier *notifier;
	295	struct rds_rdma_notify cmsg;
	296	unsigned int count = 0, max_messages = ~0U;
	297	unsigned long flags;
	298	LIST_HEAD(copy);
	299	int err = 0;
	300
	301
	302	/* put_cmsg copies to user space and thus may sleep. We can't do this
	303	* with rs_lock held, so first grab as many notifications as we can stuff
	304	* in the user provided cmsg buffer. We don't try to copy more, to avoid
	305	* losing notifications - except when the buffer is so small that it wouldn't
	306	* even hold a single notification. Then we give him as much of this single
	307	* msg as we can squeeze in, and set MSG_CTRUNC.
	308	*/
	309	if (msghdr) {
	310	max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
	311	if (!max_messages)
	312	max_messages = 1;
	313	}
	314
	315	spin_lock_irqsave(&rs->rs_lock, flags);
	316	while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
	317	notifier = list_entry(rs->rs_notify_queue.next,
	318	struct rds_notifier, n_list);
	319	list_move(&notifier->n_list, &copy);
	320	count++;
	321	}
	322	spin_unlock_irqrestore(&rs->rs_lock, flags);
	323
	324	if (!count)
	325	return 0;
	326
	327	while (!list_empty(&copy)) {
	328	notifier = list_entry(copy.next, struct rds_notifier, n_list);
	329
	330	if (msghdr) {
	331	cmsg.user_token = notifier->n_user_token;
	332	cmsg.status = notifier->n_status;
	333
	334	err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
	335	sizeof(cmsg), &cmsg);
	336	if (err)
	337	break;
	338	}
	339
	340	list_del_init(&notifier->n_list);
	341	kfree(notifier);
	342	}
	343
	344	/* If we bailed out because of an error in put_cmsg,
	345	* we may be left with one or more notifications that we
	346	* didn't process. Return them to the head of the list. */
	347	if (!list_empty(&copy)) {
	348	spin_lock_irqsave(&rs->rs_lock, flags);
	349	list_splice(&copy, &rs->rs_notify_queue);
	350	spin_unlock_irqrestore(&rs->rs_lock, flags);
	351	}
	352
	353	return err;
	354	}
	355
	356	/*
	357	* Queue a congestion notification
	358	*/
	359	static int rds_notify_cong(struct rds_sock rs, struct msghdr msghdr)
	360	{
	361	uint64_t notify = rs->rs_cong_notify;
	362	unsigned long flags;
	363	int err;
	364
	365	err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
	366	sizeof(notify), &notify);
	367	if (err)
	368	return err;
	369
	370	spin_lock_irqsave(&rs->rs_lock, flags);
	371	rs->rs_cong_notify &= ~notify;
	372	spin_unlock_irqrestore(&rs->rs_lock, flags);
	373
	374	return 0;
	375	}
	376
	377	/*
	378	* Receive any control messages.
	379	*/
	380	static int rds_cmsg_recv(struct rds_incoming inc, struct msghdr msg)
	381	{
	382	int ret = 0;
	383
	384	if (inc->i_rdma_cookie) {
	385	ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
	386	sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
	387	if (ret)
	388	return ret;
	389	}
	390
	391	return 0;
	392	}
	393
	394	int rds_recvmsg(struct kiocb iocb, struct socket sock, struct msghdr *msg,
	395	size_t size, int msg_flags)
	396	{
	397	struct sock *sk = sock->sk;
	398	struct rds_sock *rs = rds_sk_to_rs(sk);
	399	long timeo;
	400	int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
	401	struct sockaddr_in *sin;
	402	struct rds_incoming *inc = NULL;
	403
	404	/* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
	405	timeo = sock_rcvtimeo(sk, nonblock);
	406
	407	rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
	408
	409	if (msg_flags & MSG_OOB)
	410	goto out;
	411
	412	/* If there are pending notifications, do those - and nothing else */
	413	if (!list_empty(&rs->rs_notify_queue)) {
	414	ret = rds_notify_queue_get(rs, msg);
	415	goto out;
	416	}
	417
	418	if (rs->rs_cong_notify) {
	419	ret = rds_notify_cong(rs, msg);
	420	goto out;
	421	}
	422
	423	while (1) {
	424	if (!rds_next_incoming(rs, &inc)) {
	425	if (nonblock) {
	426	ret = -EAGAIN;
	427	break;
	428	}
	429
	430	timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
	431	rds_next_incoming(rs, &inc),
	432	timeo);
	433	rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
	434	timeo);
	435	if (timeo > 0 \|\| timeo == MAX_SCHEDULE_TIMEOUT)
	436	continue;
	437
	438	ret = timeo;
	439	if (ret == 0)
	440	ret = -ETIMEDOUT;
	441	break;
	442	}
	443
	444	rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
	445	&inc->i_conn->c_faddr,
	446	ntohs(inc->i_hdr.h_sport));
	447	ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov,
	448	size);
	449	if (ret < 0)
	450	break;
	451
	452	/*
	453	* if the message we just copied isn't at the head of the
	454	* recv queue then someone else raced us to return it, try
	455	* to get the next message.
	456	*/
	457	if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
	458	rds_inc_put(inc);
	459	inc = NULL;
	460	rds_stats_inc(s_recv_deliver_raced);
	461	continue;
	462	}
	463
	464	if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
	465	if (msg_flags & MSG_TRUNC)
	466	ret = be32_to_cpu(inc->i_hdr.h_len);
	467	msg->msg_flags \|= MSG_TRUNC;
	468	}
	469
	470	if (rds_cmsg_recv(inc, msg)) {
	471	ret = -EFAULT;
	472	goto out;
	473	}
	474
	475	rds_stats_inc(s_recv_delivered);
	476
	477	sin = (struct sockaddr_in *)msg->msg_name;
	478	if (sin) {
	479	sin->sin_family = AF_INET;
	480	sin->sin_port = inc->i_hdr.h_sport;
	481	sin->sin_addr.s_addr = inc->i_saddr;
	482	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
	483	}
	484	break;
	485	}
	486
	487	if (inc)
	488	rds_inc_put(inc);
	489
	490	out:
	491	return ret;
	492	}
	493
	494	/*
	495	* The socket is being shut down and we're asked to drop messages that were
	496	* queued for recvmsg. The caller has unbound the socket so the receive path
	497	* won't queue any more incoming fragments or messages on the socket.
	498	*/
	499	void rds_clear_recv_queue(struct rds_sock *rs)
	500	{
	501	struct sock *sk = rds_rs_to_sk(rs);
	502	struct rds_incoming inc, tmp;
	503	unsigned long flags;
	504
	505	write_lock_irqsave(&rs->rs_recv_lock, flags);
	506	list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
	507	rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
	508	-be32_to_cpu(inc->i_hdr.h_len),
	509	inc->i_hdr.h_dport);
	510	list_del_init(&inc->i_item);
	511	rds_inc_put(inc);
	512	}
	513	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
	514	}
	515
	516	/*
	517	* inc->i_saddr isn't used here because it is only set in the receive
	518	* path.
	519	*/
	520	void rds_inc_info_copy(struct rds_incoming *inc,
	521	struct rds_info_iterator *iter,
	522	__be32 saddr, __be32 daddr, int flip)
	523	{
	524	struct rds_info_message minfo;
	525
	526	minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
	527	minfo.len = be32_to_cpu(inc->i_hdr.h_len);
	528
	529	if (flip) {
	530	minfo.laddr = daddr;
	531	minfo.faddr = saddr;
	532	minfo.lport = inc->i_hdr.h_dport;
	533	minfo.fport = inc->i_hdr.h_sport;
	534	} else {
	535	minfo.laddr = saddr;
	536	minfo.faddr = daddr;
	537	minfo.lport = inc->i_hdr.h_sport;
	538	minfo.fport = inc->i_hdr.h_dport;
	539	}
	540
	541	rds_info_copy(iter, &minfo, sizeof(minfo));
	542	}