IPoIB/cm: Fix SRQ WR leak

SRQ WR leakage has been observed with IPoIB/CM: e.g. flipping ports on and off will, with time, leak out all WRs and then all connections will start getting RNR NAKs. Fix this in the way suggested by spec: move the QP being destroyed to the error state, wait for "Last WQE Reached" event and then post WR on a "drain QP" connected to the same CQ. Once we observe a completion on the drain QP, it's safe to call ib_destroy_qp. Signed-off-by: Michael S. Tsirkin <mst@dev.mellanox.co.il> Signed-off-by: Roland Dreier <rolandd@cisco.com>
author: Michael S. Tsirkin <mst@dev.mellanox.co.il> 2007-05-21 08:04:59 -0400
committer: Roland Dreier <rolandd@cisco.com> 2007-05-21 16:35:40 -0400
commit: 518b1646f8a31904ca637b8df0c1e31c34a7a3c2 (patch)
tree: b72e7d9b6b3e5338d636746e77d326bd42aa4e29 /drivers/infiniband
parent: 24bd1e4e32e88cd3d0675482d15bea498a922ca8 (diff)
3 files changed, 211 insertions, 36 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 93d4a9a1e1dd..a0b3782c7625 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -132,12 +132,46 @@ struct ipoib_cm_data {
        __be32 mtu;
 };
+/*
+ * Quoting 10.3.1 Queue Pair and EE Context States:
+ *
+ * Note, for QPs that are associated with an SRQ, the Consumer should take the
+ * QP through the Error State before invoking a Destroy QP or a Modify QP to the
+ * Reset State.  The Consumer may invoke the Destroy QP without first performing
+ * a Modify QP to the Error State and waiting for the Affiliated Asynchronous
+ * Last WQE Reached Event. However, if the Consumer does not wait for the
+ * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment
+ * leakage may occur. Therefore, it is good programming practice to tear down a
+ * QP that is associated with an SRQ by using the following process:
+ *
+ * - Put the QP in the Error State
+ * - Wait for the Affiliated Asynchronous Last WQE Reached Event;
+ * - either:
+ *       drain the CQ by invoking the Poll CQ verb and either wait for CQ
+ *       to be empty or the number of Poll CQ operations has exceeded
+ *       CQ capacity size;
+ * - or
+ *       post another WR that completes on the same CQ and wait for this
+ *       WR to return as a WC;
+ * - and then invoke a Destroy QP or Reset QP.
+ *
+ * We use the second option and wait for a completion on the
+ * rx_drain_qp before destroying QPs attached to our SRQ.
+ */
+enum ipoib_cm_state {
+        IPOIB_CM_RX_LIVE,
+        IPOIB_CM_RX_ERROR, /* Ignored by stale task */
+        IPOIB_CM_RX_FLUSH  /* Last WQE Reached event observed */
+};
 struct ipoib_cm_rx {
        struct ib_cm_id     *id;
        struct ib_qp        *qp;
        struct list_head     list;
        struct net_device   *dev;
        unsigned long        jiffies;
+        enum ipoib_cm_state  state;
 };
 struct ipoib_cm_tx {
@@ -165,10 +199,16 @@ struct ipoib_cm_dev_priv {
        struct ib_srq          *srq;
        struct ipoib_cm_rx_buf *srq_ring;
        struct ib_cm_id        *id;
-        struct list_head        passive_ids;
+        struct ib_qp           *rx_drain_qp;   /* generates WR described in 10.3.1 */
+        struct list_head        passive_ids;   /* state: LIVE */
+        struct list_head        rx_error_list; /* state: ERROR */
+        struct list_head        rx_flush_list; /* state: FLUSH, drain not started */
+        struct list_head        rx_drain_list; /* state: FLUSH, drain started */
+        struct list_head        rx_reap_list;  /* state: FLUSH, drain done */
        struct work_struct      start_task;
        struct work_struct      reap_task;
        struct work_struct      skb_task;
+        struct work_struct      rx_reap_task;
        struct delayed_work     stale_task;
        struct sk_buff_head     skb_queue;
        struct list_head        start_list;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index eec833b81e9b..ffec794b7913 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -37,6 +37,7 @@
 #include <net/dst.h>
 #include <net/icmp.h>
 #include <linux/icmpv6.h>
+#include <linux/delay.h>
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
 static int data_debug_level;
@@ -62,6 +63,16 @@ struct ipoib_cm_id {
        u32 remote_mtu;
 };
+static struct ib_qp_attr ipoib_cm_err_attr = {
+        .qp_state = IB_QPS_ERR
+};
+#define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff
+static struct ib_recv_wr ipoib_cm_rx_drain_wr = {
+        .wr_id = IPOIB_CM_RX_DRAIN_WRID
+};
 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
                               struct ib_cm_event *event);
@@ -150,11 +161,44 @@ partial_error:
        return NULL;
 }
+static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv)
+{
+        struct ib_recv_wr *bad_wr;
+        /* rx_drain_qp send queue depth is 1, so
+         * make sure we have at most 1 outstanding WR. */
+        if (list_empty(&priv->cm.rx_flush_list) ||
+            !list_empty(&priv->cm.rx_drain_list))
+                return;
+        if (ib_post_recv(priv->cm.rx_drain_qp, &ipoib_cm_rx_drain_wr, &bad_wr))
+                ipoib_warn(priv, "failed to post rx_drain wr\n");
+        list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
+}
+static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
+{
+        struct ipoib_cm_rx *p = ctx;
+        struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+        unsigned long flags;
+        if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
+                return;
+        spin_lock_irqsave(&priv->lock, flags);
+        list_move(&p->list, &priv->cm.rx_flush_list);
+        p->state = IPOIB_CM_RX_FLUSH;
+        ipoib_cm_start_rx_drain(priv);
+        spin_unlock_irqrestore(&priv->lock, flags);
+}
 static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
                                           struct ipoib_cm_rx *p)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct ib_qp_init_attr attr = {
+                .event_handler = ipoib_cm_rx_event_handler,
                .send_cq = priv->cq, /* does not matter, we never send anything */
                .recv_cq = priv->cq,
                .srq = priv->cm.srq,
@@ -256,6 +300,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
        cm_id->context = p;
        p->jiffies = jiffies;
+        p->state = IPOIB_CM_RX_LIVE;
        spin_lock_irq(&priv->lock);
        if (list_empty(&priv->cm.passive_ids))
                queue_delayed_work(ipoib_workqueue,
@@ -277,7 +322,6 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
 {
        struct ipoib_cm_rx *p;
        struct ipoib_dev_priv *priv;
-        int ret;
        switch (event->event) {
        case IB_CM_REQ_RECEIVED:
@@ -289,20 +333,9 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
        case IB_CM_REJ_RECEIVED:
                p = cm_id->context;
                priv = netdev_priv(p->dev);
-                spin_lock_irq(&priv->lock);
+                if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
-                if (list_empty(&p->list))
+                        ipoib_warn(priv, "unable to move qp to error state\n");
-                        ret = 0; /* Connection is going away already. */
+                /* Fall through */
-                else {
-                        list_del_init(&p->list);
-                        ret = -ECONNRESET;
-                }
-                spin_unlock_irq(&priv->lock);
-                if (ret) {
-                        ib_destroy_qp(p->qp);
-                        kfree(p);
-                        return ret;
-                }
-                return 0;
        default:
                return 0;
        }
@@ -354,8 +387,15 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
                       wr_id, wc->status);
        if (unlikely(wr_id >= ipoib_recvq_size)) {
-                ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
+                if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) {
-                           wr_id, ipoib_recvq_size);
+                        spin_lock_irqsave(&priv->lock, flags);
+                        list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
+                        ipoib_cm_start_rx_drain(priv);
+                        queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+                        spin_unlock_irqrestore(&priv->lock, flags);
+                } else
+                        ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
+                                   wr_id, ipoib_recvq_size);
                return;
        }
@@ -374,9 +414,9 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
                if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
                        spin_lock_irqsave(&priv->lock, flags);
                        p->jiffies = jiffies;
-                        /* Move this entry to list head, but do
+                        /* Move this entry to list head, but do not re-add it
-                         * not re-add it if it has been removed. */
+                         * if it has been moved out of list. */
-                        if (!list_empty(&p->list))
+                        if (p->state == IPOIB_CM_RX_LIVE)
                                list_move(&p->list, &priv->cm.passive_ids);
                        spin_unlock_irqrestore(&priv->lock, flags);
                }
@@ -583,17 +623,43 @@ static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr)
 int ipoib_cm_dev_open(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
+        struct ib_qp_init_attr qp_init_attr = {
+                .send_cq = priv->cq,   /* does not matter, we never send anything */
+                .recv_cq = priv->cq,
+                .cap.max_send_wr = 1,  /* FIXME: 0 Seems not to work */
+                .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
+                .cap.max_recv_wr = 1,
+                .cap.max_recv_sge = 1, /* FIXME: 0 Seems not to work */
+                .sq_sig_type = IB_SIGNAL_ALL_WR,
+                .qp_type = IB_QPT_UC,
+        };
        int ret;
        if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
                return 0;
+        priv->cm.rx_drain_qp = ib_create_qp(priv->pd, &qp_init_attr);
+        if (IS_ERR(priv->cm.rx_drain_qp)) {
+                printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
+                ret = PTR_ERR(priv->cm.rx_drain_qp);
+                return ret;
+        }
+        /*
+         * We put the QP in error state directly.  This way, a "flush
+         * error" WC will be immediately generated for each WR we post.
+         */
+        ret = ib_modify_qp(priv->cm.rx_drain_qp, &ipoib_cm_err_attr, IB_QP_STATE);
+        if (ret) {
+                ipoib_warn(priv, "failed to modify drain QP to error: %d\n", ret);
+                goto err_qp;
+        }
        priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
        if (IS_ERR(priv->cm.id)) {
                printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
                ret = PTR_ERR(priv->cm.id);
-                priv->cm.id = NULL;
+                goto err_cm;
-                return ret;
        }
        ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
@@ -601,35 +667,79 @@ int ipoib_cm_dev_open(struct net_device *dev)
        if (ret) {
                printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
                       IPOIB_CM_IETF_ID | priv->qp->qp_num);
-                ib_destroy_cm_id(priv->cm.id);
+                goto err_listen;
-                priv->cm.id = NULL;
-                return ret;
        }
        return 0;
+err_listen:
+        ib_destroy_cm_id(priv->cm.id);
+err_cm:
+        priv->cm.id = NULL;
+err_qp:
+        ib_destroy_qp(priv->cm.rx_drain_qp);
+        return ret;
 }
 void ipoib_cm_dev_stop(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
-        struct ipoib_cm_rx *p;
+        struct ipoib_cm_rx *p, *n;
+        unsigned long begin;
+        LIST_HEAD(list);
+        int ret;
        if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id)
                return;
        ib_destroy_cm_id(priv->cm.id);
        priv->cm.id = NULL;
        spin_lock_irq(&priv->lock);
        while (!list_empty(&priv->cm.passive_ids)) {
                p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
-                list_del_init(&p->list);
+                list_move(&p->list, &priv->cm.rx_error_list);
+                p->state = IPOIB_CM_RX_ERROR;
                spin_unlock_irq(&priv->lock);
+                ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+                if (ret)
+                        ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
+                spin_lock_irq(&priv->lock);
+        }
+        /* Wait for all RX to be drained */
+        begin = jiffies;
+        while (!list_empty(&priv->cm.rx_error_list) ||
+               !list_empty(&priv->cm.rx_flush_list) ||
+               !list_empty(&priv->cm.rx_drain_list)) {
+                if (!time_after(jiffies, begin + 5 * HZ)) {
+                        ipoib_warn(priv, "RX drain timing out\n");
+                        /*
+                         * assume the HW is wedged and just free up everything.
+                         */
+                        list_splice_init(&priv->cm.rx_flush_list, &list);
+                        list_splice_init(&priv->cm.rx_error_list, &list);
+                        list_splice_init(&priv->cm.rx_drain_list, &list);
+                        break;
+                }
+                spin_unlock_irq(&priv->lock);
+                msleep(1);
+                spin_lock_irq(&priv->lock);
+        }
+        list_splice_init(&priv->cm.rx_reap_list, &list);
+        spin_unlock_irq(&priv->lock);
+        list_for_each_entry_safe(p, n, &list, list) {
                ib_destroy_cm_id(p->id);
                ib_destroy_qp(p->qp);
                kfree(p);
-                spin_lock_irq(&priv->lock);
        }
-        spin_unlock_irq(&priv->lock);
+        ib_destroy_qp(priv->cm.rx_drain_qp);
        cancel_delayed_work(&priv->cm.stale_task);
 }
@@ -1079,24 +1189,44 @@ void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
                queue_work(ipoib_workqueue, &priv->cm.skb_task);
 }
+static void ipoib_cm_rx_reap(struct work_struct *work)
+{
+        struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+                                                   cm.rx_reap_task);
+        struct ipoib_cm_rx *p, *n;
+        LIST_HEAD(list);
+        spin_lock_irq(&priv->lock);
+        list_splice_init(&priv->cm.rx_reap_list, &list);
+        spin_unlock_irq(&priv->lock);
+        list_for_each_entry_safe(p, n, &list, list) {
+                ib_destroy_cm_id(p->id);
+                ib_destroy_qp(p->qp);
+                kfree(p);
+        }
+}
 static void ipoib_cm_stale_task(struct work_struct *work)
 {
        struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
                                                   cm.stale_task.work);
        struct ipoib_cm_rx *p;
+        int ret;
        spin_lock_irq(&priv->lock);
        while (!list_empty(&priv->cm.passive_ids)) {
-                /* List if sorted by LRU, start from tail,
+                /* List is sorted by LRU, start from tail,
                 * stop when we see a recently used entry */
                p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
                if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
                        break;
-                list_del_init(&p->list);
+                list_move(&p->list, &priv->cm.rx_error_list);
+                p->state = IPOIB_CM_RX_ERROR;
                spin_unlock_irq(&priv->lock);
-                ib_destroy_cm_id(p->id);
+                ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
-                ib_destroy_qp(p->qp);
+                if (ret)
-                kfree(p);
+                        ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
                spin_lock_irq(&priv->lock);
        }
@@ -1164,9 +1294,14 @@ int ipoib_cm_dev_init(struct net_device *dev)
        INIT_LIST_HEAD(&priv->cm.passive_ids);
        INIT_LIST_HEAD(&priv->cm.reap_list);
        INIT_LIST_HEAD(&priv->cm.start_list);
+        INIT_LIST_HEAD(&priv->cm.rx_error_list);
+        INIT_LIST_HEAD(&priv->cm.rx_flush_list);
+        INIT_LIST_HEAD(&priv->cm.rx_drain_list);
+        INIT_LIST_HEAD(&priv->cm.rx_reap_list);
        INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
        INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
        INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
+        INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
        INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
        skb_queue_head_init(&priv->cm.skb_queue);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 791252621b26..982eb88e27ec 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -173,7 +173,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
        size = ipoib_sendq_size + ipoib_recvq_size + 1;
        ret = ipoib_cm_dev_init(dev);
        if (!ret)
-                size += ipoib_recvq_size;
+                size += ipoib_recvq_size + 1 /* 1 extra for rx_drain_qp */;
        priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
        if (IS_ERR(priv->cq)) {
author	Michael S. Tsirkin <mst@dev.mellanox.co.il>	2007-05-21 08:04:59 -0400
committer	Roland Dreier <rolandd@cisco.com>	2007-05-21 16:35:40 -0400
commit	518b1646f8a31904ca637b8df0c1e31c34a7a3c2 (patch)
tree	b72e7d9b6b3e5338d636746e77d326bd42aa4e29 /drivers/infiniband
parent	24bd1e4e32e88cd3d0675482d15bea498a922ca8 (diff)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 93d4a9a1e1dd..a0b3782c7625 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -132,12 +132,46 @@ struct ipoib_cm_data {
132	__be32 mtu;	132	__be32 mtu;
133	};	133	};
134		134
		135	/*
		136	* Quoting 10.3.1 Queue Pair and EE Context States:
		137	*
		138	* Note, for QPs that are associated with an SRQ, the Consumer should take the
		139	* QP through the Error State before invoking a Destroy QP or a Modify QP to the
		140	* Reset State. The Consumer may invoke the Destroy QP without first performing
		141	* a Modify QP to the Error State and waiting for the Affiliated Asynchronous
		142	* Last WQE Reached Event. However, if the Consumer does not wait for the
		143	* Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment
		144	* leakage may occur. Therefore, it is good programming practice to tear down a
		145	* QP that is associated with an SRQ by using the following process:
		146	*
		147	* - Put the QP in the Error State
		148	* - Wait for the Affiliated Asynchronous Last WQE Reached Event;
		149	* - either:
		150	* drain the CQ by invoking the Poll CQ verb and either wait for CQ
		151	* to be empty or the number of Poll CQ operations has exceeded
		152	* CQ capacity size;
		153	* - or
		154	* post another WR that completes on the same CQ and wait for this
		155	* WR to return as a WC;
		156	* - and then invoke a Destroy QP or Reset QP.
		157	*
		158	* We use the second option and wait for a completion on the
		159	* rx_drain_qp before destroying QPs attached to our SRQ.
		160	*/
		161
		162	enum ipoib_cm_state {
		163	IPOIB_CM_RX_LIVE,
		164	IPOIB_CM_RX_ERROR, /* Ignored by stale task */
		165	IPOIB_CM_RX_FLUSH /* Last WQE Reached event observed */
		166	};
		167
135	struct ipoib_cm_rx {	168	struct ipoib_cm_rx {
136	struct ib_cm_id *id;	169	struct ib_cm_id *id;
137	struct ib_qp *qp;	170	struct ib_qp *qp;
138	struct list_head list;	171	struct list_head list;
139	struct net_device *dev;	172	struct net_device *dev;
140	unsigned long jiffies;	173	unsigned long jiffies;
		174	enum ipoib_cm_state state;
141	};	175	};
142		176
143	struct ipoib_cm_tx {	177	struct ipoib_cm_tx {
@@ -165,10 +199,16 @@ struct ipoib_cm_dev_priv {
165	struct ib_srq *srq;	199	struct ib_srq *srq;
166	struct ipoib_cm_rx_buf *srq_ring;	200	struct ipoib_cm_rx_buf *srq_ring;
167	struct ib_cm_id *id;	201	struct ib_cm_id *id;
168	struct list_head passive_ids;	202	struct ib_qp rx_drain_qp; / generates WR described in 10.3.1 */
		203	struct list_head passive_ids; /* state: LIVE */
		204	struct list_head rx_error_list; /* state: ERROR */
		205	struct list_head rx_flush_list; /* state: FLUSH, drain not started */
		206	struct list_head rx_drain_list; /* state: FLUSH, drain started */
		207	struct list_head rx_reap_list; /* state: FLUSH, drain done */
169	struct work_struct start_task;	208	struct work_struct start_task;
170	struct work_struct reap_task;	209	struct work_struct reap_task;
171	struct work_struct skb_task;	210	struct work_struct skb_task;
		211	struct work_struct rx_reap_task;
172	struct delayed_work stale_task;	212	struct delayed_work stale_task;
173	struct sk_buff_head skb_queue;	213	struct sk_buff_head skb_queue;
174	struct list_head start_list;	214	struct list_head start_list;


diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index eec833b81e9b..ffec794b7913 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -37,6 +37,7 @@
37	#include <net/dst.h>	37	#include <net/dst.h>
38	#include <net/icmp.h>	38	#include <net/icmp.h>
39	#include <linux/icmpv6.h>	39	#include <linux/icmpv6.h>
		40	#include <linux/delay.h>
40		41
41	#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA	42	#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
42	static int data_debug_level;	43	static int data_debug_level;
@@ -62,6 +63,16 @@ struct ipoib_cm_id {
62	u32 remote_mtu;	63	u32 remote_mtu;
63	};	64	};
64		65
		66	static struct ib_qp_attr ipoib_cm_err_attr = {
		67	.qp_state = IB_QPS_ERR
		68	};
		69
		70	#define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff
		71
		72	static struct ib_recv_wr ipoib_cm_rx_drain_wr = {
		73	.wr_id = IPOIB_CM_RX_DRAIN_WRID
		74	};
		75
65	static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,	76	static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
66	struct ib_cm_event *event);	77	struct ib_cm_event *event);
67		78
@@ -150,11 +161,44 @@ partial_error:
150	return NULL;	161	return NULL;
151	}	162	}
152		163
		164	static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv)
		165	{
		166	struct ib_recv_wr *bad_wr;
		167
		168	/* rx_drain_qp send queue depth is 1, so
		169	* make sure we have at most 1 outstanding WR. */
		170	if (list_empty(&priv->cm.rx_flush_list) \|\|
		171	!list_empty(&priv->cm.rx_drain_list))
		172	return;
		173
		174	if (ib_post_recv(priv->cm.rx_drain_qp, &ipoib_cm_rx_drain_wr, &bad_wr))
		175	ipoib_warn(priv, "failed to post rx_drain wr\n");
		176
		177	list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
		178	}
		179
		180	static void ipoib_cm_rx_event_handler(struct ib_event event, void ctx)
		181	{
		182	struct ipoib_cm_rx *p = ctx;
		183	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
		184	unsigned long flags;
		185
		186	if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
		187	return;
		188
		189	spin_lock_irqsave(&priv->lock, flags);
		190	list_move(&p->list, &priv->cm.rx_flush_list);
		191	p->state = IPOIB_CM_RX_FLUSH;
		192	ipoib_cm_start_rx_drain(priv);
		193	spin_unlock_irqrestore(&priv->lock, flags);
		194	}
		195
153	static struct ib_qp ipoib_cm_create_rx_qp(struct net_device dev,	196	static struct ib_qp ipoib_cm_create_rx_qp(struct net_device dev,
154	struct ipoib_cm_rx *p)	197	struct ipoib_cm_rx *p)
155	{	198	{
156	struct ipoib_dev_priv *priv = netdev_priv(dev);	199	struct ipoib_dev_priv *priv = netdev_priv(dev);
157	struct ib_qp_init_attr attr = {	200	struct ib_qp_init_attr attr = {
		201	.event_handler = ipoib_cm_rx_event_handler,
158	.send_cq = priv->cq, /* does not matter, we never send anything */	202	.send_cq = priv->cq, /* does not matter, we never send anything */
159	.recv_cq = priv->cq,	203	.recv_cq = priv->cq,
160	.srq = priv->cm.srq,	204	.srq = priv->cm.srq,
@@ -256,6 +300,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id cm_id, struct ib_cm_event even
256		300
257	cm_id->context = p;	301	cm_id->context = p;
258	p->jiffies = jiffies;	302	p->jiffies = jiffies;
		303	p->state = IPOIB_CM_RX_LIVE;
259	spin_lock_irq(&priv->lock);	304	spin_lock_irq(&priv->lock);
260	if (list_empty(&priv->cm.passive_ids))	305	if (list_empty(&priv->cm.passive_ids))
261	queue_delayed_work(ipoib_workqueue,	306	queue_delayed_work(ipoib_workqueue,
@@ -277,7 +322,6 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
277	{	322	{
278	struct ipoib_cm_rx *p;	323	struct ipoib_cm_rx *p;
279	struct ipoib_dev_priv *priv;	324	struct ipoib_dev_priv *priv;
280	int ret;
281		325
282	switch (event->event) {	326	switch (event->event) {
283	case IB_CM_REQ_RECEIVED:	327	case IB_CM_REQ_RECEIVED:
@@ -289,20 +333,9 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
289	case IB_CM_REJ_RECEIVED:	333	case IB_CM_REJ_RECEIVED:
290	p = cm_id->context;	334	p = cm_id->context;
291	priv = netdev_priv(p->dev);	335	priv = netdev_priv(p->dev);
292	spin_lock_irq(&priv->lock);	336	if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
293	if (list_empty(&p->list))	337	ipoib_warn(priv, "unable to move qp to error state\n");
294	ret = 0; /* Connection is going away already. */	338	/* Fall through */
295	else {
296	list_del_init(&p->list);
297	ret = -ECONNRESET;
298	}
299	spin_unlock_irq(&priv->lock);
300	if (ret) {
301	ib_destroy_qp(p->qp);
302	kfree(p);
303	return ret;
304	}
305	return 0;
306	default:	339	default:
307	return 0;	340	return 0;
308	}	341	}
@@ -354,8 +387,15 @@ void ipoib_cm_handle_rx_wc(struct net_device dev, struct ib_wc wc)
354	wr_id, wc->status);	387	wr_id, wc->status);
355		388
356	if (unlikely(wr_id >= ipoib_recvq_size)) {	389	if (unlikely(wr_id >= ipoib_recvq_size)) {
357	ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",	390	if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) {
358	wr_id, ipoib_recvq_size);	391	spin_lock_irqsave(&priv->lock, flags);
		392	list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
		393	ipoib_cm_start_rx_drain(priv);
		394	queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
		395	spin_unlock_irqrestore(&priv->lock, flags);
		396	} else
		397	ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
		398	wr_id, ipoib_recvq_size);
359	return;	399	return;
360	}	400	}
361		401
@@ -374,9 +414,9 @@ void ipoib_cm_handle_rx_wc(struct net_device dev, struct ib_wc wc)
374	if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {	414	if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
375	spin_lock_irqsave(&priv->lock, flags);	415	spin_lock_irqsave(&priv->lock, flags);
376	p->jiffies = jiffies;	416	p->jiffies = jiffies;
377	/* Move this entry to list head, but do	417	/* Move this entry to list head, but do not re-add it
378	* not re-add it if it has been removed. */	418	* if it has been moved out of list. */
379	if (!list_empty(&p->list))	419	if (p->state == IPOIB_CM_RX_LIVE)
380	list_move(&p->list, &priv->cm.passive_ids);	420	list_move(&p->list, &priv->cm.passive_ids);
381	spin_unlock_irqrestore(&priv->lock, flags);	421	spin_unlock_irqrestore(&priv->lock, flags);
382	}	422	}
@@ -583,17 +623,43 @@ static void ipoib_cm_tx_completion(struct ib_cq cq, void tx_ptr)
583	int ipoib_cm_dev_open(struct net_device *dev)	623	int ipoib_cm_dev_open(struct net_device *dev)
584	{	624	{
585	struct ipoib_dev_priv *priv = netdev_priv(dev);	625	struct ipoib_dev_priv *priv = netdev_priv(dev);
		626	struct ib_qp_init_attr qp_init_attr = {
		627	.send_cq = priv->cq, /* does not matter, we never send anything */
		628	.recv_cq = priv->cq,
		629	.cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */
		630	.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
		631	.cap.max_recv_wr = 1,
		632	.cap.max_recv_sge = 1, /* FIXME: 0 Seems not to work */
		633	.sq_sig_type = IB_SIGNAL_ALL_WR,
		634	.qp_type = IB_QPT_UC,
		635	};
586	int ret;	636	int ret;
587		637
588	if (!IPOIB_CM_SUPPORTED(dev->dev_addr))	638	if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
589	return 0;	639	return 0;
590		640
		641	priv->cm.rx_drain_qp = ib_create_qp(priv->pd, &qp_init_attr);
		642	if (IS_ERR(priv->cm.rx_drain_qp)) {
		643	printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
		644	ret = PTR_ERR(priv->cm.rx_drain_qp);
		645	return ret;
		646	}
		647
		648	/*
		649	* We put the QP in error state directly. This way, a "flush
		650	* error" WC will be immediately generated for each WR we post.
		651	*/
		652	ret = ib_modify_qp(priv->cm.rx_drain_qp, &ipoib_cm_err_attr, IB_QP_STATE);
		653	if (ret) {
		654	ipoib_warn(priv, "failed to modify drain QP to error: %d\n", ret);
		655	goto err_qp;
		656	}
		657
591	priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);	658	priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
592	if (IS_ERR(priv->cm.id)) {	659	if (IS_ERR(priv->cm.id)) {
593	printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);	660	printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
594	ret = PTR_ERR(priv->cm.id);	661	ret = PTR_ERR(priv->cm.id);
595	priv->cm.id = NULL;	662	goto err_cm;
596	return ret;
597	}	663	}
598		664
599	ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID \| priv->qp->qp_num),	665	ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID \| priv->qp->qp_num),
@@ -601,35 +667,79 @@ int ipoib_cm_dev_open(struct net_device *dev)
601	if (ret) {	667	if (ret) {
602	printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,	668	printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
603	IPOIB_CM_IETF_ID \| priv->qp->qp_num);	669	IPOIB_CM_IETF_ID \| priv->qp->qp_num);
604	ib_destroy_cm_id(priv->cm.id);	670	goto err_listen;
605	priv->cm.id = NULL;
606	return ret;
607	}	671	}
		672
608	return 0;	673	return 0;
		674
		675	err_listen:
		676	ib_destroy_cm_id(priv->cm.id);
		677	err_cm:
		678	priv->cm.id = NULL;
		679	err_qp:
		680	ib_destroy_qp(priv->cm.rx_drain_qp);
		681	return ret;
609	}	682	}
610		683
611	void ipoib_cm_dev_stop(struct net_device *dev)	684	void ipoib_cm_dev_stop(struct net_device *dev)
612	{	685	{
613	struct ipoib_dev_priv *priv = netdev_priv(dev);	686	struct ipoib_dev_priv *priv = netdev_priv(dev);
614	struct ipoib_cm_rx *p;	687	struct ipoib_cm_rx p, n;
		688	unsigned long begin;
		689	LIST_HEAD(list);
		690	int ret;
615		691
616	if (!IPOIB_CM_SUPPORTED(dev->dev_addr) \|\| !priv->cm.id)	692	if (!IPOIB_CM_SUPPORTED(dev->dev_addr) \|\| !priv->cm.id)
617	return;	693	return;
618		694
619	ib_destroy_cm_id(priv->cm.id);	695	ib_destroy_cm_id(priv->cm.id);
620	priv->cm.id = NULL;	696	priv->cm.id = NULL;
		697
621	spin_lock_irq(&priv->lock);	698	spin_lock_irq(&priv->lock);
622	while (!list_empty(&priv->cm.passive_ids)) {	699	while (!list_empty(&priv->cm.passive_ids)) {
623	p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);	700	p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
624	list_del_init(&p->list);	701	list_move(&p->list, &priv->cm.rx_error_list);
		702	p->state = IPOIB_CM_RX_ERROR;
625	spin_unlock_irq(&priv->lock);	703	spin_unlock_irq(&priv->lock);
		704	ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
		705	if (ret)
		706	ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
		707	spin_lock_irq(&priv->lock);
		708	}
		709
		710	/* Wait for all RX to be drained */
		711	begin = jiffies;
		712
		713	while (!list_empty(&priv->cm.rx_error_list) \|\|
		714	!list_empty(&priv->cm.rx_flush_list) \|\|
		715	!list_empty(&priv->cm.rx_drain_list)) {
		716	if (!time_after(jiffies, begin + 5 * HZ)) {
		717	ipoib_warn(priv, "RX drain timing out\n");
		718
		719	/*
		720	* assume the HW is wedged and just free up everything.
		721	*/
		722	list_splice_init(&priv->cm.rx_flush_list, &list);
		723	list_splice_init(&priv->cm.rx_error_list, &list);
		724	list_splice_init(&priv->cm.rx_drain_list, &list);
		725	break;
		726	}
		727	spin_unlock_irq(&priv->lock);
		728	msleep(1);
		729	spin_lock_irq(&priv->lock);
		730	}
		731
		732	list_splice_init(&priv->cm.rx_reap_list, &list);
		733
		734	spin_unlock_irq(&priv->lock);
		735
		736	list_for_each_entry_safe(p, n, &list, list) {
626	ib_destroy_cm_id(p->id);	737	ib_destroy_cm_id(p->id);
627	ib_destroy_qp(p->qp);	738	ib_destroy_qp(p->qp);
628	kfree(p);	739	kfree(p);
629	spin_lock_irq(&priv->lock);
630	}	740	}
631	spin_unlock_irq(&priv->lock);
632		741
		742	ib_destroy_qp(priv->cm.rx_drain_qp);
633	cancel_delayed_work(&priv->cm.stale_task);	743	cancel_delayed_work(&priv->cm.stale_task);
634	}	744	}
635		745
@@ -1079,24 +1189,44 @@ void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
1079	queue_work(ipoib_workqueue, &priv->cm.skb_task);	1189	queue_work(ipoib_workqueue, &priv->cm.skb_task);
1080	}	1190	}
1081		1191
		1192	static void ipoib_cm_rx_reap(struct work_struct *work)
		1193	{
		1194	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
		1195	cm.rx_reap_task);
		1196	struct ipoib_cm_rx p, n;
		1197	LIST_HEAD(list);
		1198
		1199	spin_lock_irq(&priv->lock);
		1200	list_splice_init(&priv->cm.rx_reap_list, &list);
		1201	spin_unlock_irq(&priv->lock);
		1202
		1203	list_for_each_entry_safe(p, n, &list, list) {
		1204	ib_destroy_cm_id(p->id);
		1205	ib_destroy_qp(p->qp);
		1206	kfree(p);
		1207	}
		1208	}
		1209
1082	static void ipoib_cm_stale_task(struct work_struct *work)	1210	static void ipoib_cm_stale_task(struct work_struct *work)
1083	{	1211	{
1084	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,	1212	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1085	cm.stale_task.work);	1213	cm.stale_task.work);
1086	struct ipoib_cm_rx *p;	1214	struct ipoib_cm_rx *p;
		1215	int ret;
1087		1216
1088	spin_lock_irq(&priv->lock);	1217	spin_lock_irq(&priv->lock);
1089	while (!list_empty(&priv->cm.passive_ids)) {	1218	while (!list_empty(&priv->cm.passive_ids)) {
1090	/* List if sorted by LRU, start from tail,	1219	/* List is sorted by LRU, start from tail,
1091	* stop when we see a recently used entry */	1220	* stop when we see a recently used entry */
1092	p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);	1221	p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
1093	if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))	1222	if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
1094	break;	1223	break;
1095	list_del_init(&p->list);	1224	list_move(&p->list, &priv->cm.rx_error_list);
		1225	p->state = IPOIB_CM_RX_ERROR;
1096	spin_unlock_irq(&priv->lock);	1226	spin_unlock_irq(&priv->lock);
1097	ib_destroy_cm_id(p->id);	1227	ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
1098	ib_destroy_qp(p->qp);	1228	if (ret)
1099	kfree(p);	1229	ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
1100	spin_lock_irq(&priv->lock);	1230	spin_lock_irq(&priv->lock);
1101	}	1231	}
1102		1232
@@ -1164,9 +1294,14 @@ int ipoib_cm_dev_init(struct net_device *dev)
1164	INIT_LIST_HEAD(&priv->cm.passive_ids);	1294	INIT_LIST_HEAD(&priv->cm.passive_ids);
1165	INIT_LIST_HEAD(&priv->cm.reap_list);	1295	INIT_LIST_HEAD(&priv->cm.reap_list);
1166	INIT_LIST_HEAD(&priv->cm.start_list);	1296	INIT_LIST_HEAD(&priv->cm.start_list);
		1297	INIT_LIST_HEAD(&priv->cm.rx_error_list);
		1298	INIT_LIST_HEAD(&priv->cm.rx_flush_list);
		1299	INIT_LIST_HEAD(&priv->cm.rx_drain_list);
		1300	INIT_LIST_HEAD(&priv->cm.rx_reap_list);
1167	INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);	1301	INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
1168	INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);	1302	INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
1169	INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);	1303	INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
		1304	INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
1170	INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);	1305	INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
1171		1306
1172	skb_queue_head_init(&priv->cm.skb_queue);	1307	skb_queue_head_init(&priv->cm.skb_queue);


diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 791252621b26..982eb88e27ec 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -173,7 +173,7 @@ int ipoib_transport_dev_init(struct net_device dev, struct ib_device ca)
173	size = ipoib_sendq_size + ipoib_recvq_size + 1;	173	size = ipoib_sendq_size + ipoib_recvq_size + 1;
174	ret = ipoib_cm_dev_init(dev);	174	ret = ipoib_cm_dev_init(dev);
175	if (!ret)	175	if (!ret)
176	size += ipoib_recvq_size;	176	size += ipoib_recvq_size + 1 /* 1 extra for rx_drain_qp */;
177		177
178	priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);	178	priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
179	if (IS_ERR(priv->cq)) {	179	if (IS_ERR(priv->cq)) {