aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband
diff options
context:
space:
mode:
authorJoachim Fenkes <fenkes@de.ibm.com>2008-01-17 09:07:24 -0500
committerRoland Dreier <rolandd@cisco.com>2008-01-25 17:15:44 -0500
commit2ec8e662416cc9a171cdfe3d75e1ff00ba757859 (patch)
treedbc881205c3177db9a7816385b4fdda762bdb04c /drivers/infiniband
parentbbdd267ef2796e96b461b8447b2026ce06e6ec4b (diff)
IB/ehca: Prevent RDMA-related connection failures on some eHCA2 hardware
Some HW revisions of eHCA2 may cause an RC connection to break if they received RDMA Reads over that connection before. This can be prevented by assuring that, after the first RDMA Read, the QP receives a new RDMA Read every few million link packets. Include code into the driver that inserts an empty (size 0) RDMA Read into the message stream every now and then if the consumer doesn't post them frequently enough. Signed-off-by: Joachim Fenkes <fenkes@de.ibm.com> Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r--drivers/infiniband/hw/ehca/ehca_classes.h5
-rw-r--r--drivers/infiniband/hw/ehca/ehca_qp.c14
-rw-r--r--drivers/infiniband/hw/ehca/ehca_reqs.c112
3 files changed, 95 insertions, 36 deletions
diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h
index 2502366e845f..f281d16040f5 100644
--- a/drivers/infiniband/hw/ehca/ehca_classes.h
+++ b/drivers/infiniband/hw/ehca/ehca_classes.h
@@ -183,6 +183,11 @@ struct ehca_qp {
183 u32 mm_count_squeue; 183 u32 mm_count_squeue;
184 u32 mm_count_rqueue; 184 u32 mm_count_rqueue;
185 u32 mm_count_galpa; 185 u32 mm_count_galpa;
186 /* unsolicited ack circumvention */
187 int unsol_ack_circ;
188 int mtu_shift;
189 u32 message_count;
190 u32 packet_count;
186}; 191};
187 192
188#define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ) 193#define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ)
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
index 8d3c35fa051b..1012f15a7140 100644
--- a/drivers/infiniband/hw/ehca/ehca_qp.c
+++ b/drivers/infiniband/hw/ehca/ehca_qp.c
@@ -592,10 +592,8 @@ static struct ehca_qp *internal_create_qp(
592 goto create_qp_exit1; 592 goto create_qp_exit1;
593 } 593 }
594 594
595 if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) 595 /* Always signal by WQE so we can hide circ. WQEs */
596 parms.sigtype = HCALL_SIGT_EVERY; 596 parms.sigtype = HCALL_SIGT_BY_WQE;
597 else
598 parms.sigtype = HCALL_SIGT_BY_WQE;
599 597
600 /* UD_AV CIRCUMVENTION */ 598 /* UD_AV CIRCUMVENTION */
601 max_send_sge = init_attr->cap.max_send_sge; 599 max_send_sge = init_attr->cap.max_send_sge;
@@ -618,6 +616,10 @@ static struct ehca_qp *internal_create_qp(
618 parms.squeue.max_sge = max_send_sge; 616 parms.squeue.max_sge = max_send_sge;
619 parms.rqueue.max_sge = max_recv_sge; 617 parms.rqueue.max_sge = max_recv_sge;
620 618
619 /* RC QPs need one more SWQE for unsolicited ack circumvention */
620 if (qp_type == IB_QPT_RC)
621 parms.squeue.max_wr++;
622
621 if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) { 623 if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) {
622 if (HAS_SQ(my_qp)) 624 if (HAS_SQ(my_qp))
623 ehca_determine_small_queue( 625 ehca_determine_small_queue(
@@ -650,6 +652,8 @@ static struct ehca_qp *internal_create_qp(
650 parms.squeue.act_nr_sges = 1; 652 parms.squeue.act_nr_sges = 1;
651 parms.rqueue.act_nr_sges = 1; 653 parms.rqueue.act_nr_sges = 1;
652 } 654 }
655 /* hide the extra WQE */
656 parms.squeue.act_nr_wqes--;
653 break; 657 break;
654 case IB_QPT_UD: 658 case IB_QPT_UD:
655 case IB_QPT_GSI: 659 case IB_QPT_GSI:
@@ -1294,6 +1298,8 @@ static int internal_modify_qp(struct ib_qp *ibqp,
1294 } 1298 }
1295 1299
1296 if (attr_mask & IB_QP_PATH_MTU) { 1300 if (attr_mask & IB_QP_PATH_MTU) {
1301 /* store ld(MTU) */
1302 my_qp->mtu_shift = attr->path_mtu + 7;
1297 mqpcb->path_mtu = attr->path_mtu; 1303 mqpcb->path_mtu = attr->path_mtu;
1298 update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1); 1304 update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1);
1299 } 1305 }
diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c
index ea91360835d3..3aacc8cf1e44 100644
--- a/drivers/infiniband/hw/ehca/ehca_reqs.c
+++ b/drivers/infiniband/hw/ehca/ehca_reqs.c
@@ -50,6 +50,9 @@
50#include "hcp_if.h" 50#include "hcp_if.h"
51#include "hipz_fns.h" 51#include "hipz_fns.h"
52 52
53/* in RC traffic, insert an empty RDMA READ every this many packets */
54#define ACK_CIRC_THRESHOLD 2000000
55
53static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue, 56static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
54 struct ehca_wqe *wqe_p, 57 struct ehca_wqe *wqe_p,
55 struct ib_recv_wr *recv_wr) 58 struct ib_recv_wr *recv_wr)
@@ -81,7 +84,7 @@ static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
81 if (ehca_debug_level) { 84 if (ehca_debug_level) {
82 ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p", 85 ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p",
83 ipz_rqueue); 86 ipz_rqueue);
84 ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe"); 87 ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe");
85 } 88 }
86 89
87 return 0; 90 return 0;
@@ -135,7 +138,8 @@ static void trace_send_wr_ud(const struct ib_send_wr *send_wr)
135 138
136static inline int ehca_write_swqe(struct ehca_qp *qp, 139static inline int ehca_write_swqe(struct ehca_qp *qp,
137 struct ehca_wqe *wqe_p, 140 struct ehca_wqe *wqe_p,
138 const struct ib_send_wr *send_wr) 141 const struct ib_send_wr *send_wr,
142 int hidden)
139{ 143{
140 u32 idx; 144 u32 idx;
141 u64 dma_length; 145 u64 dma_length;
@@ -176,7 +180,9 @@ static inline int ehca_write_swqe(struct ehca_qp *qp,
176 180
177 wqe_p->wr_flag = 0; 181 wqe_p->wr_flag = 0;
178 182
179 if (send_wr->send_flags & IB_SEND_SIGNALED) 183 if ((send_wr->send_flags & IB_SEND_SIGNALED ||
184 qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR)
185 && !hidden)
180 wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM; 186 wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM;
181 187
182 if (send_wr->opcode == IB_WR_SEND_WITH_IMM || 188 if (send_wr->opcode == IB_WR_SEND_WITH_IMM ||
@@ -199,7 +205,7 @@ static inline int ehca_write_swqe(struct ehca_qp *qp,
199 205
200 wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8; 206 wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8;
201 wqe_p->local_ee_context_qkey = remote_qkey; 207 wqe_p->local_ee_context_qkey = remote_qkey;
202 if (!send_wr->wr.ud.ah) { 208 if (unlikely(!send_wr->wr.ud.ah)) {
203 ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp); 209 ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp);
204 return -EINVAL; 210 return -EINVAL;
205 } 211 }
@@ -255,6 +261,15 @@ static inline int ehca_write_swqe(struct ehca_qp *qp,
255 } /* eof idx */ 261 } /* eof idx */
256 wqe_p->u.nud.atomic_1st_op_dma_len = dma_length; 262 wqe_p->u.nud.atomic_1st_op_dma_len = dma_length;
257 263
264 /* unsolicited ack circumvention */
265 if (send_wr->opcode == IB_WR_RDMA_READ) {
266 /* on RDMA read, switch on and reset counters */
267 qp->message_count = qp->packet_count = 0;
268 qp->unsol_ack_circ = 1;
269 } else
270 /* else estimate #packets */
271 qp->packet_count += (dma_length >> qp->mtu_shift) + 1;
272
258 break; 273 break;
259 274
260 default: 275 default:
@@ -355,13 +370,49 @@ static inline void map_ib_wc_status(u32 cqe_status,
355 *wc_status = IB_WC_SUCCESS; 370 *wc_status = IB_WC_SUCCESS;
356} 371}
357 372
373static inline int post_one_send(struct ehca_qp *my_qp,
374 struct ib_send_wr *cur_send_wr,
375 struct ib_send_wr **bad_send_wr,
376 int hidden)
377{
378 struct ehca_wqe *wqe_p;
379 int ret;
380 u64 start_offset = my_qp->ipz_squeue.current_q_offset;
381
382 /* get pointer next to free WQE */
383 wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
384 if (unlikely(!wqe_p)) {
385 /* too many posted work requests: queue overflow */
386 if (bad_send_wr)
387 *bad_send_wr = cur_send_wr;
388 ehca_err(my_qp->ib_qp.device, "Too many posted WQEs "
389 "qp_num=%x", my_qp->ib_qp.qp_num);
390 return -ENOMEM;
391 }
392 /* write a SEND WQE into the QUEUE */
393 ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, hidden);
394 /*
395 * if something failed,
396 * reset the free entry pointer to the start value
397 */
398 if (unlikely(ret)) {
399 my_qp->ipz_squeue.current_q_offset = start_offset;
400 if (bad_send_wr)
401 *bad_send_wr = cur_send_wr;
402 ehca_err(my_qp->ib_qp.device, "Could not write WQE "
403 "qp_num=%x", my_qp->ib_qp.qp_num);
404 return -EINVAL;
405 }
406
407 return 0;
408}
409
358int ehca_post_send(struct ib_qp *qp, 410int ehca_post_send(struct ib_qp *qp,
359 struct ib_send_wr *send_wr, 411 struct ib_send_wr *send_wr,
360 struct ib_send_wr **bad_send_wr) 412 struct ib_send_wr **bad_send_wr)
361{ 413{
362 struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp); 414 struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
363 struct ib_send_wr *cur_send_wr; 415 struct ib_send_wr *cur_send_wr;
364 struct ehca_wqe *wqe_p;
365 int wqe_cnt = 0; 416 int wqe_cnt = 0;
366 int ret = 0; 417 int ret = 0;
367 unsigned long flags; 418 unsigned long flags;
@@ -369,37 +420,33 @@ int ehca_post_send(struct ib_qp *qp,
369 /* LOCK the QUEUE */ 420 /* LOCK the QUEUE */
370 spin_lock_irqsave(&my_qp->spinlock_s, flags); 421 spin_lock_irqsave(&my_qp->spinlock_s, flags);
371 422
423 /* Send an empty extra RDMA read if:
424 * 1) there has been an RDMA read on this connection before
425 * 2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets
426 * 3) we can be sure that any previous extra RDMA read has been
427 * processed so we don't overflow the SQ
428 */
429 if (unlikely(my_qp->unsol_ack_circ &&
430 my_qp->packet_count > ACK_CIRC_THRESHOLD &&
431 my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) {
432 /* insert an empty RDMA READ to fix up the remote QP state */
433 struct ib_send_wr circ_wr;
434 memset(&circ_wr, 0, sizeof(circ_wr));
435 circ_wr.opcode = IB_WR_RDMA_READ;
436 post_one_send(my_qp, &circ_wr, NULL, 1); /* ignore retcode */
437 wqe_cnt++;
438 ehca_dbg(qp->device, "posted circ wr qp_num=%x", qp->qp_num);
439 my_qp->message_count = my_qp->packet_count = 0;
440 }
441
372 /* loop processes list of send reqs */ 442 /* loop processes list of send reqs */
373 for (cur_send_wr = send_wr; cur_send_wr != NULL; 443 for (cur_send_wr = send_wr; cur_send_wr != NULL;
374 cur_send_wr = cur_send_wr->next) { 444 cur_send_wr = cur_send_wr->next) {
375 u64 start_offset = my_qp->ipz_squeue.current_q_offset; 445 ret = post_one_send(my_qp, cur_send_wr, bad_send_wr, 0);
376 /* get pointer next to free WQE */
377 wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
378 if (unlikely(!wqe_p)) {
379 /* too many posted work requests: queue overflow */
380 if (bad_send_wr)
381 *bad_send_wr = cur_send_wr;
382 if (wqe_cnt == 0) {
383 ret = -ENOMEM;
384 ehca_err(qp->device, "Too many posted WQEs "
385 "qp_num=%x", qp->qp_num);
386 }
387 goto post_send_exit0;
388 }
389 /* write a SEND WQE into the QUEUE */
390 ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr);
391 /*
392 * if something failed,
393 * reset the free entry pointer to the start value
394 */
395 if (unlikely(ret)) { 446 if (unlikely(ret)) {
396 my_qp->ipz_squeue.current_q_offset = start_offset; 447 /* if one or more WQEs were successful, don't fail */
397 *bad_send_wr = cur_send_wr; 448 if (wqe_cnt)
398 if (wqe_cnt == 0) { 449 ret = 0;
399 ret = -EINVAL;
400 ehca_err(qp->device, "Could not write WQE "
401 "qp_num=%x", qp->qp_num);
402 }
403 goto post_send_exit0; 450 goto post_send_exit0;
404 } 451 }
405 wqe_cnt++; 452 wqe_cnt++;
@@ -410,6 +457,7 @@ int ehca_post_send(struct ib_qp *qp,
410post_send_exit0: 457post_send_exit0:
411 iosync(); /* serialize GAL register access */ 458 iosync(); /* serialize GAL register access */
412 hipz_update_sqa(my_qp, wqe_cnt); 459 hipz_update_sqa(my_qp, wqe_cnt);
460 my_qp->message_count += wqe_cnt;
413 spin_unlock_irqrestore(&my_qp->spinlock_s, flags); 461 spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
414 return ret; 462 return ret;
415} 463}