aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan O'Sullivan <bos@pathscale.com>2006-07-01 07:36:10 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-07-01 12:56:01 -0400
commit12eef41f8b72b6e11e36b48c78849c17e49781c8 (patch)
tree3a8bd77d77c7542e19d200d3abf25fc07f0d8f51
parentfba75200ad92892bf32d8d6f1443c6f1e4f48676 (diff)
[PATCH] IB/ipath: rC receive interrupt performance changes
This patch separates QP state used for sending and receiving RC packets so the processing in the receive interrupt handler can be done mostly without locks being held. ACK packets are now sent without requiring synchronization with the send tasklet. Signed-off-by: Ralph Campbell <ralph.campbell@qlogic.com> Signed-off-by: Bryan O'Sullivan <bryan.osullivan@qlogic.com> Cc: "Michael S. Tsirkin" <mst@mellanox.co.il> Cc: Roland Dreier <rolandd@cisco.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/infiniband/hw/ipath/ipath_keys.c50
-rw-r--r--drivers/infiniband/hw/ipath/ipath_qp.c43
-rw-r--r--drivers/infiniband/hw/ipath/ipath_rc.c383
-rw-r--r--drivers/infiniband/hw/ipath/ipath_ruc.c37
-rw-r--r--drivers/infiniband/hw/ipath/ipath_uc.c9
-rw-r--r--drivers/infiniband/hw/ipath/ipath_verbs.h12
6 files changed, 265 insertions, 269 deletions
diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/infiniband/hw/ipath/ipath_keys.c
index e9d5d7e983bb..46773c673a1a 100644
--- a/drivers/infiniband/hw/ipath/ipath_keys.c
+++ b/drivers/infiniband/hw/ipath/ipath_keys.c
@@ -121,6 +121,7 @@ int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge,
121 struct ib_sge *sge, int acc) 121 struct ib_sge *sge, int acc)
122{ 122{
123 struct ipath_mregion *mr; 123 struct ipath_mregion *mr;
124 unsigned n, m;
124 size_t off; 125 size_t off;
125 int ret; 126 int ret;
126 127
@@ -152,20 +153,22 @@ int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge,
152 } 153 }
153 154
154 off += mr->offset; 155 off += mr->offset;
155 isge->mr = mr; 156 m = 0;
156 isge->m = 0; 157 n = 0;
157 isge->n = 0; 158 while (off >= mr->map[m]->segs[n].length) {
158 while (off >= mr->map[isge->m]->segs[isge->n].length) { 159 off -= mr->map[m]->segs[n].length;
159 off -= mr->map[isge->m]->segs[isge->n].length; 160 n++;
160 isge->n++; 161 if (n >= IPATH_SEGSZ) {
161 if (isge->n >= IPATH_SEGSZ) { 162 m++;
162 isge->m++; 163 n = 0;
163 isge->n = 0;
164 } 164 }
165 } 165 }
166 isge->vaddr = mr->map[isge->m]->segs[isge->n].vaddr + off; 166 isge->mr = mr;
167 isge->length = mr->map[isge->m]->segs[isge->n].length - off; 167 isge->vaddr = mr->map[m]->segs[n].vaddr + off;
168 isge->length = mr->map[m]->segs[n].length - off;
168 isge->sge_length = sge->length; 169 isge->sge_length = sge->length;
170 isge->m = m;
171 isge->n = n;
169 172
170 ret = 1; 173 ret = 1;
171 174
@@ -190,6 +193,7 @@ int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss,
190 struct ipath_lkey_table *rkt = &dev->lk_table; 193 struct ipath_lkey_table *rkt = &dev->lk_table;
191 struct ipath_sge *sge = &ss->sge; 194 struct ipath_sge *sge = &ss->sge;
192 struct ipath_mregion *mr; 195 struct ipath_mregion *mr;
196 unsigned n, m;
193 size_t off; 197 size_t off;
194 int ret; 198 int ret;
195 199
@@ -207,20 +211,22 @@ int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss,
207 } 211 }
208 212
209 off += mr->offset; 213 off += mr->offset;
210 sge->mr = mr; 214 m = 0;
211 sge->m = 0; 215 n = 0;
212 sge->n = 0; 216 while (off >= mr->map[m]->segs[n].length) {
213 while (off >= mr->map[sge->m]->segs[sge->n].length) { 217 off -= mr->map[m]->segs[n].length;
214 off -= mr->map[sge->m]->segs[sge->n].length; 218 n++;
215 sge->n++; 219 if (n >= IPATH_SEGSZ) {
216 if (sge->n >= IPATH_SEGSZ) { 220 m++;
217 sge->m++; 221 n = 0;
218 sge->n = 0;
219 } 222 }
220 } 223 }
221 sge->vaddr = mr->map[sge->m]->segs[sge->n].vaddr + off; 224 sge->mr = mr;
222 sge->length = mr->map[sge->m]->segs[sge->n].length - off; 225 sge->vaddr = mr->map[m]->segs[n].vaddr + off;
226 sge->length = mr->map[m]->segs[n].length - off;
223 sge->sge_length = len; 227 sge->sge_length = len;
228 sge->m = m;
229 sge->n = n;
224 ss->sg_list = NULL; 230 ss->sg_list = NULL;
225 ss->num_sge = 1; 231 ss->num_sge = 1;
226 232
diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c
index 83b9a6a5e2c6..68f01513214f 100644
--- a/drivers/infiniband/hw/ipath/ipath_qp.c
+++ b/drivers/infiniband/hw/ipath/ipath_qp.c
@@ -333,10 +333,11 @@ static void ipath_reset_qp(struct ipath_qp *qp)
333 qp->remote_qpn = 0; 333 qp->remote_qpn = 0;
334 qp->qkey = 0; 334 qp->qkey = 0;
335 qp->qp_access_flags = 0; 335 qp->qp_access_flags = 0;
336 clear_bit(IPATH_S_BUSY, &qp->s_flags);
336 qp->s_hdrwords = 0; 337 qp->s_hdrwords = 0;
337 qp->s_psn = 0; 338 qp->s_psn = 0;
338 qp->r_psn = 0; 339 qp->r_psn = 0;
339 atomic_set(&qp->msn, 0); 340 qp->r_msn = 0;
340 if (qp->ibqp.qp_type == IB_QPT_RC) { 341 if (qp->ibqp.qp_type == IB_QPT_RC) {
341 qp->s_state = IB_OPCODE_RC_SEND_LAST; 342 qp->s_state = IB_OPCODE_RC_SEND_LAST;
342 qp->r_state = IB_OPCODE_RC_SEND_LAST; 343 qp->r_state = IB_OPCODE_RC_SEND_LAST;
@@ -345,7 +346,8 @@ static void ipath_reset_qp(struct ipath_qp *qp)
345 qp->r_state = IB_OPCODE_UC_SEND_LAST; 346 qp->r_state = IB_OPCODE_UC_SEND_LAST;
346 } 347 }
347 qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; 348 qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
348 qp->s_nak_state = 0; 349 qp->r_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
350 qp->r_nak_state = 0;
349 qp->s_rnr_timeout = 0; 351 qp->s_rnr_timeout = 0;
350 qp->s_head = 0; 352 qp->s_head = 0;
351 qp->s_tail = 0; 353 qp->s_tail = 0;
@@ -363,10 +365,10 @@ static void ipath_reset_qp(struct ipath_qp *qp)
363 * @qp: the QP to put into an error state 365 * @qp: the QP to put into an error state
364 * 366 *
365 * Flushes both send and receive work queues. 367 * Flushes both send and receive work queues.
366 * QP r_rq.lock and s_lock should be held. 368 * QP s_lock should be held and interrupts disabled.
367 */ 369 */
368 370
369static void ipath_error_qp(struct ipath_qp *qp) 371void ipath_error_qp(struct ipath_qp *qp)
370{ 372{
371 struct ipath_ibdev *dev = to_idev(qp->ibqp.device); 373 struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
372 struct ib_wc wc; 374 struct ib_wc wc;
@@ -409,12 +411,14 @@ static void ipath_error_qp(struct ipath_qp *qp)
409 qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; 411 qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
410 412
411 wc.opcode = IB_WC_RECV; 413 wc.opcode = IB_WC_RECV;
414 spin_lock(&qp->r_rq.lock);
412 while (qp->r_rq.tail != qp->r_rq.head) { 415 while (qp->r_rq.tail != qp->r_rq.head) {
413 wc.wr_id = get_rwqe_ptr(&qp->r_rq, qp->r_rq.tail)->wr_id; 416 wc.wr_id = get_rwqe_ptr(&qp->r_rq, qp->r_rq.tail)->wr_id;
414 if (++qp->r_rq.tail >= qp->r_rq.size) 417 if (++qp->r_rq.tail >= qp->r_rq.size)
415 qp->r_rq.tail = 0; 418 qp->r_rq.tail = 0;
416 ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); 419 ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
417 } 420 }
421 spin_unlock(&qp->r_rq.lock);
418} 422}
419 423
420/** 424/**
@@ -434,8 +438,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
434 unsigned long flags; 438 unsigned long flags;
435 int ret; 439 int ret;
436 440
437 spin_lock_irqsave(&qp->r_rq.lock, flags); 441 spin_lock_irqsave(&qp->s_lock, flags);
438 spin_lock(&qp->s_lock);
439 442
440 cur_state = attr_mask & IB_QP_CUR_STATE ? 443 cur_state = attr_mask & IB_QP_CUR_STATE ?
441 attr->cur_qp_state : qp->state; 444 attr->cur_qp_state : qp->state;
@@ -506,31 +509,19 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
506 } 509 }
507 510
508 if (attr_mask & IB_QP_MIN_RNR_TIMER) 511 if (attr_mask & IB_QP_MIN_RNR_TIMER)
509 qp->s_min_rnr_timer = attr->min_rnr_timer; 512 qp->r_min_rnr_timer = attr->min_rnr_timer;
510 513
511 if (attr_mask & IB_QP_QKEY) 514 if (attr_mask & IB_QP_QKEY)
512 qp->qkey = attr->qkey; 515 qp->qkey = attr->qkey;
513 516
514 qp->state = new_state; 517 qp->state = new_state;
515 spin_unlock(&qp->s_lock); 518 spin_unlock_irqrestore(&qp->s_lock, flags);
516 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
517
518 /*
519 * If QP1 changed to the RTS state, try to move to the link to INIT
520 * even if it was ACTIVE so the SM will reinitialize the SMA's
521 * state.
522 */
523 if (qp->ibqp.qp_num == 1 && new_state == IB_QPS_RTS) {
524 struct ipath_ibdev *dev = to_idev(ibqp->device);
525 519
526 ipath_layer_set_linkstate(dev->dd, IPATH_IB_LINKDOWN);
527 }
528 ret = 0; 520 ret = 0;
529 goto bail; 521 goto bail;
530 522
531inval: 523inval:
532 spin_unlock(&qp->s_lock); 524 spin_unlock_irqrestore(&qp->s_lock, flags);
533 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
534 ret = -EINVAL; 525 ret = -EINVAL;
535 526
536bail: 527bail:
@@ -564,7 +555,7 @@ int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
564 attr->sq_draining = 0; 555 attr->sq_draining = 0;
565 attr->max_rd_atomic = 1; 556 attr->max_rd_atomic = 1;
566 attr->max_dest_rd_atomic = 1; 557 attr->max_dest_rd_atomic = 1;
567 attr->min_rnr_timer = qp->s_min_rnr_timer; 558 attr->min_rnr_timer = qp->r_min_rnr_timer;
568 attr->port_num = 1; 559 attr->port_num = 1;
569 attr->timeout = 0; 560 attr->timeout = 0;
570 attr->retry_cnt = qp->s_retry_cnt; 561 attr->retry_cnt = qp->s_retry_cnt;
@@ -591,16 +582,12 @@ int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
591 * @qp: the queue pair to compute the AETH for 582 * @qp: the queue pair to compute the AETH for
592 * 583 *
593 * Returns the AETH. 584 * Returns the AETH.
594 *
595 * The QP s_lock should be held.
596 */ 585 */
597__be32 ipath_compute_aeth(struct ipath_qp *qp) 586__be32 ipath_compute_aeth(struct ipath_qp *qp)
598{ 587{
599 u32 aeth = atomic_read(&qp->msn) & IPS_MSN_MASK; 588 u32 aeth = qp->r_msn & IPS_MSN_MASK;
600 589
601 if (qp->s_nak_state) { 590 if (qp->ibqp.srq) {
602 aeth |= qp->s_nak_state << IPS_AETH_CREDIT_SHIFT;
603 } else if (qp->ibqp.srq) {
604 /* 591 /*
605 * Shared receive queues don't generate credits. 592 * Shared receive queues don't generate credits.
606 * Set the credit field to the invalid value. 593 * Set the credit field to the invalid value.
diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c
index bd2c405c4bf0..8568dd0538cf 100644
--- a/drivers/infiniband/hw/ipath/ipath_rc.c
+++ b/drivers/infiniband/hw/ipath/ipath_rc.c
@@ -42,7 +42,7 @@
42 * @qp: the QP who's SGE we're restarting 42 * @qp: the QP who's SGE we're restarting
43 * @wqe: the work queue to initialize the QP's SGE from 43 * @wqe: the work queue to initialize the QP's SGE from
44 * 44 *
45 * The QP s_lock should be held. 45 * The QP s_lock should be held and interrupts disabled.
46 */ 46 */
47static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe) 47static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
48{ 48{
@@ -77,7 +77,6 @@ u32 ipath_make_rc_ack(struct ipath_qp *qp,
77 struct ipath_other_headers *ohdr, 77 struct ipath_other_headers *ohdr,
78 u32 pmtu) 78 u32 pmtu)
79{ 79{
80 struct ipath_sge_state *ss;
81 u32 hwords; 80 u32 hwords;
82 u32 len; 81 u32 len;
83 u32 bth0; 82 u32 bth0;
@@ -91,7 +90,7 @@ u32 ipath_make_rc_ack(struct ipath_qp *qp,
91 */ 90 */
92 switch (qp->s_ack_state) { 91 switch (qp->s_ack_state) {
93 case OP(RDMA_READ_REQUEST): 92 case OP(RDMA_READ_REQUEST):
94 ss = &qp->s_rdma_sge; 93 qp->s_cur_sge = &qp->s_rdma_sge;
95 len = qp->s_rdma_len; 94 len = qp->s_rdma_len;
96 if (len > pmtu) { 95 if (len > pmtu) {
97 len = pmtu; 96 len = pmtu;
@@ -108,7 +107,7 @@ u32 ipath_make_rc_ack(struct ipath_qp *qp,
108 qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); 107 qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
109 /* FALLTHROUGH */ 108 /* FALLTHROUGH */
110 case OP(RDMA_READ_RESPONSE_MIDDLE): 109 case OP(RDMA_READ_RESPONSE_MIDDLE):
111 ss = &qp->s_rdma_sge; 110 qp->s_cur_sge = &qp->s_rdma_sge;
112 len = qp->s_rdma_len; 111 len = qp->s_rdma_len;
113 if (len > pmtu) 112 if (len > pmtu)
114 len = pmtu; 113 len = pmtu;
@@ -127,41 +126,50 @@ u32 ipath_make_rc_ack(struct ipath_qp *qp,
127 * We have to prevent new requests from changing 126 * We have to prevent new requests from changing
128 * the r_sge state while a ipath_verbs_send() 127 * the r_sge state while a ipath_verbs_send()
129 * is in progress. 128 * is in progress.
130 * Changing r_state allows the receiver
131 * to continue processing new packets.
132 * We do it here now instead of above so
133 * that we are sure the packet was sent before
134 * changing the state.
135 */ 129 */
136 qp->r_state = OP(RDMA_READ_RESPONSE_LAST);
137 qp->s_ack_state = OP(ACKNOWLEDGE); 130 qp->s_ack_state = OP(ACKNOWLEDGE);
138 return 0; 131 bth0 = 0;
132 goto bail;
139 133
140 case OP(COMPARE_SWAP): 134 case OP(COMPARE_SWAP):
141 case OP(FETCH_ADD): 135 case OP(FETCH_ADD):
142 ss = NULL; 136 qp->s_cur_sge = NULL;
143 len = 0; 137 len = 0;
144 qp->r_state = OP(SEND_LAST); 138 /*
145 qp->s_ack_state = OP(ACKNOWLEDGE); 139 * Set the s_ack_state so the receive interrupt handler
146 bth0 = IB_OPCODE_ATOMIC_ACKNOWLEDGE << 24; 140 * won't try to send an ACK (out of order) until this one
141 * is actually sent.
142 */
143 qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
144 bth0 = OP(ATOMIC_ACKNOWLEDGE) << 24;
147 ohdr->u.at.aeth = ipath_compute_aeth(qp); 145 ohdr->u.at.aeth = ipath_compute_aeth(qp);
148 ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->s_ack_atomic); 146 ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->r_atomic_data);
149 hwords += sizeof(ohdr->u.at) / 4; 147 hwords += sizeof(ohdr->u.at) / 4;
150 break; 148 break;
151 149
152 default: 150 default:
153 /* Send a regular ACK. */ 151 /* Send a regular ACK. */
154 ss = NULL; 152 qp->s_cur_sge = NULL;
155 len = 0; 153 len = 0;
156 qp->s_ack_state = OP(ACKNOWLEDGE); 154 /*
157 bth0 = qp->s_ack_state << 24; 155 * Set the s_ack_state so the receive interrupt handler
158 ohdr->u.aeth = ipath_compute_aeth(qp); 156 * won't try to send an ACK (out of order) until this one
157 * is actually sent.
158 */
159 qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
160 bth0 = OP(ACKNOWLEDGE) << 24;
161 if (qp->s_nak_state)
162 ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPS_MSN_MASK) |
163 (qp->s_nak_state <<
164 IPS_AETH_CREDIT_SHIFT));
165 else
166 ohdr->u.aeth = ipath_compute_aeth(qp);
159 hwords++; 167 hwords++;
160 } 168 }
161 qp->s_hdrwords = hwords; 169 qp->s_hdrwords = hwords;
162 qp->s_cur_sge = ss;
163 qp->s_cur_size = len; 170 qp->s_cur_size = len;
164 171
172bail:
165 return bth0; 173 return bth0;
166} 174}
167 175
@@ -174,7 +182,7 @@ u32 ipath_make_rc_ack(struct ipath_qp *qp,
174 * @bth2p: pointer to the BTH PSN word 182 * @bth2p: pointer to the BTH PSN word
175 * 183 *
176 * Return 1 if constructed; otherwise, return 0. 184 * Return 1 if constructed; otherwise, return 0.
177 * Note the QP s_lock must be held. 185 * Note the QP s_lock must be held and interrupts disabled.
178 */ 186 */
179int ipath_make_rc_req(struct ipath_qp *qp, 187int ipath_make_rc_req(struct ipath_qp *qp,
180 struct ipath_other_headers *ohdr, 188 struct ipath_other_headers *ohdr,
@@ -356,6 +364,11 @@ int ipath_make_rc_req(struct ipath_qp *qp,
356 bth2 |= qp->s_psn++ & IPS_PSN_MASK; 364 bth2 |= qp->s_psn++ & IPS_PSN_MASK;
357 if ((int)(qp->s_psn - qp->s_next_psn) > 0) 365 if ((int)(qp->s_psn - qp->s_next_psn) > 0)
358 qp->s_next_psn = qp->s_psn; 366 qp->s_next_psn = qp->s_psn;
367 /*
368 * Put the QP on the pending list so lost ACKs will cause
369 * a retry. More than one request can be pending so the
370 * QP may already be on the dev->pending list.
371 */
359 spin_lock(&dev->pending_lock); 372 spin_lock(&dev->pending_lock);
360 if (list_empty(&qp->timerwait)) 373 if (list_empty(&qp->timerwait))
361 list_add_tail(&qp->timerwait, 374 list_add_tail(&qp->timerwait,
@@ -365,8 +378,8 @@ int ipath_make_rc_req(struct ipath_qp *qp,
365 378
366 case OP(RDMA_READ_RESPONSE_FIRST): 379 case OP(RDMA_READ_RESPONSE_FIRST):
367 /* 380 /*
368 * This case can only happen if a send is restarted. See 381 * This case can only happen if a send is restarted.
369 * ipath_restart_rc(). 382 * See ipath_restart_rc().
370 */ 383 */
371 ipath_init_restart(qp, wqe); 384 ipath_init_restart(qp, wqe);
372 /* FALLTHROUGH */ 385 /* FALLTHROUGH */
@@ -526,11 +539,17 @@ static void send_rc_ack(struct ipath_qp *qp)
526 ohdr = &hdr.u.l.oth; 539 ohdr = &hdr.u.l.oth;
527 lrh0 = IPS_LRH_GRH; 540 lrh0 = IPS_LRH_GRH;
528 } 541 }
542 /* read pkey_index w/o lock (its atomic) */
529 bth0 = ipath_layer_get_pkey(dev->dd, qp->s_pkey_index); 543 bth0 = ipath_layer_get_pkey(dev->dd, qp->s_pkey_index);
530 ohdr->u.aeth = ipath_compute_aeth(qp); 544 if (qp->r_nak_state)
531 if (qp->s_ack_state >= OP(COMPARE_SWAP)) { 545 ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPS_MSN_MASK) |
532 bth0 |= IB_OPCODE_ATOMIC_ACKNOWLEDGE << 24; 546 (qp->r_nak_state <<
533 ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->s_ack_atomic); 547 IPS_AETH_CREDIT_SHIFT));
548 else
549 ohdr->u.aeth = ipath_compute_aeth(qp);
550 if (qp->r_ack_state >= OP(COMPARE_SWAP)) {
551 bth0 |= OP(ATOMIC_ACKNOWLEDGE) << 24;
552 ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->r_atomic_data);
534 hwords += sizeof(ohdr->u.at.atomic_ack_eth) / 4; 553 hwords += sizeof(ohdr->u.at.atomic_ack_eth) / 4;
535 } else 554 } else
536 bth0 |= OP(ACKNOWLEDGE) << 24; 555 bth0 |= OP(ACKNOWLEDGE) << 24;
@@ -541,15 +560,36 @@ static void send_rc_ack(struct ipath_qp *qp)
541 hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd)); 560 hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd));
542 ohdr->bth[0] = cpu_to_be32(bth0); 561 ohdr->bth[0] = cpu_to_be32(bth0);
543 ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); 562 ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
544 ohdr->bth[2] = cpu_to_be32(qp->s_ack_psn & IPS_PSN_MASK); 563 ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPS_PSN_MASK);
545 564
546 /* 565 /*
547 * If we can send the ACK, clear the ACK state. 566 * If we can send the ACK, clear the ACK state.
548 */ 567 */
549 if (ipath_verbs_send(dev->dd, hwords, (u32 *) &hdr, 0, NULL) == 0) { 568 if (ipath_verbs_send(dev->dd, hwords, (u32 *) &hdr, 0, NULL) == 0) {
550 qp->s_ack_state = OP(ACKNOWLEDGE); 569 qp->r_ack_state = OP(ACKNOWLEDGE);
551 dev->n_rc_qacks++;
552 dev->n_unicast_xmit++; 570 dev->n_unicast_xmit++;
571 } else {
572 /*
573 * We are out of PIO buffers at the moment.
574 * Pass responsibility for sending the ACK to the
575 * send tasklet so that when a PIO buffer becomes
576 * available, the ACK is sent ahead of other outgoing
577 * packets.
578 */
579 dev->n_rc_qacks++;
580 spin_lock_irq(&qp->s_lock);
581 /* Don't coalesce if a RDMA read or atomic is pending. */
582 if (qp->s_ack_state == OP(ACKNOWLEDGE) ||
583 qp->s_ack_state < OP(RDMA_READ_REQUEST)) {
584 qp->s_ack_state = qp->r_ack_state;
585 qp->s_nak_state = qp->r_nak_state;
586 qp->s_ack_psn = qp->r_ack_psn;
587 qp->r_ack_state = OP(ACKNOWLEDGE);
588 }
589 spin_unlock_irq(&qp->s_lock);
590
591 /* Call ipath_do_rc_send() in another thread. */
592 tasklet_hi_schedule(&qp->s_task);
553 } 593 }
554} 594}
555 595
@@ -641,7 +681,7 @@ done:
641 * @psn: packet sequence number for the request 681 * @psn: packet sequence number for the request
642 * @wc: the work completion request 682 * @wc: the work completion request
643 * 683 *
644 * The QP s_lock should be held. 684 * The QP s_lock should be held and interrupts disabled.
645 */ 685 */
646void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc) 686void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc)
647{ 687{
@@ -705,7 +745,7 @@ bail:
705 * 745 *
706 * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK 746 * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
707 * for the given QP. 747 * for the given QP.
708 * Called at interrupt level with the QP s_lock held. 748 * Called at interrupt level with the QP s_lock held and interrupts disabled.
709 * Returns 1 if OK, 0 if current operation should be aborted (NAK). 749 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
710 */ 750 */
711static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) 751static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
@@ -1126,18 +1166,16 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
1126 * Don't queue the NAK if a RDMA read, atomic, or 1166 * Don't queue the NAK if a RDMA read, atomic, or
1127 * NAK is pending though. 1167 * NAK is pending though.
1128 */ 1168 */
1129 spin_lock(&qp->s_lock); 1169 if (qp->s_ack_state != OP(ACKNOWLEDGE) ||
1130 if ((qp->s_ack_state >= OP(RDMA_READ_REQUEST) && 1170 qp->r_nak_state != 0)
1131 qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) ||
1132 qp->s_nak_state != 0) {
1133 spin_unlock(&qp->s_lock);
1134 goto done; 1171 goto done;
1172 if (qp->r_ack_state < OP(COMPARE_SWAP)) {
1173 qp->r_ack_state = OP(SEND_ONLY);
1174 qp->r_nak_state = IB_NAK_PSN_ERROR;
1175 /* Use the expected PSN. */
1176 qp->r_ack_psn = qp->r_psn;
1135 } 1177 }
1136 qp->s_ack_state = OP(SEND_ONLY); 1178 goto send_ack;
1137 qp->s_nak_state = IB_NAK_PSN_ERROR;
1138 /* Use the expected PSN. */
1139 qp->s_ack_psn = qp->r_psn;
1140 goto resched;
1141 } 1179 }
1142 1180
1143 /* 1181 /*
@@ -1151,27 +1189,7 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
1151 * send the earliest so that RDMA reads can be restarted at 1189 * send the earliest so that RDMA reads can be restarted at
1152 * the requester's expected PSN. 1190 * the requester's expected PSN.
1153 */ 1191 */
1154 spin_lock(&qp->s_lock); 1192 if (opcode == OP(RDMA_READ_REQUEST)) {
1155 if (qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE &&
1156 ipath_cmp24(psn, qp->s_ack_psn) >= 0) {
1157 if (qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST)
1158 qp->s_ack_psn = psn;
1159 spin_unlock(&qp->s_lock);
1160 goto done;
1161 }
1162 switch (opcode) {
1163 case OP(RDMA_READ_REQUEST):
1164 /*
1165 * We have to be careful to not change s_rdma_sge
1166 * while ipath_do_rc_send() is using it and not
1167 * holding the s_lock.
1168 */
1169 if (qp->s_ack_state != OP(ACKNOWLEDGE) &&
1170 qp->s_ack_state >= IB_OPCODE_RDMA_READ_REQUEST) {
1171 spin_unlock(&qp->s_lock);
1172 dev->n_rdma_dup_busy++;
1173 goto done;
1174 }
1175 /* RETH comes after BTH */ 1193 /* RETH comes after BTH */
1176 if (!header_in_data) 1194 if (!header_in_data)
1177 reth = &ohdr->u.rc.reth; 1195 reth = &ohdr->u.rc.reth;
@@ -1179,6 +1197,22 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
1179 reth = (struct ib_reth *)data; 1197 reth = (struct ib_reth *)data;
1180 data += sizeof(*reth); 1198 data += sizeof(*reth);
1181 } 1199 }
1200 /*
1201 * If we receive a duplicate RDMA request, it means the
1202 * requester saw a sequence error and needs to restart
1203 * from an earlier point. We can abort the current
1204 * RDMA read send in that case.
1205 */
1206 spin_lock_irq(&qp->s_lock);
1207 if (qp->s_ack_state != OP(ACKNOWLEDGE) &&
1208 (qp->s_hdrwords || ipath_cmp24(psn, qp->s_ack_psn) >= 0)) {
1209 /*
1210 * We are already sending earlier requested data.
1211 * Don't abort it to send later out of sequence data.
1212 */
1213 spin_unlock_irq(&qp->s_lock);
1214 goto done;
1215 }
1182 qp->s_rdma_len = be32_to_cpu(reth->length); 1216 qp->s_rdma_len = be32_to_cpu(reth->length);
1183 if (qp->s_rdma_len != 0) { 1217 if (qp->s_rdma_len != 0) {
1184 u32 rkey = be32_to_cpu(reth->rkey); 1218 u32 rkey = be32_to_cpu(reth->rkey);
@@ -1192,8 +1226,10 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
1192 ok = ipath_rkey_ok(dev, &qp->s_rdma_sge, 1226 ok = ipath_rkey_ok(dev, &qp->s_rdma_sge,
1193 qp->s_rdma_len, vaddr, rkey, 1227 qp->s_rdma_len, vaddr, rkey,
1194 IB_ACCESS_REMOTE_READ); 1228 IB_ACCESS_REMOTE_READ);
1195 if (unlikely(!ok)) 1229 if (unlikely(!ok)) {
1230 spin_unlock_irq(&qp->s_lock);
1196 goto done; 1231 goto done;
1232 }
1197 } else { 1233 } else {
1198 qp->s_rdma_sge.sg_list = NULL; 1234 qp->s_rdma_sge.sg_list = NULL;
1199 qp->s_rdma_sge.num_sge = 0; 1235 qp->s_rdma_sge.num_sge = 0;
@@ -1202,25 +1238,44 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
1202 qp->s_rdma_sge.sge.length = 0; 1238 qp->s_rdma_sge.sge.length = 0;
1203 qp->s_rdma_sge.sge.sge_length = 0; 1239 qp->s_rdma_sge.sge.sge_length = 0;
1204 } 1240 }
1205 break; 1241 qp->s_ack_state = opcode;
1242 qp->s_ack_psn = psn;
1243 spin_unlock_irq(&qp->s_lock);
1244 tasklet_hi_schedule(&qp->s_task);
1245 goto send_ack;
1246 }
1247
1248 /*
1249 * A pending RDMA read will ACK anything before it so
1250 * ignore earlier duplicate requests.
1251 */
1252 if (qp->s_ack_state != OP(ACKNOWLEDGE))
1253 goto done;
1206 1254
1255 /*
1256 * If an ACK is pending, don't replace the pending ACK
1257 * with an earlier one since the later one will ACK the earlier.
1258 * Also, if we already have a pending atomic, send it.
1259 */
1260 if (qp->r_ack_state != OP(ACKNOWLEDGE) &&
1261 (ipath_cmp24(psn, qp->r_ack_psn) <= 0 ||
1262 qp->r_ack_state >= OP(COMPARE_SWAP)))
1263 goto send_ack;
1264 switch (opcode) {
1207 case OP(COMPARE_SWAP): 1265 case OP(COMPARE_SWAP):
1208 case OP(FETCH_ADD): 1266 case OP(FETCH_ADD):
1209 /* 1267 /*
1210 * Check for the PSN of the last atomic operation 1268 * Check for the PSN of the last atomic operation
1211 * performed and resend the result if found. 1269 * performed and resend the result if found.
1212 */ 1270 */
1213 if ((psn & IPS_PSN_MASK) != qp->r_atomic_psn) { 1271 if ((psn & IPS_PSN_MASK) != qp->r_atomic_psn)
1214 spin_unlock(&qp->s_lock);
1215 goto done; 1272 goto done;
1216 }
1217 qp->s_ack_atomic = qp->r_atomic_data;
1218 break; 1273 break;
1219 } 1274 }
1220 qp->s_ack_state = opcode; 1275 qp->r_ack_state = opcode;
1221 qp->s_nak_state = 0; 1276 qp->r_nak_state = 0;
1222 qp->s_ack_psn = psn; 1277 qp->r_ack_psn = psn;
1223resched: 1278send_ack:
1224 return 0; 1279 return 0;
1225 1280
1226done: 1281done:
@@ -1248,7 +1303,6 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1248 u32 hdrsize; 1303 u32 hdrsize;
1249 u32 psn; 1304 u32 psn;
1250 u32 pad; 1305 u32 pad;
1251 unsigned long flags;
1252 struct ib_wc wc; 1306 struct ib_wc wc;
1253 u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); 1307 u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
1254 int diff; 1308 int diff;
@@ -1289,18 +1343,16 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1289 opcode <= OP(ATOMIC_ACKNOWLEDGE)) { 1343 opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1290 ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn, 1344 ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn,
1291 hdrsize, pmtu, header_in_data); 1345 hdrsize, pmtu, header_in_data);
1292 goto bail; 1346 goto done;
1293 } 1347 }
1294 1348
1295 spin_lock_irqsave(&qp->r_rq.lock, flags);
1296
1297 /* Compute 24 bits worth of difference. */ 1349 /* Compute 24 bits worth of difference. */
1298 diff = ipath_cmp24(psn, qp->r_psn); 1350 diff = ipath_cmp24(psn, qp->r_psn);
1299 if (unlikely(diff)) { 1351 if (unlikely(diff)) {
1300 if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode, 1352 if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode,
1301 psn, diff, header_in_data)) 1353 psn, diff, header_in_data))
1302 goto done; 1354 goto done;
1303 goto resched; 1355 goto send_ack;
1304 } 1356 }
1305 1357
1306 /* Check for opcode sequence errors. */ 1358 /* Check for opcode sequence errors. */
@@ -1312,22 +1364,19 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1312 opcode == OP(SEND_LAST_WITH_IMMEDIATE)) 1364 opcode == OP(SEND_LAST_WITH_IMMEDIATE))
1313 break; 1365 break;
1314 nack_inv: 1366 nack_inv:
1315 /* 1367 /*
1316 * A NAK will ACK earlier sends and RDMA writes. Don't queue the 1368 * A NAK will ACK earlier sends and RDMA writes.
1317 * NAK if a RDMA read, atomic, or NAK is pending though. 1369 * Don't queue the NAK if a RDMA read, atomic, or NAK
1318 */ 1370 * is pending though.
1319 spin_lock(&qp->s_lock); 1371 */
1320 if (qp->s_ack_state >= OP(RDMA_READ_REQUEST) && 1372 if (qp->r_ack_state >= OP(COMPARE_SWAP))
1321 qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) { 1373 goto send_ack;
1322 spin_unlock(&qp->s_lock); 1374 /* XXX Flush WQEs */
1323 goto done; 1375 qp->state = IB_QPS_ERR;
1324 } 1376 qp->r_ack_state = OP(SEND_ONLY);
1325 /* XXX Flush WQEs */ 1377 qp->r_nak_state = IB_NAK_INVALID_REQUEST;
1326 qp->state = IB_QPS_ERR; 1378 qp->r_ack_psn = qp->r_psn;
1327 qp->s_ack_state = OP(SEND_ONLY); 1379 goto send_ack;
1328 qp->s_nak_state = IB_NAK_INVALID_REQUEST;
1329 qp->s_ack_psn = qp->r_psn;
1330 goto resched;
1331 1380
1332 case OP(RDMA_WRITE_FIRST): 1381 case OP(RDMA_WRITE_FIRST):
1333 case OP(RDMA_WRITE_MIDDLE): 1382 case OP(RDMA_WRITE_MIDDLE):
@@ -1337,20 +1386,6 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1337 break; 1386 break;
1338 goto nack_inv; 1387 goto nack_inv;
1339 1388
1340 case OP(RDMA_READ_REQUEST):
1341 case OP(COMPARE_SWAP):
1342 case OP(FETCH_ADD):
1343 /*
1344 * Drop all new requests until a response has been sent. A
1345 * new request then ACKs the RDMA response we sent. Relaxed
1346 * ordering would allow new requests to be processed but we
1347 * would need to keep a queue of rwqe's for all that are in
1348 * progress. Note that we can't RNR NAK this request since
1349 * the RDMA READ or atomic response is already queued to be
1350 * sent (unless we implement a response send queue).
1351 */
1352 goto done;
1353
1354 default: 1389 default:
1355 if (opcode == OP(SEND_MIDDLE) || 1390 if (opcode == OP(SEND_MIDDLE) ||
1356 opcode == OP(SEND_LAST) || 1391 opcode == OP(SEND_LAST) ||
@@ -1359,6 +1394,11 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1359 opcode == OP(RDMA_WRITE_LAST) || 1394 opcode == OP(RDMA_WRITE_LAST) ||
1360 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) 1395 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1361 goto nack_inv; 1396 goto nack_inv;
1397 /*
1398 * Note that it is up to the requester to not send a new
1399 * RDMA read or atomic operation before receiving an ACK
1400 * for the previous operation.
1401 */
1362 break; 1402 break;
1363 } 1403 }
1364 1404
@@ -1375,17 +1415,12 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1375 * Don't queue the NAK if a RDMA read or atomic 1415 * Don't queue the NAK if a RDMA read or atomic
1376 * is pending though. 1416 * is pending though.
1377 */ 1417 */
1378 spin_lock(&qp->s_lock); 1418 if (qp->r_ack_state >= OP(COMPARE_SWAP))
1379 if (qp->s_ack_state >= 1419 goto send_ack;
1380 OP(RDMA_READ_REQUEST) && 1420 qp->r_ack_state = OP(SEND_ONLY);
1381 qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) { 1421 qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
1382 spin_unlock(&qp->s_lock); 1422 qp->r_ack_psn = qp->r_psn;
1383 goto done; 1423 goto send_ack;
1384 }
1385 qp->s_ack_state = OP(SEND_ONLY);
1386 qp->s_nak_state = IB_RNR_NAK | qp->s_min_rnr_timer;
1387 qp->s_ack_psn = qp->r_psn;
1388 goto resched;
1389 } 1424 }
1390 qp->r_rcv_len = 0; 1425 qp->r_rcv_len = 0;
1391 /* FALLTHROUGH */ 1426 /* FALLTHROUGH */
@@ -1442,7 +1477,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1442 if (unlikely(wc.byte_len > qp->r_len)) 1477 if (unlikely(wc.byte_len > qp->r_len))
1443 goto nack_inv; 1478 goto nack_inv;
1444 ipath_copy_sge(&qp->r_sge, data, tlen); 1479 ipath_copy_sge(&qp->r_sge, data, tlen);
1445 atomic_inc(&qp->msn); 1480 qp->r_msn++;
1446 if (opcode == OP(RDMA_WRITE_LAST) || 1481 if (opcode == OP(RDMA_WRITE_LAST) ||
1447 opcode == OP(RDMA_WRITE_ONLY)) 1482 opcode == OP(RDMA_WRITE_ONLY))
1448 break; 1483 break;
@@ -1486,29 +1521,8 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1486 ok = ipath_rkey_ok(dev, &qp->r_sge, 1521 ok = ipath_rkey_ok(dev, &qp->r_sge,
1487 qp->r_len, vaddr, rkey, 1522 qp->r_len, vaddr, rkey,
1488 IB_ACCESS_REMOTE_WRITE); 1523 IB_ACCESS_REMOTE_WRITE);
1489 if (unlikely(!ok)) { 1524 if (unlikely(!ok))
1490 nack_acc: 1525 goto nack_acc;
1491 /*
1492 * A NAK will ACK earlier sends and RDMA
1493 * writes. Don't queue the NAK if a RDMA
1494 * read, atomic, or NAK is pending though.
1495 */
1496 spin_lock(&qp->s_lock);
1497 if (qp->s_ack_state >=
1498 OP(RDMA_READ_REQUEST) &&
1499 qp->s_ack_state !=
1500 IB_OPCODE_ACKNOWLEDGE) {
1501 spin_unlock(&qp->s_lock);
1502 goto done;
1503 }
1504 /* XXX Flush WQEs */
1505 qp->state = IB_QPS_ERR;
1506 qp->s_ack_state = OP(RDMA_WRITE_ONLY);
1507 qp->s_nak_state =
1508 IB_NAK_REMOTE_ACCESS_ERROR;
1509 qp->s_ack_psn = qp->r_psn;
1510 goto resched;
1511 }
1512 } else { 1526 } else {
1513 qp->r_sge.sg_list = NULL; 1527 qp->r_sge.sg_list = NULL;
1514 qp->r_sge.sge.mr = NULL; 1528 qp->r_sge.sge.mr = NULL;
@@ -1535,12 +1549,10 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1535 reth = (struct ib_reth *)data; 1549 reth = (struct ib_reth *)data;
1536 data += sizeof(*reth); 1550 data += sizeof(*reth);
1537 } 1551 }
1538 spin_lock(&qp->s_lock); 1552 if (unlikely(!(qp->qp_access_flags &
1539 if (qp->s_ack_state != OP(ACKNOWLEDGE) && 1553 IB_ACCESS_REMOTE_READ)))
1540 qp->s_ack_state >= IB_OPCODE_RDMA_READ_REQUEST) { 1554 goto nack_acc;
1541 spin_unlock(&qp->s_lock); 1555 spin_lock_irq(&qp->s_lock);
1542 goto done;
1543 }
1544 qp->s_rdma_len = be32_to_cpu(reth->length); 1556 qp->s_rdma_len = be32_to_cpu(reth->length);
1545 if (qp->s_rdma_len != 0) { 1557 if (qp->s_rdma_len != 0) {
1546 u32 rkey = be32_to_cpu(reth->rkey); 1558 u32 rkey = be32_to_cpu(reth->rkey);
@@ -1552,7 +1564,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1552 qp->s_rdma_len, vaddr, rkey, 1564 qp->s_rdma_len, vaddr, rkey,
1553 IB_ACCESS_REMOTE_READ); 1565 IB_ACCESS_REMOTE_READ);
1554 if (unlikely(!ok)) { 1566 if (unlikely(!ok)) {
1555 spin_unlock(&qp->s_lock); 1567 spin_unlock_irq(&qp->s_lock);
1556 goto nack_acc; 1568 goto nack_acc;
1557 } 1569 }
1558 /* 1570 /*
@@ -1569,21 +1581,25 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1569 qp->s_rdma_sge.sge.length = 0; 1581 qp->s_rdma_sge.sge.length = 0;
1570 qp->s_rdma_sge.sge.sge_length = 0; 1582 qp->s_rdma_sge.sge.sge_length = 0;
1571 } 1583 }
1572 if (unlikely(!(qp->qp_access_flags &
1573 IB_ACCESS_REMOTE_READ)))
1574 goto nack_acc;
1575 /* 1584 /*
1576 * We need to increment the MSN here instead of when we 1585 * We need to increment the MSN here instead of when we
1577 * finish sending the result since a duplicate request would 1586 * finish sending the result since a duplicate request would
1578 * increment it more than once. 1587 * increment it more than once.
1579 */ 1588 */
1580 atomic_inc(&qp->msn); 1589 qp->r_msn++;
1590
1581 qp->s_ack_state = opcode; 1591 qp->s_ack_state = opcode;
1582 qp->s_nak_state = 0;
1583 qp->s_ack_psn = psn; 1592 qp->s_ack_psn = psn;
1593 spin_unlock_irq(&qp->s_lock);
1594
1584 qp->r_psn++; 1595 qp->r_psn++;
1585 qp->r_state = opcode; 1596 qp->r_state = opcode;
1586 goto rdmadone; 1597 qp->r_nak_state = 0;
1598
1599 /* Call ipath_do_rc_send() in another thread. */
1600 tasklet_hi_schedule(&qp->s_task);
1601
1602 goto done;
1587 1603
1588 case OP(COMPARE_SWAP): 1604 case OP(COMPARE_SWAP):
1589 case OP(FETCH_ADD): { 1605 case OP(FETCH_ADD): {
@@ -1612,7 +1628,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1612 goto nack_acc; 1628 goto nack_acc;
1613 /* Perform atomic OP and save result. */ 1629 /* Perform atomic OP and save result. */
1614 sdata = be64_to_cpu(ateth->swap_data); 1630 sdata = be64_to_cpu(ateth->swap_data);
1615 spin_lock(&dev->pending_lock); 1631 spin_lock_irq(&dev->pending_lock);
1616 qp->r_atomic_data = *(u64 *) qp->r_sge.sge.vaddr; 1632 qp->r_atomic_data = *(u64 *) qp->r_sge.sge.vaddr;
1617 if (opcode == OP(FETCH_ADD)) 1633 if (opcode == OP(FETCH_ADD))
1618 *(u64 *) qp->r_sge.sge.vaddr = 1634 *(u64 *) qp->r_sge.sge.vaddr =
@@ -1620,8 +1636,8 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1620 else if (qp->r_atomic_data == 1636 else if (qp->r_atomic_data ==
1621 be64_to_cpu(ateth->compare_data)) 1637 be64_to_cpu(ateth->compare_data))
1622 *(u64 *) qp->r_sge.sge.vaddr = sdata; 1638 *(u64 *) qp->r_sge.sge.vaddr = sdata;
1623 spin_unlock(&dev->pending_lock); 1639 spin_unlock_irq(&dev->pending_lock);
1624 atomic_inc(&qp->msn); 1640 qp->r_msn++;
1625 qp->r_atomic_psn = psn & IPS_PSN_MASK; 1641 qp->r_atomic_psn = psn & IPS_PSN_MASK;
1626 psn |= 1 << 31; 1642 psn |= 1 << 31;
1627 break; 1643 break;
@@ -1633,44 +1649,39 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1633 } 1649 }
1634 qp->r_psn++; 1650 qp->r_psn++;
1635 qp->r_state = opcode; 1651 qp->r_state = opcode;
1652 qp->r_nak_state = 0;
1636 /* Send an ACK if requested or required. */ 1653 /* Send an ACK if requested or required. */
1637 if (psn & (1 << 31)) { 1654 if (psn & (1 << 31)) {
1638 /* 1655 /*
1639 * Coalesce ACKs unless there is a RDMA READ or 1656 * Coalesce ACKs unless there is a RDMA READ or
1640 * ATOMIC pending. 1657 * ATOMIC pending.
1641 */ 1658 */
1642 spin_lock(&qp->s_lock); 1659 if (qp->r_ack_state < OP(COMPARE_SWAP)) {
1643 if (qp->s_ack_state == OP(ACKNOWLEDGE) || 1660 qp->r_ack_state = opcode;
1644 qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST) { 1661 qp->r_ack_psn = psn;
1645 qp->s_ack_state = opcode;
1646 qp->s_nak_state = 0;
1647 qp->s_ack_psn = psn;
1648 qp->s_ack_atomic = qp->r_atomic_data;
1649 goto resched;
1650 } 1662 }
1651 spin_unlock(&qp->s_lock); 1663 goto send_ack;
1652 } 1664 }
1653done: 1665 goto done;
1654 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
1655 goto bail;
1656 1666
1657resched: 1667nack_acc:
1658 /* 1668 /*
1659 * Try to send ACK right away but not if ipath_do_rc_send() is 1669 * A NAK will ACK earlier sends and RDMA writes.
1660 * active. 1670 * Don't queue the NAK if a RDMA read, atomic, or NAK
1671 * is pending though.
1661 */ 1672 */
1662 if (qp->s_hdrwords == 0 && 1673 if (qp->r_ack_state < OP(COMPARE_SWAP)) {
1663 (qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST || 1674 /* XXX Flush WQEs */
1664 qp->s_ack_state >= IB_OPCODE_COMPARE_SWAP)) 1675 qp->state = IB_QPS_ERR;
1676 qp->r_ack_state = OP(RDMA_WRITE_ONLY);
1677 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
1678 qp->r_ack_psn = qp->r_psn;
1679 }
1680send_ack:
1681 /* Send ACK right away unless the send tasklet has a pending ACK. */
1682 if (qp->s_ack_state == OP(ACKNOWLEDGE))
1665 send_rc_ack(qp); 1683 send_rc_ack(qp);
1666 1684
1667rdmadone: 1685done:
1668 spin_unlock(&qp->s_lock);
1669 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
1670
1671 /* Call ipath_do_rc_send() in another thread. */
1672 tasklet_hi_schedule(&qp->s_task);
1673
1674bail:
1675 return; 1686 return;
1676} 1687}
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c
index 9a456a7ce352..99c0652d49dc 100644
--- a/drivers/infiniband/hw/ipath/ipath_ruc.c
+++ b/drivers/infiniband/hw/ipath/ipath_ruc.c
@@ -113,20 +113,23 @@ void ipath_insert_rnr_queue(struct ipath_qp *qp)
113 * 113 *
114 * Return 0 if no RWQE is available, otherwise return 1. 114 * Return 0 if no RWQE is available, otherwise return 1.
115 * 115 *
116 * Called at interrupt level with the QP r_rq.lock held. 116 * Can be called from interrupt level.
117 */ 117 */
118int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) 118int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
119{ 119{
120 unsigned long flags;
120 struct ipath_rq *rq; 121 struct ipath_rq *rq;
121 struct ipath_srq *srq; 122 struct ipath_srq *srq;
122 struct ipath_rwqe *wqe; 123 struct ipath_rwqe *wqe;
123 int ret; 124 int ret = 1;
124 125
125 if (!qp->ibqp.srq) { 126 if (!qp->ibqp.srq) {
126 rq = &qp->r_rq; 127 rq = &qp->r_rq;
128 spin_lock_irqsave(&rq->lock, flags);
129
127 if (unlikely(rq->tail == rq->head)) { 130 if (unlikely(rq->tail == rq->head)) {
128 ret = 0; 131 ret = 0;
129 goto bail; 132 goto done;
130 } 133 }
131 wqe = get_rwqe_ptr(rq, rq->tail); 134 wqe = get_rwqe_ptr(rq, rq->tail);
132 qp->r_wr_id = wqe->wr_id; 135 qp->r_wr_id = wqe->wr_id;
@@ -138,17 +141,16 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
138 } 141 }
139 if (++rq->tail >= rq->size) 142 if (++rq->tail >= rq->size)
140 rq->tail = 0; 143 rq->tail = 0;
141 ret = 1; 144 goto done;
142 goto bail;
143 } 145 }
144 146
145 srq = to_isrq(qp->ibqp.srq); 147 srq = to_isrq(qp->ibqp.srq);
146 rq = &srq->rq; 148 rq = &srq->rq;
147 spin_lock(&rq->lock); 149 spin_lock_irqsave(&rq->lock, flags);
150
148 if (unlikely(rq->tail == rq->head)) { 151 if (unlikely(rq->tail == rq->head)) {
149 spin_unlock(&rq->lock);
150 ret = 0; 152 ret = 0;
151 goto bail; 153 goto done;
152 } 154 }
153 wqe = get_rwqe_ptr(rq, rq->tail); 155 wqe = get_rwqe_ptr(rq, rq->tail);
154 qp->r_wr_id = wqe->wr_id; 156 qp->r_wr_id = wqe->wr_id;
@@ -170,18 +172,18 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
170 n = rq->head - rq->tail; 172 n = rq->head - rq->tail;
171 if (n < srq->limit) { 173 if (n < srq->limit) {
172 srq->limit = 0; 174 srq->limit = 0;
173 spin_unlock(&rq->lock); 175 spin_unlock_irqrestore(&rq->lock, flags);
174 ev.device = qp->ibqp.device; 176 ev.device = qp->ibqp.device;
175 ev.element.srq = qp->ibqp.srq; 177 ev.element.srq = qp->ibqp.srq;
176 ev.event = IB_EVENT_SRQ_LIMIT_REACHED; 178 ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
177 srq->ibsrq.event_handler(&ev, 179 srq->ibsrq.event_handler(&ev,
178 srq->ibsrq.srq_context); 180 srq->ibsrq.srq_context);
179 } else 181 goto bail;
180 spin_unlock(&rq->lock); 182 }
181 } else 183 }
182 spin_unlock(&rq->lock);
183 ret = 1;
184 184
185done:
186 spin_unlock_irqrestore(&rq->lock, flags);
185bail: 187bail:
186 return ret; 188 return ret;
187} 189}
@@ -248,10 +250,8 @@ again:
248 wc.imm_data = wqe->wr.imm_data; 250 wc.imm_data = wqe->wr.imm_data;
249 /* FALLTHROUGH */ 251 /* FALLTHROUGH */
250 case IB_WR_SEND: 252 case IB_WR_SEND:
251 spin_lock_irqsave(&qp->r_rq.lock, flags);
252 if (!ipath_get_rwqe(qp, 0)) { 253 if (!ipath_get_rwqe(qp, 0)) {
253 rnr_nak: 254 rnr_nak:
254 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
255 /* Handle RNR NAK */ 255 /* Handle RNR NAK */
256 if (qp->ibqp.qp_type == IB_QPT_UC) 256 if (qp->ibqp.qp_type == IB_QPT_UC)
257 goto send_comp; 257 goto send_comp;
@@ -263,20 +263,17 @@ again:
263 sqp->s_rnr_retry--; 263 sqp->s_rnr_retry--;
264 dev->n_rnr_naks++; 264 dev->n_rnr_naks++;
265 sqp->s_rnr_timeout = 265 sqp->s_rnr_timeout =
266 ib_ipath_rnr_table[sqp->s_min_rnr_timer]; 266 ib_ipath_rnr_table[sqp->r_min_rnr_timer];
267 ipath_insert_rnr_queue(sqp); 267 ipath_insert_rnr_queue(sqp);
268 goto done; 268 goto done;
269 } 269 }
270 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
271 break; 270 break;
272 271
273 case IB_WR_RDMA_WRITE_WITH_IMM: 272 case IB_WR_RDMA_WRITE_WITH_IMM:
274 wc.wc_flags = IB_WC_WITH_IMM; 273 wc.wc_flags = IB_WC_WITH_IMM;
275 wc.imm_data = wqe->wr.imm_data; 274 wc.imm_data = wqe->wr.imm_data;
276 spin_lock_irqsave(&qp->r_rq.lock, flags);
277 if (!ipath_get_rwqe(qp, 1)) 275 if (!ipath_get_rwqe(qp, 1))
278 goto rnr_nak; 276 goto rnr_nak;
279 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
280 /* FALLTHROUGH */ 277 /* FALLTHROUGH */
281 case IB_WR_RDMA_WRITE: 278 case IB_WR_RDMA_WRITE:
282 if (wqe->length == 0) 279 if (wqe->length == 0)
diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c
index 89b3e1a5e3e3..10516842bb82 100644
--- a/drivers/infiniband/hw/ipath/ipath_uc.c
+++ b/drivers/infiniband/hw/ipath/ipath_uc.c
@@ -241,7 +241,6 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
241 u32 hdrsize; 241 u32 hdrsize;
242 u32 psn; 242 u32 psn;
243 u32 pad; 243 u32 pad;
244 unsigned long flags;
245 struct ib_wc wc; 244 struct ib_wc wc;
246 u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); 245 u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
247 struct ib_reth *reth; 246 struct ib_reth *reth;
@@ -279,8 +278,6 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
279 wc.imm_data = 0; 278 wc.imm_data = 0;
280 wc.wc_flags = 0; 279 wc.wc_flags = 0;
281 280
282 spin_lock_irqsave(&qp->r_rq.lock, flags);
283
284 /* Compare the PSN verses the expected PSN. */ 281 /* Compare the PSN verses the expected PSN. */
285 if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) { 282 if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) {
286 /* 283 /*
@@ -537,15 +534,11 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
537 534
538 default: 535 default:
539 /* Drop packet for unknown opcodes. */ 536 /* Drop packet for unknown opcodes. */
540 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
541 dev->n_pkt_drops++; 537 dev->n_pkt_drops++;
542 goto bail; 538 goto done;
543 } 539 }
544 qp->r_psn++; 540 qp->r_psn++;
545 qp->r_state = opcode; 541 qp->r_state = opcode;
546done: 542done:
547 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
548
549bail:
550 return; 543 return;
551} 544}
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h
index 1cb797086679..2df684727dc1 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.h
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.h
@@ -307,32 +307,34 @@ struct ipath_qp {
307 u32 s_next_psn; /* PSN for next request */ 307 u32 s_next_psn; /* PSN for next request */
308 u32 s_last_psn; /* last response PSN processed */ 308 u32 s_last_psn; /* last response PSN processed */
309 u32 s_psn; /* current packet sequence number */ 309 u32 s_psn; /* current packet sequence number */
310 u32 s_ack_psn; /* PSN for RDMA_READ */
310 u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */ 311 u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */
311 u32 s_ack_psn; /* PSN for next ACK or RDMA_READ */ 312 u32 r_ack_psn; /* PSN for next ACK or atomic ACK */
312 u64 s_ack_atomic; /* data for atomic ACK */
313 u64 r_wr_id; /* ID for current receive WQE */ 313 u64 r_wr_id; /* ID for current receive WQE */
314 u64 r_atomic_data; /* data for last atomic op */ 314 u64 r_atomic_data; /* data for last atomic op */
315 u32 r_atomic_psn; /* PSN of last atomic op */ 315 u32 r_atomic_psn; /* PSN of last atomic op */
316 u32 r_len; /* total length of r_sge */ 316 u32 r_len; /* total length of r_sge */
317 u32 r_rcv_len; /* receive data len processed */ 317 u32 r_rcv_len; /* receive data len processed */
318 u32 r_psn; /* expected rcv packet sequence number */ 318 u32 r_psn; /* expected rcv packet sequence number */
319 u32 r_msn; /* message sequence number */
319 u8 state; /* QP state */ 320 u8 state; /* QP state */
320 u8 s_state; /* opcode of last packet sent */ 321 u8 s_state; /* opcode of last packet sent */
321 u8 s_ack_state; /* opcode of packet to ACK */ 322 u8 s_ack_state; /* opcode of packet to ACK */
322 u8 s_nak_state; /* non-zero if NAK is pending */ 323 u8 s_nak_state; /* non-zero if NAK is pending */
323 u8 r_state; /* opcode of last packet received */ 324 u8 r_state; /* opcode of last packet received */
325 u8 r_ack_state; /* opcode of packet to ACK */
326 u8 r_nak_state; /* non-zero if NAK is pending */
327 u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */
324 u8 r_reuse_sge; /* for UC receive errors */ 328 u8 r_reuse_sge; /* for UC receive errors */
325 u8 r_sge_inx; /* current index into sg_list */ 329 u8 r_sge_inx; /* current index into sg_list */
326 u8 s_max_sge; /* size of s_wq->sg_list */
327 u8 qp_access_flags; 330 u8 qp_access_flags;
331 u8 s_max_sge; /* size of s_wq->sg_list */
328 u8 s_retry_cnt; /* number of times to retry */ 332 u8 s_retry_cnt; /* number of times to retry */
329 u8 s_rnr_retry_cnt; 333 u8 s_rnr_retry_cnt;
330 u8 s_min_rnr_timer;
331 u8 s_retry; /* requester retry counter */ 334 u8 s_retry; /* requester retry counter */
332 u8 s_rnr_retry; /* requester RNR retry counter */ 335 u8 s_rnr_retry; /* requester RNR retry counter */
333 u8 s_pkey_index; /* PKEY index to use */ 336 u8 s_pkey_index; /* PKEY index to use */
334 enum ib_mtu path_mtu; 337 enum ib_mtu path_mtu;
335 atomic_t msn; /* message sequence number */
336 u32 remote_qpn; 338 u32 remote_qpn;
337 u32 qkey; /* QKEY for this QP (for UD or RD) */ 339 u32 qkey; /* QKEY for this QP (for UD or RD) */
338 u32 s_size; /* send work queue size */ 340 u32 s_size; /* send work queue size */