aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw
diff options
context:
space:
mode:
authorSteve Wise <swise@opengridcomputing.com>2014-04-09 10:38:25 -0400
committerRoland Dreier <roland@purestorage.com>2014-04-11 14:36:07 -0400
commitb33bd0cbfa102b8f87702338aa72742fe3c7f220 (patch)
tree0a57d3b48b92f125c028b26b1b922c96f2c30c05 /drivers/infiniband/hw
parentfa658a98a2d08352c514758b3394caf91360aa44 (diff)
RDMA/cxgb4: Endpoint timeout fixes
1) timedout endpoint processing can be starved. If there are continual CPL messages flowing into the driver, the endpoint timeout processing can be starved. This condition exposed the other bugs below. Solution: In process_work(), call process_timedout_eps() after each CPL is processed. 2) Connection events can be processed even though the endpoint is on the timeout list. If the endpoint is scheduled for timeout processing, then we must ignore MPA Start Requests and Replies. Solution: Change stop_ep_timer() to return 1 if the ep has already been queued for timeout processing. All the callers of stop_ep_timer() need to check this and act accordingly. There are just a few cases where the caller needs to do something different if stop_ep_timer() returns 1: 1) in process_mpa_reply(), ignore the reply and process_timeout() will abort the connection. 2) in process_mpa_request, ignore the request and process_timeout() will abort the connection. It is ok for callers of stop_ep_timer() to abort the connection since that will leave the state in ABORTING or DEAD, and process_timeout() now ignores timeouts when the ep is in these states. 3) Double insertion on the timeout list. Since the endpoint timers are used for connection setup and teardown, we need to guard against the possibility that an endpoint is already on the timeout list. This is a rare condition and only seen under heavy load and in the presense of the above 2 bugs. Solution: In ep_timeout(), don't queue the endpoint if it is already on the queue. Signed-off-by: Steve Wise <swise@opengridcomputing.com> Signed-off-by: Roland Dreier <roland@purestorage.com>
Diffstat (limited to 'drivers/infiniband/hw')
-rw-r--r--drivers/infiniband/hw/cxgb4/cm.c89
1 files changed, 56 insertions, 33 deletions
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 02436d5d0dab..185452abf32c 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -173,12 +173,15 @@ static void start_ep_timer(struct c4iw_ep *ep)
173 add_timer(&ep->timer); 173 add_timer(&ep->timer);
174} 174}
175 175
176static void stop_ep_timer(struct c4iw_ep *ep) 176static int stop_ep_timer(struct c4iw_ep *ep)
177{ 177{
178 PDBG("%s ep %p stopping\n", __func__, ep); 178 PDBG("%s ep %p stopping\n", __func__, ep);
179 del_timer_sync(&ep->timer); 179 del_timer_sync(&ep->timer);
180 if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) 180 if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) {
181 c4iw_put_ep(&ep->com); 181 c4iw_put_ep(&ep->com);
182 return 0;
183 }
184 return 1;
182} 185}
183 186
184static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb, 187static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb,
@@ -1165,12 +1168,11 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
1165 PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); 1168 PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
1166 1169
1167 /* 1170 /*
1168 * Stop mpa timer. If it expired, then the state has 1171 * Stop mpa timer. If it expired, then
1169 * changed and we bail since ep_timeout already aborted 1172 * we ignore the MPA reply. process_timeout()
1170 * the connection. 1173 * will abort the connection.
1171 */ 1174 */
1172 stop_ep_timer(ep); 1175 if (stop_ep_timer(ep))
1173 if (ep->com.state != MPA_REQ_SENT)
1174 return; 1176 return;
1175 1177
1176 /* 1178 /*
@@ -1375,15 +1377,12 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
1375 1377
1376 PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); 1378 PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
1377 1379
1378 if (ep->com.state != MPA_REQ_WAIT)
1379 return;
1380
1381 /* 1380 /*
1382 * If we get more than the supported amount of private data 1381 * If we get more than the supported amount of private data
1383 * then we must fail this connection. 1382 * then we must fail this connection.
1384 */ 1383 */
1385 if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { 1384 if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
1386 stop_ep_timer(ep); 1385 (void)stop_ep_timer(ep);
1387 abort_connection(ep, skb, GFP_KERNEL); 1386 abort_connection(ep, skb, GFP_KERNEL);
1388 return; 1387 return;
1389 } 1388 }
@@ -1413,13 +1412,13 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
1413 if (mpa->revision > mpa_rev) { 1412 if (mpa->revision > mpa_rev) {
1414 printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d," 1413 printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d,"
1415 " Received = %d\n", __func__, mpa_rev, mpa->revision); 1414 " Received = %d\n", __func__, mpa_rev, mpa->revision);
1416 stop_ep_timer(ep); 1415 (void)stop_ep_timer(ep);
1417 abort_connection(ep, skb, GFP_KERNEL); 1416 abort_connection(ep, skb, GFP_KERNEL);
1418 return; 1417 return;
1419 } 1418 }
1420 1419
1421 if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) { 1420 if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
1422 stop_ep_timer(ep); 1421 (void)stop_ep_timer(ep);
1423 abort_connection(ep, skb, GFP_KERNEL); 1422 abort_connection(ep, skb, GFP_KERNEL);
1424 return; 1423 return;
1425 } 1424 }
@@ -1430,7 +1429,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
1430 * Fail if there's too much private data. 1429 * Fail if there's too much private data.
1431 */ 1430 */
1432 if (plen > MPA_MAX_PRIVATE_DATA) { 1431 if (plen > MPA_MAX_PRIVATE_DATA) {
1433 stop_ep_timer(ep); 1432 (void)stop_ep_timer(ep);
1434 abort_connection(ep, skb, GFP_KERNEL); 1433 abort_connection(ep, skb, GFP_KERNEL);
1435 return; 1434 return;
1436 } 1435 }
@@ -1439,7 +1438,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
1439 * If plen does not account for pkt size 1438 * If plen does not account for pkt size
1440 */ 1439 */
1441 if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { 1440 if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
1442 stop_ep_timer(ep); 1441 (void)stop_ep_timer(ep);
1443 abort_connection(ep, skb, GFP_KERNEL); 1442 abort_connection(ep, skb, GFP_KERNEL);
1444 return; 1443 return;
1445 } 1444 }
@@ -1496,18 +1495,24 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
1496 ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version, 1495 ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version,
1497 ep->mpa_attr.p2p_type); 1496 ep->mpa_attr.p2p_type);
1498 1497
1499 __state_set(&ep->com, MPA_REQ_RCVD); 1498 /*
1500 stop_ep_timer(ep); 1499 * If the endpoint timer already expired, then we ignore
1501 1500 * the start request. process_timeout() will abort
1502 /* drive upcall */ 1501 * the connection.
1503 mutex_lock(&ep->parent_ep->com.mutex); 1502 */
1504 if (ep->parent_ep->com.state != DEAD) { 1503 if (!stop_ep_timer(ep)) {
1505 if (connect_request_upcall(ep)) 1504 __state_set(&ep->com, MPA_REQ_RCVD);
1505
1506 /* drive upcall */
1507 mutex_lock(&ep->parent_ep->com.mutex);
1508 if (ep->parent_ep->com.state != DEAD) {
1509 if (connect_request_upcall(ep))
1510 abort_connection(ep, skb, GFP_KERNEL);
1511 } else {
1506 abort_connection(ep, skb, GFP_KERNEL); 1512 abort_connection(ep, skb, GFP_KERNEL);
1507 } else { 1513 }
1508 abort_connection(ep, skb, GFP_KERNEL); 1514 mutex_unlock(&ep->parent_ep->com.mutex);
1509 } 1515 }
1510 mutex_unlock(&ep->parent_ep->com.mutex);
1511 return; 1516 return;
1512} 1517}
1513 1518
@@ -2265,7 +2270,7 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)
2265 disconnect = 0; 2270 disconnect = 0;
2266 break; 2271 break;
2267 case MORIBUND: 2272 case MORIBUND:
2268 stop_ep_timer(ep); 2273 (void)stop_ep_timer(ep);
2269 if (ep->com.cm_id && ep->com.qp) { 2274 if (ep->com.cm_id && ep->com.qp) {
2270 attrs.next_state = C4IW_QP_STATE_IDLE; 2275 attrs.next_state = C4IW_QP_STATE_IDLE;
2271 c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, 2276 c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
@@ -2325,10 +2330,10 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
2325 case CONNECTING: 2330 case CONNECTING:
2326 break; 2331 break;
2327 case MPA_REQ_WAIT: 2332 case MPA_REQ_WAIT:
2328 stop_ep_timer(ep); 2333 (void)stop_ep_timer(ep);
2329 break; 2334 break;
2330 case MPA_REQ_SENT: 2335 case MPA_REQ_SENT:
2331 stop_ep_timer(ep); 2336 (void)stop_ep_timer(ep);
2332 if (mpa_rev == 1 || (mpa_rev == 2 && ep->tried_with_mpa_v1)) 2337 if (mpa_rev == 1 || (mpa_rev == 2 && ep->tried_with_mpa_v1))
2333 connect_reply_upcall(ep, -ECONNRESET); 2338 connect_reply_upcall(ep, -ECONNRESET);
2334 else { 2339 else {
@@ -2433,7 +2438,7 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
2433 __state_set(&ep->com, MORIBUND); 2438 __state_set(&ep->com, MORIBUND);
2434 break; 2439 break;
2435 case MORIBUND: 2440 case MORIBUND:
2436 stop_ep_timer(ep); 2441 (void)stop_ep_timer(ep);
2437 if ((ep->com.cm_id) && (ep->com.qp)) { 2442 if ((ep->com.cm_id) && (ep->com.qp)) {
2438 attrs.next_state = C4IW_QP_STATE_IDLE; 2443 attrs.next_state = C4IW_QP_STATE_IDLE;
2439 c4iw_modify_qp(ep->com.qp->rhp, 2444 c4iw_modify_qp(ep->com.qp->rhp,
@@ -3028,7 +3033,7 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
3028 if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) { 3033 if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) {
3029 close = 1; 3034 close = 1;
3030 if (abrupt) { 3035 if (abrupt) {
3031 stop_ep_timer(ep); 3036 (void)stop_ep_timer(ep);
3032 ep->com.state = ABORTING; 3037 ep->com.state = ABORTING;
3033 } else 3038 } else
3034 ep->com.state = MORIBUND; 3039 ep->com.state = MORIBUND;
@@ -3462,6 +3467,16 @@ static void process_timeout(struct c4iw_ep *ep)
3462 __state_set(&ep->com, ABORTING); 3467 __state_set(&ep->com, ABORTING);
3463 close_complete_upcall(ep, -ETIMEDOUT); 3468 close_complete_upcall(ep, -ETIMEDOUT);
3464 break; 3469 break;
3470 case ABORTING:
3471 case DEAD:
3472
3473 /*
3474 * These states are expected if the ep timed out at the same
3475 * time as another thread was calling stop_ep_timer().
3476 * So we silently do nothing for these states.
3477 */
3478 abort = 0;
3479 break;
3465 default: 3480 default:
3466 WARN(1, "%s unexpected state ep %p tid %u state %u\n", 3481 WARN(1, "%s unexpected state ep %p tid %u state %u\n",
3467 __func__, ep, ep->hwtid, ep->com.state); 3482 __func__, ep, ep->hwtid, ep->com.state);
@@ -3483,6 +3498,8 @@ static void process_timedout_eps(void)
3483 3498
3484 tmp = timeout_list.next; 3499 tmp = timeout_list.next;
3485 list_del(tmp); 3500 list_del(tmp);
3501 tmp->next = NULL;
3502 tmp->prev = NULL;
3486 spin_unlock_irq(&timeout_lock); 3503 spin_unlock_irq(&timeout_lock);
3487 ep = list_entry(tmp, struct c4iw_ep, entry); 3504 ep = list_entry(tmp, struct c4iw_ep, entry);
3488 process_timeout(ep); 3505 process_timeout(ep);
@@ -3499,6 +3516,7 @@ static void process_work(struct work_struct *work)
3499 unsigned int opcode; 3516 unsigned int opcode;
3500 int ret; 3517 int ret;
3501 3518
3519 process_timedout_eps();
3502 while ((skb = skb_dequeue(&rxq))) { 3520 while ((skb = skb_dequeue(&rxq))) {
3503 rpl = cplhdr(skb); 3521 rpl = cplhdr(skb);
3504 dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *))); 3522 dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *)));
@@ -3508,8 +3526,8 @@ static void process_work(struct work_struct *work)
3508 ret = work_handlers[opcode](dev, skb); 3526 ret = work_handlers[opcode](dev, skb);
3509 if (!ret) 3527 if (!ret)
3510 kfree_skb(skb); 3528 kfree_skb(skb);
3529 process_timedout_eps();
3511 } 3530 }
3512 process_timedout_eps();
3513} 3531}
3514 3532
3515static DECLARE_WORK(skb_work, process_work); 3533static DECLARE_WORK(skb_work, process_work);
@@ -3521,8 +3539,13 @@ static void ep_timeout(unsigned long arg)
3521 3539
3522 spin_lock(&timeout_lock); 3540 spin_lock(&timeout_lock);
3523 if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) { 3541 if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) {
3524 list_add_tail(&ep->entry, &timeout_list); 3542 /*
3525 kickit = 1; 3543 * Only insert if it is not already on the list.
3544 */
3545 if (!ep->entry.next) {
3546 list_add_tail(&ep->entry, &timeout_list);
3547 kickit = 1;
3548 }
3526 } 3549 }
3527 spin_unlock(&timeout_lock); 3550 spin_unlock(&timeout_lock);
3528 if (kickit) 3551 if (kickit)