aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChad Dupuis <chad.dupuis@cavium.com>2017-11-15 10:06:06 -0500
committerMartin K. Petersen <martin.petersen@oracle.com>2017-11-15 18:44:56 -0500
commitfaae19be80be7c39c9ce8b04bcc9cc10da82c29e (patch)
treed7f44d02686ae3fcd740dc6d5a618efd56f4bc78
parent6363b3f3ac5be096d08c8c504128befa0c033529 (diff)
scsi: bnx2fc: Fix hung task messages when a cleanup response is not received during abort
If a cleanup task is not responded to while we are in bnx2fc_abts_cleanup, it will hang the SCSI error handler since we use wait_for_completion instead of wait_for_completion_timeout. So, use wait_for_completion_timeout so that we don't hang the SCSI error handler thread forever. Fixes the call trace: [183373.131468] INFO: task scsi_eh_16:110146 blocked for more than 120 seconds. [183373.131469] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [183373.131470] scsi_eh_16 D ffff88103f2fca14 0 110146 2 0x00000080 [183373.131472] ffff880855e77cb0 0000000000000046 ffff881050654e70 ffff880855e77fd8 [183373.131474] ffff880855e77fd8 ffff880855e77fd8 ffff881050654e70 ffff88103f2fcb48 [183373.131475] ffff88103f2fcb50 7fffffffffffffff ffff881050654e70 ffff88103f2fca14 [183373.131477] Call Trace: [183373.131479] [<ffffffff8168b579>] schedule+0x29/0x70 [183373.131481] [<ffffffff81688fc9>] schedule_timeout+0x239/0x2d0 [183373.131486] [<ffffffff8142821e>] ? __dev_printk+0x3e/0x90 [183373.131487] [<ffffffff814282cd>] ? dev_printk+0x5d/0x80 [183373.131490] [<ffffffff8168b956>] wait_for_completion+0x116/0x170 [183373.131492] [<ffffffff810c4ec0>] ? wake_up_state+0x20/0x20 [183373.131494] [<ffffffffa048c234>] bnx2fc_abts_cleanup+0x3d/0x62 [bnx2fc] [183373.131497] [<ffffffffa0483a80>] bnx2fc_eh_abort+0x470/0x580 [bnx2fc] [183373.131500] [<ffffffff814570af>] scsi_error_handler+0x59f/0x8b0 [183373.131501] [<ffffffff81456b10>] ? scsi_eh_get_sense+0x250/0x250 [183373.131503] [<ffffffff810b052f>] kthread+0xcf/0xe0 [183373.131505] [<ffffffff810b0460>] ? kthread_create_on_node+0x140/0x140 [183373.131507] [<ffffffff81696418>] ret_from_fork+0x58/0x90 [183373.131509] [<ffffffff810b0460>] ? kthread_create_on_node+0x140/0x140 Signed-off-by: Chad Dupuis <chad.dupuis@cavium.com> Reviewed-by: Laurence Oberman <loberman@redhat.com> Tested-by: Laurence Oberman <loberman@redhat.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
-rw-r--r--drivers/scsi/bnx2fc/bnx2fc_io.c40
1 files changed, 32 insertions, 8 deletions
diff --git a/drivers/scsi/bnx2fc/bnx2fc_io.c b/drivers/scsi/bnx2fc/bnx2fc_io.c
index 5b6153f23f01..8e2f767147cb 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_io.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_io.c
@@ -1084,24 +1084,35 @@ static int bnx2fc_abts_cleanup(struct bnx2fc_cmd *io_req)
1084{ 1084{
1085 struct bnx2fc_rport *tgt = io_req->tgt; 1085 struct bnx2fc_rport *tgt = io_req->tgt;
1086 int rc = SUCCESS; 1086 int rc = SUCCESS;
1087 unsigned int time_left;
1087 1088
1088 io_req->wait_for_comp = 1; 1089 io_req->wait_for_comp = 1;
1089 bnx2fc_initiate_cleanup(io_req); 1090 bnx2fc_initiate_cleanup(io_req);
1090 1091
1091 spin_unlock_bh(&tgt->tgt_lock); 1092 spin_unlock_bh(&tgt->tgt_lock);
1092 1093
1093 wait_for_completion(&io_req->tm_done); 1094 /*
1094 1095 * Can't wait forever on cleanup response lest we let the SCSI error
1096 * handler wait forever
1097 */
1098 time_left = wait_for_completion_timeout(&io_req->tm_done,
1099 BNX2FC_FW_TIMEOUT);
1095 io_req->wait_for_comp = 0; 1100 io_req->wait_for_comp = 0;
1101 if (!time_left)
1102 BNX2FC_IO_DBG(io_req, "%s(): Wait for cleanup timed out.\n",
1103 __func__);
1104
1096 /* 1105 /*
1097 * release the reference taken in eh_abort to allow the 1106 * Release reference held by SCSI command the cleanup completion
1098 * target to re-login after flushing IOs 1107 * hits the BNX2FC_CLEANUP case in bnx2fc_process_cq_compl() and
1108 * thus the SCSI command is not returnedi by bnx2fc_scsi_done().
1099 */ 1109 */
1100 kref_put(&io_req->refcount, bnx2fc_cmd_release); 1110 kref_put(&io_req->refcount, bnx2fc_cmd_release);
1101 1111
1102 spin_lock_bh(&tgt->tgt_lock); 1112 spin_lock_bh(&tgt->tgt_lock);
1103 return rc; 1113 return rc;
1104} 1114}
1115
1105/** 1116/**
1106 * bnx2fc_eh_abort - eh_abort_handler api to abort an outstanding 1117 * bnx2fc_eh_abort - eh_abort_handler api to abort an outstanding
1107 * SCSI command 1118 * SCSI command
@@ -1118,6 +1129,7 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
1118 struct fc_lport *lport; 1129 struct fc_lport *lport;
1119 struct bnx2fc_rport *tgt; 1130 struct bnx2fc_rport *tgt;
1120 int rc; 1131 int rc;
1132 unsigned int time_left;
1121 1133
1122 rc = fc_block_scsi_eh(sc_cmd); 1134 rc = fc_block_scsi_eh(sc_cmd);
1123 if (rc) 1135 if (rc)
@@ -1194,6 +1206,11 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
1194 if (cancel_delayed_work(&io_req->timeout_work)) 1206 if (cancel_delayed_work(&io_req->timeout_work))
1195 kref_put(&io_req->refcount, 1207 kref_put(&io_req->refcount,
1196 bnx2fc_cmd_release); /* drop timer hold */ 1208 bnx2fc_cmd_release); /* drop timer hold */
1209 /*
1210 * We don't want to hold off the upper layer timer so simply
1211 * cleanup the command and return that I/O was successfully
1212 * aborted.
1213 */
1197 rc = bnx2fc_abts_cleanup(io_req); 1214 rc = bnx2fc_abts_cleanup(io_req);
1198 /* This only occurs when an task abort was requested while ABTS 1215 /* This only occurs when an task abort was requested while ABTS
1199 is in progress. Setting the IO_CLEANUP flag will skip the 1216 is in progress. Setting the IO_CLEANUP flag will skip the
@@ -1201,7 +1218,7 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
1201 was a result from the ABTS request rather than the CLEANUP 1218 was a result from the ABTS request rather than the CLEANUP
1202 request */ 1219 request */
1203 set_bit(BNX2FC_FLAG_IO_CLEANUP, &io_req->req_flags); 1220 set_bit(BNX2FC_FLAG_IO_CLEANUP, &io_req->req_flags);
1204 goto out; 1221 goto done;
1205 } 1222 }
1206 1223
1207 /* Cancel the current timer running on this io_req */ 1224 /* Cancel the current timer running on this io_req */
@@ -1221,7 +1238,11 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
1221 } 1238 }
1222 spin_unlock_bh(&tgt->tgt_lock); 1239 spin_unlock_bh(&tgt->tgt_lock);
1223 1240
1224 wait_for_completion(&io_req->tm_done); 1241 /* Wait 2 * RA_TOV + 1 to be sure timeout function hasn't fired */
1242 time_left = wait_for_completion_timeout(&io_req->tm_done,
1243 (2 * rp->r_a_tov + 1) * HZ);
1244 if (time_left)
1245 BNX2FC_IO_DBG(io_req, "Timed out in eh_abort waiting for tm_done");
1225 1246
1226 spin_lock_bh(&tgt->tgt_lock); 1247 spin_lock_bh(&tgt->tgt_lock);
1227 io_req->wait_for_comp = 0; 1248 io_req->wait_for_comp = 0;
@@ -1233,8 +1254,12 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
1233 /* Let the scsi-ml try to recover this command */ 1254 /* Let the scsi-ml try to recover this command */
1234 printk(KERN_ERR PFX "abort failed, xid = 0x%x\n", 1255 printk(KERN_ERR PFX "abort failed, xid = 0x%x\n",
1235 io_req->xid); 1256 io_req->xid);
1257 /*
1258 * Cleanup firmware residuals before returning control back
1259 * to SCSI ML.
1260 */
1236 rc = bnx2fc_abts_cleanup(io_req); 1261 rc = bnx2fc_abts_cleanup(io_req);
1237 goto out; 1262 goto done;
1238 } else { 1263 } else {
1239 /* 1264 /*
1240 * We come here even when there was a race condition 1265 * We come here even when there was a race condition
@@ -1249,7 +1274,6 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
1249done: 1274done:
1250 /* release the reference taken in eh_abort */ 1275 /* release the reference taken in eh_abort */
1251 kref_put(&io_req->refcount, bnx2fc_cmd_release); 1276 kref_put(&io_req->refcount, bnx2fc_cmd_release);
1252out:
1253 spin_unlock_bh(&tgt->tgt_lock); 1277 spin_unlock_bh(&tgt->tgt_lock);
1254 return rc; 1278 return rc;
1255} 1279}