scsi: bnx2fc: Fix hung task messages when a cleanup response is not received during abort

If a cleanup task is not responded to while we are in bnx2fc_abts_cleanup, it will hang the SCSI error handler since we use wait_for_completion instead of wait_for_completion_timeout. So, use wait_for_completion_timeout so that we don't hang the SCSI error handler thread forever. Fixes the call trace: [183373.131468] INFO: task scsi_eh_16:110146 blocked for more than 120 seconds. [183373.131469] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [183373.131470] scsi_eh_16 D ffff88103f2fca14 0 110146 2 0x00000080 [183373.131472] ffff880855e77cb0 0000000000000046 ffff881050654e70 ffff880855e77fd8 [183373.131474] ffff880855e77fd8 ffff880855e77fd8 ffff881050654e70 ffff88103f2fcb48 [183373.131475] ffff88103f2fcb50 7fffffffffffffff ffff881050654e70 ffff88103f2fca14 [183373.131477] Call Trace: [183373.131479] [<ffffffff8168b579>] schedule+0x29/0x70 [183373.131481] [<ffffffff81688fc9>] schedule_timeout+0x239/0x2d0 [183373.131486] [<ffffffff8142821e>] ? __dev_printk+0x3e/0x90 [183373.131487] [<ffffffff814282cd>] ? dev_printk+0x5d/0x80 [183373.131490] [<ffffffff8168b956>] wait_for_completion+0x116/0x170 [183373.131492] [<ffffffff810c4ec0>] ? wake_up_state+0x20/0x20 [183373.131494] [<ffffffffa048c234>] bnx2fc_abts_cleanup+0x3d/0x62 [bnx2fc] [183373.131497] [<ffffffffa0483a80>] bnx2fc_eh_abort+0x470/0x580 [bnx2fc] [183373.131500] [<ffffffff814570af>] scsi_error_handler+0x59f/0x8b0 [183373.131501] [<ffffffff81456b10>] ? scsi_eh_get_sense+0x250/0x250 [183373.131503] [<ffffffff810b052f>] kthread+0xcf/0xe0 [183373.131505] [<ffffffff810b0460>] ? kthread_create_on_node+0x140/0x140 [183373.131507] [<ffffffff81696418>] ret_from_fork+0x58/0x90 [183373.131509] [<ffffffff810b0460>] ? kthread_create_on_node+0x140/0x140 Signed-off-by: Chad Dupuis <chad.dupuis@cavium.com> Reviewed-by: Laurence Oberman <loberman@redhat.com> Tested-by: Laurence Oberman <loberman@redhat.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
author: Chad Dupuis <chad.dupuis@cavium.com> 2017-11-15 10:06:06 -0500
committer: Martin K. Petersen <martin.petersen@oracle.com> 2017-11-15 18:44:56 -0500
commit: faae19be80be7c39c9ce8b04bcc9cc10da82c29e (patch)
tree: d7f44d02686ae3fcd740dc6d5a618efd56f4bc78
parent: 6363b3f3ac5be096d08c8c504128befa0c033529 (diff)
1 files changed, 32 insertions, 8 deletions
diff --git a/drivers/scsi/bnx2fc/bnx2fc_io.c b/drivers/scsi/bnx2fc/bnx2fc_io.c
index 5b6153f23f01..8e2f767147cb 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_io.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_io.c
@@ -1084,24 +1084,35 @@ static int bnx2fc_abts_cleanup(struct bnx2fc_cmd *io_req)
 {
        struct bnx2fc_rport *tgt = io_req->tgt;
        int rc = SUCCESS;
+        unsigned int time_left;
        io_req->wait_for_comp = 1;
        bnx2fc_initiate_cleanup(io_req);
        spin_unlock_bh(&tgt->tgt_lock);
-        wait_for_completion(&io_req->tm_done);
+        /*
+         * Can't wait forever on cleanup response lest we let the SCSI error
+         * handler wait forever
+         */
+        time_left = wait_for_completion_timeout(&io_req->tm_done,
+                                                BNX2FC_FW_TIMEOUT);
        io_req->wait_for_comp = 0;
+        if (!time_left)
+                BNX2FC_IO_DBG(io_req, "%s(): Wait for cleanup timed out.\n",
+                              __func__);
        /*
-         * release the reference taken in eh_abort to allow the
+         * Release reference held by SCSI command the cleanup completion
-         * target to re-login after flushing IOs
+         * hits the BNX2FC_CLEANUP case in bnx2fc_process_cq_compl() and
+         * thus the SCSI command is not returnedi by bnx2fc_scsi_done().
         */
        kref_put(&io_req->refcount, bnx2fc_cmd_release);
        spin_lock_bh(&tgt->tgt_lock);
        return rc;
 }
 /**
 * bnx2fc_eh_abort - eh_abort_handler api to abort an outstanding
 *                      SCSI command
@@ -1118,6 +1129,7 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
        struct fc_lport *lport;
        struct bnx2fc_rport *tgt;
        int rc;
+        unsigned int time_left;
        rc = fc_block_scsi_eh(sc_cmd);
        if (rc)
@@ -1194,6 +1206,11 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
                if (cancel_delayed_work(&io_req->timeout_work))
                        kref_put(&io_req->refcount,
                                 bnx2fc_cmd_release); /* drop timer hold */
+                /*
+                 * We don't want to hold off the upper layer timer so simply
+                 * cleanup the command and return that I/O was successfully
+                 * aborted.
+                 */
                rc = bnx2fc_abts_cleanup(io_req);
                /* This only occurs when an task abort was requested while ABTS
                   is in progress.  Setting the IO_CLEANUP flag will skip the
@@ -1201,7 +1218,7 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
                   was a result from the ABTS request rather than the CLEANUP
                   request */
                set_bit(BNX2FC_FLAG_IO_CLEANUP, &io_req->req_flags);
-                goto out;
+                goto done;
        }
        /* Cancel the current timer running on this io_req */
@@ -1221,7 +1238,11 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
        }
        spin_unlock_bh(&tgt->tgt_lock);
-        wait_for_completion(&io_req->tm_done);
+        /* Wait 2 * RA_TOV + 1 to be sure timeout function hasn't fired */
+        time_left = wait_for_completion_timeout(&io_req->tm_done,
+            (2 * rp->r_a_tov + 1) * HZ);
+        if (time_left)
+                BNX2FC_IO_DBG(io_req, "Timed out in eh_abort waiting for tm_done");
        spin_lock_bh(&tgt->tgt_lock);
        io_req->wait_for_comp = 0;
@@ -1233,8 +1254,12 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
                /* Let the scsi-ml try to recover this command */
                printk(KERN_ERR PFX "abort failed, xid = 0x%x\n",
                       io_req->xid);
+                /*
+                 * Cleanup firmware residuals before returning control back
+                 * to SCSI ML.
+                 */
                rc = bnx2fc_abts_cleanup(io_req);
-                goto out;
+                goto done;
        } else {
                /*
                 * We come here even when there was a race condition
@@ -1249,7 +1274,6 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
 done:
        /* release the reference taken in eh_abort */
        kref_put(&io_req->refcount, bnx2fc_cmd_release);
-out:
        spin_unlock_bh(&tgt->tgt_lock);
        return rc;
 }
author	Chad Dupuis <chad.dupuis@cavium.com>	2017-11-15 10:06:06 -0500
committer	Martin K. Petersen <martin.petersen@oracle.com>	2017-11-15 18:44:56 -0500
commit	faae19be80be7c39c9ce8b04bcc9cc10da82c29e (patch)
tree	d7f44d02686ae3fcd740dc6d5a618efd56f4bc78
parent	6363b3f3ac5be096d08c8c504128befa0c033529 (diff)

diff --git a/drivers/scsi/bnx2fc/bnx2fc_io.c b/drivers/scsi/bnx2fc/bnx2fc_io.c index 5b6153f23f01..8e2f767147cb 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_io.c +++ b/drivers/scsi/bnx2fc/bnx2fc_io.c
@@ -1084,24 +1084,35 @@ static int bnx2fc_abts_cleanup(struct bnx2fc_cmd *io_req)
1084	{	1084	{
1085	struct bnx2fc_rport *tgt = io_req->tgt;	1085	struct bnx2fc_rport *tgt = io_req->tgt;
1086	int rc = SUCCESS;	1086	int rc = SUCCESS;
		1087	unsigned int time_left;
1087		1088
1088	io_req->wait_for_comp = 1;	1089	io_req->wait_for_comp = 1;
1089	bnx2fc_initiate_cleanup(io_req);	1090	bnx2fc_initiate_cleanup(io_req);
1090		1091
1091	spin_unlock_bh(&tgt->tgt_lock);	1092	spin_unlock_bh(&tgt->tgt_lock);
1092		1093
1093	wait_for_completion(&io_req->tm_done);	1094	/*
1094		1095	* Can't wait forever on cleanup response lest we let the SCSI error
		1096	* handler wait forever
		1097	*/
		1098	time_left = wait_for_completion_timeout(&io_req->tm_done,
		1099	BNX2FC_FW_TIMEOUT);
1095	io_req->wait_for_comp = 0;	1100	io_req->wait_for_comp = 0;
		1101	if (!time_left)
		1102	BNX2FC_IO_DBG(io_req, "%s(): Wait for cleanup timed out.\n",
		1103	__func__);
		1104
1096	/*	1105	/*
1097	* release the reference taken in eh_abort to allow the	1106	* Release reference held by SCSI command the cleanup completion
1098	* target to re-login after flushing IOs	1107	* hits the BNX2FC_CLEANUP case in bnx2fc_process_cq_compl() and
		1108	* thus the SCSI command is not returnedi by bnx2fc_scsi_done().
1099	*/	1109	*/
1100	kref_put(&io_req->refcount, bnx2fc_cmd_release);	1110	kref_put(&io_req->refcount, bnx2fc_cmd_release);
1101		1111
1102	spin_lock_bh(&tgt->tgt_lock);	1112	spin_lock_bh(&tgt->tgt_lock);
1103	return rc;	1113	return rc;
1104	}	1114	}
		1115
1105	/**	1116	/**
1106	* bnx2fc_eh_abort - eh_abort_handler api to abort an outstanding	1117	* bnx2fc_eh_abort - eh_abort_handler api to abort an outstanding
1107	* SCSI command	1118	* SCSI command
@@ -1118,6 +1129,7 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
1118	struct fc_lport *lport;	1129	struct fc_lport *lport;
1119	struct bnx2fc_rport *tgt;	1130	struct bnx2fc_rport *tgt;
1120	int rc;	1131	int rc;
		1132	unsigned int time_left;
1121		1133
1122	rc = fc_block_scsi_eh(sc_cmd);	1134	rc = fc_block_scsi_eh(sc_cmd);
1123	if (rc)	1135	if (rc)
@@ -1194,6 +1206,11 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
1194	if (cancel_delayed_work(&io_req->timeout_work))	1206	if (cancel_delayed_work(&io_req->timeout_work))
1195	kref_put(&io_req->refcount,	1207	kref_put(&io_req->refcount,
1196	bnx2fc_cmd_release); /* drop timer hold */	1208	bnx2fc_cmd_release); /* drop timer hold */
		1209	/*
		1210	* We don't want to hold off the upper layer timer so simply
		1211	* cleanup the command and return that I/O was successfully
		1212	* aborted.
		1213	*/
1197	rc = bnx2fc_abts_cleanup(io_req);	1214	rc = bnx2fc_abts_cleanup(io_req);
1198	/* This only occurs when an task abort was requested while ABTS	1215	/* This only occurs when an task abort was requested while ABTS
1199	is in progress. Setting the IO_CLEANUP flag will skip the	1216	is in progress. Setting the IO_CLEANUP flag will skip the
@@ -1201,7 +1218,7 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
1201	was a result from the ABTS request rather than the CLEANUP	1218	was a result from the ABTS request rather than the CLEANUP
1202	request */	1219	request */
1203	set_bit(BNX2FC_FLAG_IO_CLEANUP, &io_req->req_flags);	1220	set_bit(BNX2FC_FLAG_IO_CLEANUP, &io_req->req_flags);
1204	goto out;	1221	goto done;
1205	}	1222	}
1206		1223
1207	/* Cancel the current timer running on this io_req */	1224	/* Cancel the current timer running on this io_req */
@@ -1221,7 +1238,11 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
1221	}	1238	}
1222	spin_unlock_bh(&tgt->tgt_lock);	1239	spin_unlock_bh(&tgt->tgt_lock);
1223		1240
1224	wait_for_completion(&io_req->tm_done);	1241	/* Wait 2 * RA_TOV + 1 to be sure timeout function hasn't fired */
		1242	time_left = wait_for_completion_timeout(&io_req->tm_done,
		1243	(2 * rp->r_a_tov + 1) * HZ);
		1244	if (time_left)
		1245	BNX2FC_IO_DBG(io_req, "Timed out in eh_abort waiting for tm_done");
1225		1246
1226	spin_lock_bh(&tgt->tgt_lock);	1247	spin_lock_bh(&tgt->tgt_lock);
1227	io_req->wait_for_comp = 0;	1248	io_req->wait_for_comp = 0;
@@ -1233,8 +1254,12 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
1233	/* Let the scsi-ml try to recover this command */	1254	/* Let the scsi-ml try to recover this command */
1234	printk(KERN_ERR PFX "abort failed, xid = 0x%x\n",	1255	printk(KERN_ERR PFX "abort failed, xid = 0x%x\n",
1235	io_req->xid);	1256	io_req->xid);
		1257	/*
		1258	* Cleanup firmware residuals before returning control back
		1259	* to SCSI ML.
		1260	*/
1236	rc = bnx2fc_abts_cleanup(io_req);	1261	rc = bnx2fc_abts_cleanup(io_req);
1237	goto out;	1262	goto done;
1238	} else {	1263	} else {
1239	/*	1264	/*
1240	* We come here even when there was a race condition	1265	* We come here even when there was a race condition
@@ -1249,7 +1274,6 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
1249	done:	1274	done:
1250	/* release the reference taken in eh_abort */	1275	/* release the reference taken in eh_abort */
1251	kref_put(&io_req->refcount, bnx2fc_cmd_release);	1276	kref_put(&io_req->refcount, bnx2fc_cmd_release);
1252	out:
1253	spin_unlock_bh(&tgt->tgt_lock);	1277	spin_unlock_bh(&tgt->tgt_lock);
1254	return rc;	1278	return rc;
1255	}	1279	}