aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMike Christie <michaelc@cs.wisc.edu>2008-08-19 19:45:25 -0400
committerJames Bottomley <James.Bottomley@HansenPartnership.com>2008-10-13 09:28:48 -0400
commita4dfaa6f2e55b736adf2719133996f7e7dc309bc (patch)
tree770132bb154b0e4c395b45485580563c0b660286
parent9cc328f502eacfcc52ab1c1bf9a7729cf12f14be (diff)
[SCSI] scsi: add transport host byte errors (v3)
Currently, if there is a transport problem the iscsi drivers will return outstanding commands (commands being exeucted by the driver/fw/hw) with DID_BUS_BUSY and block the session so no new commands can be queued. Commands that are caught between the failure handling and blocking are failed with DID_IMM_RETRY or one of the scsi ml queuecommand return values. When the recovery_timeout fires, the iscsi drivers then fail IO with DID_NO_CONNECT. For fcp, some drivers will fail some outstanding IO (disk but possibly not tape) with DID_BUS_BUSY or DID_ERROR or some other value that causes a retry and hits the scsi_error.c failfast check, block the rport, and commands caught in the race are failed with DID_IMM_RETRY. Other drivers, may hold onto all IO and wait for the terminate_rport_io or dev_loss_tmo_callbk to be called. The following patches attempt to unify what upper layers will see drivers like multipath can make a good guess. This relies on drivers being hooked into their transport class. This first patch just defines two new host byte errors so drivers can return the same value for when a rport/session is blocked and for when the fast_io_fail_tmo fires. The idea is that if the LLD/class detects a problem and is going to block a rport/session, then if the LLD wants or must return the command to scsi-ml, then it can return it with DID_TRANSPORT_DISRUPTED. This will requeue the IO into the same scsi queue it came from, until the fast io fail timer fires and the class decides what to do. When using multipath and the fast_io_fail_tmo fires then the class can fail commands with DID_TRANSPORT_FAILFAST or drivers can use DID_TRANSPORT_FAILFAST in their terminate_rport_io callbacks or the equivlent in iscsi if we ever implement more advanced recovery methods. A LLD, like lpfc, could continue to return DID_ERROR and then it will hit the normal failfast path, so drivers do not have fully be ported to work better. The point of the patches is that upper layers will not see a failure that could be recovered from while the rport/session is blocked until fast_io_fail_tmo/recovery_timeout fires. V3 Remove some comments. V2 Fixed patch/diff errors and renamed DID_TRANSPORT_BLOCKED to DID_TRANSPORT_DISRUPTED. V1 initial patch. Signed-off-by: Mike Christie <michaelc@cs.wisc.edu> Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
-rw-r--r--drivers/scsi/constants.c3
-rw-r--r--drivers/scsi/scsi_error.c15
-rw-r--r--include/scsi/scsi.h5
3 files changed, 21 insertions, 2 deletions
diff --git a/drivers/scsi/constants.c b/drivers/scsi/constants.c
index 9785d7384199..4003deefb7d8 100644
--- a/drivers/scsi/constants.c
+++ b/drivers/scsi/constants.c
@@ -1364,7 +1364,8 @@ EXPORT_SYMBOL(scsi_print_sense);
1364static const char * const hostbyte_table[]={ 1364static const char * const hostbyte_table[]={
1365"DID_OK", "DID_NO_CONNECT", "DID_BUS_BUSY", "DID_TIME_OUT", "DID_BAD_TARGET", 1365"DID_OK", "DID_NO_CONNECT", "DID_BUS_BUSY", "DID_TIME_OUT", "DID_BAD_TARGET",
1366"DID_ABORT", "DID_PARITY", "DID_ERROR", "DID_RESET", "DID_BAD_INTR", 1366"DID_ABORT", "DID_PARITY", "DID_ERROR", "DID_RESET", "DID_BAD_INTR",
1367"DID_PASSTHROUGH", "DID_SOFT_ERROR", "DID_IMM_RETRY", "DID_REQUEUE"}; 1367"DID_PASSTHROUGH", "DID_SOFT_ERROR", "DID_IMM_RETRY", "DID_REQUEUE",
1368"DID_TRANSPORT_DISRUPTED", "DID_TRANSPORT_FAILFAST" };
1368#define NUM_HOSTBYTE_STRS ARRAY_SIZE(hostbyte_table) 1369#define NUM_HOSTBYTE_STRS ARRAY_SIZE(hostbyte_table)
1369 1370
1370static const char * const driverbyte_table[]={ 1371static const char * const driverbyte_table[]={
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index fecefa05cb62..5bf8be21a165 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1290,7 +1290,20 @@ int scsi_decide_disposition(struct scsi_cmnd *scmd)
1290 1290
1291 case DID_REQUEUE: 1291 case DID_REQUEUE:
1292 return ADD_TO_MLQUEUE; 1292 return ADD_TO_MLQUEUE;
1293 1293 case DID_TRANSPORT_DISRUPTED:
1294 /*
1295 * LLD/transport was disrupted during processing of the IO.
1296 * The transport class is now blocked/blocking,
1297 * and the transport will decide what to do with the IO
1298 * based on its timers and recovery capablilities.
1299 */
1300 return ADD_TO_MLQUEUE;
1301 case DID_TRANSPORT_FAILFAST:
1302 /*
1303 * The transport decided to failfast the IO (most likely
1304 * the fast io fail tmo fired), so send IO directly upwards.
1305 */
1306 return SUCCESS;
1294 case DID_ERROR: 1307 case DID_ERROR:
1295 if (msg_byte(scmd->result) == COMMAND_COMPLETE && 1308 if (msg_byte(scmd->result) == COMMAND_COMPLETE &&
1296 status_byte(scmd->result) == RESERVATION_CONFLICT) 1309 status_byte(scmd->result) == RESERVATION_CONFLICT)
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 3a5662b2817e..a109165714d6 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -381,6 +381,11 @@ static inline int scsi_is_wlun(unsigned int lun)
381#define DID_IMM_RETRY 0x0c /* Retry without decrementing retry count */ 381#define DID_IMM_RETRY 0x0c /* Retry without decrementing retry count */
382#define DID_REQUEUE 0x0d /* Requeue command (no immediate retry) also 382#define DID_REQUEUE 0x0d /* Requeue command (no immediate retry) also
383 * without decrementing the retry count */ 383 * without decrementing the retry count */
384#define DID_TRANSPORT_DISRUPTED 0x0e /* Transport error disrupted execution
385 * and the driver blocked the port to
386 * recover the link. Transport class will
387 * retry or fail IO */
388#define DID_TRANSPORT_FAILFAST 0x0f /* Transport class fastfailed the io */
384#define DRIVER_OK 0x00 /* Driver status */ 389#define DRIVER_OK 0x00 /* Driver status */
385 390
386/* 391/*