aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Jeffery <dhjeffery@gmail.com>2011-05-19 14:41:12 -0400
committerJames Bottomley <jbottomley@parallels.com>2011-05-24 12:51:53 -0400
commit3eef6257de48ff84a5d98ca533685df8a3beaeb8 (patch)
tree087372685b6d3069faf1a174c4b70778c3a51504
parent0bcaa11154f07502e68375617e5650173eea8e50 (diff)
[SCSI] Reduce error recovery time by reducing use of TURs
In error recovery, most scsi error recovery stages will send a TUR command for every bad command when a driver's error handler reports success. When several bad commands to the same device, this results in a device being probed multiple times. This becomes very problematic if the device or connection is in a state where the device still doesn't respond to commands even after a recovery function returns success. The error handler must wait for the test commands to time out. The time waiting for the redundant commands can drastically lengthen error recovery. This patch alters the scsi mid-layer's error routines to send test commands once per device instead of once per bad command. This can drastically lower error recovery time. [jejb: fixed up whitespace and formatting] Signed-of-by: David Jeffery <djeffery@redhat.com> Signed-off-by: James Bottomley <jbottomley@parallels.com>
-rw-r--r--drivers/scsi/scsi_error.c87
1 files changed, 67 insertions, 20 deletions
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index abea2cf05c2e..a4b9cdbaaa0b 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -50,6 +50,8 @@
50#define BUS_RESET_SETTLE_TIME (10) 50#define BUS_RESET_SETTLE_TIME (10)
51#define HOST_RESET_SETTLE_TIME (10) 51#define HOST_RESET_SETTLE_TIME (10)
52 52
53static int scsi_eh_try_stu(struct scsi_cmnd *scmd);
54
53/* called with shost->host_lock held */ 55/* called with shost->host_lock held */
54void scsi_eh_wakeup(struct Scsi_Host *shost) 56void scsi_eh_wakeup(struct Scsi_Host *shost)
55{ 57{
@@ -947,6 +949,48 @@ retry_tur:
947} 949}
948 950
949/** 951/**
952 * scsi_eh_test_devices - check if devices are responding from error recovery.
953 * @cmd_list: scsi commands in error recovery.
954 * @work_q: queue for commands which still need more error recovery
955 * @done_q: queue for commands which are finished
956 * @try_stu: boolean on if a STU command should be tried in addition to TUR.
957 *
958 * Decription:
959 * Tests if devices are in a working state. Commands to devices now in
960 * a working state are sent to the done_q while commands to devices which
961 * are still failing to respond are returned to the work_q for more
962 * processing.
963 **/
964static int scsi_eh_test_devices(struct list_head *cmd_list,
965 struct list_head *work_q,
966 struct list_head *done_q, int try_stu)
967{
968 struct scsi_cmnd *scmd, *next;
969 struct scsi_device *sdev;
970 int finish_cmds;
971
972 while (!list_empty(cmd_list)) {
973 scmd = list_entry(cmd_list->next, struct scsi_cmnd, eh_entry);
974 sdev = scmd->device;
975
976 finish_cmds = !scsi_device_online(scmd->device) ||
977 (try_stu && !scsi_eh_try_stu(scmd) &&
978 !scsi_eh_tur(scmd)) ||
979 !scsi_eh_tur(scmd);
980
981 list_for_each_entry_safe(scmd, next, cmd_list, eh_entry)
982 if (scmd->device == sdev) {
983 if (finish_cmds)
984 scsi_eh_finish_cmd(scmd, done_q);
985 else
986 list_move_tail(&scmd->eh_entry, work_q);
987 }
988 }
989 return list_empty(work_q);
990}
991
992
993/**
950 * scsi_eh_abort_cmds - abort pending commands. 994 * scsi_eh_abort_cmds - abort pending commands.
951 * @work_q: &list_head for pending commands. 995 * @work_q: &list_head for pending commands.
952 * @done_q: &list_head for processed commands. 996 * @done_q: &list_head for processed commands.
@@ -962,6 +1006,7 @@ static int scsi_eh_abort_cmds(struct list_head *work_q,
962 struct list_head *done_q) 1006 struct list_head *done_q)
963{ 1007{
964 struct scsi_cmnd *scmd, *next; 1008 struct scsi_cmnd *scmd, *next;
1009 LIST_HEAD(check_list);
965 int rtn; 1010 int rtn;
966 1011
967 list_for_each_entry_safe(scmd, next, work_q, eh_entry) { 1012 list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
@@ -973,11 +1018,10 @@ static int scsi_eh_abort_cmds(struct list_head *work_q,
973 rtn = scsi_try_to_abort_cmd(scmd->device->host->hostt, scmd); 1018 rtn = scsi_try_to_abort_cmd(scmd->device->host->hostt, scmd);
974 if (rtn == SUCCESS || rtn == FAST_IO_FAIL) { 1019 if (rtn == SUCCESS || rtn == FAST_IO_FAIL) {
975 scmd->eh_eflags &= ~SCSI_EH_CANCEL_CMD; 1020 scmd->eh_eflags &= ~SCSI_EH_CANCEL_CMD;
976 if (!scsi_device_online(scmd->device) || 1021 if (rtn == FAST_IO_FAIL)
977 rtn == FAST_IO_FAIL ||
978 !scsi_eh_tur(scmd)) {
979 scsi_eh_finish_cmd(scmd, done_q); 1022 scsi_eh_finish_cmd(scmd, done_q);
980 } 1023 else
1024 list_move_tail(&scmd->eh_entry, &check_list);
981 } else 1025 } else
982 SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting" 1026 SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting"
983 " cmd failed:" 1027 " cmd failed:"
@@ -986,7 +1030,7 @@ static int scsi_eh_abort_cmds(struct list_head *work_q,
986 scmd)); 1030 scmd));
987 } 1031 }
988 1032
989 return list_empty(work_q); 1033 return scsi_eh_test_devices(&check_list, work_q, done_q, 0);
990} 1034}
991 1035
992/** 1036/**
@@ -1137,6 +1181,7 @@ static int scsi_eh_target_reset(struct Scsi_Host *shost,
1137 struct list_head *done_q) 1181 struct list_head *done_q)
1138{ 1182{
1139 LIST_HEAD(tmp_list); 1183 LIST_HEAD(tmp_list);
1184 LIST_HEAD(check_list);
1140 1185
1141 list_splice_init(work_q, &tmp_list); 1186 list_splice_init(work_q, &tmp_list);
1142 1187
@@ -1161,9 +1206,9 @@ static int scsi_eh_target_reset(struct Scsi_Host *shost,
1161 if (scmd_id(scmd) != id) 1206 if (scmd_id(scmd) != id)
1162 continue; 1207 continue;
1163 1208
1164 if ((rtn == SUCCESS || rtn == FAST_IO_FAIL) 1209 if (rtn == SUCCESS)
1165 && (!scsi_device_online(scmd->device) || 1210 list_move_tail(&scmd->eh_entry, &check_list);
1166 rtn == FAST_IO_FAIL || !scsi_eh_tur(scmd))) 1211 else if (rtn == FAST_IO_FAIL)
1167 scsi_eh_finish_cmd(scmd, done_q); 1212 scsi_eh_finish_cmd(scmd, done_q);
1168 else 1213 else
1169 /* push back on work queue for further processing */ 1214 /* push back on work queue for further processing */
@@ -1171,7 +1216,7 @@ static int scsi_eh_target_reset(struct Scsi_Host *shost,
1171 } 1216 }
1172 } 1217 }
1173 1218
1174 return list_empty(work_q); 1219 return scsi_eh_test_devices(&check_list, work_q, done_q, 0);
1175} 1220}
1176 1221
1177/** 1222/**
@@ -1185,6 +1230,7 @@ static int scsi_eh_bus_reset(struct Scsi_Host *shost,
1185 struct list_head *done_q) 1230 struct list_head *done_q)
1186{ 1231{
1187 struct scsi_cmnd *scmd, *chan_scmd, *next; 1232 struct scsi_cmnd *scmd, *chan_scmd, *next;
1233 LIST_HEAD(check_list);
1188 unsigned int channel; 1234 unsigned int channel;
1189 int rtn; 1235 int rtn;
1190 1236
@@ -1216,12 +1262,14 @@ static int scsi_eh_bus_reset(struct Scsi_Host *shost,
1216 rtn = scsi_try_bus_reset(chan_scmd); 1262 rtn = scsi_try_bus_reset(chan_scmd);
1217 if (rtn == SUCCESS || rtn == FAST_IO_FAIL) { 1263 if (rtn == SUCCESS || rtn == FAST_IO_FAIL) {
1218 list_for_each_entry_safe(scmd, next, work_q, eh_entry) { 1264 list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
1219 if (channel == scmd_channel(scmd)) 1265 if (channel == scmd_channel(scmd)) {
1220 if (!scsi_device_online(scmd->device) || 1266 if (rtn == FAST_IO_FAIL)
1221 rtn == FAST_IO_FAIL ||
1222 !scsi_eh_tur(scmd))
1223 scsi_eh_finish_cmd(scmd, 1267 scsi_eh_finish_cmd(scmd,
1224 done_q); 1268 done_q);
1269 else
1270 list_move_tail(&scmd->eh_entry,
1271 &check_list);
1272 }
1225 } 1273 }
1226 } else { 1274 } else {
1227 SCSI_LOG_ERROR_RECOVERY(3, printk("%s: BRST" 1275 SCSI_LOG_ERROR_RECOVERY(3, printk("%s: BRST"
@@ -1230,7 +1278,7 @@ static int scsi_eh_bus_reset(struct Scsi_Host *shost,
1230 channel)); 1278 channel));
1231 } 1279 }
1232 } 1280 }
1233 return list_empty(work_q); 1281 return scsi_eh_test_devices(&check_list, work_q, done_q, 0);
1234} 1282}
1235 1283
1236/** 1284/**
@@ -1242,6 +1290,7 @@ static int scsi_eh_host_reset(struct list_head *work_q,
1242 struct list_head *done_q) 1290 struct list_head *done_q)
1243{ 1291{
1244 struct scsi_cmnd *scmd, *next; 1292 struct scsi_cmnd *scmd, *next;
1293 LIST_HEAD(check_list);
1245 int rtn; 1294 int rtn;
1246 1295
1247 if (!list_empty(work_q)) { 1296 if (!list_empty(work_q)) {
@@ -1252,12 +1301,10 @@ static int scsi_eh_host_reset(struct list_head *work_q,
1252 , current->comm)); 1301 , current->comm));
1253 1302
1254 rtn = scsi_try_host_reset(scmd); 1303 rtn = scsi_try_host_reset(scmd);
1255 if (rtn == SUCCESS || rtn == FAST_IO_FAIL) { 1304 if (rtn == SUCCESS) {
1305 list_splice_init(work_q, &check_list);
1306 } else if (rtn == FAST_IO_FAIL) {
1256 list_for_each_entry_safe(scmd, next, work_q, eh_entry) { 1307 list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
1257 if (!scsi_device_online(scmd->device) ||
1258 rtn == FAST_IO_FAIL ||
1259 (!scsi_eh_try_stu(scmd) && !scsi_eh_tur(scmd)) ||
1260 !scsi_eh_tur(scmd))
1261 scsi_eh_finish_cmd(scmd, done_q); 1308 scsi_eh_finish_cmd(scmd, done_q);
1262 } 1309 }
1263 } else { 1310 } else {
@@ -1266,7 +1313,7 @@ static int scsi_eh_host_reset(struct list_head *work_q,
1266 current->comm)); 1313 current->comm));
1267 } 1314 }
1268 } 1315 }
1269 return list_empty(work_q); 1316 return scsi_eh_test_devices(&check_list, work_q, done_q, 1);
1270} 1317}
1271 1318
1272/** 1319/**