aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Bottomley <James.Bottomley@HansenPartnership.com>2008-02-19 22:48:42 -0500
committerJames Bottomley <James.Bottomley@HansenPartnership.com>2008-02-22 17:57:14 -0500
commita8e14fec164cc01d8dfb18760ee9bddd91e127c2 (patch)
tree8e2c2e5933d2f41ecc0e37be448e9407029508c8
parent69e562c234440fb7410877b5b24f4b29ef8521d1 (diff)
[SCSI] libsas: fix error handling
The libsas error handler has two fairly fatal bugs 1. scsi_sas_task_done calls scsi_eh_finish_cmd() too early. This happens if the task completes after it has been aborted but before the error handler starts up. Because scsi_eh_finish_cmd() decrements host_failed and adds the task to the done list, the error handler start check (host_failed == host_busy) never passes and the eh never starts. 2. The multiple task completion paths sas_scsi_clear_queue_... all simply delete the task from the error queue. This causes it to disappear into the ether, since a command must be placed on the done queue to be finished off by the error handler. This behaviour causes the HBA to hang on pending commands. Fix 1. by moving the SAS_TASK_STATE_ABORTED check to an exit clause at the top of the routine and calling ->scsi_done() unconditionally (it is a nop if the timer has fired). This keeps the task in the error handling queue until the eh starts. Fix 2. by making sure every task goes through task complete followed by scsi_eh_finish_cmd(). Tested this by firing resets across a disk running a hammer test (now it actually survives without hanging the system) Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
-rw-r--r--drivers/scsi/libsas/sas_scsi_host.c65
1 files changed, 41 insertions, 24 deletions
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index f869fba86807..9c96d1bd36e2 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -51,10 +51,14 @@ static void sas_scsi_task_done(struct sas_task *task)
51{ 51{
52 struct task_status_struct *ts = &task->task_status; 52 struct task_status_struct *ts = &task->task_status;
53 struct scsi_cmnd *sc = task->uldd_task; 53 struct scsi_cmnd *sc = task->uldd_task;
54 struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(sc->device->host);
55 unsigned ts_flags = task->task_state_flags;
56 int hs = 0, stat = 0; 54 int hs = 0, stat = 0;
57 55
56 if (unlikely(task->task_state_flags & SAS_TASK_STATE_ABORTED)) {
57 /* Aborted tasks will be completed by the error handler */
58 SAS_DPRINTK("task done but aborted\n");
59 return;
60 }
61
58 if (unlikely(!sc)) { 62 if (unlikely(!sc)) {
59 SAS_DPRINTK("task_done called with non existing SCSI cmnd!\n"); 63 SAS_DPRINTK("task_done called with non existing SCSI cmnd!\n");
60 list_del_init(&task->list); 64 list_del_init(&task->list);
@@ -120,11 +124,7 @@ static void sas_scsi_task_done(struct sas_task *task)
120 sc->result = (hs << 16) | stat; 124 sc->result = (hs << 16) | stat;
121 list_del_init(&task->list); 125 list_del_init(&task->list);
122 sas_free_task(task); 126 sas_free_task(task);
123 /* This is very ugly but this is how SCSI Core works. */ 127 sc->scsi_done(sc);
124 if (ts_flags & SAS_TASK_STATE_ABORTED)
125 scsi_eh_finish_cmd(sc, &sas_ha->eh_done_q);
126 else
127 sc->scsi_done(sc);
128} 128}
129 129
130static enum task_attribute sas_scsi_get_task_attr(struct scsi_cmnd *cmd) 130static enum task_attribute sas_scsi_get_task_attr(struct scsi_cmnd *cmd)
@@ -255,13 +255,33 @@ out:
255 return res; 255 return res;
256} 256}
257 257
258static void sas_eh_finish_cmd(struct scsi_cmnd *cmd)
259{
260 struct sas_task *task = TO_SAS_TASK(cmd);
261 struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(cmd->device->host);
262
263 /* remove the aborted task flag to allow the task to be
264 * completed now. At this point, we only get called following
265 * an actual abort of the task, so we should be guaranteed not
266 * to be racing with any completions from the LLD (hence we
267 * don't need the task state lock to clear the flag) */
268 task->task_state_flags &= ~SAS_TASK_STATE_ABORTED;
269 /* Now call task_done. However, task will be free'd after
270 * this */
271 task->task_done(task);
272 /* now finish the command and move it on to the error
273 * handler done list, this also takes it off the
274 * error handler pending list */
275 scsi_eh_finish_cmd(cmd, &sas_ha->eh_done_q);
276}
277
258static void sas_scsi_clear_queue_lu(struct list_head *error_q, struct scsi_cmnd *my_cmd) 278static void sas_scsi_clear_queue_lu(struct list_head *error_q, struct scsi_cmnd *my_cmd)
259{ 279{
260 struct scsi_cmnd *cmd, *n; 280 struct scsi_cmnd *cmd, *n;
261 281
262 list_for_each_entry_safe(cmd, n, error_q, eh_entry) { 282 list_for_each_entry_safe(cmd, n, error_q, eh_entry) {
263 if (cmd == my_cmd) 283 if (cmd == my_cmd)
264 list_del_init(&cmd->eh_entry); 284 sas_eh_finish_cmd(cmd);
265 } 285 }
266} 286}
267 287
@@ -274,7 +294,7 @@ static void sas_scsi_clear_queue_I_T(struct list_head *error_q,
274 struct domain_device *x = cmd_to_domain_dev(cmd); 294 struct domain_device *x = cmd_to_domain_dev(cmd);
275 295
276 if (x == dev) 296 if (x == dev)
277 list_del_init(&cmd->eh_entry); 297 sas_eh_finish_cmd(cmd);
278 } 298 }
279} 299}
280 300
@@ -288,7 +308,7 @@ static void sas_scsi_clear_queue_port(struct list_head *error_q,
288 struct asd_sas_port *x = dev->port; 308 struct asd_sas_port *x = dev->port;
289 309
290 if (x == port) 310 if (x == port)
291 list_del_init(&cmd->eh_entry); 311 sas_eh_finish_cmd(cmd);
292 } 312 }
293} 313}
294 314
@@ -528,14 +548,14 @@ Again:
528 case TASK_IS_DONE: 548 case TASK_IS_DONE:
529 SAS_DPRINTK("%s: task 0x%p is done\n", __FUNCTION__, 549 SAS_DPRINTK("%s: task 0x%p is done\n", __FUNCTION__,
530 task); 550 task);
531 task->task_done(task); 551 sas_eh_finish_cmd(cmd);
532 if (need_reset) 552 if (need_reset)
533 try_to_reset_cmd_device(shost, cmd); 553 try_to_reset_cmd_device(shost, cmd);
534 continue; 554 continue;
535 case TASK_IS_ABORTED: 555 case TASK_IS_ABORTED:
536 SAS_DPRINTK("%s: task 0x%p is aborted\n", 556 SAS_DPRINTK("%s: task 0x%p is aborted\n",
537 __FUNCTION__, task); 557 __FUNCTION__, task);
538 task->task_done(task); 558 sas_eh_finish_cmd(cmd);
539 if (need_reset) 559 if (need_reset)
540 try_to_reset_cmd_device(shost, cmd); 560 try_to_reset_cmd_device(shost, cmd);
541 continue; 561 continue;
@@ -547,7 +567,7 @@ Again:
547 "recovered\n", 567 "recovered\n",
548 SAS_ADDR(task->dev), 568 SAS_ADDR(task->dev),
549 cmd->device->lun); 569 cmd->device->lun);
550 task->task_done(task); 570 sas_eh_finish_cmd(cmd);
551 if (need_reset) 571 if (need_reset)
552 try_to_reset_cmd_device(shost, cmd); 572 try_to_reset_cmd_device(shost, cmd);
553 sas_scsi_clear_queue_lu(work_q, cmd); 573 sas_scsi_clear_queue_lu(work_q, cmd);
@@ -562,7 +582,7 @@ Again:
562 if (tmf_resp == TMF_RESP_FUNC_COMPLETE) { 582 if (tmf_resp == TMF_RESP_FUNC_COMPLETE) {
563 SAS_DPRINTK("I_T %016llx recovered\n", 583 SAS_DPRINTK("I_T %016llx recovered\n",
564 SAS_ADDR(task->dev->sas_addr)); 584 SAS_ADDR(task->dev->sas_addr));
565 task->task_done(task); 585 sas_eh_finish_cmd(cmd);
566 if (need_reset) 586 if (need_reset)
567 try_to_reset_cmd_device(shost, cmd); 587 try_to_reset_cmd_device(shost, cmd);
568 sas_scsi_clear_queue_I_T(work_q, task->dev); 588 sas_scsi_clear_queue_I_T(work_q, task->dev);
@@ -577,7 +597,7 @@ Again:
577 if (res == TMF_RESP_FUNC_COMPLETE) { 597 if (res == TMF_RESP_FUNC_COMPLETE) {
578 SAS_DPRINTK("clear nexus port:%d " 598 SAS_DPRINTK("clear nexus port:%d "
579 "succeeded\n", port->id); 599 "succeeded\n", port->id);
580 task->task_done(task); 600 sas_eh_finish_cmd(cmd);
581 if (need_reset) 601 if (need_reset)
582 try_to_reset_cmd_device(shost, cmd); 602 try_to_reset_cmd_device(shost, cmd);
583 sas_scsi_clear_queue_port(work_q, 603 sas_scsi_clear_queue_port(work_q,
@@ -591,10 +611,10 @@ Again:
591 if (res == TMF_RESP_FUNC_COMPLETE) { 611 if (res == TMF_RESP_FUNC_COMPLETE) {
592 SAS_DPRINTK("clear nexus ha " 612 SAS_DPRINTK("clear nexus ha "
593 "succeeded\n"); 613 "succeeded\n");
594 task->task_done(task); 614 sas_eh_finish_cmd(cmd);
595 if (need_reset) 615 if (need_reset)
596 try_to_reset_cmd_device(shost, cmd); 616 try_to_reset_cmd_device(shost, cmd);
597 goto out; 617 goto clear_q;
598 } 618 }
599 } 619 }
600 /* If we are here -- this means that no amount 620 /* If we are here -- this means that no amount
@@ -606,21 +626,18 @@ Again:
606 SAS_ADDR(task->dev->sas_addr), 626 SAS_ADDR(task->dev->sas_addr),
607 cmd->device->lun); 627 cmd->device->lun);
608 628
609 task->task_done(task); 629 sas_eh_finish_cmd(cmd);
610 if (need_reset) 630 if (need_reset)
611 try_to_reset_cmd_device(shost, cmd); 631 try_to_reset_cmd_device(shost, cmd);
612 goto clear_q; 632 goto clear_q;
613 } 633 }
614 } 634 }
615out:
616 return list_empty(work_q); 635 return list_empty(work_q);
617clear_q: 636clear_q:
618 SAS_DPRINTK("--- Exit %s -- clear_q\n", __FUNCTION__); 637 SAS_DPRINTK("--- Exit %s -- clear_q\n", __FUNCTION__);
619 list_for_each_entry_safe(cmd, n, work_q, eh_entry) { 638 list_for_each_entry_safe(cmd, n, work_q, eh_entry)
620 struct sas_task *task = TO_SAS_TASK(cmd); 639 sas_eh_finish_cmd(cmd);
621 list_del_init(&cmd->eh_entry); 640
622 task->task_done(task);
623 }
624 return list_empty(work_q); 641 return list_empty(work_q);
625} 642}
626 643